用phpSpider爬虫采集糗事百科(改进版)
接着上一篇《用phpSpider爬虫采集糗事百科》程序的改进版,测试时发现phpSpider在单次执行任务的时候自带URL过滤重复,但是如果重复执行的时候程序会重复抓取,所以我们需要建立一个url的字段来保存历史抓取记录,每次入库前如果判断一下该URL是否抓取过了即可。
在content表中新建存放页面地址的字段: url
核心文件 test.php
<?php
require_once __DIR__ . '/../autoloader.php';
use phpspider\core\phpspider;
use phpspider\core\db;
use phpspider\core\log;
/* Do NOT delete this comment */
/* 不要删除这段注释 */
$configs = array(
'name' => '糗事百科',
//'log_show' => true,
//'log_type' => 'error,debug',
//'multiserver' => true,
//'serverid' => 1,
//'tasknum' => 3,
//'save_running_state' => true,
//'input_encoding' => 'utf-8',
//'max_depth' => 3,
'domains' => array(
'qiushibaike.com',
'www.qiushibaike.com'
),
'scan_urls' => array(
'http://www.qiushibaike.com/'
),
'content_url_regexes' => array(
"http://www.qiushibaike.com/article/\d+"
),
'list_url_regexes' => array(
"http://www.qiushibaike.com/hot/page/\d+\?s=\d+"
),
'fields' => array(
array(
// 抽取内容页的文章内容
'name' => "body",
'selector' => "//*[@id='single-next-link']/div",
'required' => true
),
array(
// 抽取内容页的文章作者
'name' => "author",
'selector' => "//div[contains(@class,'author')]//h2",
'required' => true
),
array(
'name' => "url",
'selector' => "//div[contains(@class,'author1')]//h2", // 这里随便设置,on_extract_field回调里面会替换
'required' => true,
),
),
'max_try' => 5,
'proxies' => array(
'http://H784U84R444YABQD:[email protected]:9010'
),
'export' => array(
'type' => 'db',
'table' => 'content',
),
'db_config' => array(
'host' => '127.0.0.1',
'port' => 3306,
'user' => 'root',
'pass' => 'kinglife',
'name' => 'phpspider',
),
);
// 数据库配置
db::set_connect('default', $configs['db_config']);
// 数据库链接
db::init_mysql();
$spider = new phpspider($configs);
$spider->on_extract_field = function($fieldname, $data, $page)
{
// 把当前内容页URL替换上面的field
if ($fieldname == 'url')
{
$data = $page['url'];
}
return $data;
};
$spider->on_extract_page = function($page, $data)
{
$url = $data['url'];
$row = db::get_one("SELECT * FROM `content` WHERE `url`='{$url}'");
if (count($row) > 0)
{
log::info("Page url exists!");
return false;
}
return $data;
};
$spider->start();