用phpSpider爬虫采集糗事百科(改进版)
接着上一篇《用phpSpider爬虫采集糗事百科》程序的改进版,测试时发现phpSpider在单次执行任务的时候自带URL过滤重复,但是如果重复执行的时候程序会重复抓取,所以我们需要建立一个url的字段来保存历史抓取记录,每次入库前如果判断一下该URL是否抓取过了即可。
在content
表中新建存放页面地址的字段: url
核心文件 test.php
<?php require_once __DIR__ . '/../autoloader.php'; use phpspider\core\phpspider; use phpspider\core\db; use phpspider\core\log; /* Do NOT delete this comment */ /* 不要删除这段注释 */ $configs = array( 'name' => '糗事百科', //'log_show' => true, //'log_type' => 'error,debug', //'multiserver' => true, //'serverid' => 1, //'tasknum' => 3, //'save_running_state' => true, //'input_encoding' => 'utf-8', //'max_depth' => 3, 'domains' => array( 'qiushibaike.com', 'www.qiushibaike.com' ), 'scan_urls' => array( 'http://www.qiushibaike.com/' ), 'content_url_regexes' => array( "http://www.qiushibaike.com/article/\d+" ), 'list_url_regexes' => array( "http://www.qiushibaike.com/hot/page/\d+\?s=\d+" ), 'fields' => array( array( // 抽取内容页的文章内容 'name' => "body", 'selector' => "//*[@id='single-next-link']/div", 'required' => true ), array( // 抽取内容页的文章作者 'name' => "author", 'selector' => "//div[contains(@class,'author')]//h2", 'required' => true ), array( 'name' => "url", 'selector' => "//div[contains(@class,'author1')]//h2", // 这里随便设置,on_extract_field回调里面会替换 'required' => true, ), ), 'max_try' => 5, 'proxies' => array( 'http://H784U84R444YABQD:[email protected]:9010' ), 'export' => array( 'type' => 'db', 'table' => 'content', ), 'db_config' => array( 'host' => '127.0.0.1', 'port' => 3306, 'user' => 'root', 'pass' => 'kinglife', 'name' => 'phpspider', ), ); // 数据库配置 db::set_connect('default', $configs['db_config']); // 数据库链接 db::init_mysql(); $spider = new phpspider($configs); $spider->on_extract_field = function($fieldname, $data, $page) { // 把当前内容页URL替换上面的field if ($fieldname == 'url') { $data = $page['url']; } return $data; }; $spider->on_extract_page = function($page, $data) { $url = $data['url']; $row = db::get_one("SELECT * FROM `content` WHERE `url`='{$url}'"); if (count($row) > 0) { log::info("Page url exists!"); return false; } return $data; }; $spider->start();