用phpSpider爬虫采集糗事百科(改进版)

爬虫技术PHP 1044

接着上一篇《用phpSpider爬虫采集糗事百科》程序的改进版,测试时发现phpSpider在单次执行任务的时候自带URL过滤重复,但是如果重复执行的时候程序会重复抓取,所以我们需要建立一个url的字段来保存历史抓取记录,每次入库前如果判断一下该URL是否抓取过了即可。

content表中新建存放页面地址的字段: url

核心文件 test.php

<?php
require_once __DIR__ . '/../autoloader.php';
use phpspider\core\phpspider;
use phpspider\core\db;
use phpspider\core\log;

/* Do NOT delete this comment */
/* 不要删除这段注释 */


$configs = array(
    'name' => '糗事百科',

    //'log_show' => true,
    //'log_type' => 'error,debug',
    //'multiserver' => true,
    //'serverid' => 1,
    //'tasknum' => 3,
    //'save_running_state' => true,
    //'input_encoding' => 'utf-8',
    //'max_depth' => 3,

    'domains' => array(
        'qiushibaike.com',
        'www.qiushibaike.com'
    ),
    'scan_urls' => array(
        'http://www.qiushibaike.com/'
    ),
    'content_url_regexes' => array(
        "http://www.qiushibaike.com/article/\d+"
    ),
    'list_url_regexes' => array(
        "http://www.qiushibaike.com/hot/page/\d+\?s=\d+"
    ),
    'fields' => array(
        array(
            // 抽取内容页的文章内容
            'name' => "body",
            'selector' => "//*[@id='single-next-link']/div",
            'required' => true
        ),
        array(
            // 抽取内容页的文章作者
            'name' => "author",
            'selector' => "//div[contains(@class,'author')]//h2",
            'required' => true
        ),
        array(
            'name' => "url",
            'selector' => "//div[contains(@class,'author1')]//h2",   // 这里随便设置,on_extract_field回调里面会替换
            'required' => true,
        ),        
    ),
    'max_try' => 5,
    'proxies' => array(
        'http://H784U84R444YABQD:[email protected]:9010'
    ),    
    'export' => array(
        'type' => 'db', 
        'table' => 'content',
    ),       
    'db_config' => array(
        'host'  => '127.0.0.1',
        'port'  => 3306,
        'user'  => 'root',
        'pass'  => 'kinglife',
        'name'  => 'phpspider',
    ),
 
);

// 数据库配置
db::set_connect('default', $configs['db_config']);
// 数据库链接
db::init_mysql();

$spider = new phpspider($configs);

$spider->on_extract_field = function($fieldname, $data, $page) 
{

    // 把当前内容页URL替换上面的field
    if ($fieldname == 'url') 
    {
        $data = $page['url'];
    }
    return $data;
};


$spider->on_extract_page = function($page, $data)
{

    $url = $data['url'];

    $row = db::get_one("SELECT * FROM `content` WHERE `url`='{$url}'");

    if (count($row) > 0)
    {
        log::info("Page url exists!");
        return false;
    }
    return $data;

};

$spider->start();

Post Comment