<delect id="sj01t"></delect>
  1. <em id="sj01t"><label id="sj01t"></label></em>
  2. <div id="sj01t"></div>
    1. <em id="sj01t"></em>

            <div id="sj01t"></div>

            PHP可以靈活配置使用的采集器

            時間:2025-08-14 16:36:52 php語言

            PHP可以靈活配置使用的采集器

              PHP可以靈活配置使用的采集器?就跟隨百分網小編一起去了解下吧,想了解更多相關信息請持續關注我們應屆畢業生考試網!

              代碼:

              <?php

              /**

              *  可以靈活配置使用的采集器

              *  作者:Rain

              *  創建時間:2015-02-03 15:17:30

              *  版本信息:V1.0

              */

              /pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/

              /pic/p>

              define('DB_HOST', 'localhost');

              define('DB_USER', 'root');

              define('DB_PWD', 'test123456');

              define('DB_NAME', 'test_dbname');

              define('DB_CHARSET', 'utf8');

              define('TABLE_NAME', 'tb_book');

              /pic/p>

              /pic/p>

              define('WEB_CHARSET', 'gbk');

              /pic/p>

              define('WEB_LIST_URL', '/pic/book/1_%d.htm');

              /pic/p>

              define('PAGE_COUNT', 14);

              /pic/p>

              define('PAGE_START', 1);

              /pic/,例如:/\/xuefu2008\/article\/details\/(\d)+/i

              define('WEB_CONTENT_URL_REG', '/\/book\/(\d)+\.htm/i');

              /pic/,例如:/pic/p>

              define('WEB_HOST', '/pic/p>

              /pic/p>

              define('WEB_LIST_POSTION', '/book_name\.gif(.*?)<td\swidth="15\%"\snowrap>/i');

              /pic/p>

              /pic/p>

              define('SLEEP_TIME', 1);

              define('IS_DEBUG', false);

              define('INSERT_DB', true);

              /pic/p>

              define('OUTPUT_SPEED', 1);

              /pic/p>

              /pic/p>

              $text_filter = array(

              '- 中華電腦書庫' => '',

              '_電腦電子書' => '',

              '_電腦書籍' => '',

              '下載' => '',

              );

              /pic/p>

              $table_mapping = array(

              /pic/p>

              'size' => '/軟件大小.*?000000>(.*?)<\/font>/i',

              'logo' => '/pic/index/uploads/images/20150105/0b8461910de101cc51a07684cdab797e.jpg',

              'field1' => '/<title>(.*?)<\/title>/i',

              'field2' => '/軟件簡介.*?000000>(.*?)<\/font>/i',

              'field3' => '1',

              'field4' => '1',

              'field5' => '1',

              'field6' => '電子書,計算機,圖像,圖形',

              'platform' => 'window/Linux',

              'ishot' => '1',

              'agreement' => '免費',

              'downurl' => '/(\/down\.asp\?id=.*?)"/i',

              'istop' => '1',

              );

              /pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/

              $ga = new Gather();

              $ga->run();

              class Gather

              {

              public function __construct()

              {

              $this->init_check();

              }

              public function run()

              {

              global $table_mapping, $text_filter;

              for ($page = PAGE_START; $page <= PAGE_COUNT; $page++)

              {

              $this->write('開始采集列表第'.$page.'頁的內容...');

              $list_content = $this->get(sprintf(WEB_LIST_URL, $page));

              if (empty($list_content))

              {

              $this->write('抓取的列表頁的內容為空,所以過濾掉');

              continue;

              }

              $list_content = str_replace("\r", '', $list_content);

              $list_content = str_replace("\n", '', $list_content);

              /pic/p>

              if (!preg_match(WEB_LIST_POSTION, $list_content, $list_search))

              {

              $this->write('精準匹配列表頁的內容失敗,所以過濾掉');

              continue;

              }

              if (isset($list_search[1]))

              $list_content = $list_search[1];

              else

              $list_content = $list_search[0];

              /pic/p>

              preg_match_all(WEB_CONTENT_URL_REG, $list_content, $match);

              if (is_array($match[0]) && !empty($match[0]))

              {

              $this->write('當前的列表頁面,總共匹配到:'.count($match[0]).'個內容頁');

              foreach ($match[0] as $val)

              {

              if (strpos($val, 'http:') === false)

              {

              if (substr($val, 0, 1) == '/')

              $val = WEB_HOST.$val;

              else

              $val = WEB_HOST.'/'.$val;

              }

              $web_content = $this->get($val);

              if (empty($web_content))

              {

              $this->write('抓取的內容頁為空,所以過濾掉');

              continue;

              }

              $web_content = str_replace("\r", '', $web_content);

              $web_content = str_replace("\n", '【】', $web_content);

              $sql = "INSERT INTO ".TABLE_NAME."(".implode(', ', array_keys($table_mapping)).")VALUES(";

              foreach ($table_mapping as $field => $reg)

              $sql .= ':'.$field.',';

              $sql = substr($sql ,0, -1);

              $sql .= ')';

              if (IS_DEBUG)

              $this->write('執行SQL '.$sql);

              $dsn = 'mysql:dbname='.DB_NAME.';host='.DB_HOST;

              try {

              $dbh = new PDO($dsn, DB_USER, DB_PWD);

              } catch (PDOException $e) {

              $this->write( 'Connection failed: ' . $e->getMessage(), true);

              }

              $dbh->query("set names 'utf8'");

              $sth = $dbh->prepare($sql);

              foreach ($table_mapping as $field => $reg)

              {

              if (substr($reg, 0, 1) !=  '/')

              {

              $$field = $reg;

              }

              else

              {

              if (!preg_match($reg, $web_content, $tmp_match))

              {

              $this->write('對不起,匹配字段:'.$field.'失敗,過濾此記錄');

              continue 2;

              }

              $$field = $tmp_match[1];

              $$field = $this->closetags($$field);

              /pic/p>

              $$field = preg_replace('/<script(.*?)>(.*?)<\/script>/i', '', $$field);

              /pic/p>

              $$field = preg_replace('/<a(.*?)>(.*?)<\/a>/i', '${2}', $$field);

              /pic/p>

              preg_match_all('/<img.*?src=("|\')+(.*?)("|\')+.*?>/i', $$field, $img_match);

              if (isset($img_match[2]) && is_array($img_match[2]) && !empty($img_match[2]))

              {

              foreach ($img_match[2] as $img_val)

              {

              if (strpos($img_val, 'http:') === false)

              {

              $new_val = $img_val;

              if (substr($new_val, 0, 1) != '/')

              $new_val = '/'.$img_val;

              $new_val = WEB_HOST.$new_val;

              $$field = str_replace($img_val, $new_val, $$field);

              }

              }

              }

              /pic/p>

              /pic/p>

              $$field = preg_replace('/<pre.*?>(.*?)<\/pre>/i', '<pre class="prettyprint">${1}</pre>', $$field);

              preg_match_all('/<pre>(.*?)<\/pre>/i', $$field, $pre_match);

              if (isset($pre_match[1]) && is_array($pre_match[1]) && !empty($pre_match[1]))

              {

              foreach ($pre_match[1] as $pre_val)

              $$field = str_replace($pre_val, str_replace("【】", "\r\n", $pre_val), $$field);

              }

              /pic/p>

              }

              /pic/p>

              $$field = str_replace('【】', "\r\n", $$field);

              /pic/p>

              if (is_array($text_filter) && !empty($text_filter))

              {

              foreach ($text_filter as $tk => $tv)

              $$field = str_ireplace($tk, $tv, $$field);

              }

              if (IS_DEBUG)

              $this->write('*'."\t".'字段:'.$field.'  值:'."\n****************************************************\n".$$field."\n****************************************************");

              if ('downurl' == $field && stripos($$field, 'http:') === false)

              if (substr($$field, 0, 1) == '/')

              $$field = WEB_HOST.trim($$field);

              else

              $$field = WEB_HOST.'/'.trim($$field);

              $sth->bindValue(':'.$field, trim($$field));

              }

              if (INSERT_DB)

              $sth->execute();

              $sth->closeCursor();

              $this->write( '休息,暫停'.SLEEP_TIME.'秒后繼續抓取...');

              sleep(SLEEP_TIME);

              }

              }

              else

              {

              $this->write('列表頁面沒有抓取到內容,所以過濾掉');

              }

              }

              $this->write('', true);

              }

              protected function closetags($html)

              {

              /pic/p>

              $arr_single_tags = array('meta', 'img', 'br', 'link', 'area');

              /pic/p>

              preg_match_all('#<([a-z]+)(?: .*)?(?<![/|/ ])>#iU', $html, $result);

              $openedtags = $result[1];

              /pic/p>

              preg_match_all('#</([a-z]+)>#iU', $html, $result);

              $closedtags = $result[1];

              /pic/p>

              $len_opened = count($openedtags);

              if (count($closedtags) == $len_opened) {

              return $html;

              }

              /pic/p>

              $openedtags = array_reverse($openedtags);

              /pic/p>

              for ($i = 0; $i < $len_opened; $i++) {

              /pic/p>

              if (!in_array($openedtags[$i], $arr_single_tags)) {

              /pic/p>

              if (!in_array($openedtags[$i], $closedtags)) {

              /pic/p>

              $html .= '</' . $openedtags[$i] . '>';

              } else {

              unset($closedtags[array_search($openedtags[$i], $closedtags)]);

              }

              }

              }

              return $html;

              }

              protected function init_check()

              {

              if (!$this->check_curl_support())

              $this->write('對不起,請先開啟CURL的類庫的支持,否則無法執行', true);

              $this->check_mysql_connect();

              $this->write('程序初始化檢查通過,執行后續的流程...');

              }

              private function get($url, $data = array())

              {

              $this->write('開始執行抓取: '.$url);

              $ch = curl_init();

              curl_setopt($ch, CURLOPT_URL, $url);

              /pic/pic/search/spider.htm)");

              curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");

              curl_setopt($ch, CURLOPT_HEADER, 0);

              curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

              curl_setopt($ch, CURLOPT_HTTPHEADER, $data);

              $ret = curl_exec($ch);

              $error = curl_error($ch);

              curl_close($ch);

              unset($ch);

              if (!empty($error))

              {

              $this->write('程序抓取URL: '.$url.'發生錯誤,錯誤信息: '.$error);

              return false;

              }

              if (WEB_CHARSET != 'utf-8')

              $ret = iconv(WEB_CHARSET, 'utf-8', $ret);

              return $ret;

              }

              /pic/p>

              private function check_mysql_connect()

            【PHP可以靈活配置使用的采集器】相關文章:

            php學習之php配置03-11

            PHP基礎配置10-29

            PHP安裝與配置11-22

            PHP socket的配置及實例11-22

            如何配置php環境11-21

            php環境怎么配置12-25

            PHP配置文件詳解php.ini03-17

            PHP環境搭建與配置的方法01-25

            如何正確配置 Nginx + PHP03-01

            <delect id="sj01t"></delect>
            1. <em id="sj01t"><label id="sj01t"></label></em>
            2. <div id="sj01t"></div>
              1. <em id="sj01t"></em>

                      <div id="sj01t"></div>
                      黄色视频在线观看