Browse Source

Merge pull request '队列抓取' (#52) from wpd_spider_queue into master

Reviewed-on: #52
pull/53/head
pengda 5 days ago
parent
commit
dfab49e400
  1. 25
      config/define.php
  2. 253
      model/mSpider.php
  3. 11
      queue/config/daemonconf.php
  4. 43
      queue/crontab/spider_behavior_data.php
  5. 79
      queue/deal/get_weibo_behavior.php

25
config/define.php

@ -65,31 +65,6 @@
WEIBO_USER_LUO => '罗大伦',
);
define('_RC_SPIDER_IP', 'rc_spider_ip');
define('SPIDER_REQUEST_MAX_NUMS', 5);
define('SPIDER_ERROR_IP_GET_EMPTY', 101);
define('SPIDER_ERROR_IP_GET_OUT', 102);
define('SPIDER_ERROR_IP_GET_FAIL', 103);
define('SPIDER_ERROR_IP_EXPIERD', 104);
define('SPIDER_ERROR_IP_ASTABLE', 105);
define('SPIDER_ERROR_IP_UNKNOWN', 199);
define('SPIDER_ERROR_URL_GET_EMPTY', 201);
define('SPIDER_ERROR_URL_COOKIE_EXPIRED', 202);
define('SPIDER_ERROR_URL_GET_UNKNOWN', 299);
$GLOBALS['spider_error'] = array(
SPIDER_ERROR_IP_GET_EMPTY => '代理ip获取为空',
SPIDER_ERROR_IP_GET_OUT => '代理ip获取超额',
SPIDER_ERROR_IP_GET_FAIL => '代理ip获取失败',
SPIDER_ERROR_IP_EXPIERD => '代理ip失效',
SPIDER_ERROR_IP_ASTABLE => '代理ip不稳定',
SPIDER_ERROR_IP_UNKNOWN => '代理ip未知错误',
SPIDER_ERROR_URL_GET_EMPTY => '请求为空',
SPIDER_ERROR_URL_COOKIE_EXPIRED => 'cookie失效',
SPIDER_ERROR_URL_GET_UNKNOWN => '请求未知错误',
);
$GLOBALS['notice_mail_list'] = array(
'mouzhi@qq.com' => array(
'host' => 'smtp.qq.com',

253
model/mSpider.php

@ -7,14 +7,13 @@ include_once(SERVER_ROOT . "/model/mBase.php");
class mSpider extends mBase {
private $error_nums = 0;
public function __construct() {
$this->obj = new dWeibo();
$this->tbl = 'spider_weibo';
}
public function getZmhttpIp($task_key) {
//一分钟内不允许再次获取ip
$rdobj = $this->initRedis();
$ip_request = $rdobj->get('ip_request');
if ($ip_request) return false;
@ -27,61 +26,30 @@ class mSpider extends mBase {
$this->writeLog('spider', 'getZmhttpIp.log', $task_key . '|' . $jsoninfo);
$data = json_decode($jsoninfo, true);
$ip = $data['data'][0]['ip'];
$port = $data['data'][0]['port'];
$end_time = $data['data'][0]['end_time'];
if (empty($ip) || empty($port)) return false;
$rdobj = $this->initRedis();
$rdobj->setex($task_key, 6 * 60, json_encode(array('ip' => $ip, 'port' => $port, 'end_time' => $end_time)));
return array('ip' => $ip, 'port' => $port);
}
public function getNewHttpIp($task_key, $times = 1) {
$url = "http://proxy.siyetian.com/apis_get.html?token=AesJWLNp2a65kaJdXTqFFeNpWT35ERNpnTn1STqFUeORUR31kaNh3TUl0dPRUQy4ERJdXT6lVN.QMxkTO0MjM0cTM&limit=1&type=1&time=&data_format=json&showTimeEnd=true";
$jsoninfo = $this->getCUrl($url);
$this->writeLog('spider', 'getNewHttpIp.log', $task_key . '|' . $jsoninfo);
//请求为空 重试第五次还是为空 则抛出异常
if (empty($jsoninfo)) {
if ($times >= SPIDER_REQUEST_MAX_NUMS) {
$this->setError(SPIDER_ERROR_IP_GET_EMPTY);
return false;
}
//请求为空 5秒后重试
sleep(5);
return $this->getNewHttpIp($task_key, $times + 1);
}
$data = json_decode($jsoninfo, true);
//套餐用完 需要换套餐了
if ($data['code'] == 10019 || $data['code'] == 10005) {
$this->setError(SPIDER_ERROR_IP_GET_OUT);
$this->sendMail(array('1026652509@qq.com'), date('Y年m月d日', time()) . '-微博数据抓取', $data['info']);
return false;
}
$ip = $data['data'][0]['ip'];
$port = $data['data'][0]['port'];
$end_time = $data['data'][0]['end_time'];
//其他异常错误
if (empty($ip) || empty($port)) {
$this->setError(SPIDER_ERROR_IP_GET_FAIL);
return false;
}
$expire_time = strtotime($end_time) - time();
if (empty($ip) || empty($port)) return false;
$rdobj = $this->initRedis();
$rdobj->setex($task_key, $expire_time, json_encode(array('ip' => $ip, 'port' => $port, 'end_time' => $end_time)));
$rdobj->setex($task_key, 3600 * 24, json_encode(array('ip' => $ip, 'port' => $port, 'end_time' => $end_time)));
return array('ip' => $ip, 'port' => $port);
}
public function getRequest($url, $headers = array(), $timeout = 60) {
public function getRequest($url, $headers = array(), $timeout = 30) {
if (empty($this->proxy_ip) || empty($this->proxy_port)) {
$this->setError('配置代理后再来请求吧');
return false;
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
@ -247,28 +215,7 @@ class mSpider extends mBase {
}
//原创微博 用户行为数据
public function spiderBehaviorOriginalData($uid, $page, $times = 1) {
$this->writeLog('spider', 'getData.log', "{$uid}|{$page}|{$times}");
$rdobj = $this->initRedis();
$ipinfo = $rdobj->get(_RC_SPIDER_IP);
$ipdata = json_decode($ipinfo, true);
if (!$ipinfo) {
$ipdata = $this->getNewHttpIp(_RC_SPIDER_IP);
}
//获取代理ip失败
if (empty($ipdata)) {
$this->setError($this->getError());
return false;
}
$this->proxy_ip = $ipdata['ip'];
$this->proxy_port = $ipdata['port'];
$cookie = $rdobj->get(_RC_WEIBO_LOGIN_COOKIE);
if(empty($cookie)) $cookie = WEIBO_LOGIN_COOKIE;
public function spiderBehaviorOriginalData($cookie, $uid, $page) {
$url = "https://weibo.com/ajax/statuses/searchProfile?uid={$uid}&page={$page}&hasori=1";
$headers = array(
"Cookie: SUB={$cookie}",
@ -277,110 +224,10 @@ class mSpider extends mBase {
"Referer: https://weibo.com/u/{$uid}?tabtype=feed"
);
$res = $this->getRequest($url, $headers);
//请求错误
if (empty($res)) {
//请求为空
$error_code = SPIDER_ERROR_URL_GET_EMPTY;
//curl错误
if ($this->curl_err) {
$this->error_nums++;
$error_code = $this->getSpiderErrors();
}
//ip不稳定 请求不作数
if ($error_code == SPIDER_ERROR_IP_ASTABLE) $times--;
//代理失效
if ($error_code == SPIDER_ERROR_IP_EXPIERD) $rdobj->del(_RC_SPIDER_IP);
//连续五次请求错误 则抛出异常
if ($times >= SPIDER_REQUEST_MAX_NUMS) {
$this->setError($error_code);
return false;
}
sleep(2);
return $this->spiderBehaviorOriginalData($uid, $page, $times + 1);
}
$this->error_nums = 0;
if (strpos($res, '登录 - 微博')) {
$this->setError(SPIDER_ERROR_URL_COOKIE_EXPIRED);
return false;
}
$data = json_decode($res, 1);
if (!$data) {
$this->writeLog('spider', 'errorData.log', $res);
$this->setError(SPIDER_ERROR_URL_GET_UNKNOWN);
return false;
}
//获取完毕
if (empty($data['data']['list'])) return 'done';
$date = date('Y-m-d');
$bobj = new mWeiboBehavior();
$wobj = new mWeibo();
foreach ($data['data']['list'] as $da) {
//博主设置不可见
if (!isset($da['reposts_count'])) continue;
$temp = array(
'uid' => $uid,
'wid' => $da['id'],
'reposts_count' => $da['reposts_count'],
'comments_count' => $da['comments_count'],
'attitudes_count' => $da['attitudes_count'],
'date' => $date,
);
$res = $bobj->saveBehaviorData($temp);
if (!$res) {
$this->writeLog('spider', 'insert_error.log', json_encode($temp));
}
$created_at = new DateTime($da['created_at']); // 解析时间
$weibo = array(
'mblogid' => $da['mblogid'],
'text' => strip_tags($da['text']),
'refer' => '',
'pic_ids' => $da['pic_num'] > 0 ? json_encode($da['pic_ids']) : '',
'video_url' => isset($da['page_info']['media_info']['stream_url']) ? $da['page_info']['media_info']['stream_url'] : '',
'video_cover' => isset($da['page_info']['page_pic']) ? pathinfo(basename($da['page_info']['page_pic']), PATHINFO_FILENAME) : '',
'created_at' => $created_at->format('Y-m-d H:i:s')
);
$res = $wobj->saveWeiboData($temp, $weibo);
if (!$res) {
$this->writeLog('spider', 'insert_error.log', json_encode(array_merge($temp, $weibo)));
}
}
$this->writeLog('spider', 'getDone.log', "{$uid}|{$page}|{$times}");
return true;
return $this->spiderBehavior($uid, $url, $headers);
}
//转发微博 用户行为数据
public function spiderBehaviorForwardData($uid, $page, $times = 1) {
$this->writeLog('spider', 'getData.log', "{$uid}|{$page}|{$times}");
$rdobj = $this->initRedis();
$ipinfo = $rdobj->get(_RC_SPIDER_IP);
$ipdata = json_decode($ipinfo, true);
if (!$ipinfo) {
$ipdata = $this->getNewHttpIp(_RC_SPIDER_IP);
}
//获取代理ip失败
if (empty($ipdata)) {
$this->setError($this->getError());
return false;
}
$this->proxy_ip = $ipdata['ip'];
$this->proxy_port = $ipdata['port'];
$cookie = $rdobj->get(_RC_WEIBO_LOGIN_COOKIE);
if(empty($cookie)) $cookie = WEIBO_LOGIN_COOKIE;
public function spiderBehaviorForwardData($cookie, $uid, $page) {
$endtime = strtotime("tomorrow");
$url = "https://weibo.com/ajax/statuses/searchProfile?uid={$uid}&page={$page}&endtime={$endtime}&hasret=1&hastext=1&haspic=1&hasvideo=1&hasmusic=1";
$headers = array(
@ -389,57 +236,46 @@ class mSpider extends mBase {
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
"Referer: https://weibo.com/u/{$uid}?is_text=1&is_pic=1&is_video=1&is_music=1&is_forward=1&end_time={$endtime}"
);
return $this->spiderBehavior($uid, $url, $headers);
}
public function spiderBehavior($uid, $url, $headers) {
$res = $this->getRequest($url, $headers);
if (strpos($res, '登录 - 微博')) {
$this->setError('cookie失效');
return false;
}
//请求错误
if (empty($res)) {
//请求为空
$error_code = SPIDER_ERROR_URL_GET_EMPTY;
//curl错误
if ($this->curl_err) {
$this->error_nums++;
$error_code = $this->getSpiderErrors();
$this->getSpiderErrors();
}
//ip不稳定 请求不作数
if ($error_code == SPIDER_ERROR_IP_ASTABLE) $times--;
//代理失效
if ($error_code == SPIDER_ERROR_IP_EXPIERD) $rdobj->del(_RC_SPIDER_IP);
//连续五次请求错误 则抛出异常
if ($times >= SPIDER_REQUEST_MAX_NUMS) {
$this->setError($error_code);
return false;
}
sleep(2);
return $this->spiderBehaviorForwardData($uid, $page, $times + 1);
}
$this->error_nums = 0;
if (strpos($res, '登录 - 微博')) {
$this->setError(SPIDER_ERROR_URL_COOKIE_EXPIRED);
$this->setError($this->curl_err);
return false;
}
$data = json_decode($res, 1);
if (!$data) {
$this->writeLog('spider', 'errorData.log', $res);
$this->setError(SPIDER_ERROR_URL_GET_UNKNOWN);
$this->setError('请求为空');
return false;
}
//获取完毕
if (empty($data['data']['list'])) return 'done';
if (empty($data['data']['list'])) return true;
$date = date('Y-m-d');
$bobj = new mWeiboBehavior();
$wobj = new mWeibo();
foreach ($data['data']['list'] as $item) {
if (!isset($item['user']['id']) || $item['user']['id'] != $uid) continue;
$da = $item['retweeted_status'];
foreach ($data['data']['list'] as $da) {
//有转发数据 取转发数据
if(isset($da['retweeted_status'])) $da = $da['retweeted_status'];
//博主设置不可见
if (!isset($da['reposts_count'])) continue;
//只取转发自己的数据
if (!isset($da['user']['id']) || $da['user']['id'] != $uid) continue;
$temp = array(
'uid' => $uid,
@ -457,7 +293,7 @@ class mSpider extends mBase {
$created_at = new DateTime($da['created_at']); // 解析时间
$weibo = array(
'mblogid' => $da['mblogid'],
'text' => strip_tags($da['text']),
'text' => $da['text_raw'],
'refer' => '',
'pic_ids' => $da['pic_num'] > 0 ? json_encode($da['pic_ids']) : '',
'video_url' => isset($da['page_info']['media_info']['stream_url']) ? $da['page_info']['media_info']['stream_url'] : '',
@ -470,37 +306,28 @@ class mSpider extends mBase {
}
}
$this->writeLog('spider', 'getDone.log', "{$uid}|{$page}|{$times}");
return true;
}
private function getSpiderErrors() {
preg_match('/Unable to receive initial SOCKS5 response./', $this->curl_err, $match);
if ($match[0]) {
return SPIDER_ERROR_IP_EXPIERD;
$this->is_change_ip = true;
return true;
}
preg_match('/SOCKS5 read timeout/', $this->curl_err, $match);
if ($match[0]) {
return SPIDER_ERROR_IP_EXPIERD;
$this->is_change_ip = true;
return true;
}
$this->writeLog('spider', 'curl_error.log', $this->curl_err);
//ip不稳定
preg_match("/Can't complete SOCKS5 connection to 0.0.0.0:0/", $this->curl_err, $match);
preg_match('/Connection refused/', $this->curl_err, $match);
if ($match[0]) {
if ($this->error_nums < 5) return SPIDER_ERROR_IP_ASTABLE;
if ($this->error_nums == 5) return SPIDER_ERROR_IP_EXPIERD;
$this->is_change_ip = true;
return true;
}
//ip不稳定
preg_match("/Connection refused/", $this->curl_err, $match);
if ($match[0]) {
if ($this->error_nums < 5) return SPIDER_ERROR_IP_ASTABLE;
if ($this->error_nums == 5) return SPIDER_ERROR_IP_EXPIERD;
}
return SPIDER_ERROR_IP_UNKNOWN;
return true;
}
}

11
queue/config/daemonconf.php

@ -1,22 +1,19 @@
<?php
define('_RC_SPIDER_USE_IP', 'rc_spider_use_ip');
define('_RQ_SPIDER_WEIBO_DATA', 'rq_spider_weibo_data');
define('SPIDER_WEIBO_DATA', 'spider_weibo_data');
define('SPIDER_WEIBO_BEHAVIOR', 'spider_weibo_behavior');
$GLOBALS['DAEMON_MAXTIME'] = array(
SPIDER_WEIBO_DATA => 7200,
SPIDER_WEIBO_BEHAVIOR => 7200,
);
$GLOBALS['DAEMON_LIST'] = array(
//SPIDER_WEIBO_DATA => 'get_weibo_pic.php',
SPIDER_WEIBO_DATA => 'get_weibo_video.php',
SPIDER_WEIBO_BEHAVIOR => 'get_weibo_behavior.php',
);
$GLOBALS['DAEMON_LIST_DESC'] = array(
);
$GLOBALS['DAEMON_NUMLIMIT'] = array(
SPIDER_WEIBO_DATA => 5,
SPIDER_WEIBO_BEHAVIOR => 5,
);

43
queue/crontab/spider_behavior_data.php

@ -16,31 +16,26 @@ class spiderBehaviorData {
$user_list = array_keys($GLOBALS['WEIBO_USER_LIST']);
$obj = new mSpider();
$robj = $obj->initRedis();
$date = date("Y-m-d");
foreach ($user_list as $uid) {
//采集原创数据
for ($page = 1; $page <= 1000; $page++) {
$res = $obj->spiderBehaviorOriginalData($uid, $page);
//采集完毕
if($res === 'done') break;
if (!$res) {
$error_code = $obj->getError();
$obj->sendMail(array('1026652509@qq.com'), date('Y年m月d日', time()) . '-微博原创数据抓取', $GLOBALS['spider_error'][$error_code]);
break 2;
}
}
//采集转发数据
for ($page = 1; $page <= 1000; $page++) {
$res = $obj->spiderBehaviorForwardData($uid, $page);
//采集完毕
if($res === 'done') break;
if (!$res) {
$error_code = $obj->getError();
$obj->sendMail(array('1026652509@qq.com'), date('Y年m月d日', time()) . '-微博转发数据抓取', $GLOBALS['spider_error'][$error_code]);
break 2;
}
for ($page = 1; $page <= 120; $page++) {
//原创数据
$data = array(
'type' => 1,
'uid' => $uid,
'page' => $page,
'date' => $date,
);
$robj->lpush(_RQ_SPIDER_WEIBO_BEHAVIOR, json_encode($data));
//转发数据
$data = array(
'type' => 2,
'uid' => $uid,
'page' => $page,
'date' => $date,
);
$robj->lpush(_RQ_SPIDER_WEIBO_BEHAVIOR, json_encode($data));
}
}

79
queue/deal/get_weibo_behavior.php

@ -0,0 +1,79 @@
<?php
include_once dirname(dirname(__FILE__)).'/base/dealBase.php';
define('DEAL_FLAG', $argv[0]);
define('PROC_CODE', $argv[1]);
class getWeiboBehavior extends dealBase {
private $task_key = _RQ_SPIDER_WEIBO_BEHAVIOR;
public function setPara() {
$this->spiderdaemon = SPIDER_WEIBO_BEHAVIOR;
$this->processnum = $GLOBALS['DAEMON_NUMLIMIT'][SPIDER_WEIBO_BEHAVIOR];
}
public function deal() {
$this->is_change_ip = false;
$obj = new mSpider();
$rdobj = $obj->initRedis();
$ipinfo = $rdobj->get(_RC_SPIDER_USE_IP);
$ipdata = json_decode($ipinfo, true);
if (!$ipinfo && PROC_CODE == 0 && $rdobj->llen($this->task_key) + 0 >= 0) {
$ipdata = $obj->getZmhttpIp(_RC_SPIDER_USE_IP);
}
if (empty($ipdata)) {
sleep(2);
return false;
}
$obj->proxy_ip = $ipdata['ip'];
$obj->proxy_port = $ipdata['port'];
$rq_data = $rdobj->rpop($this->task_key);
if (empty($rq_data)) {
sleep(60);
return false;
}
$baselog = $rq_data . "|" . $ipdata['ip'] . "|" . $ipdata['port'] . "|" . PROC_CODE . "|";
//数据格式不正确
$data = json_decode($rq_data, true);
if(empty($data['type']) || empty($data['uid']) || empty($data['page']) || empty($data['date'])) return false;
//队列数据已过期
$date = date("Y-m-d");
if($data['date'] != $date) return false;
$cookie = $rdobj->get(_RC_WEIBO_LOGIN_COOKIE);
if(empty($cookie)) $cookie = WEIBO_LOGIN_COOKIE;
if($data['type'] == 1){
//原创数据
$res = $obj->spiderBehaviorOriginalData($cookie, $data['uid'], $data['page']);
}else{
//转发数据
$res = $obj->spiderBehaviorForwardData($cookie, $data['uid'], $data['page']);
}
if (!$res) {
$error = $obj->getError();
if($error == 'cookie失效') $obj->sendMail(array('1026652509@qq.com'), date('Y年m月d日', time()) . '-微博数据抓取', $data['info']);
if ($obj->is_change_ip) {
// 0进程监控更换ip
if (PROC_CODE == 0) $rdobj->del(_RC_SPIDER_USE_IP);
}
$obj->writeLog('spider', $this->spiderdaemon.'_fail.log', $baselog.$error);
$rdobj->lpush($this->task_key, $rq_data);
return false;
}
$obj->writeLog('spider', $this->spiderdaemon . '_success.log', $baselog);
return true;
}
}
new getWeiboBehavior();
Loading…
Cancel
Save