diff --git a/config/define.php b/config/define.php index eb5a6d1..e459407 100644 --- a/config/define.php +++ b/config/define.php @@ -65,31 +65,6 @@ WEIBO_USER_LUO => '罗大伦', ); - define('_RC_SPIDER_IP', 'rc_spider_ip'); - - define('SPIDER_REQUEST_MAX_NUMS', 5); - - define('SPIDER_ERROR_IP_GET_EMPTY', 101); - define('SPIDER_ERROR_IP_GET_OUT', 102); - define('SPIDER_ERROR_IP_GET_FAIL', 103); - define('SPIDER_ERROR_IP_EXPIERD', 104); - define('SPIDER_ERROR_IP_ASTABLE', 105); - define('SPIDER_ERROR_IP_UNKNOWN', 199); - define('SPIDER_ERROR_URL_GET_EMPTY', 201); - define('SPIDER_ERROR_URL_COOKIE_EXPIRED', 202); - define('SPIDER_ERROR_URL_GET_UNKNOWN', 299); - $GLOBALS['spider_error'] = array( - SPIDER_ERROR_IP_GET_EMPTY => '代理ip获取为空', - SPIDER_ERROR_IP_GET_OUT => '代理ip获取超额', - SPIDER_ERROR_IP_GET_FAIL => '代理ip获取失败', - SPIDER_ERROR_IP_EXPIERD => '代理ip失效', - SPIDER_ERROR_IP_ASTABLE => '代理ip不稳定', - SPIDER_ERROR_IP_UNKNOWN => '代理ip未知错误', - SPIDER_ERROR_URL_GET_EMPTY => '请求为空', - SPIDER_ERROR_URL_COOKIE_EXPIRED => 'cookie失效', - SPIDER_ERROR_URL_GET_UNKNOWN => '请求未知错误', - ); - $GLOBALS['notice_mail_list'] = array( 'mouzhi@qq.com' => array( 'host' => 'smtp.qq.com', diff --git a/model/mSpider.php b/model/mSpider.php index 6e8057a..0c06f16 100644 --- a/model/mSpider.php +++ b/model/mSpider.php @@ -7,14 +7,13 @@ include_once(SERVER_ROOT . "/model/mBase.php"); class mSpider extends mBase { - private $error_nums = 0; - public function __construct() { $this->obj = new dWeibo(); $this->tbl = 'spider_weibo'; } public function getZmhttpIp($task_key) { + //一分钟内不允许再次获取ip $rdobj = $this->initRedis(); $ip_request = $rdobj->get('ip_request'); if ($ip_request) return false; @@ -27,61 +26,30 @@ class mSpider extends mBase { $this->writeLog('spider', 'getZmhttpIp.log', $task_key . '|' . $jsoninfo); $data = json_decode($jsoninfo, true); - $ip = $data['data'][0]['ip']; - $port = $data['data'][0]['port']; - $end_time = $data['data'][0]['end_time']; - - if (empty($ip) || empty($port)) return false; - - $rdobj = $this->initRedis(); - $rdobj->setex($task_key, 6 * 60, json_encode(array('ip' => $ip, 'port' => $port, 'end_time' => $end_time))); - - return array('ip' => $ip, 'port' => $port); - } - - public function getNewHttpIp($task_key, $times = 1) { - $url = "http://proxy.siyetian.com/apis_get.html?token=AesJWLNp2a65kaJdXTqFFeNpWT35ERNpnTn1STqFUeORUR31kaNh3TUl0dPRUQy4ERJdXT6lVN.QMxkTO0MjM0cTM&limit=1&type=1&time=&data_format=json&showTimeEnd=true"; - $jsoninfo = $this->getCUrl($url); - - $this->writeLog('spider', 'getNewHttpIp.log', $task_key . '|' . $jsoninfo); - - //请求为空 重试第五次还是为空 则抛出异常 - if (empty($jsoninfo)) { - if ($times >= SPIDER_REQUEST_MAX_NUMS) { - $this->setError(SPIDER_ERROR_IP_GET_EMPTY); - return false; - } - - //请求为空 5秒后重试 - sleep(5); - return $this->getNewHttpIp($task_key, $times + 1); - } - - $data = json_decode($jsoninfo, true); //套餐用完 需要换套餐了 if ($data['code'] == 10019 || $data['code'] == 10005) { - $this->setError(SPIDER_ERROR_IP_GET_OUT); + $this->sendMail(array('1026652509@qq.com'), date('Y年m月d日', time()) . '-微博数据抓取', $data['info']); return false; } $ip = $data['data'][0]['ip']; $port = $data['data'][0]['port']; $end_time = $data['data'][0]['end_time']; - //其他异常错误 - if (empty($ip) || empty($port)) { - $this->setError(SPIDER_ERROR_IP_GET_FAIL); - return false; - } - $expire_time = strtotime($end_time) - time(); + if (empty($ip) || empty($port)) return false; $rdobj = $this->initRedis(); - $rdobj->setex($task_key, $expire_time, json_encode(array('ip' => $ip, 'port' => $port, 'end_time' => $end_time))); + $rdobj->setex($task_key, 3600 * 24, json_encode(array('ip' => $ip, 'port' => $port, 'end_time' => $end_time))); return array('ip' => $ip, 'port' => $port); } - public function getRequest($url, $headers = array(), $timeout = 60) { + public function getRequest($url, $headers = array(), $timeout = 30) { + if (empty($this->proxy_ip) || empty($this->proxy_port)) { + $this->setError('配置代理后再来请求吧'); + return false; + } + $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); @@ -247,28 +215,7 @@ class mSpider extends mBase { } //原创微博 用户行为数据 - public function spiderBehaviorOriginalData($uid, $page, $times = 1) { - $this->writeLog('spider', 'getData.log', "{$uid}|{$page}|{$times}"); - - $rdobj = $this->initRedis(); - - $ipinfo = $rdobj->get(_RC_SPIDER_IP); - $ipdata = json_decode($ipinfo, true); - if (!$ipinfo) { - $ipdata = $this->getNewHttpIp(_RC_SPIDER_IP); - } - - //获取代理ip失败 - if (empty($ipdata)) { - $this->setError($this->getError()); - return false; - } - - $this->proxy_ip = $ipdata['ip']; - $this->proxy_port = $ipdata['port']; - - $cookie = $rdobj->get(_RC_WEIBO_LOGIN_COOKIE); - if(empty($cookie)) $cookie = WEIBO_LOGIN_COOKIE; + public function spiderBehaviorOriginalData($cookie, $uid, $page) { $url = "https://weibo.com/ajax/statuses/searchProfile?uid={$uid}&page={$page}&hasori=1"; $headers = array( "Cookie: SUB={$cookie}", @@ -277,110 +224,10 @@ class mSpider extends mBase { "Referer: https://weibo.com/u/{$uid}?tabtype=feed" ); - $res = $this->getRequest($url, $headers); - //请求错误 - if (empty($res)) { - //请求为空 - $error_code = SPIDER_ERROR_URL_GET_EMPTY; - //curl错误 - if ($this->curl_err) { - $this->error_nums++; - $error_code = $this->getSpiderErrors(); - } - //ip不稳定 请求不作数 - if ($error_code == SPIDER_ERROR_IP_ASTABLE) $times--; - //代理失效 - if ($error_code == SPIDER_ERROR_IP_EXPIERD) $rdobj->del(_RC_SPIDER_IP); - - //连续五次请求错误 则抛出异常 - if ($times >= SPIDER_REQUEST_MAX_NUMS) { - $this->setError($error_code); - return false; - } - - sleep(2); - return $this->spiderBehaviorOriginalData($uid, $page, $times + 1); - } - $this->error_nums = 0; - - if (strpos($res, '登录 - 微博')) { - $this->setError(SPIDER_ERROR_URL_COOKIE_EXPIRED); - return false; - } - - $data = json_decode($res, 1); - if (!$data) { - $this->writeLog('spider', 'errorData.log', $res); - $this->setError(SPIDER_ERROR_URL_GET_UNKNOWN); - return false; - } - - //获取完毕 - if (empty($data['data']['list'])) return 'done'; - - $date = date('Y-m-d'); - $bobj = new mWeiboBehavior(); - $wobj = new mWeibo(); - foreach ($data['data']['list'] as $da) { - //博主设置不可见 - if (!isset($da['reposts_count'])) continue; - - $temp = array( - 'uid' => $uid, - 'wid' => $da['id'], - 'reposts_count' => $da['reposts_count'], - 'comments_count' => $da['comments_count'], - 'attitudes_count' => $da['attitudes_count'], - 'date' => $date, - ); - $res = $bobj->saveBehaviorData($temp); - if (!$res) { - $this->writeLog('spider', 'insert_error.log', json_encode($temp)); - } - - $created_at = new DateTime($da['created_at']); // 解析时间 - $weibo = array( - 'mblogid' => $da['mblogid'], - 'text' => strip_tags($da['text']), - 'refer' => '', - 'pic_ids' => $da['pic_num'] > 0 ? json_encode($da['pic_ids']) : '', - 'video_url' => isset($da['page_info']['media_info']['stream_url']) ? $da['page_info']['media_info']['stream_url'] : '', - 'video_cover' => isset($da['page_info']['page_pic']) ? pathinfo(basename($da['page_info']['page_pic']), PATHINFO_FILENAME) : '', - 'created_at' => $created_at->format('Y-m-d H:i:s') - ); - $res = $wobj->saveWeiboData($temp, $weibo); - if (!$res) { - $this->writeLog('spider', 'insert_error.log', json_encode(array_merge($temp, $weibo))); - } - } - - $this->writeLog('spider', 'getDone.log', "{$uid}|{$page}|{$times}"); - return true; + return $this->spiderBehavior($uid, $url, $headers); } - //转发微博 用户行为数据 - public function spiderBehaviorForwardData($uid, $page, $times = 1) { - $this->writeLog('spider', 'getData.log', "{$uid}|{$page}|{$times}"); - - $rdobj = $this->initRedis(); - - $ipinfo = $rdobj->get(_RC_SPIDER_IP); - $ipdata = json_decode($ipinfo, true); - if (!$ipinfo) { - $ipdata = $this->getNewHttpIp(_RC_SPIDER_IP); - } - - //获取代理ip失败 - if (empty($ipdata)) { - $this->setError($this->getError()); - return false; - } - - $this->proxy_ip = $ipdata['ip']; - $this->proxy_port = $ipdata['port']; - - $cookie = $rdobj->get(_RC_WEIBO_LOGIN_COOKIE); - if(empty($cookie)) $cookie = WEIBO_LOGIN_COOKIE; + public function spiderBehaviorForwardData($cookie, $uid, $page) { $endtime = strtotime("tomorrow"); $url = "https://weibo.com/ajax/statuses/searchProfile?uid={$uid}&page={$page}&endtime={$endtime}&hasret=1&hastext=1&haspic=1&hasvideo=1&hasmusic=1"; $headers = array( @@ -389,57 +236,46 @@ class mSpider extends mBase { "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36", "Referer: https://weibo.com/u/{$uid}?is_text=1&is_pic=1&is_video=1&is_music=1&is_forward=1&end_time={$endtime}" ); + + return $this->spiderBehavior($uid, $url, $headers); + } + + public function spiderBehavior($uid, $url, $headers) { $res = $this->getRequest($url, $headers); + if (strpos($res, '登录 - 微博')) { + $this->setError('cookie失效'); + return false; + } //请求错误 if (empty($res)) { - //请求为空 - $error_code = SPIDER_ERROR_URL_GET_EMPTY; - //curl错误 if ($this->curl_err) { - $this->error_nums++; - $error_code = $this->getSpiderErrors(); + $this->getSpiderErrors(); } - //ip不稳定 请求不作数 - if ($error_code == SPIDER_ERROR_IP_ASTABLE) $times--; - //代理失效 - if ($error_code == SPIDER_ERROR_IP_EXPIERD) $rdobj->del(_RC_SPIDER_IP); - - //连续五次请求错误 则抛出异常 - if ($times >= SPIDER_REQUEST_MAX_NUMS) { - $this->setError($error_code); - return false; - } - - sleep(2); - return $this->spiderBehaviorForwardData($uid, $page, $times + 1); - } - $this->error_nums = 0; - - if (strpos($res, '登录 - 微博')) { - $this->setError(SPIDER_ERROR_URL_COOKIE_EXPIRED); + $this->setError($this->curl_err); return false; } $data = json_decode($res, 1); if (!$data) { $this->writeLog('spider', 'errorData.log', $res); - $this->setError(SPIDER_ERROR_URL_GET_UNKNOWN); + $this->setError('请求为空'); return false; } //获取完毕 - if (empty($data['data']['list'])) return 'done'; + if (empty($data['data']['list'])) return true; $date = date('Y-m-d'); $bobj = new mWeiboBehavior(); $wobj = new mWeibo(); - foreach ($data['data']['list'] as $item) { - if (!isset($item['user']['id']) || $item['user']['id'] != $uid) continue; - - $da = $item['retweeted_status']; + foreach ($data['data']['list'] as $da) { + //有转发数据 取转发数据 + if(isset($da['retweeted_status'])) $da = $da['retweeted_status']; //博主设置不可见 if (!isset($da['reposts_count'])) continue; + //只取转发自己的数据 + if (!isset($da['user']['id']) || $da['user']['id'] != $uid) continue; $temp = array( 'uid' => $uid, @@ -457,7 +293,7 @@ class mSpider extends mBase { $created_at = new DateTime($da['created_at']); // 解析时间 $weibo = array( 'mblogid' => $da['mblogid'], - 'text' => strip_tags($da['text']), + 'text' => $da['text_raw'], 'refer' => '', 'pic_ids' => $da['pic_num'] > 0 ? json_encode($da['pic_ids']) : '', 'video_url' => isset($da['page_info']['media_info']['stream_url']) ? $da['page_info']['media_info']['stream_url'] : '', @@ -470,37 +306,28 @@ class mSpider extends mBase { } } - $this->writeLog('spider', 'getDone.log', "{$uid}|{$page}|{$times}"); return true; } private function getSpiderErrors() { preg_match('/Unable to receive initial SOCKS5 response./', $this->curl_err, $match); if ($match[0]) { - return SPIDER_ERROR_IP_EXPIERD; + $this->is_change_ip = true; + return true; } preg_match('/SOCKS5 read timeout/', $this->curl_err, $match); if ($match[0]) { - return SPIDER_ERROR_IP_EXPIERD; + $this->is_change_ip = true; + return true; } - $this->writeLog('spider', 'curl_error.log', $this->curl_err); - - //ip不稳定 - preg_match("/Can't complete SOCKS5 connection to 0.0.0.0:0/", $this->curl_err, $match); + preg_match('/Connection refused/', $this->curl_err, $match); if ($match[0]) { - if ($this->error_nums < 5) return SPIDER_ERROR_IP_ASTABLE; - if ($this->error_nums == 5) return SPIDER_ERROR_IP_EXPIERD; + $this->is_change_ip = true; + return true; } - //ip不稳定 - preg_match("/Connection refused/", $this->curl_err, $match); - if ($match[0]) { - if ($this->error_nums < 5) return SPIDER_ERROR_IP_ASTABLE; - if ($this->error_nums == 5) return SPIDER_ERROR_IP_EXPIERD; - } - - return SPIDER_ERROR_IP_UNKNOWN; + return true; } } \ No newline at end of file diff --git a/queue/config/daemonconf.php b/queue/config/daemonconf.php index f023c83..3bcb257 100644 --- a/queue/config/daemonconf.php +++ b/queue/config/daemonconf.php @@ -1,22 +1,19 @@ 7200, + SPIDER_WEIBO_BEHAVIOR => 7200, ); $GLOBALS['DAEMON_LIST'] = array( - //SPIDER_WEIBO_DATA => 'get_weibo_pic.php', - SPIDER_WEIBO_DATA => 'get_weibo_video.php', + SPIDER_WEIBO_BEHAVIOR => 'get_weibo_behavior.php', ); $GLOBALS['DAEMON_LIST_DESC'] = array( ); $GLOBALS['DAEMON_NUMLIMIT'] = array( - SPIDER_WEIBO_DATA => 5, + SPIDER_WEIBO_BEHAVIOR => 5, ); \ No newline at end of file diff --git a/queue/crontab/spider_behavior_data.php b/queue/crontab/spider_behavior_data.php index 98f5f55..a0d5c2e 100644 --- a/queue/crontab/spider_behavior_data.php +++ b/queue/crontab/spider_behavior_data.php @@ -16,31 +16,26 @@ class spiderBehaviorData { $user_list = array_keys($GLOBALS['WEIBO_USER_LIST']); $obj = new mSpider(); + $robj = $obj->initRedis(); + $date = date("Y-m-d"); foreach ($user_list as $uid) { - //采集原创数据 - for ($page = 1; $page <= 1000; $page++) { - $res = $obj->spiderBehaviorOriginalData($uid, $page); - //采集完毕 - if($res === 'done') break; - - if (!$res) { - $error_code = $obj->getError(); - $obj->sendMail(array('1026652509@qq.com'), date('Y年m月d日', time()) . '-微博原创数据抓取', $GLOBALS['spider_error'][$error_code]); - break 2; - } - } - - //采集转发数据 - for ($page = 1; $page <= 1000; $page++) { - $res = $obj->spiderBehaviorForwardData($uid, $page); - //采集完毕 - if($res === 'done') break; - - if (!$res) { - $error_code = $obj->getError(); - $obj->sendMail(array('1026652509@qq.com'), date('Y年m月d日', time()) . '-微博转发数据抓取', $GLOBALS['spider_error'][$error_code]); - break 2; - } + for ($page = 1; $page <= 120; $page++) { + //原创数据 + $data = array( + 'type' => 1, + 'uid' => $uid, + 'page' => $page, + 'date' => $date, + ); + $robj->lpush(_RQ_SPIDER_WEIBO_BEHAVIOR, json_encode($data)); + //转发数据 + $data = array( + 'type' => 2, + 'uid' => $uid, + 'page' => $page, + 'date' => $date, + ); + $robj->lpush(_RQ_SPIDER_WEIBO_BEHAVIOR, json_encode($data)); } } diff --git a/queue/deal/get_weibo_behavior.php b/queue/deal/get_weibo_behavior.php new file mode 100644 index 0000000..1a4439c --- /dev/null +++ b/queue/deal/get_weibo_behavior.php @@ -0,0 +1,79 @@ +spiderdaemon = SPIDER_WEIBO_BEHAVIOR; + $this->processnum = $GLOBALS['DAEMON_NUMLIMIT'][SPIDER_WEIBO_BEHAVIOR]; + } + + public function deal() { + $this->is_change_ip = false; + + $obj = new mSpider(); + $rdobj = $obj->initRedis(); + + $ipinfo = $rdobj->get(_RC_SPIDER_USE_IP); + $ipdata = json_decode($ipinfo, true); + if (!$ipinfo && PROC_CODE == 0 && $rdobj->llen($this->task_key) + 0 >= 0) { + $ipdata = $obj->getZmhttpIp(_RC_SPIDER_USE_IP); + } + if (empty($ipdata)) { + sleep(2); + return false; + } + + $obj->proxy_ip = $ipdata['ip']; + $obj->proxy_port = $ipdata['port']; + + $rq_data = $rdobj->rpop($this->task_key); + if (empty($rq_data)) { + sleep(60); + return false; + } + + $baselog = $rq_data . "|" . $ipdata['ip'] . "|" . $ipdata['port'] . "|" . PROC_CODE . "|"; + + //数据格式不正确 + $data = json_decode($rq_data, true); + if(empty($data['type']) || empty($data['uid']) || empty($data['page']) || empty($data['date'])) return false; + + //队列数据已过期 + $date = date("Y-m-d"); + if($data['date'] != $date) return false; + + $cookie = $rdobj->get(_RC_WEIBO_LOGIN_COOKIE); + if(empty($cookie)) $cookie = WEIBO_LOGIN_COOKIE; + + if($data['type'] == 1){ + //原创数据 + $res = $obj->spiderBehaviorOriginalData($cookie, $data['uid'], $data['page']); + }else{ + //转发数据 + $res = $obj->spiderBehaviorForwardData($cookie, $data['uid'], $data['page']); + } + + if (!$res) { + $error = $obj->getError(); + if($error == 'cookie失效') $obj->sendMail(array('1026652509@qq.com'), date('Y年m月d日', time()) . '-微博数据抓取', $data['info']); + if ($obj->is_change_ip) { + // 0进程监控更换ip + if (PROC_CODE == 0) $rdobj->del(_RC_SPIDER_USE_IP); + } + $obj->writeLog('spider', $this->spiderdaemon.'_fail.log', $baselog.$error); + $rdobj->lpush($this->task_key, $rq_data); + return false; + } + + $obj->writeLog('spider', $this->spiderdaemon . '_success.log', $baselog); + return true; + } +} + +new getWeiboBehavior();