|
|
@ -7,6 +7,8 @@ include_once(SERVER_ROOT . "/model/mBase.php"); |
|
|
|
|
|
|
|
class mSpider extends mBase { |
|
|
|
|
|
|
|
private $error_nums = 0; |
|
|
|
|
|
|
|
public function __construct() { |
|
|
|
$this->obj = new dWeibo(); |
|
|
|
$this->tbl = 'spider_weibo'; |
|
|
@ -37,12 +39,49 @@ class mSpider extends mBase { |
|
|
|
return array('ip' => $ip, 'port' => $port); |
|
|
|
} |
|
|
|
|
|
|
|
public function getRequest($url, $headers = array(), $timeout = 60) { |
|
|
|
if (empty($this->proxy_ip) || empty($this->proxy_port)) { |
|
|
|
$this->setError('配置代理后再来请求吧'); |
|
|
|
public function getNewHttpIp($task_key, $times = 1) { |
|
|
|
$url = "http://proxy.siyetian.com/apis_get.html?token=AesJWLNp2a65kaJdXTqFFeNpWT35ERNpnTn1STqFUeORUR31kaNh3TUl0dPRUQy4ERJdXT6lVN.QMxkTO0MjM0cTM&limit=1&type=1&time=&data_format=json&showTimeEnd=true"; |
|
|
|
$jsoninfo = $this->getCUrl($url); |
|
|
|
|
|
|
|
$this->writeLog('spider', 'getNewHttpIp.log', $task_key . '|' . $jsoninfo); |
|
|
|
|
|
|
|
//请求为空 重试第五次还是为空 则抛出异常 |
|
|
|
if (empty($jsoninfo)) { |
|
|
|
if ($times >= SPIDER_REQUEST_MAX_NUMS) { |
|
|
|
$this->setError(SPIDER_ERROR_IP_GET_EMPTY); |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
//请求为空 5秒后重试 |
|
|
|
sleep(5); |
|
|
|
return $this->getNewHttpIp($task_key, $times + 1); |
|
|
|
} |
|
|
|
|
|
|
|
$data = json_decode($jsoninfo, true); |
|
|
|
//套餐用完 需要换套餐了 |
|
|
|
if ($data['code'] == 10019 || $data['code'] == 10005) { |
|
|
|
$this->setError(SPIDER_ERROR_IP_GET_OUT); |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
$ip = $data['data'][0]['ip']; |
|
|
|
$port = $data['data'][0]['port']; |
|
|
|
$end_time = $data['data'][0]['end_time']; |
|
|
|
//其他异常错误 |
|
|
|
if (empty($ip) || empty($port)) { |
|
|
|
$this->setError(SPIDER_ERROR_IP_GET_FAIL); |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
$expire_time = strtotime($end_time) - time(); |
|
|
|
|
|
|
|
$rdobj = $this->initRedis(); |
|
|
|
$rdobj->setex($task_key, $expire_time, json_encode(array('ip' => $ip, 'port' => $port, 'end_time' => $end_time))); |
|
|
|
|
|
|
|
return array('ip' => $ip, 'port' => $port); |
|
|
|
} |
|
|
|
|
|
|
|
public function getRequest($url, $headers = array(), $timeout = 60) { |
|
|
|
$ch = curl_init(); |
|
|
|
|
|
|
|
curl_setopt($ch, CURLOPT_URL, $url); |
|
|
@ -206,4 +245,260 @@ class mSpider extends mBase { |
|
|
|
|
|
|
|
return $this->obj->selectAll('spider_weibo', array('sql' => $where, 'vals' => array()), 'id desc ', array($offset, $page_size)); |
|
|
|
} |
|
|
|
|
|
|
|
//原创微博 用户行为数据 |
|
|
|
public function spiderBehaviorOriginalData($uid, $page, $times = 1) { |
|
|
|
$this->writeLog('spider', 'getData.log', "{$uid}|{$page}|{$times}"); |
|
|
|
|
|
|
|
$rdobj = $this->initRedis(); |
|
|
|
|
|
|
|
$ipinfo = $rdobj->get(_RC_SPIDER_IP); |
|
|
|
$ipdata = json_decode($ipinfo, true); |
|
|
|
if (!$ipinfo) { |
|
|
|
$ipdata = $this->getNewHttpIp(_RC_SPIDER_IP); |
|
|
|
} |
|
|
|
|
|
|
|
//获取代理ip失败 |
|
|
|
if (empty($ipdata)) { |
|
|
|
$this->setError($this->getError()); |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
$this->proxy_ip = $ipdata['ip']; |
|
|
|
$this->proxy_port = $ipdata['port']; |
|
|
|
|
|
|
|
$cookie = WEIBO_LOGIN_COOKIE; |
|
|
|
$url = "https://weibo.com/ajax/statuses/searchProfile?uid={$uid}&page={$page}&hasori=1"; |
|
|
|
$headers = array( |
|
|
|
"Cookie: SUB={$cookie}", |
|
|
|
"x-requested-with: XMLHttpRequest", |
|
|
|
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36", |
|
|
|
"Referer: https://weibo.com/u/{$uid}?tabtype=feed" |
|
|
|
); |
|
|
|
|
|
|
|
$res = $this->getRequest($url, $headers); |
|
|
|
//请求错误 |
|
|
|
if (empty($res)) { |
|
|
|
//请求为空 |
|
|
|
$error_code = SPIDER_ERROR_URL_GET_EMPTY; |
|
|
|
//curl错误 |
|
|
|
if ($this->curl_err) { |
|
|
|
$this->error_nums++; |
|
|
|
$error_code = $this->getSpiderErrors(); |
|
|
|
} |
|
|
|
//ip不稳定 请求不作数 |
|
|
|
if ($error_code == SPIDER_ERROR_IP_ASTABLE) $times--; |
|
|
|
//代理失效 |
|
|
|
if ($error_code == SPIDER_ERROR_IP_EXPIERD) $rdobj->del(_RC_SPIDER_IP); |
|
|
|
|
|
|
|
//连续五次请求错误 则抛出异常 |
|
|
|
if ($times >= SPIDER_REQUEST_MAX_NUMS) { |
|
|
|
$this->setError($error_code); |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
sleep(2); |
|
|
|
return $this->spiderBehaviorOriginalData($uid, $page, $times + 1); |
|
|
|
} |
|
|
|
$this->error_nums = 0; |
|
|
|
|
|
|
|
if (strpos($res, '登录 - 微博')) { |
|
|
|
$this->setError(SPIDER_ERROR_URL_COOKIE_EXPIRED); |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
$data = json_decode($res, 1); |
|
|
|
if (!$data) { |
|
|
|
$this->writeLog('spider', 'errorData.log', $res); |
|
|
|
$this->setError(SPIDER_ERROR_URL_GET_UNKNOWN); |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
//获取完毕 |
|
|
|
if (empty($data['data']['list'])) return 'done'; |
|
|
|
|
|
|
|
$date = date('Y-m-d'); |
|
|
|
$bobj = new mWeiboBehavior(); |
|
|
|
$wobj = new mWeibo(); |
|
|
|
foreach ($data['data']['list'] as $da) { |
|
|
|
//博主设置不可见 |
|
|
|
if (!isset($da['reposts_count'])) continue; |
|
|
|
|
|
|
|
$temp = array( |
|
|
|
'uid' => $uid, |
|
|
|
'wid' => $da['id'], |
|
|
|
'reposts_count' => $da['reposts_count'], |
|
|
|
'comments_count' => $da['comments_count'], |
|
|
|
'attitudes_count' => $da['attitudes_count'], |
|
|
|
'date' => $date, |
|
|
|
); |
|
|
|
$res = $bobj->saveBehaviorData($temp); |
|
|
|
if (!$res) { |
|
|
|
$this->writeLog('spider', 'insert_error.log', json_encode($temp)); |
|
|
|
} |
|
|
|
|
|
|
|
$created_at = new DateTime($da['created_at']); // 解析时间 |
|
|
|
$weibo = array( |
|
|
|
'mblogid' => $da['mblogid'], |
|
|
|
'text' => strip_tags($da['text']), |
|
|
|
'refer' => '', |
|
|
|
'pic_ids' => $da['pic_num'] > 0 ? json_encode($da['pic_ids']) : '', |
|
|
|
'video_url' => isset($da['page_info']['media_info']['stream_url']) ? $da['page_info']['media_info']['stream_url'] : '', |
|
|
|
'video_cover' => isset($da['page_info']['page_pic']) ? pathinfo(basename($da['page_info']['page_pic']), PATHINFO_FILENAME) : '', |
|
|
|
'created_at' => $created_at->format('Y-m-d H:i:s') |
|
|
|
); |
|
|
|
$res = $wobj->saveWeiboData($temp, $weibo); |
|
|
|
if (!$res) { |
|
|
|
$this->writeLog('spider', 'insert_error.log', json_encode(array_merge($temp, $weibo))); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
$this->writeLog('spider', 'getDone.log', "{$uid}|{$page}|{$times}"); |
|
|
|
return true; |
|
|
|
} |
|
|
|
|
|
|
|
//转发微博 用户行为数据 |
|
|
|
public function spiderBehaviorForwardData($uid, $page, $times = 1) { |
|
|
|
$this->writeLog('spider', 'getData.log', "{$uid}|{$page}|{$times}"); |
|
|
|
|
|
|
|
$rdobj = $this->initRedis(); |
|
|
|
|
|
|
|
$ipinfo = $rdobj->get(_RC_SPIDER_IP); |
|
|
|
$ipdata = json_decode($ipinfo, true); |
|
|
|
if (!$ipinfo) { |
|
|
|
$ipdata = $this->getNewHttpIp(_RC_SPIDER_IP); |
|
|
|
} |
|
|
|
|
|
|
|
//获取代理ip失败 |
|
|
|
if (empty($ipdata)) { |
|
|
|
$this->setError($this->getError()); |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
$this->proxy_ip = $ipdata['ip']; |
|
|
|
$this->proxy_port = $ipdata['port']; |
|
|
|
|
|
|
|
$cookie = WEIBO_LOGIN_COOKIE; |
|
|
|
$endtime = strtotime("tomorrow"); |
|
|
|
$url = "https://weibo.com/ajax/statuses/searchProfile?uid={$uid}&page={$page}&endtime={$endtime}&hasret=1&hastext=1&haspic=1&hasvideo=1&hasmusic=1"; |
|
|
|
$headers = array( |
|
|
|
"Cookie: SUB={$cookie}", |
|
|
|
"x-requested-with: XMLHttpRequest", |
|
|
|
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36", |
|
|
|
"Referer: https://weibo.com/u/{$uid}?is_text=1&is_pic=1&is_video=1&is_music=1&is_forward=1&end_time={$endtime}" |
|
|
|
); |
|
|
|
$res = $this->getRequest($url, $headers); |
|
|
|
|
|
|
|
//请求错误 |
|
|
|
if (empty($res)) { |
|
|
|
//请求为空 |
|
|
|
$error_code = SPIDER_ERROR_URL_GET_EMPTY; |
|
|
|
//curl错误 |
|
|
|
if ($this->curl_err) { |
|
|
|
$this->error_nums++; |
|
|
|
$error_code = $this->getSpiderErrors(); |
|
|
|
} |
|
|
|
//ip不稳定 请求不作数 |
|
|
|
if ($error_code == SPIDER_ERROR_IP_ASTABLE) $times--; |
|
|
|
//代理失效 |
|
|
|
if ($error_code == SPIDER_ERROR_IP_EXPIERD) $rdobj->del(_RC_SPIDER_IP); |
|
|
|
|
|
|
|
//连续五次请求错误 则抛出异常 |
|
|
|
if ($times >= SPIDER_REQUEST_MAX_NUMS) { |
|
|
|
$this->setError($error_code); |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
sleep(2); |
|
|
|
return $this->spiderBehaviorForwardData($uid, $page, $times + 1); |
|
|
|
} |
|
|
|
$this->error_nums = 0; |
|
|
|
|
|
|
|
if (strpos($res, '登录 - 微博')) { |
|
|
|
$this->setError(SPIDER_ERROR_URL_COOKIE_EXPIRED); |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
$data = json_decode($res, 1); |
|
|
|
if (!$data) { |
|
|
|
$this->writeLog('spider', 'errorData.log', $res); |
|
|
|
$this->setError(SPIDER_ERROR_URL_GET_UNKNOWN); |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
//获取完毕 |
|
|
|
if (empty($data['data']['list'])) return 'done'; |
|
|
|
|
|
|
|
$date = date('Y-m-d'); |
|
|
|
$bobj = new mWeiboBehavior(); |
|
|
|
$wobj = new mWeibo(); |
|
|
|
foreach ($data['data']['list'] as $item) { |
|
|
|
if (!isset($item['user']['id']) || $item['user']['id'] != $uid) continue; |
|
|
|
|
|
|
|
$da = $item['retweeted_status']; |
|
|
|
//博主设置不可见 |
|
|
|
if (!isset($da['reposts_count'])) continue; |
|
|
|
|
|
|
|
$temp = array( |
|
|
|
'uid' => $uid, |
|
|
|
'wid' => $da['id'], |
|
|
|
'reposts_count' => $da['reposts_count'], |
|
|
|
'comments_count' => $da['comments_count'], |
|
|
|
'attitudes_count' => $da['attitudes_count'], |
|
|
|
'date' => $date, |
|
|
|
); |
|
|
|
$res = $bobj->saveBehaviorData($temp); |
|
|
|
if (!$res) { |
|
|
|
$this->writeLog('spider', 'insert_error.log', json_encode($temp)); |
|
|
|
} |
|
|
|
|
|
|
|
$created_at = new DateTime($da['created_at']); // 解析时间 |
|
|
|
$weibo = array( |
|
|
|
'mblogid' => $da['mblogid'], |
|
|
|
'text' => strip_tags($da['text']), |
|
|
|
'refer' => '', |
|
|
|
'pic_ids' => $da['pic_num'] > 0 ? json_encode($da['pic_ids']) : '', |
|
|
|
'video_url' => isset($da['page_info']['media_info']['stream_url']) ? $da['page_info']['media_info']['stream_url'] : '', |
|
|
|
'video_cover' => isset($da['page_info']['page_pic']) ? pathinfo(basename($da['page_info']['page_pic']), PATHINFO_FILENAME) : '', |
|
|
|
'created_at' => $created_at->format('Y-m-d H:i:s') |
|
|
|
); |
|
|
|
$res = $wobj->saveWeiboData($temp, $weibo); |
|
|
|
if (!$res) { |
|
|
|
$this->writeLog('spider', 'insert_error.log', json_encode(array_merge($temp, $weibo))); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
$this->writeLog('spider', 'getDone.log', "{$uid}|{$page}|{$times}"); |
|
|
|
return true; |
|
|
|
} |
|
|
|
|
|
|
|
private function getSpiderErrors() { |
|
|
|
preg_match('/Unable to receive initial SOCKS5 response./', $this->curl_err, $match); |
|
|
|
if ($match[0]) { |
|
|
|
return SPIDER_ERROR_IP_EXPIERD; |
|
|
|
} |
|
|
|
|
|
|
|
preg_match('/SOCKS5 read timeout/', $this->curl_err, $match); |
|
|
|
if ($match[0]) { |
|
|
|
return SPIDER_ERROR_IP_EXPIERD; |
|
|
|
} |
|
|
|
|
|
|
|
$this->writeLog('spider', 'curl_error.log', $this->curl_err); |
|
|
|
|
|
|
|
//ip不稳定 |
|
|
|
preg_match("/Can't complete SOCKS5 connection to 0.0.0.0:0/", $this->curl_err, $match); |
|
|
|
if ($match[0]) { |
|
|
|
if ($this->error_nums < 5) return SPIDER_ERROR_IP_ASTABLE; |
|
|
|
if ($this->error_nums == 5) return SPIDER_ERROR_IP_EXPIERD; |
|
|
|
} |
|
|
|
|
|
|
|
//ip不稳定 |
|
|
|
preg_match("/Connection refused/", $this->curl_err, $match); |
|
|
|
if ($match[0]) { |
|
|
|
if ($this->error_nums < 5) return SPIDER_ERROR_IP_ASTABLE; |
|
|
|
if ($this->error_nums == 5) return SPIDER_ERROR_IP_EXPIERD; |
|
|
|
} |
|
|
|
|
|
|
|
return SPIDER_ERROR_IP_UNKNOWN; |
|
|
|
} |
|
|
|
} |