You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

504 lines
19 KiB

2 months ago
<?php
/**
*
*/
2 months ago
include_once(SERVER_ROOT . "/model/mBase.php");
2 months ago
class mSpider extends mBase {
4 days ago
private $error_nums = 0;
2 months ago
public function __construct() {
$this->obj = new dWeibo();
$this->tbl = 'spider_weibo';
}
public function getZmhttpIp($task_key) {
$rdobj = $this->initRedis();
2 months ago
$ip_request = $rdobj->get('ip_request');
if ($ip_request) return false;
2 months ago
$rdobj->setex('ip_request', 60, 1);
2 months ago
$url = "http://proxy.siyetian.com/apis_get.html?token=AesJWLNp2a65kaJdXTqFFeNpWT35ERNpnTn1STqFUeORUR31kaNh3TUl0dPRUQy4ERJdXT6lVN.QMxkTO0MjM0cTM&limit=1&type=1&time=&data_format=json&showTimeEnd=true";
$jsoninfo = $this->getCUrl($url);
$this->writeLog('spider', 'getZmhttpIp.log', $task_key . '|' . $jsoninfo);
$data = json_decode($jsoninfo, true);
$ip = $data['data'][0]['ip'];
$port = $data['data'][0]['port'];
$end_time = $data['data'][0]['end_time'];
if (empty($ip) || empty($port)) return false;
$rdobj = $this->initRedis();
$rdobj->setex($task_key, 6 * 60, json_encode(array('ip' => $ip, 'port' => $port, 'end_time' => $end_time)));
return array('ip' => $ip, 'port' => $port);
}
4 days ago
public function getNewHttpIp($task_key, $times = 1) {
$url = "http://proxy.siyetian.com/apis_get.html?token=AesJWLNp2a65kaJdXTqFFeNpWT35ERNpnTn1STqFUeORUR31kaNh3TUl0dPRUQy4ERJdXT6lVN.QMxkTO0MjM0cTM&limit=1&type=1&time=&data_format=json&showTimeEnd=true";
$jsoninfo = $this->getCUrl($url);
$this->writeLog('spider', 'getNewHttpIp.log', $task_key . '|' . $jsoninfo);
//请求为空 重试第五次还是为空 则抛出异常
if (empty($jsoninfo)) {
if ($times >= SPIDER_REQUEST_MAX_NUMS) {
$this->setError(SPIDER_ERROR_IP_GET_EMPTY);
return false;
}
//请求为空 5秒后重试
sleep(5);
return $this->getNewHttpIp($task_key, $times + 1);
}
$data = json_decode($jsoninfo, true);
//套餐用完 需要换套餐了
if ($data['code'] == 10019 || $data['code'] == 10005) {
$this->setError(SPIDER_ERROR_IP_GET_OUT);
return false;
}
$ip = $data['data'][0]['ip'];
$port = $data['data'][0]['port'];
$end_time = $data['data'][0]['end_time'];
//其他异常错误
if (empty($ip) || empty($port)) {
$this->setError(SPIDER_ERROR_IP_GET_FAIL);
2 months ago
return false;
}
4 days ago
$expire_time = strtotime($end_time) - time();
$rdobj = $this->initRedis();
$rdobj->setex($task_key, $expire_time, json_encode(array('ip' => $ip, 'port' => $port, 'end_time' => $end_time)));
return array('ip' => $ip, 'port' => $port);
}
public function getRequest($url, $headers = array(), $timeout = 60) {
2 months ago
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_POST, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_REFERER, $url);
if (!empty($headers)) curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
if ($this->proxy_port) {
curl_setopt($ch, CURLOPT_PROXY, 'socks5h://' . $this->proxy_ip . ":" . $this->proxy_port);
}
if (!empty($this->cookie_path)) {
curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookie_path);
curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookie_path);
}
// 若给定url自动跳转到新的url,有了下面参数可自动获取新url内容:302跳转(paperpass下载报告会用到)
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
// 忽略 SSL 验证(如果需要)
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
$result = curl_exec($ch);
$this->curl_err = curl_error($ch);
curl_close($ch);
if ($result !== false) return $result;
return false;
}
public function getWeiboVideo($url) {
if (empty($this->proxy_ip) || empty($this->proxy_port)) {
$this->setError('配置代理后再来请求吧');
return false;
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_PROXY, 'socks5h://' . $this->proxy_ip . ":" . $this->proxy_port); // 设置 SOCKS5 代理
curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME); // SOCKS5 解析域名
curl_setopt($ch, CURLOPT_HTTPHEADER, [
"Referer: https://weibo.com",
"User-Agent: {$this->getRandUserAgent()}"
]);
curl_setopt($ch, CURLOPT_TIMEOUT, 600);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
$result = curl_exec($ch);
$this->curl_err = curl_error($ch);
curl_close($ch);
if ($result !== false) return $result;
return false;
}
public function getWapWeibo($wid) {
$url = "https://m.weibo.cn/detail/{$wid}";
$cooike_header = 'Cookie: SUB=_2A25K3UsmDeRhGeFH41MV8ibNyjSIHXVpk8LurDV6PUJbktAYLUzAkW1NekLV22E63ymp9LDusz92jTcj_StK-KEN;';
$headers = array(
$cooike_header,
"User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1"
);
$html = $this->getRequest($url, $headers);
if (!$html) {
$this->setError($this->curl_err);
2 months ago
$this->writeLog('spider', 'curl_error.log', $this->curl_err . "|" . $url . "|" . $html);
2 months ago
return false;
}
$content = array();
if (preg_match('/var\s+\$render_data\s*=\s*(\[\{.*?\}\])\[0\]\s*\|\|\s*\{};/s', $html, $matches)) {
$json_data = $matches[1]; // 提取 JSON 字符串
// 解析 JSON
$data = json_decode($json_data, true);
if (!empty($data) && isset($data[0]['status'])) {
$content = $data[0]['status'];
}
}
2 months ago
if (empty($content)) {
2 months ago
$this->setError($html);
return 'error';
}
$return = array();
$return['text'] = strip_tags($content['text']);
$return['pic_ids'] = $content['pic_num'] > 0 ? json_encode($content['pic_ids']) : '';;
$video_url = isset($content['page_info']['media_info']['stream_url']) ? $content['page_info']['media_info']['stream_url'] : '';
$return['video_url'] = $video_url;
$video_cover = isset($content['page_info']['page_pic']['pid']) ? $content['page_info']['page_pic']['pid'] : '';
$return['video_cover'] = $video_cover;
$video_cover_url = isset($content['page_info']['page_pic']['url']) ? $content['page_info']['page_pic']['url'] : '';
$return['video_cover_url'] = $video_cover_url;
$date = new DateTime($content['created_at']); // 解析时间
$return['created_at'] = $date->format('Y-m-d H:i:s');
$return['reposts_count'] = $content['reposts_count'];
$return['comments_count'] = $content['comments_count'];
$return['attitudes_count'] = $content['attitudes_count'];
2 months ago
if ($content['retweeted_status']) {
2 months ago
$return['retweeted_status'] = array();
$return['retweeted_status']['wid'] = $content['retweeted_status']['id'];
$return['retweeted_status']['text'] = strip_tags($content['retweeted_status']['text']);
$return['retweeted_status']['pic_ids'] = $content['retweeted_status']['pic_num'] > 0 ? json_encode($content['retweeted_status']['pic_ids']) : '';;
$video_url = isset($content['retweeted_status']['page_info']['media_info']['stream_url']) ? $content['retweeted_status']['page_info']['media_info']['stream_url'] : '';
$return['retweeted_status']['video_url'] = $video_url;
$video_cover = isset($content['retweeted_status']['page_info']['page_pic']['pid']) ? $content['retweeted_status']['page_info']['page_pic']['pid'] : '';
$return['retweeted_status']['video_cover'] = $video_cover;
$video_cover_url = isset($content['retweeted_status']['page_info']['page_pic']['url']) ? $content['retweeted_status']['page_info']['page_pic']['url'] : '';
$return['video_cover_url'] = $video_cover_url;
$date = new DateTime($content['retweeted_status']['created_at']); // 解析时间
$return['retweeted_status']['created_at'] = $date->format('Y-m-d H:i:s');
$return['retweeted_status']['reposts_count'] = $content['retweeted_status']['reposts_count'];
$return['retweeted_status']['comments_count'] = $content['retweeted_status']['comments_count'];
$return['retweeted_status']['attitudes_count'] = $content['retweeted_status']['attitudes_count'];
}
return $return;
}
public function getWeiboList($condition, $page_num, $page_size) {
$offset = ($page_num - 1) * $page_size;
$where = "1=1 ";
if (!empty($condition)) {
foreach ($condition as $key => $val) {
if (is_array($val)) {
$val = implode(',', $val);
$where .= " and {$key} in ({$val})";
} else {
$where .= " and {$key}={$val}";
}
}
}
return $this->obj->selectAll('spider_weibo', array('sql' => $where, 'vals' => array()), 'id desc ', array($offset, $page_size));
}
4 days ago
//原创微博 用户行为数据
public function spiderBehaviorOriginalData($uid, $page, $times = 1) {
$this->writeLog('spider', 'getData.log', "{$uid}|{$page}|{$times}");
$rdobj = $this->initRedis();
$ipinfo = $rdobj->get(_RC_SPIDER_IP);
$ipdata = json_decode($ipinfo, true);
if (!$ipinfo) {
$ipdata = $this->getNewHttpIp(_RC_SPIDER_IP);
}
//获取代理ip失败
if (empty($ipdata)) {
$this->setError($this->getError());
return false;
}
$this->proxy_ip = $ipdata['ip'];
$this->proxy_port = $ipdata['port'];
$cookie = WEIBO_LOGIN_COOKIE;
$url = "https://weibo.com/ajax/statuses/searchProfile?uid={$uid}&page={$page}&hasori=1";
$headers = array(
"Cookie: SUB={$cookie}",
"x-requested-with: XMLHttpRequest",
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
"Referer: https://weibo.com/u/{$uid}?tabtype=feed"
);
$res = $this->getRequest($url, $headers);
//请求错误
if (empty($res)) {
//请求为空
$error_code = SPIDER_ERROR_URL_GET_EMPTY;
//curl错误
if ($this->curl_err) {
$this->error_nums++;
$error_code = $this->getSpiderErrors();
}
//ip不稳定 请求不作数
if ($error_code == SPIDER_ERROR_IP_ASTABLE) $times--;
//代理失效
if ($error_code == SPIDER_ERROR_IP_EXPIERD) $rdobj->del(_RC_SPIDER_IP);
//连续五次请求错误 则抛出异常
if ($times >= SPIDER_REQUEST_MAX_NUMS) {
$this->setError($error_code);
return false;
}
sleep(2);
return $this->spiderBehaviorOriginalData($uid, $page, $times + 1);
}
$this->error_nums = 0;
if (strpos($res, '登录 - 微博')) {
$this->setError(SPIDER_ERROR_URL_COOKIE_EXPIRED);
return false;
}
$data = json_decode($res, 1);
if (!$data) {
$this->writeLog('spider', 'errorData.log', $res);
$this->setError(SPIDER_ERROR_URL_GET_UNKNOWN);
return false;
}
//获取完毕
if (empty($data['data']['list'])) return 'done';
$date = date('Y-m-d');
$bobj = new mWeiboBehavior();
$wobj = new mWeibo();
foreach ($data['data']['list'] as $da) {
//博主设置不可见
if (!isset($da['reposts_count'])) continue;
$temp = array(
'uid' => $uid,
'wid' => $da['id'],
'reposts_count' => $da['reposts_count'],
'comments_count' => $da['comments_count'],
'attitudes_count' => $da['attitudes_count'],
'date' => $date,
);
$res = $bobj->saveBehaviorData($temp);
if (!$res) {
$this->writeLog('spider', 'insert_error.log', json_encode($temp));
}
$created_at = new DateTime($da['created_at']); // 解析时间
$weibo = array(
'mblogid' => $da['mblogid'],
'text' => strip_tags($da['text']),
'refer' => '',
'pic_ids' => $da['pic_num'] > 0 ? json_encode($da['pic_ids']) : '',
'video_url' => isset($da['page_info']['media_info']['stream_url']) ? $da['page_info']['media_info']['stream_url'] : '',
'video_cover' => isset($da['page_info']['page_pic']) ? pathinfo(basename($da['page_info']['page_pic']), PATHINFO_FILENAME) : '',
'created_at' => $created_at->format('Y-m-d H:i:s')
);
$res = $wobj->saveWeiboData($temp, $weibo);
if (!$res) {
$this->writeLog('spider', 'insert_error.log', json_encode(array_merge($temp, $weibo)));
}
}
$this->writeLog('spider', 'getDone.log', "{$uid}|{$page}|{$times}");
return true;
}
//转发微博 用户行为数据
public function spiderBehaviorForwardData($uid, $page, $times = 1) {
$this->writeLog('spider', 'getData.log', "{$uid}|{$page}|{$times}");
$rdobj = $this->initRedis();
$ipinfo = $rdobj->get(_RC_SPIDER_IP);
$ipdata = json_decode($ipinfo, true);
if (!$ipinfo) {
$ipdata = $this->getNewHttpIp(_RC_SPIDER_IP);
}
//获取代理ip失败
if (empty($ipdata)) {
$this->setError($this->getError());
return false;
}
$this->proxy_ip = $ipdata['ip'];
$this->proxy_port = $ipdata['port'];
$cookie = WEIBO_LOGIN_COOKIE;
$endtime = strtotime("tomorrow");
$url = "https://weibo.com/ajax/statuses/searchProfile?uid={$uid}&page={$page}&endtime={$endtime}&hasret=1&hastext=1&haspic=1&hasvideo=1&hasmusic=1";
$headers = array(
"Cookie: SUB={$cookie}",
"x-requested-with: XMLHttpRequest",
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
"Referer: https://weibo.com/u/{$uid}?is_text=1&is_pic=1&is_video=1&is_music=1&is_forward=1&end_time={$endtime}"
);
$res = $this->getRequest($url, $headers);
//请求错误
if (empty($res)) {
//请求为空
$error_code = SPIDER_ERROR_URL_GET_EMPTY;
//curl错误
if ($this->curl_err) {
$this->error_nums++;
$error_code = $this->getSpiderErrors();
}
//ip不稳定 请求不作数
if ($error_code == SPIDER_ERROR_IP_ASTABLE) $times--;
//代理失效
if ($error_code == SPIDER_ERROR_IP_EXPIERD) $rdobj->del(_RC_SPIDER_IP);
//连续五次请求错误 则抛出异常
if ($times >= SPIDER_REQUEST_MAX_NUMS) {
$this->setError($error_code);
return false;
}
sleep(2);
return $this->spiderBehaviorForwardData($uid, $page, $times + 1);
}
$this->error_nums = 0;
if (strpos($res, '登录 - 微博')) {
$this->setError(SPIDER_ERROR_URL_COOKIE_EXPIRED);
return false;
}
$data = json_decode($res, 1);
if (!$data) {
$this->writeLog('spider', 'errorData.log', $res);
$this->setError(SPIDER_ERROR_URL_GET_UNKNOWN);
return false;
}
//获取完毕
if (empty($data['data']['list'])) return 'done';
$date = date('Y-m-d');
$bobj = new mWeiboBehavior();
$wobj = new mWeibo();
foreach ($data['data']['list'] as $item) {
if (!isset($item['user']['id']) || $item['user']['id'] != $uid) continue;
$da = $item['retweeted_status'];
//博主设置不可见
if (!isset($da['reposts_count'])) continue;
$temp = array(
'uid' => $uid,
'wid' => $da['id'],
'reposts_count' => $da['reposts_count'],
'comments_count' => $da['comments_count'],
'attitudes_count' => $da['attitudes_count'],
'date' => $date,
);
$res = $bobj->saveBehaviorData($temp);
if (!$res) {
$this->writeLog('spider', 'insert_error.log', json_encode($temp));
}
$created_at = new DateTime($da['created_at']); // 解析时间
$weibo = array(
'mblogid' => $da['mblogid'],
'text' => strip_tags($da['text']),
'refer' => '',
'pic_ids' => $da['pic_num'] > 0 ? json_encode($da['pic_ids']) : '',
'video_url' => isset($da['page_info']['media_info']['stream_url']) ? $da['page_info']['media_info']['stream_url'] : '',
'video_cover' => isset($da['page_info']['page_pic']) ? pathinfo(basename($da['page_info']['page_pic']), PATHINFO_FILENAME) : '',
'created_at' => $created_at->format('Y-m-d H:i:s')
);
$res = $wobj->saveWeiboData($temp, $weibo);
if (!$res) {
$this->writeLog('spider', 'insert_error.log', json_encode(array_merge($temp, $weibo)));
}
}
$this->writeLog('spider', 'getDone.log', "{$uid}|{$page}|{$times}");
return true;
}
private function getSpiderErrors() {
preg_match('/Unable to receive initial SOCKS5 response./', $this->curl_err, $match);
if ($match[0]) {
return SPIDER_ERROR_IP_EXPIERD;
}
preg_match('/SOCKS5 read timeout/', $this->curl_err, $match);
if ($match[0]) {
return SPIDER_ERROR_IP_EXPIERD;
}
$this->writeLog('spider', 'curl_error.log', $this->curl_err);
//ip不稳定
preg_match("/Can't complete SOCKS5 connection to 0.0.0.0:0/", $this->curl_err, $match);
if ($match[0]) {
if ($this->error_nums < 5) return SPIDER_ERROR_IP_ASTABLE;
if ($this->error_nums == 5) return SPIDER_ERROR_IP_EXPIERD;
}
//ip不稳定
preg_match("/Connection refused/", $this->curl_err, $match);
if ($match[0]) {
if ($this->error_nums < 5) return SPIDER_ERROR_IP_ASTABLE;
if ($this->error_nums == 5) return SPIDER_ERROR_IP_EXPIERD;
}
return SPIDER_ERROR_IP_UNKNOWN;
}
2 months ago
}