You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
209 lines
8.2 KiB
209 lines
8.2 KiB
<?php
|
|
/**
|
|
*
|
|
*/
|
|
include_once(SERVER_ROOT."/model/mBase.php");
|
|
|
|
|
|
class mSpider extends mBase {
|
|
|
|
public function __construct() {
|
|
$this->obj = new dWeibo();
|
|
$this->tbl = 'spider_weibo';
|
|
}
|
|
|
|
public function getZmhttpIp($task_key) {
|
|
$rdobj = $this->initRedis();
|
|
$aa = $rdobj->get('aaa');
|
|
if($aa) return false;
|
|
|
|
$rdobj->setex('aaa', 60, 1);
|
|
|
|
$url = "http://proxy.siyetian.com/apis_get.html?token=AesJWLNp2a65kaJdXTqFFeNpWT35ERNpnTn1STqFUeORUR31kaNh3TUl0dPRUQy4ERJdXT6lVN.QMxkTO0MjM0cTM&limit=1&type=1&time=&data_format=json&showTimeEnd=true";
|
|
$jsoninfo = $this->getCUrl($url);
|
|
|
|
$this->writeLog('spider', 'getZmhttpIp.log', $task_key . '|' . $jsoninfo);
|
|
|
|
$data = json_decode($jsoninfo, true);
|
|
$ip = $data['data'][0]['ip'];
|
|
$port = $data['data'][0]['port'];
|
|
$end_time = $data['data'][0]['end_time'];
|
|
|
|
if (empty($ip) || empty($port)) return false;
|
|
|
|
$rdobj = $this->initRedis();
|
|
$rdobj->setex($task_key, 6 * 60, json_encode(array('ip' => $ip, 'port' => $port, 'end_time' => $end_time)));
|
|
|
|
return array('ip' => $ip, 'port' => $port);
|
|
}
|
|
|
|
public function getRequest($url, $headers = array(), $timeout = 60) {
|
|
if (empty($this->proxy_ip) || empty($this->proxy_port)) {
|
|
$this->setError('配置代理后再来请求吧');
|
|
return false;
|
|
}
|
|
|
|
$ch = curl_init();
|
|
|
|
curl_setopt($ch, CURLOPT_URL, $url);
|
|
curl_setopt($ch, CURLOPT_POST, false);
|
|
|
|
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
|
curl_setopt($ch, CURLOPT_REFERER, $url);
|
|
|
|
if (!empty($headers)) curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
|
|
|
|
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
|
|
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
|
|
|
|
if ($this->proxy_port) {
|
|
curl_setopt($ch, CURLOPT_PROXY, 'socks5h://' . $this->proxy_ip . ":" . $this->proxy_port);
|
|
}
|
|
|
|
if (!empty($this->cookie_path)) {
|
|
curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookie_path);
|
|
curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookie_path);
|
|
}
|
|
|
|
// 若给定url自动跳转到新的url,有了下面参数可自动获取新url内容:302跳转(paperpass下载报告会用到)
|
|
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
|
|
|
|
// 忽略 SSL 验证(如果需要)
|
|
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
|
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
|
|
|
|
$result = curl_exec($ch);
|
|
$this->curl_err = curl_error($ch);
|
|
|
|
curl_close($ch);
|
|
|
|
if ($result !== false) return $result;
|
|
|
|
return false;
|
|
}
|
|
|
|
public function getWeiboVideo($url) {
|
|
if (empty($this->proxy_ip) || empty($this->proxy_port)) {
|
|
$this->setError('配置代理后再来请求吧');
|
|
return false;
|
|
}
|
|
|
|
$ch = curl_init();
|
|
curl_setopt($ch, CURLOPT_URL, $url);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
|
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
|
curl_setopt($ch, CURLOPT_PROXY, 'socks5h://' . $this->proxy_ip . ":" . $this->proxy_port); // 设置 SOCKS5 代理
|
|
curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME); // SOCKS5 解析域名
|
|
curl_setopt($ch, CURLOPT_HTTPHEADER, [
|
|
"Referer: https://weibo.com",
|
|
"User-Agent: {$this->getRandUserAgent()}"
|
|
]);
|
|
curl_setopt($ch, CURLOPT_TIMEOUT, 600);
|
|
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
|
|
|
|
$result = curl_exec($ch);
|
|
$this->curl_err = curl_error($ch);
|
|
|
|
curl_close($ch);
|
|
|
|
if ($result !== false) return $result;
|
|
|
|
return false;
|
|
}
|
|
|
|
public function getWapWeibo($wid) {
|
|
$url = "https://m.weibo.cn/detail/{$wid}";
|
|
|
|
$cooike_header = 'Cookie: SUB=_2A25K3UsmDeRhGeFH41MV8ibNyjSIHXVpk8LurDV6PUJbktAYLUzAkW1NekLV22E63ymp9LDusz92jTcj_StK-KEN;';
|
|
$headers = array(
|
|
$cooike_header,
|
|
"User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1"
|
|
);
|
|
|
|
$html = $this->getRequest($url, $headers);
|
|
if (!$html) {
|
|
$this->setError($this->curl_err);
|
|
$this->writeLog('spider', 'curl_error.log', $this->curl_err."|".$url."|".$html);
|
|
return false;
|
|
}
|
|
|
|
$content = array();
|
|
if (preg_match('/var\s+\$render_data\s*=\s*(\[\{.*?\}\])\[0\]\s*\|\|\s*\{};/s', $html, $matches)) {
|
|
$json_data = $matches[1]; // 提取 JSON 字符串
|
|
// 解析 JSON
|
|
$data = json_decode($json_data, true);
|
|
|
|
if (!empty($data) && isset($data[0]['status'])) {
|
|
$content = $data[0]['status'];
|
|
}
|
|
}
|
|
if(empty($content)) {
|
|
$this->setError($html);
|
|
return 'error';
|
|
}
|
|
|
|
$return = array();
|
|
$return['text'] = strip_tags($content['text']);
|
|
$return['pic_ids'] = $content['pic_num'] > 0 ? json_encode($content['pic_ids']) : '';;
|
|
|
|
$video_url = isset($content['page_info']['media_info']['stream_url']) ? $content['page_info']['media_info']['stream_url'] : '';
|
|
$return['video_url'] = $video_url;
|
|
|
|
$video_cover = isset($content['page_info']['page_pic']['pid']) ? $content['page_info']['page_pic']['pid'] : '';
|
|
$return['video_cover'] = $video_cover;
|
|
|
|
$video_cover_url = isset($content['page_info']['page_pic']['url']) ? $content['page_info']['page_pic']['url'] : '';
|
|
$return['video_cover_url'] = $video_cover_url;
|
|
|
|
$date = new DateTime($content['created_at']); // 解析时间
|
|
$return['created_at'] = $date->format('Y-m-d H:i:s');
|
|
|
|
$return['reposts_count'] = $content['reposts_count'];
|
|
$return['comments_count'] = $content['comments_count'];
|
|
$return['attitudes_count'] = $content['attitudes_count'];
|
|
|
|
if($content['retweeted_status']){
|
|
$return['retweeted_status'] = array();
|
|
|
|
$return['retweeted_status']['wid'] = $content['retweeted_status']['id'];
|
|
$return['retweeted_status']['text'] = strip_tags($content['retweeted_status']['text']);
|
|
$return['retweeted_status']['pic_ids'] = $content['retweeted_status']['pic_num'] > 0 ? json_encode($content['retweeted_status']['pic_ids']) : '';;
|
|
|
|
$video_url = isset($content['retweeted_status']['page_info']['media_info']['stream_url']) ? $content['retweeted_status']['page_info']['media_info']['stream_url'] : '';
|
|
$return['retweeted_status']['video_url'] = $video_url;
|
|
|
|
$video_cover = isset($content['retweeted_status']['page_info']['page_pic']['pid']) ? $content['retweeted_status']['page_info']['page_pic']['pid'] : '';
|
|
$return['retweeted_status']['video_cover'] = $video_cover;
|
|
|
|
$video_cover_url = isset($content['retweeted_status']['page_info']['page_pic']['url']) ? $content['retweeted_status']['page_info']['page_pic']['url'] : '';
|
|
$return['video_cover_url'] = $video_cover_url;
|
|
|
|
$date = new DateTime($content['retweeted_status']['created_at']); // 解析时间
|
|
$return['retweeted_status']['created_at'] = $date->format('Y-m-d H:i:s');
|
|
|
|
$return['retweeted_status']['reposts_count'] = $content['retweeted_status']['reposts_count'];
|
|
$return['retweeted_status']['comments_count'] = $content['retweeted_status']['comments_count'];
|
|
$return['retweeted_status']['attitudes_count'] = $content['retweeted_status']['attitudes_count'];
|
|
}
|
|
return $return;
|
|
}
|
|
|
|
public function getWeiboList($condition, $page_num, $page_size) {
|
|
$offset = ($page_num - 1) * $page_size;
|
|
|
|
$where = "1=1 ";
|
|
if (!empty($condition)) {
|
|
foreach ($condition as $key => $val) {
|
|
if (is_array($val)) {
|
|
$val = implode(',', $val);
|
|
$where .= " and {$key} in ({$val})";
|
|
} else {
|
|
$where .= " and {$key}={$val}";
|
|
}
|
|
}
|
|
}
|
|
|
|
return $this->obj->selectAll('spider_weibo', array('sql' => $where, 'vals' => array()), 'id desc ', array($offset, $page_size));
|
|
}
|
|
}
|