obj = new dWeibo(); $this->tbl = 'spider_weibo'; } public function getZmhttpIp($task_key) { $rdobj = $this->initRedis(); $ip_request = $rdobj->get('ip_request'); if ($ip_request) return false; $rdobj->setex('ip_request', 60, 1); $url = "http://proxy.siyetian.com/apis_get.html?token=AesJWLNp2a65kaJdXTqFFeNpWT35ERNpnTn1STqFUeORUR31kaNh3TUl0dPRUQy4ERJdXT6lVN.QMxkTO0MjM0cTM&limit=1&type=1&time=&data_format=json&showTimeEnd=true"; $jsoninfo = $this->getCUrl($url); $this->writeLog('spider', 'getZmhttpIp.log', $task_key . '|' . $jsoninfo); $data = json_decode($jsoninfo, true); $ip = $data['data'][0]['ip']; $port = $data['data'][0]['port']; $end_time = $data['data'][0]['end_time']; if (empty($ip) || empty($port)) return false; $rdobj = $this->initRedis(); $rdobj->setex($task_key, 6 * 60, json_encode(array('ip' => $ip, 'port' => $port, 'end_time' => $end_time))); return array('ip' => $ip, 'port' => $port); } public function getNewHttpIp($task_key, $times = 1) { $url = "http://proxy.siyetian.com/apis_get.html?token=AesJWLNp2a65kaJdXTqFFeNpWT35ERNpnTn1STqFUeORUR31kaNh3TUl0dPRUQy4ERJdXT6lVN.QMxkTO0MjM0cTM&limit=1&type=1&time=&data_format=json&showTimeEnd=true"; $jsoninfo = $this->getCUrl($url); $this->writeLog('spider', 'getNewHttpIp.log', $task_key . '|' . $jsoninfo); //请求为空 重试第五次还是为空 则抛出异常 if (empty($jsoninfo)) { if ($times >= SPIDER_REQUEST_MAX_NUMS) { $this->setError(SPIDER_ERROR_IP_GET_EMPTY); return false; } //请求为空 5秒后重试 sleep(5); return $this->getNewHttpIp($task_key, $times + 1); } $data = json_decode($jsoninfo, true); //套餐用完 需要换套餐了 if ($data['code'] == 10019 || $data['code'] == 10005) { $this->setError(SPIDER_ERROR_IP_GET_OUT); return false; } $ip = $data['data'][0]['ip']; $port = $data['data'][0]['port']; $end_time = $data['data'][0]['end_time']; //其他异常错误 if (empty($ip) || empty($port)) { $this->setError(SPIDER_ERROR_IP_GET_FAIL); return false; } $expire_time = strtotime($end_time) - time(); $rdobj = $this->initRedis(); $rdobj->setex($task_key, $expire_time, json_encode(array('ip' => $ip, 'port' => $port, 'end_time' => $end_time))); return array('ip' => $ip, 'port' => $port); } public function getRequest($url, $headers = array(), $timeout = 60) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_POST, false); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_REFERER, $url); if (!empty($headers)) curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1); curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); if ($this->proxy_port) { curl_setopt($ch, CURLOPT_PROXY, 'socks5h://' . $this->proxy_ip . ":" . $this->proxy_port); } if (!empty($this->cookie_path)) { curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookie_path); curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookie_path); } // 若给定url自动跳转到新的url,有了下面参数可自动获取新url内容:302跳转(paperpass下载报告会用到) curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); // 忽略 SSL 验证(如果需要) curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); $result = curl_exec($ch); $this->curl_err = curl_error($ch); curl_close($ch); if ($result !== false) return $result; return false; } public function getWeiboVideo($url) { if (empty($this->proxy_ip) || empty($this->proxy_port)) { $this->setError('配置代理后再来请求吧'); return false; } $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_PROXY, 'socks5h://' . $this->proxy_ip . ":" . $this->proxy_port); // 设置 SOCKS5 代理 curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME); // SOCKS5 解析域名 curl_setopt($ch, CURLOPT_HTTPHEADER, [ "Referer: https://weibo.com", "User-Agent: {$this->getRandUserAgent()}" ]); curl_setopt($ch, CURLOPT_TIMEOUT, 600); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30); $result = curl_exec($ch); $this->curl_err = curl_error($ch); curl_close($ch); if ($result !== false) return $result; return false; } public function getWapWeibo($wid) { $url = "https://m.weibo.cn/detail/{$wid}"; $cooike_header = 'Cookie: SUB=_2A25K3UsmDeRhGeFH41MV8ibNyjSIHXVpk8LurDV6PUJbktAYLUzAkW1NekLV22E63ymp9LDusz92jTcj_StK-KEN;'; $headers = array( $cooike_header, "User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1" ); $html = $this->getRequest($url, $headers); if (!$html) { $this->setError($this->curl_err); $this->writeLog('spider', 'curl_error.log', $this->curl_err . "|" . $url . "|" . $html); return false; } $content = array(); if (preg_match('/var\s+\$render_data\s*=\s*(\[\{.*?\}\])\[0\]\s*\|\|\s*\{};/s', $html, $matches)) { $json_data = $matches[1]; // 提取 JSON 字符串 // 解析 JSON $data = json_decode($json_data, true); if (!empty($data) && isset($data[0]['status'])) { $content = $data[0]['status']; } } if (empty($content)) { $this->setError($html); return 'error'; } $return = array(); $return['text'] = strip_tags($content['text']); $return['pic_ids'] = $content['pic_num'] > 0 ? json_encode($content['pic_ids']) : '';; $video_url = isset($content['page_info']['media_info']['stream_url']) ? $content['page_info']['media_info']['stream_url'] : ''; $return['video_url'] = $video_url; $video_cover = isset($content['page_info']['page_pic']['pid']) ? $content['page_info']['page_pic']['pid'] : ''; $return['video_cover'] = $video_cover; $video_cover_url = isset($content['page_info']['page_pic']['url']) ? $content['page_info']['page_pic']['url'] : ''; $return['video_cover_url'] = $video_cover_url; $date = new DateTime($content['created_at']); // 解析时间 $return['created_at'] = $date->format('Y-m-d H:i:s'); $return['reposts_count'] = $content['reposts_count']; $return['comments_count'] = $content['comments_count']; $return['attitudes_count'] = $content['attitudes_count']; if ($content['retweeted_status']) { $return['retweeted_status'] = array(); $return['retweeted_status']['wid'] = $content['retweeted_status']['id']; $return['retweeted_status']['text'] = strip_tags($content['retweeted_status']['text']); $return['retweeted_status']['pic_ids'] = $content['retweeted_status']['pic_num'] > 0 ? json_encode($content['retweeted_status']['pic_ids']) : '';; $video_url = isset($content['retweeted_status']['page_info']['media_info']['stream_url']) ? $content['retweeted_status']['page_info']['media_info']['stream_url'] : ''; $return['retweeted_status']['video_url'] = $video_url; $video_cover = isset($content['retweeted_status']['page_info']['page_pic']['pid']) ? $content['retweeted_status']['page_info']['page_pic']['pid'] : ''; $return['retweeted_status']['video_cover'] = $video_cover; $video_cover_url = isset($content['retweeted_status']['page_info']['page_pic']['url']) ? $content['retweeted_status']['page_info']['page_pic']['url'] : ''; $return['video_cover_url'] = $video_cover_url; $date = new DateTime($content['retweeted_status']['created_at']); // 解析时间 $return['retweeted_status']['created_at'] = $date->format('Y-m-d H:i:s'); $return['retweeted_status']['reposts_count'] = $content['retweeted_status']['reposts_count']; $return['retweeted_status']['comments_count'] = $content['retweeted_status']['comments_count']; $return['retweeted_status']['attitudes_count'] = $content['retweeted_status']['attitudes_count']; } return $return; } public function getWeiboList($condition, $page_num, $page_size) { $offset = ($page_num - 1) * $page_size; $where = "1=1 "; if (!empty($condition)) { foreach ($condition as $key => $val) { if (is_array($val)) { $val = implode(',', $val); $where .= " and {$key} in ({$val})"; } else { $where .= " and {$key}={$val}"; } } } return $this->obj->selectAll('spider_weibo', array('sql' => $where, 'vals' => array()), 'id desc ', array($offset, $page_size)); } //原创微博 用户行为数据 public function spiderBehaviorOriginalData($uid, $page, $times = 1) { $this->writeLog('spider', 'getData.log', "{$uid}|{$page}|{$times}"); $rdobj = $this->initRedis(); $ipinfo = $rdobj->get(_RC_SPIDER_IP); $ipdata = json_decode($ipinfo, true); if (!$ipinfo) { $ipdata = $this->getNewHttpIp(_RC_SPIDER_IP); } //获取代理ip失败 if (empty($ipdata)) { $this->setError($this->getError()); return false; } $this->proxy_ip = $ipdata['ip']; $this->proxy_port = $ipdata['port']; $cookie = WEIBO_LOGIN_COOKIE; $url = "https://weibo.com/ajax/statuses/searchProfile?uid={$uid}&page={$page}&hasori=1"; $headers = array( "Cookie: SUB={$cookie}", "x-requested-with: XMLHttpRequest", "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36", "Referer: https://weibo.com/u/{$uid}?tabtype=feed" ); $res = $this->getRequest($url, $headers); //请求错误 if (empty($res)) { //请求为空 $error_code = SPIDER_ERROR_URL_GET_EMPTY; //curl错误 if ($this->curl_err) { $this->error_nums++; $error_code = $this->getSpiderErrors(); } //ip不稳定 请求不作数 if ($error_code == SPIDER_ERROR_IP_ASTABLE) $times--; //代理失效 if ($error_code == SPIDER_ERROR_IP_EXPIERD) $rdobj->del(_RC_SPIDER_IP); //连续五次请求错误 则抛出异常 if ($times >= SPIDER_REQUEST_MAX_NUMS) { $this->setError($error_code); return false; } sleep(2); return $this->spiderBehaviorOriginalData($uid, $page, $times + 1); } $this->error_nums = 0; if (strpos($res, '登录 - 微博')) { $this->setError(SPIDER_ERROR_URL_COOKIE_EXPIRED); return false; } $data = json_decode($res, 1); if (!$data) { $this->writeLog('spider', 'errorData.log', $res); $this->setError(SPIDER_ERROR_URL_GET_UNKNOWN); return false; } //获取完毕 if (empty($data['data']['list'])) return 'done'; $date = date('Y-m-d'); $bobj = new mWeiboBehavior(); $wobj = new mWeibo(); foreach ($data['data']['list'] as $da) { //博主设置不可见 if (!isset($da['reposts_count'])) continue; $temp = array( 'uid' => $uid, 'wid' => $da['id'], 'reposts_count' => $da['reposts_count'], 'comments_count' => $da['comments_count'], 'attitudes_count' => $da['attitudes_count'], 'date' => $date, ); $res = $bobj->saveBehaviorData($temp); if (!$res) { $this->writeLog('spider', 'insert_error.log', json_encode($temp)); } $created_at = new DateTime($da['created_at']); // 解析时间 $weibo = array( 'mblogid' => $da['mblogid'], 'text' => strip_tags($da['text']), 'refer' => '', 'pic_ids' => $da['pic_num'] > 0 ? json_encode($da['pic_ids']) : '', 'video_url' => isset($da['page_info']['media_info']['stream_url']) ? $da['page_info']['media_info']['stream_url'] : '', 'video_cover' => isset($da['page_info']['page_pic']) ? pathinfo(basename($da['page_info']['page_pic']), PATHINFO_FILENAME) : '', 'created_at' => $created_at->format('Y-m-d H:i:s') ); $res = $wobj->saveWeiboData($temp, $weibo); if (!$res) { $this->writeLog('spider', 'insert_error.log', json_encode(array_merge($temp, $weibo))); } } $this->writeLog('spider', 'getDone.log', "{$uid}|{$page}|{$times}"); return true; } //转发微博 用户行为数据 public function spiderBehaviorForwardData($uid, $page, $times = 1) { $this->writeLog('spider', 'getData.log', "{$uid}|{$page}|{$times}"); $rdobj = $this->initRedis(); $ipinfo = $rdobj->get(_RC_SPIDER_IP); $ipdata = json_decode($ipinfo, true); if (!$ipinfo) { $ipdata = $this->getNewHttpIp(_RC_SPIDER_IP); } //获取代理ip失败 if (empty($ipdata)) { $this->setError($this->getError()); return false; } $this->proxy_ip = $ipdata['ip']; $this->proxy_port = $ipdata['port']; $cookie = WEIBO_LOGIN_COOKIE; $endtime = strtotime("tomorrow"); $url = "https://weibo.com/ajax/statuses/searchProfile?uid={$uid}&page={$page}&endtime={$endtime}&hasret=1&hastext=1&haspic=1&hasvideo=1&hasmusic=1"; $headers = array( "Cookie: SUB={$cookie}", "x-requested-with: XMLHttpRequest", "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36", "Referer: https://weibo.com/u/{$uid}?is_text=1&is_pic=1&is_video=1&is_music=1&is_forward=1&end_time={$endtime}" ); $res = $this->getRequest($url, $headers); //请求错误 if (empty($res)) { //请求为空 $error_code = SPIDER_ERROR_URL_GET_EMPTY; //curl错误 if ($this->curl_err) { $this->error_nums++; $error_code = $this->getSpiderErrors(); } //ip不稳定 请求不作数 if ($error_code == SPIDER_ERROR_IP_ASTABLE) $times--; //代理失效 if ($error_code == SPIDER_ERROR_IP_EXPIERD) $rdobj->del(_RC_SPIDER_IP); //连续五次请求错误 则抛出异常 if ($times >= SPIDER_REQUEST_MAX_NUMS) { $this->setError($error_code); return false; } sleep(2); return $this->spiderBehaviorForwardData($uid, $page, $times + 1); } $this->error_nums = 0; if (strpos($res, '登录 - 微博')) { $this->setError(SPIDER_ERROR_URL_COOKIE_EXPIRED); return false; } $data = json_decode($res, 1); if (!$data) { $this->writeLog('spider', 'errorData.log', $res); $this->setError(SPIDER_ERROR_URL_GET_UNKNOWN); return false; } //获取完毕 if (empty($data['data']['list'])) return 'done'; $date = date('Y-m-d'); $bobj = new mWeiboBehavior(); $wobj = new mWeibo(); foreach ($data['data']['list'] as $item) { if (!isset($item['user']['id']) || $item['user']['id'] != $uid) continue; $da = $item['retweeted_status']; //博主设置不可见 if (!isset($da['reposts_count'])) continue; $temp = array( 'uid' => $uid, 'wid' => $da['id'], 'reposts_count' => $da['reposts_count'], 'comments_count' => $da['comments_count'], 'attitudes_count' => $da['attitudes_count'], 'date' => $date, ); $res = $bobj->saveBehaviorData($temp); if (!$res) { $this->writeLog('spider', 'insert_error.log', json_encode($temp)); } $created_at = new DateTime($da['created_at']); // 解析时间 $weibo = array( 'mblogid' => $da['mblogid'], 'text' => strip_tags($da['text']), 'refer' => '', 'pic_ids' => $da['pic_num'] > 0 ? json_encode($da['pic_ids']) : '', 'video_url' => isset($da['page_info']['media_info']['stream_url']) ? $da['page_info']['media_info']['stream_url'] : '', 'video_cover' => isset($da['page_info']['page_pic']) ? pathinfo(basename($da['page_info']['page_pic']), PATHINFO_FILENAME) : '', 'created_at' => $created_at->format('Y-m-d H:i:s') ); $res = $wobj->saveWeiboData($temp, $weibo); if (!$res) { $this->writeLog('spider', 'insert_error.log', json_encode(array_merge($temp, $weibo))); } } $this->writeLog('spider', 'getDone.log', "{$uid}|{$page}|{$times}"); return true; } private function getSpiderErrors() { preg_match('/Unable to receive initial SOCKS5 response./', $this->curl_err, $match); if ($match[0]) { return SPIDER_ERROR_IP_EXPIERD; } preg_match('/SOCKS5 read timeout/', $this->curl_err, $match); if ($match[0]) { return SPIDER_ERROR_IP_EXPIERD; } $this->writeLog('spider', 'curl_error.log', $this->curl_err); //ip不稳定 preg_match("/Can't complete SOCKS5 connection to 0.0.0.0:0/", $this->curl_err, $match); if ($match[0]) { if ($this->error_nums < 5) return SPIDER_ERROR_IP_ASTABLE; if ($this->error_nums == 5) return SPIDER_ERROR_IP_EXPIERD; } //ip不稳定 preg_match("/Connection refused/", $this->curl_err, $match); if ($match[0]) { if ($this->error_nums < 5) return SPIDER_ERROR_IP_ASTABLE; if ($this->error_nums == 5) return SPIDER_ERROR_IP_EXPIERD; } return SPIDER_ERROR_IP_UNKNOWN; } }