Browse Source

评论数据入库

xmz_weibo_demo
longchao 1 day ago
parent
commit
289d014850
  1. 13
      config/define.php
  2. 1
      data/dWeibo.php
  3. 71
      model/mSpider.php
  4. 4
      model/mWeibo.php
  5. 26
      model/mWeiboComments.php
  6. 5
      queue/config/daemonconf.php
  7. 38
      queue/crontab/add_spider_weibo_comment_task.php
  8. 103
      queue/deal/spider_weibo_comment.php
  9. 8
      tools/filter_comments.php

13
config/define.php

@ -53,6 +53,7 @@
define('WEIBO_LOGIN_COOKIE', '_2A25FMHeEDeRhGeFH41MV8ibNyjSIHXVmTPVMrDV8PUNbmtAYLWLlkW9NekLV22uXkAnuAlCngnjVjozSXR5mujv7');
define('_RQ_SPIDER_WEIBO_BEHAVIOR', 'rq_spider_weibo_behavior');
define('_RQ_SPIDER_WEIBO_COMMENT', 'rq_spider_weibo_comment');
define('WEIBO_DETAIL_URL', 'https://weibo.yizherenxin.cn/detail/%s?v=1');
@ -82,4 +83,14 @@
define('ZHISHIKU_SPIDER_COMMENT', 'zhishiku_spider_comment.log');
define('ZHISHIKU_SPIDER_TEMP_PATH', DATACENTER_ROOT.'/zhishiku_spider_temp/');
define('ZHISHIKU_SPIDER_COMMENT_PATH', ZHISHIKU_SPIDER_TEMP_PATH.'/spider_comment/%s/%s.log');
define('ZHISHIKU_SPIDER_COMMENT_PATH', ZHISHIKU_SPIDER_TEMP_PATH.'/spider_comment/%s/%s.log');
// 采集评论状态 0=未采集 1=采集时出现错误中断 2=采集成功
define('SPIDER_COMMENT_STATUS_NO', 0);
define('SPIDER_COMMENT_STATUS_ERROR', 1);
define('SPIDER_COMMENT_STATUS_SUCCESS', 2);
define('_RS_SPIDER_WEIBO_COMMENT', 'rs_spider_weibo_comment');
define('_RC_WEIBO_LOGIN_COOKIE2', 'rc_weibo_login_cookie2');
define('WEIBO_LOGIN_COOKIE2', '_2A25FdJmRDeRhGeFG71sY9CbOyDuIHXVmC5NZrDV8PUNbmtAYLUj1kW9NeUknrDj_DFv7tUuAQqx1kWSg_y3B4Qe3');

1
data/dWeibo.php

@ -24,6 +24,7 @@ class dWeibo extends dBase {
'comments_count',
'attitudes_count',
'status',
'spider_comment_status'
),
);

71
model/mSpider.php

@ -7,6 +7,8 @@ include_once(SERVER_ROOT . "/model/mBase.php");
class mSpider extends mBase {
public $is_change_ip = false;
public function __construct() {
$this->obj = new dWeibo();
$this->tbl = 'spider_weibo';
@ -15,20 +17,22 @@ class mSpider extends mBase {
public function getZmhttpIp($task_key) {
//一分钟内不允许再次获取ip
$rdobj = $this->initRedis();
$ip_request = $rdobj->get('ip_request');
$ip_request_key = 'ip_request_'. $task_key;
$ip_request = $rdobj->get($ip_request_key);
if ($ip_request) return false;
$rdobj->setex('ip_request', 60, 1);
$rdobj->setex($ip_request_key, 60, 1);
$url = "http://proxy.siyetian.com/apis_get.html?token=AesJWLNp2a65kaJdXTqFFeNpWT35ERNpnTn1STqFUeORUR31kaNh3TUl0dPRUQy4ERJdXT6lVN.QMxkTO0MjM0cTM&limit=1&type=1&time=&data_format=json&showTimeEnd=true";
$jsoninfo = $this->getCUrl($url);
$this->writeLog('spider', 'getZmhttpIp.log', $task_key . '|' . $jsoninfo);
$this->writeLog('spider', 'get_zmhttp_ip.log', $task_key . '|' . $jsoninfo);
$data = json_decode($jsoninfo, true);
//套餐用完 需要换套餐了
if ($data['code'] == 10019 || $data['code'] == 10005) {
$this->sendMail(array('1026652509@qq.com'), date('Y年m月d日', time()) . '-微博数据抓取', $data['info']);
if($task_key == _RC_SPIDER_WEIBO_COMMENT_USE_IP) $this->sendMail(array('1432334894@qq.com'), date('Y年m月d日', time()) . '-微博评论数据抓取', $data['info']);
return false;
}
@ -44,8 +48,8 @@ class mSpider extends mBase {
return array('ip' => $ip, 'port' => $port);
}
public function getRequest($url, $headers = array(), $timeout = 30) {
if (empty($this->proxy_ip) || empty($this->proxy_port)) {
public function getRequest($url, $headers = array(), $timeout = 30, $use_proxy = true) {
if ((empty($this->proxy_ip) || empty($this->proxy_port)) && $use_proxy) {
$this->setError('配置代理后再来请求吧');
return false;
}
@ -55,7 +59,6 @@ class mSpider extends mBase {
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_POST, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_REFERER, $url);
@ -64,7 +67,7 @@ class mSpider extends mBase {
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
if ($this->proxy_port) {
if ($this->proxy_port && $use_proxy) {
curl_setopt($ch, CURLOPT_PROXY, 'socks5h://' . $this->proxy_ip . ":" . $this->proxy_port);
}
@ -82,7 +85,6 @@ class mSpider extends mBase {
$result = curl_exec($ch);
$this->curl_err = curl_error($ch);
curl_close($ch);
if ($result !== false) return $result;
@ -328,10 +330,63 @@ class mSpider extends mBase {
return true;
}
preg_match('/Connection timed out after/', $this->curl_err, $match);
if ($match[0]) {
$this->is_change_ip = true;
return true;
}
return true;
}
private function remove4ByteChars($str) {
return preg_replace('/[\xF0-\xF7][\x80-\xBF]{3}/', '', $str);
}
public function spiderComments($cookie, $uid, $wid, $mblogid, $max_id = 0) {
$url = "https://weibo.com/ajax/statuses/buildComments?flow=1&is_reload=1&id={$wid}&is_show_bulletin=2&is_mix=0&count=10&uid={$uid}&fetch_level=0&locale=zh-CN";
if($max_id > 0) $url = "https://weibo.com/ajax/statuses/buildComments?flow=1&is_reload=1&id={$wid}&is_show_bulletin=2&is_mix=0&max_id={$max_id}&count=10&uid={$uid}&fetch_level=0&locale=zh-CN";
$headers = array(
"Cookie: SUB={$cookie}",
"x-requested-with: XMLHttpRequest",
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
"Referer: https://weibo.com/{$uid}/{$mblogid}"
);
$headers = array(
"Cookie: SUB={$cookie}",
"x-requested-with: XMLHttpRequest",
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
"Referer: https://weibo.com/{$uid}/{$mblogid}"
);
return $this->requestComment($uid, $url, $headers);
}
public function requestComment($uid, $url, $headers) {
$res = $this->getRequest($url, $headers, 30, true);
if (strpos($res, '登录 - 微博')) {
$this->setError('cookie失效');
return false;
}
if (empty($res)) {
if ($this->curl_err) {
$this->getSpiderErrors();
$this->setError($this->curl_err);
}
return false;
}
$data = json_decode($res, true);
if (!$data) {
$this->writeLog(ZHISHIKU_SPIDER_LOG, ZHISHIKU_SPIDER_COMMENT, '请求失败:'.$res.'|'.$uid.'|'.$url.'|'.$data);
$this->setError('请求为空');
return false;
}
return $res;
}
}

4
model/mWeibo.php

@ -14,7 +14,7 @@ class mWeibo extends mBase {
$this->tbl = 'spider_weibo';
}
public function getWeiboList($condition, $page_num, $page_size) {
public function getWeiboList($condition, $page_num, $page_size, $is_all_data = true) {
$offset = ($page_num - 1) * $page_size;
$where = "1=1 ";
@ -32,6 +32,8 @@ class mWeibo extends mBase {
$list = $this->obj->selectAll($this->tbl, array('sql' => $where, 'vals' => array()), 'created_at desc', array($offset, $page_size));
if (empty($list)) return array();
if(!$is_all_data) return $list;
foreach ($list as &$v) {
$v['name'] = '微博粉丝';
if ($v['uid'] == 2282201403) $v['name'] = '张宝旬';

26
model/mWeiboComments.php

@ -24,14 +24,9 @@ class mWeiboComments extends mBase {
return $this->obj->select($this->tbl, array('sql' => '`weibo_data_id`=?', 'vals' => array($weibo_data_id)));
}
public function isNeedInsertData($weibo_id, $max_weibo_data_id) {
$max_weibo_data_id_info = $this->getCommentByWeiboId($max_weibo_data_id, 1, 1);
if(empty($max_weibo_data_id_info)) return true;
$weibo_info = $this->getCommentByWeiboId($weibo_id, 1, 1);
if(empty($weibo_info)) return true;
if($max_weibo_data_id <= $weibo_info[0]['weibo_data_id']) return false;
public function isNeedInsertData($weibo_id, $max_weibo_data_id, $comment_count) {
$max_weibo_data_id_info = $this->getCommentByWeiboId($max_weibo_data_id, 1, 1, 'id desc');
if(empty($max_weibo_data_id_info) || $max_weibo_data_id_info['spider_comment_status']) return true;
return true;
}
@ -76,7 +71,7 @@ class mWeiboComments extends mBase {
if(!is_dir($dir)) {
$this->writeLog(ZHISHIKU_SPIDER_LOG, ZHISHIKU_SPIDER_COMMENT, '创建目录失败:'.$source_json_save_path);
continue;
return false;
}
if(file_exists($source_json_save_path)) continue;
@ -84,17 +79,16 @@ class mWeiboComments extends mBase {
file_put_contents($source_json_save_path, json_encode($comment), LOCK_EX);
if(!file_exists($source_json_save_path) || filesize($source_json_save_path) < 10) {
$this->writeLog(ZHISHIKU_SPIDER_LOG, ZHISHIKU_SPIDER_COMMENT, '评论保存到文件失败:'.$source_json_save_path);
unset($source_data['data'][$key]);
continue;
return false;
}
chmod($source_json_save_path, 0755);
chown($source_json_save_path, 'nobody');
chgrp($source_json_save_path, 'nobody');
}
if(empty($source_data['data'])) return true;
$sqls = array();
$max_id = 0;
foreach($source_data['data'] as $comment) {
$content = $comment['text'];
$weibo_data_id = $comment['id']+0;
@ -104,6 +98,7 @@ class mWeiboComments extends mBase {
'sql' => 'insert into '.$this->tbl.' (`weibo_id`, `weibo_data_id`, `content`, `comment_time`) values (?, ?, ?, ?) ON DUPLICATE KEY UPDATE `weibo_data_id`=?',
'vals' => array($weibo_id, $weibo_data_id, $content, $comment_time, $weibo_data_id),
);
$max_id = $weibo_data_id - 1;
}
$res = $this->obj->execTrans2($sqls);
@ -113,7 +108,12 @@ class mWeiboComments extends mBase {
return false;
}
return true;
$rdata['total_number'] = $source_data['total_number'];
$rdata['max_id'] = $max_id;
$rdata['weibo_id'] = $weibo_id;
$rdata['is_load_all'] = count($source_data['data']) < 20 ? true : false;
return $rdata;
}
}

5
queue/config/daemonconf.php

@ -1,14 +1,18 @@
<?php
define('_RC_SPIDER_USE_IP', 'rc_spider_use_ip');
define('_RC_SPIDER_WEIBO_COMMENT_USE_IP', 'rc_spider_weibo_comment_use_ip');
define('SPIDER_WEIBO_BEHAVIOR', 'spider_weibo_behavior');
define('SPIDER_WEIBO_COMMENT', 'spider_weibo_comment');
$GLOBALS['DAEMON_MAXTIME'] = array(
SPIDER_WEIBO_BEHAVIOR => 7200,
SPIDER_WEIBO_COMMENT => 7200,
);
$GLOBALS['DAEMON_LIST'] = array(
SPIDER_WEIBO_BEHAVIOR => 'get_weibo_behavior.php',
SPIDER_WEIBO_COMMENT => 'spider_weibo_comment.php',
);
$GLOBALS['DAEMON_LIST_DESC'] = array(
@ -16,4 +20,5 @@ $GLOBALS['DAEMON_LIST_DESC'] = array(
$GLOBALS['DAEMON_NUMLIMIT'] = array(
SPIDER_WEIBO_BEHAVIOR => 5,
SPIDER_WEIBO_COMMENT => 2,
);

38
queue/crontab/add_spider_weibo_comment_task.php

@ -0,0 +1,38 @@
<?php
/**
* 抓取微博评论任务
* @package crontab
*/
include_once(dirname(dirname(dirname(__FILE__))) . "/library/publicBase.php");
class addSpiderWeiboCommentTask {
public function __construct() {
$page_size = 500;
$condition['uid'] = WEIBO_USER_ZHANG;
$condition['spider_comment_status'] = SPIDER_COMMENT_STATUS_NO;
$obj = new mWeibo();
$rdobj = $obj->initRedis();
if($rdobj->llen(_RQ_SPIDER_WEIBO_COMMENT)+0 > 50) return true;
for($page_num=1;$page_num<=1;$page_num++) {
$weibo_data = $obj->getWeiboList($condition, $page_num, $page_size);
if(empty($weibo_data)) break;
foreach($weibo_data as $data) {
$pdata['weibo_id'] = $data['id'];
$pdata['max_id'] = 0;
if ($rdobj->sIsMember(_RS_SPIDER_WEIBO_COMMENT, $data['id'])) continue;
$rdobj->lpush(_RQ_SPIDER_WEIBO_COMMENT, json_encode($pdata));
$rdobj->sAdd(_RS_SPIDER_WEIBO_COMMENT, $data['id']);
}
}
return true;
}
}
new addSpiderWeiboCommentTask();

103
queue/deal/spider_weibo_comment.php

@ -0,0 +1,103 @@
<?php
include_once dirname(dirname(__FILE__)).'/base/dealBase.php';
define('PROC_CODE', $argv[2]);
class spiderWeiboComment extends dealBase {
private $task_key = _RQ_SPIDER_WEIBO_COMMENT;
public function setPara() {
$this->processnum = $GLOBALS['DAEMON_NUMLIMIT'][SPIDER_WEIBO_COMMENT];
$this->is_while = true;
}
public function deal() {
$obj = new mSpider();
$rdobj = $obj->initRedis();
$ipinfo = $rdobj->get(_RC_SPIDER_WEIBO_COMMENT_USE_IP);
$ipdata = json_decode($ipinfo, true);
if (!$ipinfo && PROC_CODE == 0 && ($rdobj->llen($this->task_key) + 0 > 0 || !$this->is_while)) $ipdata = $obj->getZmhttpIp(_RC_SPIDER_WEIBO_COMMENT_USE_IP);
if (empty($ipdata)) {
sleep(2);
return false;
}
$obj->proxy_ip = $ipdata['ip'];
$obj->proxy_port = $ipdata['port'];
$rq_data = $rdobj->rpop($this->task_key);
if(empty($rq_data)) exit();
$rq_data = json_decode($rq_data, true);
$weibo_id = $rq_data['weibo_id']+0;
$max_id = $rq_data['max_id']+0;
if($weibo_id+0 <= 0 || $max_id < 0) {
$obj->writeLog(ZHISHIKU_SPIDER_LOG, ZHISHIKU_SPIDER_COMMENT, 'queue:数据格式不正确:'.json_decode( $rq_data));
exit;
}
$baselog = $weibo_id . "|" .$max_id . "|" . $ipdata['ip'] . "|" . $ipdata['port'] . "|" . PROC_CODE . "|";
$wobj = new mWeibo();
$weibo_info = $wobj->getWeiboById($weibo_id);
if(empty($weibo_info)) {
$obj->writeLog(ZHISHIKU_SPIDER_LOG, ZHISHIKU_SPIDER_COMMENT, '微博信息不存在:'.$baselog);
exit;
}
$cookie = $rdobj->get(_RC_WEIBO_LOGIN_COOKIE2);
if(empty($cookie)) $cookie = WEIBO_LOGIN_COOKIE2;
$res = $obj->spiderComments($cookie, $weibo_info['uid'], $weibo_info['wid'], $weibo_info['mblogid'], $max_id);
if (!$res) {
$error = $obj->getError();
$obj->writeLog(ZHISHIKU_SPIDER_LOG, ZHISHIKU_SPIDER_COMMENT, '队列评论抓取失败,重试中:'.$error.'|'.$baselog);
if($error == 'cookie失效') {
$obj->sendMail(array('1432334894@qq.com'), date('Y年m月d日', time()) . '-微博评论数据抓取', $error);
$rdobj->del($this->task_key);
$rdobj->del(_RS_SPIDER_WEIBO_COMMENT);
exit;
}
$rdobj->lpush($this->task_key, json_encode($rq_data));
if ($obj->is_change_ip && PROC_CODE == 0) $rdobj->del(_RC_SPIDER_WEIBO_COMMENT_USE_IP);
sleep(30);
return false;
}
$cobj = new mWeiboComments();
$res = $cobj->addComment($weibo_id, $res);
if(!$res) {
$error = $cobj->getError().'|评论json数据保存到数据库失败~';
$obj->writeLog(ZHISHIKU_SPIDER_LOG, ZHISHIKU_SPIDER_COMMENT, $baselog.'|'.$error);
$wobj->updateWeibo($weibo_id, array('spider_comment_status' => SPIDER_COMMENT_STATUS_ERROR));
return false;
}
if(!$res['is_load_all']) {
$rdobj->lpush($this->task_key, json_encode($res));
return true;
}
$res = $wobj->updateWeibo($weibo_id, array('spider_comment_status' => SPIDER_COMMENT_STATUS_SUCCESS));
if(!$res) {
$error = $wobj->getError();
$obj->writeLog(ZHISHIKU_SPIDER_LOG, ZHISHIKU_SPIDER_COMMENT, '更新微博状态失败:'.$error.'|'.$baselog);
sleep(2);
return false;
}
$obj->writeLog(ZHISHIKU_SPIDER_LOG, ZHISHIKU_SPIDER_COMMENT, '抓取成功:'.$baselog);
$rdobj->sRem(_RS_SPIDER_WEIBO_COMMENT, $weibo_id);
sleep(3);
return true;
}
}
new spiderWeiboComment();

8
tools/filter_comments.php

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save