Browse Source

评论抓取模型层方法

xmz_weibo_demo
longchao 3 days ago
parent
commit
dec4d72ebb
  1. 5
      config/define.php
  2. 39
      data/dBase.php
  3. 23
      data/dWeiboComments.php
  4. 145
      model/mWeiboComments.php
  5. 7
      tools/filter_comments.php

5
config/define.php

@ -78,3 +78,8 @@
define('BEHAVIOR_TYPE_COMMENTS', 1);
define('BEHAVIOR_TYPE_ATTITUDES', 2);
define('ZHISHIKU_SPIDER_LOG', 'zhishiku_spider_log');
define('ZHISHIKU_SPIDER_COMMENT', 'zhishiku_spider_comment.log');
define('ZHISHIKU_SPIDER_TEMP_PATH', DATACENTER_ROOT.'/zhishiku_spider_temp/');
define('ZHISHIKU_SPIDER_COMMENT_PATH', ZHISHIKU_SPIDER_TEMP_PATH.'/spider_comment/%s/%s.log');

39
data/dBase.php

@ -68,7 +68,7 @@ class dBase extends publicBase {
try {
$conn[PDO::ATTR_TIMEOUT] = 3;
if($GLOBALS['pconnect_db']===true) $conn[PDO::ATTR_PERSISTENT] = true;
$conn[PDO::MYSQL_ATTR_INIT_COMMAND] = "SET NAMES UTF8;";
$conn[PDO::MYSQL_ATTR_INIT_COMMAND] = "SET NAMES utf8mb4;";
$db = new PDO('mysql:host='.$cnf['host'].';port='.$cnf['port'].';dbname='.$cnf['db'],$cnf['user'],$cnf['pwd'], $conn);
} catch(PDOException $e) {
//error_log('['.date('Y-m-d H:i:s').'][first-fail]'.implode('|', $cnf).':'.$e->getMessage()."\n", 3, LOG_PATH_BASE.'/mysql/stat_todb_'.date('Y-m-d').'.log');
@ -76,7 +76,7 @@ class dBase extends publicBase {
try {
$conn[PDO::ATTR_TIMEOUT] = 3;
if($GLOBALS['pconnect_db']===true) $conn[PDO::ATTR_PERSISTENT] = true;
$conn[PDO::MYSQL_ATTR_INIT_COMMAND] = "SET NAMES UTF8;";
$conn[PDO::MYSQL_ATTR_INIT_COMMAND] = "SET NAMES utf8mb4;";
$db = new PDO('mysql:host='.$cnf['host'].';port='.$cnf['port'].';dbname='.$cnf['db'],$cnf['user'],$cnf['pwd'], $conn);
} catch(PDOException $e) {
//error_log('['.date('Y-m-d H:i:s').'][second-fail]'.implode('|', $cnf).':'.$e->getMessage()."\n", 3, LOG_PATH_BASE.'/mysql/stat_todb_'.date('Y-m-d').'.log');
@ -124,6 +124,40 @@ class dBase extends publicBase {
}
}
/*
* 数据库事务操作
* 封装标准方法
* @param string $sqls sql语句数组 array(0 =>array('sql'=>$sql, 'val'=>$val), 1 =>array('sql'=>$sql, 'val'=>$val))
* 将根据sql数组index顺序执行,执行SQL的表必须是innodb
*/
public function execTrans2($sqls) {
$this->ismaster = true;
self::getInstance();
try {
self::$link->beginTransaction(); // 开启事务处
foreach ($sqls as $sqlval) {
if(trim($sqlval['sql'])=='') continue;
$st = self::$link->prepare($sqlval['sql']);
$res = $st->execute($sqlval['vals']);
if(!$res) {
error_log(date('Y-m-d H:i:s').'|'.$sqlval['sql'].':'.json_encode($st->errorInfo())."\n", 3, LOG_PATH_BASE.'/mysql/execTrans_zhishiku_'.date('Y-m-d').'.log');
self::$link->rollBack();
return false;
}
$st->closeCursor();
}
self::$link->commit(); // 事务处理结束
} catch(PDOException $e) {
return false;
}
return true;
}
/**
* 批量插入
* 封装标准方法
@ -746,6 +780,7 @@ class dBase extends publicBase {
$st = self::$link->prepare($sql);
$res = $st->execute();
if(!$res) {
error_log(date('Y-m-d H:i:s').'|'.$sql.':'.json_encode($st->errorInfo())."\n", 3, LOG_PATH_BASE.'/mysql/execTrans_zhishiku_'.date('Y-m-d').'.log');
self::$link->rollBack();
return false;
}

23
data/dWeiboComments.php

@ -0,0 +1,23 @@
<?php
/**
*
*/
include_once SERVER_ROOT . '/data/dBase.php';
class dWeiboComments extends dBase {
protected $fieldlist = array(
'spider_weibo_comments' => array(
'id',
'weibo_id',
'content',
'weibo_data_id',
'is_search',
'comment_time',
'create_time',
)
);
protected $primary_keys = array(
);
}

145
model/mWeiboComments.php

@ -6,67 +6,114 @@ include_once(SERVER_ROOT . "/model/mBase.php");
class mWeiboComments extends mBase {
private $config = [
'positive' => [
'keywords' => [
'有效', '好用', '好了', '治愈', '见效', '不错', '管用', '有用','感恩',
'灵验', '缓解', '痊愈', '见效', '神效', '推荐', '感谢', '好了',
'管用', '有效果', '有效果', '奏效', '改善', '见效快', '神奇',
'有效', '有效果', '有效果', '有效果', '有效果', '有效果'
],
'patterns' => [
'/(喝|吃|用)了?\s*\d+\s*次?[就]?(好|痊愈|缓解|不咳|不痛)了/u',
'/(效果|结果)\s*(非常|很)?\s*(好|明显|不错|显著|惊人)/u',
'/(尝试|试了|用了)\s*\d+\s*[天个]?\s*(就)?(好|痊愈|缓解|见效)/u',
'/(咳嗽|咳|痛|疼|症状)\s*(明显)?\s*(减轻|缓解|消失|好了)/u',
'/(感谢|谢谢)\s*(分享|博主|张医生|宝旬)/u'
]
],
'negative' => [
'keywords' => [
'无效', '没用', '不好', '没效果', '不行', '不见效', '没好转',
'加重', '恶化', '白费', '失望', '避雷', '没用', '没效果',
'无效果', '无改善', '没作用', '不灵', '骗人', '忽悠', '上当'
],
'patterns' => [
'/(还是|依然|仍旧|仍然)\s*(咳|难受|痛|疼|没效果)/u',
'/(一点|完全|根本|丝毫)\s*没(效果|用|好转|作用|改善)/u',
'/(不仅|不但)\s*没(好|改善).*(反而|而且)\s*(加重|恶化)/u',
'/(浪费|白费|白忙|白折腾)\s*(时间|精力)/u',
'/(失望|后悔|上当|骗人|忽悠|别信)\s*(了|吧|!)/u'
]
]
];
public function __construct() {}
private function preprocess($comment) {
$comment = preg_replace('/\s+/u', ' ', $comment);
$comment = preg_replace('/[^\p{Han}\p{P}\w\s]/u', '', $comment);
return trim($comment);
private $obj;
private $tbl;
public function __construct() {
$this->obj = new dWeiboComments();
$this->tbl = 'spider_weibo_comments';
}
public function getCommentByWeiboId($weibo_id, $page = 0, $limit = 0, $order = 'id asc') {
$limit_info = array();
if($page > 0 && $limit > 0) $limit_info = array(($page-1)*$limit, $limit);
return $this->obj->selectAll($this->tbl, array('sql' => '`weibo_id`=?', 'vals' => array($weibo_id)), $order, $limit_info);
}
public function analyze($comment) {
$comment = $this->preprocess($comment);
public function getCommentByWeiboDataId($weibo_data_id) {
return $this->obj->select($this->tbl, array('sql' => '`weibo_data_id`=?', 'vals' => array($weibo_data_id)));
}
public function isNeedInsertData($weibo_id, $max_weibo_data_id) {
$max_weibo_data_id_info = $this->getCommentByWeiboId($max_weibo_data_id, 1, 1);
if(empty($max_weibo_data_id_info)) return true;
$weibo_info = $this->getCommentByWeiboId($weibo_id, 1, 1);
if(empty($weibo_info)) return true;
if($max_weibo_data_id <= $weibo_info[0]['weibo_data_id']) return false;
foreach ($this->config['negative']['keywords'] as $word) {
if (mb_strpos($comment, $word) !== false) return 'negative';
return true;
}
foreach ($this->config['positive']['keywords'] as $word) {
if (mb_strpos($comment, $word) !== false) return 'positive';
public function addComment($weibo_id, $source_json) {
$obj = new mWeibo();
$weibo_info = $obj->getWeiboById($weibo_id);
if(empty($weibo_info)) {
$this->setError('微博不存在');
$this->writeLog(ZHISHIKU_SPIDER_LOG, ZHISHIKU_SPIDER_COMMENT, '微博不存在:'.$weibo_id);
return false;
}
foreach ($this->config['negative']['patterns'] as $pattern) {
if (preg_match($pattern, $comment)) return 'negative';
$source_data = json_decode($source_json, true);
if($source_data['ok'] != 1) {
$this->setError('抓取失败');
$this->writeLog(ZHISHIKU_SPIDER_LOG, ZHISHIKU_SPIDER_COMMENT, '抓取评论失败:'.$source_json);
return false;
}
foreach ($this->config['positive']['patterns'] as $pattern) {
if (preg_match($pattern, $comment)) return 'positive';
if(!is_dir(ZHISHIKU_SPIDER_TEMP_PATH)) {
mkdir(ZHISHIKU_SPIDER_TEMP_PATH, 0755, true);
chown(ZHISHIKU_SPIDER_TEMP_PATH, 'nobody');
chgrp(ZHISHIKU_SPIDER_TEMP_PATH, 'nobody');
}
return 'neutral';
foreach($source_data['data'] as $key=>$comment) {
$weibo_data_id = $comment['id']+0;
$source_json_save_path = sprintf(ZHISHIKU_SPIDER_COMMENT_PATH, $weibo_info['wid'], $weibo_data_id);
$dir = dirname(dirname($source_json_save_path));
if(!is_dir($dir)) {
mkdir($dir, 0755, true);
chown($dir, 'nobody');
chgrp($dir, 'nobody');
}
$dir = dirname($source_json_save_path);
if(!is_dir($dir)) {
mkdir($dir, 0755, true);
chown($dir, 'nobody');
chgrp($dir, 'nobody');
}
if(!is_dir($dir)) {
$this->writeLog(ZHISHIKU_SPIDER_LOG, ZHISHIKU_SPIDER_COMMENT, '创建目录失败:'.$source_json_save_path);
continue;
}
if(file_exists($source_json_save_path)) continue;
file_put_contents($source_json_save_path, json_encode($comment), LOCK_EX);
if(!file_exists($source_json_save_path) || filesize($source_json_save_path) < 10) {
$this->writeLog(ZHISHIKU_SPIDER_LOG, ZHISHIKU_SPIDER_COMMENT, '评论保存到文件失败:'.$source_json_save_path);
unset($source_data['data'][$key]);
continue;
}
chmod($source_json_save_path, 0755);
chown($source_json_save_path, 'nobody');
chgrp($source_json_save_path, 'nobody');
}
if(empty($source_data['data'])) return true;
$sqls = array();
foreach($source_data['data'] as $comment) {
$content = $comment['text'];
$weibo_data_id = $comment['id']+0;
$comment_time = date('Y-m-d H:i:s', strtotime($comment['created_at']));
$sqls[] = array(
'sql' => 'insert into '.$this->tbl.' (`weibo_id`, `weibo_data_id`, `content`, `comment_time`) values (?, ?, ?, ?) ON DUPLICATE KEY UPDATE `weibo_data_id`=?',
'vals' => array($weibo_id, $weibo_data_id, $content, $comment_time, $weibo_data_id),
);
}
$res = $this->obj->execTrans2($sqls);
if(!$res) {
$this->setError('保存评论失败');
$this->writeLog(ZHISHIKU_SPIDER_LOG, ZHISHIKU_SPIDER_COMMENT, '保存评论失败:'.json_encode($sqls));
return false;
}
return true;
}
}

7
tools/filter_comments.php

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save