评论筛选python脚本

6 months ago · e15eea2be8
1 changed files with 92 additions and 143 deletions
--- a/tools/filter_comments.py
+++ b/tools/filter_comments.py
@ -1,11 +1,43 @@
 # -*- coding: utf-8 -*-
 from snownlp import seg
 from snownlp import SnowNLP
+import pymysql
+import configparser
+import os
+
+def get_db_config():
+    config = configparser.ConfigParser()
+    config_path = os.path.join(os.path.dirname(__file__), '../config/database.ini')
+    config.read(config_path)
+
+    return {
+        'host': config.get('simplyphp', 'master').replace('"',''),
+        'user': config.get('simplyphp', 'user').replace('"',''),
+        'password': config.get('simplyphp', 'passwd').replace('"',''),
+        'database': config.get('simplyphp', 'db').replace('"',''),
+        'charset': 'utf8mb4'
+    }
+
+# 新增数据库连接配置
+DB_CONFIG = get_db_config()
+
+def get_comments_from_db():
+    """从数据库获取微博评论"""
+    connection = pymysql.connect(**DB_CONFIG)
+    try:
+        with connection.cursor(pymysql.cursors.DictCursor) as cursor:
+            sql = "SELECT id,content FROM spider_weibo_comments where is_search=-1 limit 0,1000"
+            cursor.execute(sql)
+            results = cursor.fetchall()
+            # return [item[0] for item in results]
+            return results
+    finally:
+        connection.close()

 #pip install snownlp
 def filter_medical_comments(comments_list):
    filtered_comments = []
-
+    update_records = []
    positive_keywords = [
        '有效', '好用', '管用', '有用', '效果好', '见效', '有效果', '显著', '明显',
        '改善', '缓解', '康复', '痊愈', '立竿见影', '有奇效', '灵验', '奏效',
@ -25,7 +57,7 @@ def filter_medical_comments(comments_list):
        '非常明显', '特别显著', '极度舒适', '超级满意', '无比惊喜', '彻底解决',
        '完全康复', '根本改善', '质的飞跃', '翻天覆地', '脱胎换骨', '焕然一新',

-        '解决了', '治好了', '好多了', '舒服多了', '舒缓了', '减轻了', '消失',
+        '解决了', '治好了', '好多了', '舒服多了', '舒缓了', '减轻了', '消失', '好了',
        '根除', '治愈', '解救', '化解', '战胜', '摆脱', '修复', '解救', '消除',
        '消退', '痊愈', '康复', '愈合', '好转', '恢复', '根治', '清除', '驱散',
        '击退', '控制', '抑制', '止住', '缓解', '舒缓', '镇定', '安抚', '平复',
@ -341,17 +373,20 @@ def filter_medical_comments(comments_list):
            return True
        return False

-    for comment in comments_list:
-        comment = comment.replace('#张宝旬妙招#', '')
+    for commentItem in comments_list:
+        comment_id = commentItem['id']
+        comment = commentItem['content'].replace('#张宝旬妙招#', '')
+
        if is_question(comment):
+            update_records.append((0, comment_id))
            continue

        s = SnowNLP(comment)
        sentiment_score = s.sentiments

-        print(comment)
-        print(sentiment_score)
-        print('*'*100)
+        # print(comment)
+        # print(sentiment_score)
+        # print('*'*100)

        contains_positive = any(keyword in comment for keyword in positive_keywords)
        contains_negative = any(keyword in comment for keyword in negative_keywords)
@ -359,145 +394,59 @@ def filter_medical_comments(comments_list):

        if (contains_medical and (contains_positive or contains_negative or sentiment_score > 0.7 or sentiment_score < 0.3)) or ((contains_positive or contains_negative) and (sentiment_score > 0.7 or sentiment_score < 0.3)):
        # if contains_positive or contains_negative or sentiment_score > 0.7 or sentiment_score < 0.3:
-            filtered_comments.append({
-                'comment': comment,
-                'sentiment': sentiment_score,
-                'is_positive': contains_positive or sentiment_score > 0.7,
-                'is_negative': contains_negative or sentiment_score < 0.3,
-                # 'is_positive': sentiment_score > 0.5,
-                # 'is_negative': sentiment_score < 0.3,
-                # 'has_medical_reference': contains_medical
-            })
+            # filtered_comments.append({
+            #     'comment': comment,
+            #     'sentiment': sentiment_score,
+            #     'is_positive': contains_positive or sentiment_score > 0.7,
+            #     'is_negative': contains_negative or sentiment_score < 0.3,
+            #     # 'is_positive': sentiment_score > 0.5,
+            #     # 'is_negative': sentiment_score < 0.3,
+            #     # 'has_medical_reference': contains_medical
+            # })
+            update_records.append((1, comment_id))
+        else:
+            update_records.append((0, comment_id))
+
+    update_database(update_records)

    return filtered_comments

-def classify_comment_effectiveness(comment):
-    """
-    判断单条评论是否在讨论方法的有效性
-
-    Args:
-        comment: 评论文本
-
-    Returns:
-        dict: 包含评论分类结果的字典
-    """
-    s = SnowNLP(comment)
-    words = s.words
-
-    # 定义有效性相关的词汇
-    effectiveness_words = ['有效', '无效', '好用', '不好用', '管用', '不管用', '有用', '没用',
-                          '效果好', '效果差', '见效', '不见效', '有效果', '没效果']
-
-    # 检查评论中是否包含有效性相关词汇
-    discussing_effectiveness = any(word in effectiveness_words for word in words)
-
-    return {
-        'comment': comment,
-        'sentiment': s.sentiments,
-        'discussing_effectiveness': discussing_effectiveness,
-        'is_positive': s.sentiments > 0.6,
-        'is_negative': s.sentiments < 0.4
-    }
-
-def demo():
-    """示例使用"""
-    comments = [
-        "这个方法真的很有效，我用了三天好了！",
-        "对我来说作用不大",
-        "谢谢博主分享的小妙招，确实管用",
-        "这个偏方我试过了，没什么效果",
-        "今天天气真好，我去公园散步了",
-        "中医调理方法不错，但需要坚持",
-        "这个秘方太神奇了，我妈妈用了效果特别好",
-        "我觉得这个治疗方法完全是骗人的",
-        "博主推荐的食疗方子我试了，感觉身体好多了",
-        "个人认为，相信和那种专注的意念能发动到大能量的。想信看不到的力量",
-        "挖出的梨肉不要了吗？张老师",
-        "张老师，讲讲肝硬度偏高",
-        "四月天有次吹了冷风，第二天咽炎发作，喉咙处动不动奇痒无比，每次都会干呕，中午碰巧吃了鱼你在一起的青花椒的鱼，心想明天可能连话都说不出来，没想到晚上就不痒了，大家有这种情况的可以试试",
-        "原来里面还要加水的，我一直都是拿个碗盛着，🍐在中间放点花椒就蒸，我以为就是喝那个蒸出来的汁水",
-        "嗓子疼，花椒蒸梨",
-        "一次做好一件事很容易，难的是坚持去做好每一件事。一切成就的取得，都离不开持续的努力。所以，既要有做好一件事的决心，也要有不断努力的耐心。",
-        "宝藏老师这梨挖得是真好啊肉能挖到这么薄，我只能挖出中间的芯，再往外挖，大多都得挖穿洞了，咱没这技术，意思到了就行了，但我会把里面的梨肉戳一戳，尽量让梨汁释放多一些",
-        "早晨突然嗓子疼，刚按完少商和大鱼际稍微缓解了一下，一打开微博就看见张老师发的花椒梨，晚上吃起来，五虎汤晚上也喝起来",
-        "刀片嗓，#张宝旬妙招# 嗓子疼，花椒蒸梨",
-        "各位大神，白头发越来越多有什么好的方法嘛",
-        "这个好",
-        "转发微博",
-        "刀片嗓 花椒蒸梨",
-        "#张宝旬妙招# 存着",
-        "👍👍👍",
-        "感谢张医生",
-        "收藏",
-        "养生 食疗 花椒梨 咽炎 嗓子",
-        "刀片桑，嗓子疼。用花椒蒸梨",
-        "喝起来",
-        "有空可以学着做做！",
-        "刀片嗓",
-        "谢谢分享",
-        "明天我也来蒸起吃，相信张医生",
-        "什么效果",
-        "原来要加水，一直冰糖加花椒",
-        "真实用的妙招",
-        "@我的印象笔记",
-        "这个梨挖的真专业啊，有没有开挖梨课 我老是挖裂 然后一煮芝麻全部漏出来",
-        "👍",
-        "收藏",
-        "花椒蒸梨",
-        "转发微博",
-        "转发微博",
-        "收藏",
-        "大家这行动力真不是盖的！我就知道转转转！嗓子一有情况少商或三商挨着都掐   再不行就什么关冲啥的都挨一遍！",
-        "感谢张老师分享",
-        "挖掉中间的核就可以了，不要放白糖，放冰糖、花椒。然后拿三个牙签扎起来。这个是甘肃兰州的传统冬季小吃，叫＂热冬果＂。兰州市中心还有这么一个塑像呢：老爷爷挑着担子在街头卖热冬果。我们小时候很多家冬天炉子上都煨着这么一锅热冬果。",
-        "花椒蒸梨有什么保健作用",
-        "哇塞",
-        "我以前看个偏方喝花椒水减肥，我喝了一口差点把自己送走，几粒花椒劲特别大，嗓子一下堵住了，上不来气，在地上跳脚，那种绝望的感觉，记忆犹新。",
-        "以后多来点图文",
-        "👍🏻👍🏻",
-        "转发微博",
-        "转发微博",
-        "去年冬天寒咳，直接切梨加花椒加少许水蒸半个小时，喝几次就好了",
-        "消炎？",
-        "老师  梨肉呢？",
-        "看着都舒服",
-        "👍",
-        "刀片嗓  花椒蒸梨",
-        "放黑豆一起蒸 可以治疗喉咙发炎疼，立竿见影",
-        "好用",
-        "老师，用嗓过度也可以用这个方法吗？",
-        "张老师，小孩咽炎怎么办呀",
-        "这个图示好",
-        "这个季节买的梨肉外表看起来好的切开就是黄的了可以吃吗换了好几家都是这样。刀片嗓，水泥鼻子不停的有鼻涕黄痰。",
-        "谢谢宝医！",
-        "转发微博",
-        "谢谢",
-        "转发微博",
-        "转",
-        "转发收藏",
-        "老师的洞挖得忒大啊",
-        "又学一招",
-        "小妙招大用途",
-        "学到了",
-        "想问问玫瑰痤疮 酒渣鼻怎么治啊很困扰",
-        "牛",
-        "这个季节买的梨肉外表看起来好的切开就是黄的了可以吃吗换了好几家都是这样。刀片嗓，水泥鼻子不停的有鼻涕黄痰。"
-        "感谢张老师分享",
-        "早晨突然嗓子疼，刚按完少商和大鱼际稍微缓解了一下，一打开微博就看见张老师发的花椒梨，晚上吃起来，五虎汤晚上也喝起来",
-        "刚炒菜时手被热油烫到发红，一边冲冷水一边拿白糖往疼的地方撒，刚开始有些刺疼，接着感觉有点烧烧的，一个多小时后把白糖洗掉啥事了[赞][赞]想起之前被热水烫到冲冷水几个小时，敷烫伤膏也还是疼的睡不着，这次真是全程很顺利了"
-        "我每天晚上泡完脚就是做这个动作，然后勾脚尖绷脚尖，入睡快，半夜也很少醒",
-        "昨晚就是那样做，很快就睡着了"
-    ]
-
-    filtered = filter_medical_comments(comments)
-    print(f"筛选出 {len(filtered)} 条相关评论:")
-    for item in filtered:
-        print(f"评论: {item['comment']}")
-        print(f"情感得分: {item['sentiment']:.2f}")
-        print(f"是否正面: {item['is_positive']}")
-        print(f"是否负面: {item['is_negative']}")
-        print("-" * 50)
+def update_database(records):
+    connection = pymysql.connect(**DB_CONFIG)
+    try:
+        with connection.cursor() as cursor:
+            sql = "UPDATE spider_weibo_comments SET is_search = %s WHERE id = %s"
+            cursor.executemany(sql, records)
+        connection.commit()
+    finally:
+        connection.close()
+
+def domain():
+    contentdata = get_comments_from_db()
+    length = len(contentdata)
+    if length < 1000:
+        if length > 0:
+            filtered = filter_medical_comments(contentdata)
+        print(f'已经是最后一页，没有数据了:{length}')
+        exit()
+    else:
+        print(f'共{length}条数据')
+        filtered = filter_medical_comments(contentdata)
+        domain()
+
+    # comments = [item['content'] for item in contentdata]
+    # filtered = filter_medical_comments(contentdata)
+    # comments = []
+    # import json
+    # print(json.dumps(filtered, ensure_ascii=False, indent=None))
+    # print(f"筛选出 {len(filtered)} 条相关评论:")
+    # for item in filtered:
+    #     print(f"评论: {item['comment']}")
+    #     print(f"情感得分: {item['sentiment']:.2f}")
+    #     print(f"是否正面: {item['is_positive']}")
+    #     print(f"是否负面: {item['is_negative']}")
+    #     print("-" * 50)

 if __name__ == "__main__":
-    demo()
+    domain()