# -*- coding = utf-8 -*- # @Time: 18:01 # @Author:ZYP # @File:SearchSimPaper.py # @mail:zypsunshine1@gmail.com # @Software: PyCharm import gc import time # ========================================================================================= # 查找相似文档 # · 文档之间关键词进行取交集 # · 再对选取出的文档与送检文档进行关键词之间的相似度计算 # · 最终选出最相似的文档,进行排序返回 # ========================================================================================= import math import numpy as np from collections import defaultdict from pymysql.converters import escape_string from sklearn.metrics.pairwise import cosine_similarity from util import cut_text, l2_normal, get_word_vec def load_inverted_table(class_list, mysql, log): """根据分类结果,将每个类别的倒排表进行聚合,返回一个几个类别的字典、几个类别库中总论文数量""" # 记录总的倒排表 {word:[doc_id1,doc_id2,doc_id3, ...]} total_inverted_dict1 = {} # 记录每个类别的论文数量的和 total_nums1 = 0 for label_num in class_list: conn, cursor = mysql.open() select_sql = """ select word, paper_doc_id from word_map_paper_{}; """.format(str(label_num)) s_time1 = time.time() cursor.execute(select_sql) for word, paper_doc_id in cursor.fetchall(): if word not in total_inverted_dict1.keys(): total_inverted_dict1[word] = paper_doc_id else: # total_inverted_dict1[word] = ','.join( # set(total_inverted_dict1[word].split(',') + paper_doc_id.split(','))) total_inverted_dict1[word] = total_inverted_dict1[word] + ',' + paper_doc_id e_time1 = time.time() log.log('查找{}类倒排表花费的时间:{}s'.format(str(label_num), e_time1 - s_time1)) s_time2 = time.time() select_paper_num_sql = """ select count_number from count_map where label_num={}; """.format(str(label_num)) cursor.execute(select_paper_num_sql) for nums in cursor.fetchall(): total_nums1 += int(nums[0]) e_time2 = time.time() log.log('查找{}类别下数据量花费:{}s'.format(str(label_num), e_time2 - s_time2)) mysql.close(cursor, conn) return total_inverted_dict1, total_nums1 def select_sim_doc_message(sim_doc1, mysql): """ 通过相似的 doc_id 在库中查找相关的信息,然后计算每个 doc_id 的均值文档向量,以字典形式返回 {文档号:均值文档向量....} :param sim_doc1: 相似文档的列表,[doc_id1, doc_id2, ...] :return: 返回{doc_id:(doc_avg_vec, doc_path)} """ all_paper_vec_dict = {} conn, cursor = mysql.open() for doc_id in sim_doc1: select_sql = """ select tb1.doc_id, tb1.title, tb1.abst_zh, tb2.vsm, tb1.content from ( (select doc_id, title, abst_zh, content from main_table_paper_detail_message) tb1 left join (select doc_id, vsm from id_keywords_weights) tb2 on tb1.doc_id=tb2.doc_id )where tb1.doc_id="{}"; """.format( escape_string(doc_id)) cursor.execute(select_sql) sim_doc_id, sim_title, sim_abst, sim_vsm, sim_content_path = cursor.fetchone() sim_vsm_dict = {weight.split('@#$@')[0]: float(weight.split('@#$@')[1]) for weight in sim_vsm.split('&*^%')} vector_paper = [] value_sum = 0.0 for word, weight in sim_vsm_dict.items(): if word in sim_title: value = 0.5 * weight elif word in sim_abst: value = 0.3 * weight else: value = 0.2 * weight word_vec = get_word_vec(word) if isinstance(word_vec, int): continue vector_paper.append(word_vec * value) value_sum += value del sim_vsm_dict gc.collect() # 求一篇文档的关键词的向量均值 # avg_vector = np.array(np.sum(np.array(vector_paper, dtype=np.float32), axis=0) / len(vector_paper)) avg_vector = np.array(np.sum(np.array(vector_paper, dtype=np.float32), axis=0) / value_sum) all_paper_vec_dict[doc_id] = (avg_vector, sim_content_path) mysql.close(cursor, conn) return all_paper_vec_dict def submit_paper_avg_vec(paper_dict1, tf_weight_dict): """根据送检的文档的 tf 值,计算这篇文档的均值向量,以 numpy 数组形式返回""" vector_paper = [] value_sum = 0.0 for word, weight in tf_weight_dict.items(): if word in paper_dict1['title']: value = 0.5 * weight elif word in paper_dict1['abst_zh']: value = 0.3 * weight else: value = 0.2 * weight word_vec = get_word_vec(word) if isinstance(word_vec, int): continue vector_paper.append(word_vec * value) value_sum += value # avg_vector = np.array(np.sum(np.array(vector_paper, dtype=np.float32), axis=0) / len(vector_paper)) avg_vector = np.array(np.sum(np.array(vector_paper, dtype=np.float32), axis=0) / value_sum) return avg_vector def compare_sim_in_papers(check_vector, sim_message, top=40): """ 计算文档间的相似度,使用的是余弦相似度 :param check_vector: 送检文章的文本向量 :param sim_message: 待检测的 50 篇相似文档,以字典形式存储 :param top: 设置返回最相似的 N 篇文档 :return: 返回相似文档的字典 形式:{doc_id:(相似得分, 文档路径)} """ sim_res_dict = {} for doc_id, (vector, content_path) in sim_message.items(): # sim_res_dict[doc_id] = cosine_similarity([scalar(check_vector), scalar(vector)])[0][1] sim_res_dict[doc_id] = (str(cosine_similarity([check_vector, vector])[0][1]), content_path) _ = sorted(sim_res_dict.items(), key=lambda x: float(x[1][0]), reverse=True) return {key: value for key, value in _[:top]} def search_sim_paper(paper_dict, class_list, mysql, log, top=100): """ 根据送检论文的字典,在库中进行相似文档的查询,最后返回最相似的 top 文章,用于逐句查重。 :param paper_dict: 处理好的格式化送检论文 :param class_list: 模型预测送检论文的类别 id 的列表 :param top: 返回前 top 个文档 :return: 返回相似文档的字典 形式:{doc_id:(相似得分, 文档路径)} """ all_str = paper_dict['title'] + '。' + paper_dict['abst_zh'] + '。' + paper_dict['content'] # 合并倒排表,并统计 论文总量 total_inverted_dict:总的倒排表 s0 = time.time() total_inverted_dict, total_nums = load_inverted_table(class_list, mysql, log) e0 = time.time() log.log('查询倒排表花费时间为:{}s'.format(e0 - s0)) s1 = time.time() # 计算送检文档的词频字典{word1:fre1, word2:fre2, ...} word_fre_dict = cut_text(all_str, tokenizer='jieba') e1 = time.time() log.log('切词时间为:{}s'.format(e1 - s1)) s2 = time.time() # 计算送检文档所有词语的 tf-idf 值 tf_idf_dict = {} for word, freq in word_fre_dict.items(): if freq <= 2: continue tf = freq / sum(word_fre_dict.values()) if word in total_inverted_dict.keys(): idf = math.log(total_nums / (len(set(total_inverted_dict[word].split(','))) + 1)) else: idf = math.log(total_nums / 1) tf_idf = tf * idf tf_idf_dict[word] = tf_idf e2 = time.time() log.log('计算送检文档关键词的TF-idf值花费的时间为:{}s'.format(e2 - s2)) s3 = time.time() # 前 15 的单词、权重 tf_dict = l2_normal(tf_idf_dict) e3 = time.time() log.log('权重正则化花费的时间为:{}s'.format(e3 - s3)) s4 = time.time() # 统计交集的 count_words_num = defaultdict(int) for word, weight in tf_dict.items(): if word in total_inverted_dict.keys(): for doc_id in set(total_inverted_dict[word].split(',')): count_words_num[doc_id] += 1 else: continue e4 = time.time() log.log('统计doc_id交集花费的时间为:{}s'.format(e4 - s4)) # 排序 count_word_num = {i: j for i, j in sorted(count_words_num.items(), key=lambda x: x[1], reverse=True)} # 查找前 200 篇相似的文档 sim_doc = list(count_word_num.keys())[:200] # 计算这 200 篇文档的 文档均值向量 s_time1 = time.time() sim_paper_vec_dict = select_sim_doc_message(sim_doc, mysql) e_time1 = time.time() log.log('计算200篇均值向量所花费的时间为:{}s'.format(e_time1 - s_time1)) # 计算送检文档的 文档均值向量 s_time2 = time.time() submit_vec = submit_paper_avg_vec(paper_dict, tf_dict) e_time2 = time.time() log.log('计算送检文档的均值向量所花费的时间为:{}s'.format(e_time2 - s_time2)) # 计算送检文档 和 查出来的文档的相似度 并排序, 取 top 10 文章用作整篇查重 s_time3 = time.time() sim_paper_dict = compare_sim_in_papers(submit_vec, sim_paper_vec_dict, top=top) e_time3 = time.time() log.log('计算送检文档和查出的文档的相似度(并排序)所花费的时间为:{}s'.format(e_time3 - s_time3)) del total_inverted_dict del total_nums del submit_vec del sim_paper_vec_dict del count_word_num del sim_doc del word_fre_dict gc.collect() return sim_paper_dict