You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
195 lines
7.9 KiB
195 lines
7.9 KiB
2 years ago
|
# -*- coding = utf-8 -*-
|
||
|
# @Time: 18:01
|
||
|
# @Author:ZYP
|
||
|
# @File:SearchSimPaper.py
|
||
|
# @mail:zypsunshine1@gmail.com
|
||
|
# @Software: PyCharm
|
||
|
|
||
|
# =========================================================================================
|
||
|
# 查找相似文档
|
||
|
# · 文档之间关键词进行取交集
|
||
|
# · 再对选取出的文档与送检文档进行关键词之间的相似度计算
|
||
|
# · 最终选出最相似的文档,进行排序返回
|
||
|
# =========================================================================================
|
||
|
|
||
|
import math
|
||
|
import numpy as np
|
||
|
from collections import defaultdict
|
||
|
from pymysql.converters import escape_string
|
||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||
|
from util import cut_text, l2_normal, mysql, get_word_vec
|
||
|
|
||
|
|
||
|
def load_inverted_table(class_list):
|
||
|
"""根据分类结果,将每个类别的倒排表进行聚合,返回一个几个类别的字典、几个类别库中总论文数量"""
|
||
|
# 记录总的倒排表 {word:[doc_id1,doc_id2,doc_id3, ...]}
|
||
|
total_inverted_dict1 = {}
|
||
|
# 记录每个类别的论文数量的和
|
||
|
total_nums1 = 0
|
||
|
for label_num in class_list:
|
||
|
select_sql = """
|
||
|
select word, paper_doc_id from word_map_paper_{};
|
||
|
""".format(str(label_num))
|
||
|
mysql.cursor.execute(select_sql)
|
||
|
for word, paper_doc_id in mysql.cursor.fetchall():
|
||
|
if word not in total_inverted_dict1.keys():
|
||
|
total_inverted_dict1[word] = paper_doc_id.split(',')
|
||
|
else:
|
||
|
total_inverted_dict1[word] = sorted(list(set(total_inverted_dict1[word] + paper_doc_id.split(','))),
|
||
|
reverse=False)
|
||
|
|
||
|
select_paper_num_sql = """
|
||
|
select count(*) from main_table_paper_detail_message where label_num={};
|
||
|
""".format(label_num)
|
||
|
mysql.cursor.execute(select_paper_num_sql)
|
||
|
for nums in mysql.cursor.fetchall():
|
||
|
total_nums1 += nums
|
||
|
|
||
|
return total_inverted_dict1, total_nums1
|
||
|
|
||
|
|
||
|
def select_sim_doc_message(sim_doc1):
|
||
|
"""
|
||
|
通过相似的 doc_id 在库中查找相关的信息,然后计算每个 doc_id 的均值文档向量,以字典形式返回 {文档号:均值文档向量....}
|
||
|
:param sim_doc1: 相似文档的列表,[doc_id1, doc_id2, ...]
|
||
|
:return: 返回{doc_id:(doc_avg_vec, doc_path)}
|
||
|
"""
|
||
|
all_paper_vec_dict = {}
|
||
|
for doc_id in sim_doc1:
|
||
|
select_sql = """
|
||
|
select tb1.doc_id, tb1.title, tb1.abst_zh, tb2.vsm, tb1.content from
|
||
|
(
|
||
|
(select doc_id, title, abst_zh, content from main_table_paper_detail_message) tb1
|
||
|
left join
|
||
|
(select doc_id, vsm from id_keywords_weights1) tb2
|
||
|
on
|
||
|
tb1.doc_id=tb2.doc_id
|
||
|
)where tb1.doc_id="{}";
|
||
|
""".format(
|
||
|
escape_string(doc_id))
|
||
|
|
||
|
mysql.cursor.execute(select_sql)
|
||
|
|
||
|
sim_doc_id, sim_title, sim_abst, sim_vsm, sim_content_path = mysql.cursor.fetchone()
|
||
|
sim_vsm_dict = {weight.split('=')[0]: float(weight.split('=')[1]) for weight in sim_vsm.split(',')}
|
||
|
vector_paper = []
|
||
|
value_sum = 0.0
|
||
|
for word, weight in sim_vsm_dict.items():
|
||
|
if word in sim_title:
|
||
|
value = 0.5 * weight
|
||
|
elif word in sim_abst:
|
||
|
value = 0.3 * weight
|
||
|
else:
|
||
|
value = 0.2 * weight
|
||
|
|
||
|
word_vec = get_word_vec(word)
|
||
|
if word_vec == 0:
|
||
|
continue
|
||
|
vector_paper.append(word_vec * value)
|
||
|
value_sum += value
|
||
|
|
||
|
# 求一篇文档的关键词的向量均值
|
||
|
# avg_vector = np.array(np.sum(np.array(vector_paper, dtype=np.float32), axis=0) / len(vector_paper))
|
||
|
avg_vector = np.array(np.sum(np.array(vector_paper, dtype=np.float32), axis=0) / value_sum)
|
||
|
all_paper_vec_dict[doc_id] = (avg_vector, sim_content_path)
|
||
|
|
||
|
return all_paper_vec_dict
|
||
|
|
||
|
|
||
|
def submit_paper_avg_vec(paper_dict1, tf_weight_dict):
|
||
|
"""根据送检的文档的 tf 值,计算这篇文档的均值向量,以 numpy 数组形式返回"""
|
||
|
vector_paper = []
|
||
|
value_sum = 0.0
|
||
|
for word, weight in tf_weight_dict.items():
|
||
|
if word in paper_dict1['title']:
|
||
|
value = 0.5 * weight
|
||
|
elif word in paper_dict1['abst_zh']:
|
||
|
value = 0.3 * weight
|
||
|
else:
|
||
|
value = 0.2 * weight
|
||
|
|
||
|
word_vec = get_word_vec(word)
|
||
|
if word_vec == 0:
|
||
|
continue
|
||
|
|
||
|
vector_paper.append(word_vec * value)
|
||
|
value_sum += value
|
||
|
|
||
|
# avg_vector = np.array(np.sum(np.array(vector_paper, dtype=np.float32), axis=0) / len(vector_paper))
|
||
|
avg_vector = np.array(np.sum(np.array(vector_paper, dtype=np.float32), axis=0) / value_sum)
|
||
|
|
||
|
return avg_vector
|
||
|
|
||
|
|
||
|
def compare_sim_in_papers(check_vector, sim_message, top=40):
|
||
|
"""
|
||
|
计算文档间的相似度,使用的是余弦相似度
|
||
|
:param check_vector: 送检文章的文本向量
|
||
|
:param sim_message: 待检测的 50 篇相似文档,以字典形式存储
|
||
|
:param top: 设置返回最相似的 N 篇文档
|
||
|
:return: 返回相似文档的字典 形式:{doc_id:(相似得分, 文档路径)}
|
||
|
"""
|
||
|
sim_res_dict = {}
|
||
|
for doc_id, (vector, content_path) in sim_message.items():
|
||
|
# sim_res_dict[doc_id] = cosine_similarity([scalar(check_vector), scalar(vector)])[0][1]
|
||
|
sim_res_dict[doc_id] = (cosine_similarity([check_vector, vector])[0][1], content_path)
|
||
|
_ = sorted(sim_res_dict.items(), key=lambda x: x[1][1], reverse=True)
|
||
|
return {key: value for key, value in _[:top]}
|
||
|
|
||
|
|
||
|
def search_sim_paper(paper_dict, class_list, top=10):
|
||
|
"""
|
||
|
根据送检论文的字典,在库中进行相似文档的查询,最后返回最相似的 top 文章,用于逐句查重。
|
||
|
:param paper_dict: 处理好的格式化送检论文
|
||
|
:param class_list: 模型预测送检论文的类别 id 的列表
|
||
|
:param top: 返回前 top 个文档
|
||
|
:return: 返回相似文档的字典 形式:{doc_id:(相似得分, 文档路径)}
|
||
|
"""
|
||
|
all_str = paper_dict['title'] + '。' + paper_dict['key_words'] + '。' + paper_dict['content']
|
||
|
# 合并倒排表,并统计 论文总量 total_inverted_dict:总的倒排表
|
||
|
total_inverted_dict, total_nums = load_inverted_table(class_list)
|
||
|
|
||
|
# 计算送检文档的词频字典{word1:fre1, word2:fre2, ...}
|
||
|
word_fre_dict = cut_text(all_str, tokenizer='jieba')
|
||
|
|
||
|
# 计算送检文档所有词语的 tf-idf 值
|
||
|
tf_idf_dict = {}
|
||
|
for word, freq in word_fre_dict.items():
|
||
|
tf = freq / sum(word_fre_dict.values())
|
||
|
if word in total_inverted_dict.keys():
|
||
|
idf = math.log(total_nums / (len(total_inverted_dict[word]) + 1))
|
||
|
else:
|
||
|
idf = math.log(total_nums / 1)
|
||
|
|
||
|
tf_idf = tf * idf
|
||
|
tf_idf_dict[word] = tf_idf
|
||
|
|
||
|
# 前 15 的单词、权重
|
||
|
tf_dict = l2_normal(tf_idf_dict)
|
||
|
|
||
|
# 统计交集的
|
||
|
count_words_num = defaultdict(int)
|
||
|
for word, weight in tf_dict.items():
|
||
|
if word in total_inverted_dict.keys():
|
||
|
for doc_id in total_inverted_dict[word]:
|
||
|
count_words_num[doc_id] += 1
|
||
|
else:
|
||
|
continue
|
||
|
|
||
|
# 排序
|
||
|
count_word_num = {i: j for i, j in sorted(count_words_num.items(), key=lambda x: x[1], reverse=True)}
|
||
|
|
||
|
# 查找前 50 篇相似的文档
|
||
|
sim_doc = list(count_word_num.keys())[:200]
|
||
|
|
||
|
# 计算这 50 篇文档的 文档均值向量
|
||
|
sim_paper_vec_dict = select_sim_doc_message(sim_doc)
|
||
|
|
||
|
# 计算送检文档的 文档均值向量
|
||
|
submit_vec = submit_paper_avg_vec(paper_dict, tf_dict)
|
||
|
|
||
|
# 计算送检文档 和 查出来的文档的相似度 并排序, 取 top 10 文章用作整篇查重
|
||
|
sim_paper_dict = compare_sim_in_papers(submit_vec, sim_paper_vec_dict, top=top)
|
||
|
|
||
|
return sim_paper_dict
|