对送检文档进行查重
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

47 lines
1.7 KiB

# -*- coding = utf-8 -*-
# @Time: 9:59
# @Author:ZYP
# @File:CheckPaper.py
# @mail:zypsunshine1@gmail.com
# @Software: PyCharm
# =========================================================================================
# 查重主函数
# · 进行文档之间的相似度查询
# =========================================================================================
from SearchSimSentence import check_repeat_by_model, check_repeat_by_word2vec
from LoadRoformer import pred_class_num
from SearchSimPaper import search_sim_paper
from util import deal_paper, save_result
def main():
# 查重文档路径
target_paper_path = ''
# 结果输出路径
output_path = ''
# 重复率设定
threshold = 0.85
# 处理成相应的格式 {title:...,abst_zh:...,content:...}
paper_dict = deal_paper(target_paper_path)
# 执行分类模型进行分类(在哪几个类别中进行查重)
class_list = pred_class_num(paper_dict)
# 查出的相似文档,以字典的形式进行返回,{doc_id: 与送检文档的相似度得分}
sim_paper_id_dict = search_sim_paper(paper_dict, class_list)
# 对已经查出来的文档进行逐篇、逐句查重 # {doc_id:{sent1:[sim_sent,...], sent2:[sim_sent,...]}}
result = check_repeat_by_word2vec(paper_dict, sim_paper_id_dict, threshold) # 使用专利中的方法,计算每个词语的权重,最后均值
# result = check_repeat_by_model(paper_dict, sim_paper_id_dict, threshold) # 使用 bert 模型进行句与句中的相似度比较
# 进行结果的保存
save_result(result, output_path)
if __name__ == '__main__':
main()