# -*- coding = utf-8 -*- # @Time: 9:59 # @Author:ZYP # @File:CheckPaper.py # @mail:zypsunshine1@gmail.com # @Software: PyCharm # ========================================================================================= # 查重主函数 # · 进行文档之间的相似度查询 # ========================================================================================= from SearchSimSentence import check_repeat_by_model, check_repeat_by_word2vec from LoadRoformer import pred_class_num from SearchSimPaper import search_sim_paper from util import deal_paper, save_result def main(): # 查重文档路径 target_paper_path = '' # 结果输出路径 output_path = '' # 重复率设定 threshold = 0.85 # 处理成相应的格式 {title:...,abst_zh:...,content:...} paper_dict = deal_paper(target_paper_path) # 执行分类模型进行分类(在哪几个类别中进行查重) class_list = pred_class_num(paper_dict) # 查出的相似文档,以字典的形式进行返回,{doc_id: 与送检文档的相似度得分} sim_paper_id_dict = search_sim_paper(paper_dict, class_list) # 对已经查出来的文档进行逐篇、逐句查重 # {doc_id:{sent1:[sim_sent,...], sent2:[sim_sent,...]}} result = check_repeat_by_word2vec(paper_dict, sim_paper_id_dict, threshold) # 使用专利中的方法,计算每个词语的权重,最后均值 # result = check_repeat_by_model(paper_dict, sim_paper_id_dict, threshold) # 使用 bert 模型进行句与句中的相似度比较 # 进行结果的保存 save_result(result, output_path) if __name__ == '__main__': main()