You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
47 lines
1.7 KiB
47 lines
1.7 KiB
# -*- coding = utf-8 -*-
|
|
# @Time: 9:59
|
|
# @Author:ZYP
|
|
# @File:CheckPaper.py
|
|
# @mail:zypsunshine1@gmail.com
|
|
# @Software: PyCharm
|
|
|
|
# =========================================================================================
|
|
# 查重主函数
|
|
# · 进行文档之间的相似度查询
|
|
# =========================================================================================
|
|
|
|
from SearchSimSentence import check_repeat_by_model, check_repeat_by_word2vec
|
|
from LoadRoformer import pred_class_num
|
|
from SearchSimPaper import search_sim_paper
|
|
from util import deal_paper, save_result
|
|
|
|
|
|
def main():
|
|
# 查重文档路径
|
|
target_paper_path = ''
|
|
|
|
# 结果输出路径
|
|
output_path = ''
|
|
|
|
# 重复率设定
|
|
threshold = 0.85
|
|
|
|
# 处理成相应的格式 {title:...,abst_zh:...,content:...}
|
|
paper_dict = deal_paper(target_paper_path)
|
|
|
|
# 执行分类模型进行分类(在哪几个类别中进行查重)
|
|
class_list = pred_class_num(paper_dict)
|
|
|
|
# 查出的相似文档,以字典的形式进行返回,{doc_id: 与送检文档的相似度得分}
|
|
sim_paper_id_dict = search_sim_paper(paper_dict, class_list)
|
|
|
|
# 对已经查出来的文档进行逐篇、逐句查重 # {doc_id:{sent1:[sim_sent,...], sent2:[sim_sent,...]}}
|
|
result = check_repeat_by_word2vec(paper_dict, sim_paper_id_dict, threshold) # 使用专利中的方法,计算每个词语的权重,最后均值
|
|
# result = check_repeat_by_model(paper_dict, sim_paper_id_dict, threshold) # 使用 bert 模型进行句与句中的相似度比较
|
|
|
|
# 进行结果的保存
|
|
save_result(result, output_path)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
|