You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1582 lines
62 KiB
1582 lines
62 KiB
|
2 weeks ago
|
# -*- coding:utf-8 -*-
|
||
|
|
import os
|
||
|
|
import numpy as np
|
||
|
|
from numpy.linalg import norm
|
||
|
|
import pandas as pd
|
||
|
|
# from rouge import Rouge
|
||
|
|
from rouge_chinese import Rouge
|
||
|
|
from Rouge_w import Rouge_w, Rouge_l
|
||
|
|
import json
|
||
|
|
# import pymysql
|
||
|
|
import re
|
||
|
|
import requests
|
||
|
|
from flask import Flask, jsonify
|
||
|
|
from flask import request
|
||
|
|
import uuid
|
||
|
|
import time
|
||
|
|
import redis
|
||
|
|
from threading import Thread
|
||
|
|
from multiprocessing import Pool
|
||
|
|
|
||
|
|
app = Flask(__name__)
|
||
|
|
app.config["JSON_AS_ASCII"] = False
|
||
|
|
|
||
|
|
# pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=16, password="zhicheng123*")
|
||
|
|
|
||
|
|
pool = redis.ConnectionPool(host='192.168.31.74', port=63179, max_connections=100, db=17, password="zhicheng123*")
|
||
|
|
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
|
||
|
|
|
||
|
|
db_key_querying = 'querying_check_task'
|
||
|
|
db_key_queryset = 'queryset_check_task'
|
||
|
|
db_key_query_recall = 'query_recall'
|
||
|
|
|
||
|
|
shout_sentence_len = 7
|
||
|
|
long_sentence_len = 200
|
||
|
|
nums_cpus = 24
|
||
|
|
rouge = Rouge()
|
||
|
|
rouge_model = Rouge_w()
|
||
|
|
rouge_l_model = Rouge_l()
|
||
|
|
|
||
|
|
|
||
|
|
def jaccard_similarity(s1, s2):
|
||
|
|
set1 = set(s1)
|
||
|
|
set2 = set(s2)
|
||
|
|
intersection = set1 & set2
|
||
|
|
union = set1 | set2
|
||
|
|
return len(intersection) / len(union)
|
||
|
|
|
||
|
|
|
||
|
|
# def bert_check(text, recall_data_list):
|
||
|
|
# '''
|
||
|
|
# bert 查重
|
||
|
|
# :return:
|
||
|
|
# '''
|
||
|
|
#
|
||
|
|
# sen_0 = [text] * len(recall_data_list)
|
||
|
|
# sen_1 = [i[0] for i in recall_data_list]
|
||
|
|
#
|
||
|
|
# return_list = []
|
||
|
|
# request_json = {
|
||
|
|
# "texts": [sen_0, sen_1],
|
||
|
|
# }
|
||
|
|
# paper_dict = dialog_line_parse("http://192.168.31.74:16002/", request_json)
|
||
|
|
# score_list = paper_dict["res"]
|
||
|
|
#
|
||
|
|
# # 后期要改
|
||
|
|
# # return_list.append(re1[0][1])
|
||
|
|
# # return_list.append(re1[0][0])
|
||
|
|
# if 1 in score_list:
|
||
|
|
# index_score = score_list.index(1)
|
||
|
|
# else:
|
||
|
|
# index_score = "NaN"
|
||
|
|
#
|
||
|
|
# if index_score == "NaN":
|
||
|
|
# return_list.append(0)
|
||
|
|
# return_list.append("")
|
||
|
|
# else:
|
||
|
|
# return_list.append(1)
|
||
|
|
# return_list.append(index_score)
|
||
|
|
#
|
||
|
|
# return return_list
|
||
|
|
|
||
|
|
|
||
|
|
def rouge_value_self(data_1, data_2):
|
||
|
|
data_1 = [' '.join(i) for i in data_1]
|
||
|
|
data_2 = [' '.join(i) for i in data_2]
|
||
|
|
rouge_l_list = []
|
||
|
|
|
||
|
|
for sen_1, sen_2 in zip(data_1, data_2):
|
||
|
|
sen_1 = sen_1.split(" ")
|
||
|
|
sen_2 = sen_2.split(" ")
|
||
|
|
rouge_l_score = rouge_l_model.score(sen_1, sen_2)
|
||
|
|
rouge_l_list.append(rouge_l_score)
|
||
|
|
|
||
|
|
return "", "", rouge_l_list
|
||
|
|
|
||
|
|
|
||
|
|
def strsim_value(data_1, data_2):
|
||
|
|
data_1 = [' '.join(i) for i in data_1]
|
||
|
|
data_2 = [' '.join(i) for i in data_2]
|
||
|
|
rouge_l_list = []
|
||
|
|
|
||
|
|
for sen_1, sen_2 in zip(data_1, data_2):
|
||
|
|
sen_1 = sen_1.split(" ")
|
||
|
|
sen_2 = sen_2.split(" ")
|
||
|
|
rouge_l_score = jaccard_similarity(sen_1, sen_2)
|
||
|
|
rouge_l_list.append(rouge_l_score)
|
||
|
|
|
||
|
|
return "", "", rouge_l_list
|
||
|
|
|
||
|
|
|
||
|
|
def rouge_pre(text, df_train_nuoche):
|
||
|
|
return_list = []
|
||
|
|
index_rouge_list = []
|
||
|
|
text_list = [text] * len(df_train_nuoche)
|
||
|
|
|
||
|
|
data_list = []
|
||
|
|
for data_dan in df_train_nuoche:
|
||
|
|
data_list.append(data_dan[0])
|
||
|
|
rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list)
|
||
|
|
index_rouge_list.extend(rouge_l)
|
||
|
|
|
||
|
|
re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)]
|
||
|
|
|
||
|
|
return_list.append(re1[0][1])
|
||
|
|
return_list.append(re1[0][0])
|
||
|
|
|
||
|
|
return return_list
|
||
|
|
|
||
|
|
|
||
|
|
def rouge_pre_m(text, df_train_nuoche):
|
||
|
|
return_list = []
|
||
|
|
index_rouge_list = []
|
||
|
|
|
||
|
|
text_list = [text] * len(df_train_nuoche)
|
||
|
|
|
||
|
|
data_list = []
|
||
|
|
for data_dan in df_train_nuoche:
|
||
|
|
data_list.append(data_dan[0])
|
||
|
|
rouge_1, rouge_2, rouge_l = strsim_value(text_list, data_list)
|
||
|
|
index_rouge_list.extend(rouge_l)
|
||
|
|
|
||
|
|
re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)]
|
||
|
|
|
||
|
|
return_list.extend(re1)
|
||
|
|
|
||
|
|
return return_list
|
||
|
|
|
||
|
|
|
||
|
|
def rouge_pre_m_1(bool_check_sentense, content_list, recall_data_list):
|
||
|
|
# bool_check_sentense [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
|
||
|
|
|
||
|
|
bool_check_sentense_new = []
|
||
|
|
for bool_check_sentense_dan in bool_check_sentense:
|
||
|
|
bool_check_sentense_new_dan = []
|
||
|
|
|
||
|
|
text_list = []
|
||
|
|
data_list = []
|
||
|
|
linshi = []
|
||
|
|
for i in bool_check_sentense_dan:
|
||
|
|
text1 = content_list[i[0]]
|
||
|
|
text2 = recall_data_list[i[1]][0]
|
||
|
|
linshi.append([i[0], i[1]])
|
||
|
|
text_list.append(text1)
|
||
|
|
data_list.append(text2)
|
||
|
|
_, _, rouge_l_list = rouge_value_self(text_list, data_list)
|
||
|
|
for i in range(len(rouge_l_list)):
|
||
|
|
if rouge_l_list[i] > 0.47:
|
||
|
|
bool_check_sentense_new_dan.append(linshi[i])
|
||
|
|
if bool_check_sentense_new_dan != []:
|
||
|
|
bool_check_sentense_new.append(bool_check_sentense_new_dan)
|
||
|
|
return bool_check_sentense_new
|
||
|
|
|
||
|
|
|
||
|
|
# 以单个章节为例
|
||
|
|
def similar_content_func():
|
||
|
|
'''
|
||
|
|
重复文章
|
||
|
|
:return:
|
||
|
|
'''
|
||
|
|
return [{
|
||
|
|
"content": "重复的内容标红",
|
||
|
|
"thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计 李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01",
|
||
|
|
"title": "标题",
|
||
|
|
"year": "日期",
|
||
|
|
"degree": "来源",
|
||
|
|
"author": "作者"
|
||
|
|
}]
|
||
|
|
|
||
|
|
|
||
|
|
def original_text_contrast_func(sentence_dan, content_list):
|
||
|
|
'''
|
||
|
|
|
||
|
|
:param data_sentence_dan: section_dan[0] = [原句子序号, 第一组句子, 第一组句子相似句子, 第一组句子的标红部分,第一组句子相似句子标红部分,相似句子文件名]
|
||
|
|
:param content_list:
|
||
|
|
:return:
|
||
|
|
'''
|
||
|
|
|
||
|
|
'''
|
||
|
|
{
|
||
|
|
'original_text': '此处有 52 字相似\n影响 2.吉比特清仓式分红的外部效应 (1)提高了外界关注度 公司派发现金股利,可以向市场传递消息,提高外部关注度 表6\n<red>“吉比特”百度搜索指数 <table><tr><td><p><strong>年份</strong></p</red>\n></td><td><p><strong>搜索日均指数(次)</strong></p></td><td><p><stro',
|
||
|
|
'dan_sentence_word_nums': 52,
|
||
|
|
'similar_content': [{
|
||
|
|
'paper_red_len_word': 52,
|
||
|
|
'content': '浅谈幼儿说谎的成因及对策<table><tr<red>><td><p><strong>姓名</strong></p><p><strong>学号</</red>strong></p></td><td><p><strong>:</strong></p><p><strong>:</strong></p></td><td><p>李媛媛</p><p>202101016968</p></td></tr><tr><td><p><strong>专业</strong></p></td><td><p><strong>:</strong></p></td><td><p>学前教育</p></td></tr><tr><td><p><strong>层次</strong></p></td><td><p><strong>:</strong></p></td><td><p>专升本</p></td></tr><tr><td><p><strong>站点</strong></p></td><td><p><strong>:</strong></p></td><td></td></tr></table>',
|
||
|
|
'thesis_info': '14397770 ',
|
||
|
|
'title': '14397770',
|
||
|
|
'year': '',
|
||
|
|
'degree': '',
|
||
|
|
'author': '',
|
||
|
|
'paper_len_word': ''
|
||
|
|
}, {
|
||
|
|
'paper_red_len_word': 52,
|
||
|
|
'content': '浅谈幼儿说谎的成因及对策<table><tr<red>><td><p><strong>姓名</strong></p><p><strong>学号</</red>strong></p></td><td><p><strong>:</strong></p><p><strong>:</strong></p></td><td><p>李媛媛</p><p>202101016968</p></td></tr><tr><td><p><strong>专业</strong></p></td><td><p><strong>:</strong></p></td><td><p>学前教育</p></td></tr><tr><td><p><strong>层次</strong></p></td><td><p><strong>:</strong></p></td><td><p>专升本</p></td></tr><tr><td><p><strong>站点</strong></p></td><td><p><strong>:</strong></p></td><td></td></tr></table>',
|
||
|
|
'thesis_info': '14397770 ',
|
||
|
|
'title': '14397770',
|
||
|
|
'year': '',
|
||
|
|
'degree': '',
|
||
|
|
'author': '',
|
||
|
|
'paper_len_word': ''
|
||
|
|
}]
|
||
|
|
}
|
||
|
|
'''
|
||
|
|
|
||
|
|
if sentence_dan != []:
|
||
|
|
original_text = ""
|
||
|
|
start = len(sentence_dan[0][1])
|
||
|
|
end = 0
|
||
|
|
similar_content = []
|
||
|
|
for dan_sen_info in sentence_dan: # 可能有很多个暂且确定是一个
|
||
|
|
|
||
|
|
similar_content_dan = {
|
||
|
|
"paper_red_len_word": "",
|
||
|
|
"content": "重复的内容标红",
|
||
|
|
"thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计 李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01",
|
||
|
|
"title": "标题",
|
||
|
|
"year": "日期",
|
||
|
|
"degree": "来源",
|
||
|
|
"author": "作者",
|
||
|
|
"paper_len_word": ""
|
||
|
|
}
|
||
|
|
|
||
|
|
sentence_0_bool, sentence_0_dan_red = original_text_marked_red(dan_sen_info[1], dan_sen_info[1], 0, len(dan_sen_info[1])) # 后俩个参数是标红序号需要修改
|
||
|
|
|
||
|
|
sentence_1_bool, sentence_1_dan_red = original_text_marked_red(dan_sen_info[2], dan_sen_info[2], 0, len(dan_sen_info[2])) # 后俩个参数是标红序号需要修改
|
||
|
|
|
||
|
|
if sentence_0_bool == False or sentence_1_bool == False:
|
||
|
|
continue
|
||
|
|
|
||
|
|
start_dan = sentence_0_dan_red.index("<red>")
|
||
|
|
end_dan = sentence_0_dan_red.index("</red>") - len("<red>")
|
||
|
|
|
||
|
|
similar_content_dan["content"] = sentence_1_dan_red
|
||
|
|
similar_content_dan["title"] = dan_sen_info[5]
|
||
|
|
similar_content_dan["author"] = ""
|
||
|
|
similar_content_dan["degree"] = ""
|
||
|
|
similar_content_dan["year"] = ""
|
||
|
|
similar_content_dan["paper_len_word"] = ""
|
||
|
|
similar_content_dan["paper_red_len_word"] = end_dan - start_dan
|
||
|
|
|
||
|
|
thesis_info = " ".join(
|
||
|
|
[similar_content_dan["title"], similar_content_dan["author"], similar_content_dan["degree"],
|
||
|
|
similar_content_dan["year"]])
|
||
|
|
similar_content_dan["thesis_info"] = thesis_info
|
||
|
|
|
||
|
|
similar_content.append(similar_content_dan)
|
||
|
|
|
||
|
|
original_text_list = list(sentence_dan[0][1])
|
||
|
|
# original_text_list.insert(end, "</red>\n")
|
||
|
|
# original_text_list.insert(start, "\n<red>")
|
||
|
|
target_text_str = "".join(["\n<red>"] + original_text_list[start: end] + ["</red>\n"])
|
||
|
|
|
||
|
|
original_text_start = "".join(original_text_list[:start])
|
||
|
|
original_text_end = "".join(original_text_list[end:])
|
||
|
|
|
||
|
|
print(sentence_dan)
|
||
|
|
if sentence_dan[0][4][0] - 1 < 0:
|
||
|
|
start_sen = ""
|
||
|
|
else:
|
||
|
|
start_sen = content_list[data_sentence_dan[0][4][0] - 1]
|
||
|
|
|
||
|
|
if data_sentence_dan[0][4][-1] + 1 >= len(content_list):
|
||
|
|
end_sen = ""
|
||
|
|
else:
|
||
|
|
end_sen = content_list[data_sentence_dan[0][4][-1] + 1]
|
||
|
|
|
||
|
|
start_sen = start_sen + original_text_start
|
||
|
|
end_sen = original_text_end + end_sen
|
||
|
|
original_text = "此处有 {} 字相似\n".format(str(end - start)) + start_sen[-60:] + target_text_str + end_sen[:60]
|
||
|
|
else:
|
||
|
|
original_text = ""
|
||
|
|
end = 0
|
||
|
|
start = 0
|
||
|
|
similar_content = []
|
||
|
|
return_info = {
|
||
|
|
"original_text": original_text,
|
||
|
|
"dan_sentence_word_nums": end - start,
|
||
|
|
"similar_content": similar_content
|
||
|
|
}
|
||
|
|
return return_info
|
||
|
|
|
||
|
|
|
||
|
|
def repeat_quote_info_func(original_text_contrast, section_words):
|
||
|
|
'''
|
||
|
|
重复的引用信息
|
||
|
|
:return:
|
||
|
|
'''
|
||
|
|
chongfuwendang = {}
|
||
|
|
|
||
|
|
for sentence_dan in original_text_contrast:
|
||
|
|
for i in sentence_dan["similar_content"]:
|
||
|
|
thesis_info = i["thesis_info"]
|
||
|
|
if thesis_info not in chongfuwendang:
|
||
|
|
chongfuwendang[thesis_info] = {
|
||
|
|
"quote": False,
|
||
|
|
"thesis_author": i["author"],
|
||
|
|
"thesis_date": i["year"],
|
||
|
|
"thesis_info": thesis_info,
|
||
|
|
"thesis_repeat_rate": (i["paper_red_len_word"] / section_words) * 100,
|
||
|
|
# str(round(repeat_rate, 1)) + "%"
|
||
|
|
# round(repetition_rate, 3) * 100
|
||
|
|
"thesis_title": i["title"],
|
||
|
|
"thesis_link": "",
|
||
|
|
"thesis_publish": i["degree"],
|
||
|
|
"thesis_repeat_word": i["paper_red_len_word"],
|
||
|
|
"thesis_teacher": "",
|
||
|
|
"paper_len_word": i["paper_len_word"]
|
||
|
|
}
|
||
|
|
else:
|
||
|
|
chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"]
|
||
|
|
chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] /
|
||
|
|
section_words) * 100
|
||
|
|
chongfuwendang = sorted(chongfuwendang.items(),
|
||
|
|
key=lambda x: x[1]["thesis_repeat_rate"], reverse=False)
|
||
|
|
|
||
|
|
chongfuwendang_list = []
|
||
|
|
|
||
|
|
for i in chongfuwendang:
|
||
|
|
chongfuwendang_dan = i[1]
|
||
|
|
print(chongfuwendang_dan)
|
||
|
|
chongfuwendang_dan["thesis_repeat_rate"] = str(round(chongfuwendang_dan["thesis_repeat_rate"], 1)) + "%"
|
||
|
|
chongfuwendang_list.append(chongfuwendang_dan)
|
||
|
|
|
||
|
|
return chongfuwendang_list
|
||
|
|
|
||
|
|
|
||
|
|
def total_data_func(section_data_list):
|
||
|
|
'''
|
||
|
|
总体数据
|
||
|
|
:return:
|
||
|
|
'''
|
||
|
|
# "end_page_index": 0,
|
||
|
|
# "name": "第1部分",
|
||
|
|
# "repeat_rate": repeat_rate,
|
||
|
|
# "repeat_words": repeat_words,
|
||
|
|
# "start_page_index": 0,
|
||
|
|
# "words": section_words,
|
||
|
|
# "original_text": original_text,
|
||
|
|
# "original_text_oneself": original_text,
|
||
|
|
# "original_text_contrast/重复的对比详细信息": original_text_contrast,
|
||
|
|
# "repeat_quote_info/重复的引用信息": repeat_quote_info
|
||
|
|
|
||
|
|
repeat_words = 0
|
||
|
|
words = 0
|
||
|
|
|
||
|
|
for i in section_data_list:
|
||
|
|
repeat_words += i["repeat_words"]
|
||
|
|
words += i["words"]
|
||
|
|
|
||
|
|
baifenbi = (repeat_words / words) * 100
|
||
|
|
exclude_personal_rate = str(round(baifenbi, 1)) + "%"
|
||
|
|
exclude_quote_rate = str(round(baifenbi, 1)) + "%"
|
||
|
|
single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"]
|
||
|
|
single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"]
|
||
|
|
total_repeat_rate = str(round(baifenbi, 1)) + "%"
|
||
|
|
total_repeat_words = repeat_words
|
||
|
|
total_words = words
|
||
|
|
|
||
|
|
print(exclude_personal_rate)
|
||
|
|
|
||
|
|
return {
|
||
|
|
"back_repeat_words": "",
|
||
|
|
"exclude_personal_rate": exclude_personal_rate,
|
||
|
|
"exclude_quote_rate": exclude_quote_rate,
|
||
|
|
"front_repeat_words": "",
|
||
|
|
"single_max_rate": single_max_rate,
|
||
|
|
"single_max_repeat_words": single_max_repeat_words,
|
||
|
|
"suspected_paragraph": "",
|
||
|
|
"suspected_paragraph_max_repeat_words": "",
|
||
|
|
"suspected_paragraph_min_repeat_words": "",
|
||
|
|
"total_paragraph": "",
|
||
|
|
"total_repeat_rate": total_repeat_rate,
|
||
|
|
"total_repeat_words": total_repeat_words,
|
||
|
|
"total_words": total_words,
|
||
|
|
"tables": 0
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def section_data_func_dan():
|
||
|
|
'''
|
||
|
|
章节信息单个
|
||
|
|
:return:
|
||
|
|
'''
|
||
|
|
# {
|
||
|
|
# "section_name": "章节名称",
|
||
|
|
# "section_repeat_rate": "重复率",
|
||
|
|
# "section_repeat_words": "重复字数",
|
||
|
|
# "section_words": "章节字数",
|
||
|
|
# "oneself_repeat_words": "去除本人后重复字数",
|
||
|
|
# "reference_repeat_words": "去除引用后重复字数",
|
||
|
|
# "section_oneself_rate": "去除本人后重复率"
|
||
|
|
# }
|
||
|
|
|
||
|
|
return {
|
||
|
|
"section_name": "",
|
||
|
|
"section_repeat_rate": "",
|
||
|
|
"section_repeat_words": "",
|
||
|
|
"section_words": "",
|
||
|
|
"oneself_repeat_words": "",
|
||
|
|
"reference_repeat_words": "",
|
||
|
|
"section_oneself_rate": ""
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def section_data_func(section_details):
|
||
|
|
'''
|
||
|
|
章节信息
|
||
|
|
:return:
|
||
|
|
'''
|
||
|
|
# "end_page_index": 0,
|
||
|
|
# "name": "第1部分",
|
||
|
|
# "repeat_rate": repeat_rate,
|
||
|
|
# "repeat_words": repeat_words,
|
||
|
|
# "start_page_index": 0,
|
||
|
|
# "words": section_words,
|
||
|
|
# "original_text": original_text,
|
||
|
|
# "original_text_oneself": original_text,
|
||
|
|
# "original_text_contrast/重复的对比详细信息": original_text_contrast,
|
||
|
|
# "repeat_quote_info/重复的引用信息": repeat_quote_info
|
||
|
|
|
||
|
|
section_name = section_details["name"]
|
||
|
|
section_repeat_rate = section_details["repeat_rate"]
|
||
|
|
section_repeat_words = section_details["repeat_words"]
|
||
|
|
section_words = section_details["words"]
|
||
|
|
oneself_repeat_words = section_details["repeat_words"]
|
||
|
|
reference_repeat_words = section_details["repeat_words"]
|
||
|
|
section_oneself_rate = section_details["repeat_rate"]
|
||
|
|
|
||
|
|
return {
|
||
|
|
"section_name": section_name,
|
||
|
|
"section_repeat_rate": section_repeat_rate,
|
||
|
|
"section_repeat_words": section_repeat_words,
|
||
|
|
"section_words": section_words,
|
||
|
|
"oneself_repeat_words": oneself_repeat_words,
|
||
|
|
"reference_repeat_words": reference_repeat_words,
|
||
|
|
"section_oneself_rate": section_oneself_rate
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def section_details_func(data_section_dan, num_words, content_list, index_content_list_dan):
|
||
|
|
'''
|
||
|
|
章节详细信息
|
||
|
|
:param data_section_dan: 章节的每一个内容的相似句子的信息 data_section_dan[0][0] = [原句子序号, 第一组句子, 第一组句子相似句子, 第一组句子的标红部分,第一组句子相似句子标红部分,相似句子文件名]
|
||
|
|
:param num_words: 一个章节的字数
|
||
|
|
:param content_list: 一个章节里面的所有句子 []
|
||
|
|
:param index_content_list_dan: 章节id
|
||
|
|
:return:
|
||
|
|
'''
|
||
|
|
original_text_contrast = []
|
||
|
|
section_repeat_rate = ""
|
||
|
|
repeat_words = 0
|
||
|
|
section_words = num_words
|
||
|
|
oneself_repeat_words = ""
|
||
|
|
reference_repeat_words = ""
|
||
|
|
section_oneself_rate = ""
|
||
|
|
original_text_list = []
|
||
|
|
|
||
|
|
for sentence_dan in data_section_dan:
|
||
|
|
original_text_contrast_dan = original_text_contrast_func(sentence_dan, content_list)
|
||
|
|
original_text_contrast.append(original_text_contrast_dan)
|
||
|
|
repeat_words += original_text_contrast_dan["dan_sentence_word_nums"]
|
||
|
|
original_text_list.append(original_text_contrast_dan["original_text"])
|
||
|
|
|
||
|
|
original_text = "".join(original_text_list)
|
||
|
|
repeat_rate = (repeat_words / section_words) * 100
|
||
|
|
repeat_rate = str(round(repeat_rate, 1)) + "%"
|
||
|
|
|
||
|
|
repeat_quote_info = repeat_quote_info_func(original_text_contrast, section_words)
|
||
|
|
|
||
|
|
return {
|
||
|
|
"end_page_index": 0,
|
||
|
|
"name": "第{}部分".format(str(index_content_list_dan)),
|
||
|
|
"repeat_rate": repeat_rate,
|
||
|
|
"repeat_words": repeat_words,
|
||
|
|
"start_page_index": 0,
|
||
|
|
"words": section_words,
|
||
|
|
"original_text": original_text,
|
||
|
|
"original_text_oneself": original_text,
|
||
|
|
"original_text_contrast": original_text_contrast,
|
||
|
|
"repeat_quote_info": repeat_quote_info
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def check_dict(similar_content_control_zong, num_words_zong, chapter_data, index_content_list):
|
||
|
|
# similar_content_control, paper_dict, num_words, title, author, content_list
|
||
|
|
'''
|
||
|
|
生成返回字典
|
||
|
|
:param similar_content_control:
|
||
|
|
:param paper_dict:
|
||
|
|
:param num_words:
|
||
|
|
:param title:
|
||
|
|
:param author:
|
||
|
|
:return:
|
||
|
|
'''
|
||
|
|
# 模拟多个章节
|
||
|
|
section_details_list = []
|
||
|
|
for data_section_dan, num_words, content_list, index_content_list_dan in zip(similar_content_control_zong,
|
||
|
|
num_words_zong, chapter_data,
|
||
|
|
index_content_list):
|
||
|
|
# 章节详细信息
|
||
|
|
section_details = section_details_func(data_section_dan, num_words, content_list, index_content_list_dan)
|
||
|
|
section_details_list.append(section_details)
|
||
|
|
|
||
|
|
# 模拟多个章节
|
||
|
|
|
||
|
|
section_data_list = []
|
||
|
|
for section_details in section_details_list:
|
||
|
|
section_data = section_data_func(section_details)
|
||
|
|
section_data_list.append(section_data)
|
||
|
|
|
||
|
|
total_data = total_data_func(section_details_list)
|
||
|
|
|
||
|
|
format = '%Y-%m-%d %H:%M:%S'
|
||
|
|
value = time.localtime(int(time.time()))
|
||
|
|
dt = time.strftime(format, value)
|
||
|
|
|
||
|
|
paper_data = {
|
||
|
|
"author": "",
|
||
|
|
"check_time": dt,
|
||
|
|
"time_range": "1900-01-01至2023-08-08",
|
||
|
|
"title": "",
|
||
|
|
"total_data": total_data,
|
||
|
|
"section_data": section_data_list,
|
||
|
|
"section_details": section_details_list
|
||
|
|
}
|
||
|
|
|
||
|
|
return paper_data
|
||
|
|
|
||
|
|
|
||
|
|
def split_chapter(content_list):
|
||
|
|
'''
|
||
|
|
|
||
|
|
:param content_list:
|
||
|
|
:return: [[[sentence, sentence, ... sentence], 2000], [[sentence, sentence, ... sentence], 2000]]
|
||
|
|
'''
|
||
|
|
|
||
|
|
content_list_new = []
|
||
|
|
zishu = 9000
|
||
|
|
dangqianzishu = 0
|
||
|
|
|
||
|
|
i = 0
|
||
|
|
content_list_dan = []
|
||
|
|
while True:
|
||
|
|
if i >= len(content_list):
|
||
|
|
if content_list_dan != []:
|
||
|
|
content_list_new.append([content_list_dan, dangqianzishu])
|
||
|
|
break
|
||
|
|
content_list_dan.append(content_list[i])
|
||
|
|
dangqianzishu += len(content_list[i])
|
||
|
|
if dangqianzishu > zishu:
|
||
|
|
content_list_new.append([content_list_dan, dangqianzishu])
|
||
|
|
dangqianzishu = 0
|
||
|
|
content_list_dan = []
|
||
|
|
i += 1
|
||
|
|
|
||
|
|
return content_list_new
|
||
|
|
|
||
|
|
|
||
|
|
# def biahong_rule(s1, s2):
|
||
|
|
# '''
|
||
|
|
#
|
||
|
|
# :param s1:
|
||
|
|
# :param s2:
|
||
|
|
# :return:
|
||
|
|
# {
|
||
|
|
# "probabilities": null,
|
||
|
|
# "resilt": [
|
||
|
|
# [
|
||
|
|
# "而且受大型火灾事件的影响,会对公众心理造成相应的伤害,火灾发生后让人们产生恐惧、烦躁等心理现象,这也会妨碍对火灾的管理。1大型商业建筑火灾的特点大型企业的商业建筑一般设计为商业综合体,其中包括办公,餐饮,商铺,娱乐等活动场所,具有建筑市场规模大、人数多、火灾荷载大、消防救援难等特点,其建筑火灾具有以下特点。1.1可燃物种类多,火灾荷载集中大型企业综合商业建筑设计一般主要包括室内步行街和大量的商铺柜台等部分。",
|
||
|
|
# "火灾的管理。1大型商业建筑火灾的特点大型企业的商业建筑一般设计为商业综合体,其中包括办公,餐饮,商铺,娱乐等活动场所,具有建筑市场规模大、人数多、火灾荷载大、消防救援难等特点,其建筑火灾具有以下特点。1.1可燃物种类多,火灾荷载集中大型企业综合商业建筑设计一般主要包括室内步行街和大量的商铺柜台等部分",
|
||
|
|
# "1大型商业建筑火灾的特点大型商业建筑一般为商业综合体,其中包含商铺、娱乐等多种场所,具有建筑规模大、人员多、火灾荷载大、扑救困难等特点,其建筑火灾有以下特点。1.1可燃物种类多,火灾荷载集中大型综合体商业建筑中一般包含室内步行街与大量百货专柜等部分。1111",
|
||
|
|
# "1大型商业建筑火灾的特点大型商业建筑一般为商业综合体,其中包含商铺、娱乐等多种场所,具有建筑规模大、人员多、火灾荷载大、扑救困难等特点,其建筑火灾有以下特点。1.1可燃物种类多,火灾荷载集中大型综合体商业建筑中一般包含室内步行街与大量百货专柜等部分。1111",
|
||
|
|
# [
|
||
|
|
# 54,
|
||
|
|
# 204,
|
||
|
|
# 0,
|
||
|
|
# 126
|
||
|
|
# ]
|
||
|
|
# ],
|
||
|
|
# ]
|
||
|
|
# }
|
||
|
|
# '''
|
||
|
|
#
|
||
|
|
# id_start_1_dan_best = 0
|
||
|
|
# id_end_1_dan_best = len(s1) -1
|
||
|
|
# id_start_2_dan_best = 0
|
||
|
|
# id_end_2_dan_best = len(s2) -1
|
||
|
|
# sim_score_best_best = 0
|
||
|
|
#
|
||
|
|
# while True:
|
||
|
|
# if sim_score_best >= 0.75:
|
||
|
|
# break
|
||
|
|
# else:
|
||
|
|
# id_start_1_dan = 0
|
||
|
|
# id_end_1_dan = len(s1) - 1
|
||
|
|
# id_start_2_dan = 0
|
||
|
|
# id_end_2_dan = len(s2) - 1
|
||
|
|
#
|
||
|
|
# sen_list = [
|
||
|
|
# [s1[id_start_1_dan_best+1:id_end_1_dan_best +1], s2[id_start_2_dan_best:id_end_2_dan_best +1]],
|
||
|
|
# [s1[id_start_1_dan_best:id_end_1_dan_best], s2[id_start_2_dan_best:id_end_2_dan_best +1]],
|
||
|
|
# [s1[id_start_1_dan_best :id_end_1_dan_best + 1], s2[id_start_2_dan_best + 1:id_end_2_dan_best + 1]],
|
||
|
|
# [s1[id_start_1_dan_best:id_end_1_dan_best + 1], s2[id_start_2_dan_best:id_end_2_dan_best]]
|
||
|
|
# ]
|
||
|
|
# for i in sen_list:
|
||
|
|
# sim_score = jaccard_similarity(i[0], i[1])
|
||
|
|
#
|
||
|
|
#
|
||
|
|
# return
|
||
|
|
|
||
|
|
def check_red(bool_check_sentense, data_zong, recall_data_list, dan_chapter_data):
|
||
|
|
# print("data_zong", data_zong)
|
||
|
|
biao_red = biaohong(bool_check_sentense, data_zong,
|
||
|
|
recall_data_list) # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]]
|
||
|
|
|
||
|
|
print("biao_red", str(biao_red))
|
||
|
|
|
||
|
|
original_sentence_index = []
|
||
|
|
# for i in biao_red:
|
||
|
|
# for j in i:
|
||
|
|
# original_sentence_index.append(j[0])
|
||
|
|
|
||
|
|
sentence_0_list = []
|
||
|
|
sentence_1_list = []
|
||
|
|
sim_paper_name = []
|
||
|
|
|
||
|
|
for i in range(len(biao_red)):
|
||
|
|
for j in range(len(biao_red[i])):
|
||
|
|
print("i,j", i, j)
|
||
|
|
# if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]:
|
||
|
|
# sentence_0_list.append("".join([content_list[biao_red[i][j][0][0]], content_list[biao_red[i][j][0][1]], content_list[biao_red[i][j][0][2]]]))
|
||
|
|
# sentence_1_list.append(
|
||
|
|
# "".join([recall_data_list[biao_red[i][j][1][0]][0], recall_data_list[biao_red[i][j][1][1]][0], recall_data_list[biao_red[i][j][1][2]][0]]))
|
||
|
|
# sim_paper_name.append(recall_data_list[biao_red[i][j][1][0]][1])
|
||
|
|
# else:
|
||
|
|
# continue
|
||
|
|
|
||
|
|
file_name = recall_data_list[biao_red[i][j][1][1]][1]
|
||
|
|
sentence_0_list_dan = []
|
||
|
|
sentence_1_list_dan = []
|
||
|
|
sentence_0_list_dan_index = []
|
||
|
|
# houxuna_file_list = [
|
||
|
|
# [recall_data_list[biao_red[i][j][1][0]][1], dan_chapter_data[biao_red[i][j][0][0]],
|
||
|
|
# recall_data_list[biao_red[i][j][1][0]][0]],
|
||
|
|
# [recall_data_list[biao_red[i][j][1][1]][1], dan_chapter_data[biao_red[i][j][0][1]],
|
||
|
|
# recall_data_list[biao_red[i][j][1][1]][0]],
|
||
|
|
# [recall_data_list[biao_red[i][j][1][2]][1], dan_chapter_data[biao_red[i][j][0][2]],
|
||
|
|
# recall_data_list[biao_red[i][j][1][2]][0]]
|
||
|
|
# ]
|
||
|
|
|
||
|
|
sentence_0_list_dan = [dan_chapter_data[biao_red[i][j][0][index_simsentence]] for index_simsentence in
|
||
|
|
range(len(biao_red[i][j][0]))]
|
||
|
|
houxuna_file_list = [[recall_data_list[biao_red[i][j][1][index_simsentence]][1],
|
||
|
|
recall_data_list[biao_red[i][j][1][index_simsentence]][0]] for index_simsentence in
|
||
|
|
range(len(biao_red[i][j][0]))]
|
||
|
|
|
||
|
|
for dan_sen_info in houxuna_file_list:
|
||
|
|
if dan_sen_info[0] == file_name:
|
||
|
|
sentence_1_list_dan.append(dan_sen_info[1])
|
||
|
|
if sentence_0_list_dan != [] and sentence_1_list_dan != []:
|
||
|
|
sentence_0_list.append("".join(sentence_0_list_dan))
|
||
|
|
sentence_1_list.append("".join(sentence_1_list_dan))
|
||
|
|
original_sentence_index.append(biao_red[i][j][0])
|
||
|
|
sim_paper_name.append(recall_data_list[biao_red[i][j][1][1]][1])
|
||
|
|
|
||
|
|
print("待标红句子筛选完成")
|
||
|
|
sentence_0_list_new = []
|
||
|
|
sentence_1_list_new = []
|
||
|
|
|
||
|
|
for i in zip(sentence_0_list, sentence_1_list):
|
||
|
|
if len(i[0]) + len(i[1]) < 1200:
|
||
|
|
sentence_0_list_new.append(i[0])
|
||
|
|
sentence_1_list_new.append(i[1])
|
||
|
|
else:
|
||
|
|
print(len(i[0]) + len(i[1]))
|
||
|
|
continue
|
||
|
|
t2 = time.time()
|
||
|
|
|
||
|
|
print()
|
||
|
|
for i in sentence_0_list_new:
|
||
|
|
print("sentence_0_list_new", i)
|
||
|
|
if sentence_0_list_new == sentence_1_list_new == []:
|
||
|
|
paper_dict = []
|
||
|
|
else:
|
||
|
|
print("sentence_0_list_new", len(sentence_0_list_new))
|
||
|
|
print("sentence_1_list_new", len(sentence_1_list_new))
|
||
|
|
|
||
|
|
# ================================================================================================
|
||
|
|
# 深度学习标红
|
||
|
|
paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new)
|
||
|
|
|
||
|
|
return original_sentence_index, sim_paper_name, sentence_0_list_new, sentence_1_list_new, paper_dict
|
||
|
|
|
||
|
|
|
||
|
|
def chapter_check(dan_chapter_data, recall_data_list):
|
||
|
|
# =============================================================================================
|
||
|
|
# 多进程算法
|
||
|
|
# rouge算法查重
|
||
|
|
# t1_0 = time.time()
|
||
|
|
# rst = []
|
||
|
|
# p = Pool(nums_cpus) # 进程池中含有n个子进程
|
||
|
|
#
|
||
|
|
# print("num_words", num_words)
|
||
|
|
# for i in range(len(content_list)):
|
||
|
|
# text = content_list[i]
|
||
|
|
# a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,))
|
||
|
|
# rst.append(a)
|
||
|
|
# p.close()
|
||
|
|
# p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。
|
||
|
|
#
|
||
|
|
# print("筛选句子完成")
|
||
|
|
# rst = [i.get() for i in rst]
|
||
|
|
#
|
||
|
|
# t2_0 = time.time()
|
||
|
|
# print(t2_0- t1_0)
|
||
|
|
|
||
|
|
# =========================================================================================================
|
||
|
|
|
||
|
|
rst = []
|
||
|
|
for i in range(len(dan_chapter_data)):
|
||
|
|
text = dan_chapter_data[i]
|
||
|
|
rst.append(rouge_pre_m(text, recall_data_list))
|
||
|
|
|
||
|
|
# ========================================================================================================
|
||
|
|
|
||
|
|
data_zong = []
|
||
|
|
for i in range(len(rst)):
|
||
|
|
# print(rst[i])
|
||
|
|
data_zong.append(rst[i])
|
||
|
|
|
||
|
|
t0 = time.time()
|
||
|
|
# bert算法查重
|
||
|
|
# for text in content_list:
|
||
|
|
# bert_pre_list = bert_check(text, recall_data_list)
|
||
|
|
# data_zong.append(bert_pre_list)
|
||
|
|
t1 = time.time()
|
||
|
|
original_dict = []
|
||
|
|
|
||
|
|
# 找出相似的句子序号
|
||
|
|
bool_check_sentense = [] # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
|
||
|
|
|
||
|
|
# rouge算法
|
||
|
|
for i in range(len(data_zong)):
|
||
|
|
bool_check_sentense_dan = [] # [[1, 223],[1, 226], [1, 562]]
|
||
|
|
for j in range(len(data_zong[i])):
|
||
|
|
if data_zong[i][j][1] > 0.3:
|
||
|
|
# print("data_zong[{}][{}]".format(i,j), data_zong[i][j][0])
|
||
|
|
bool_check_sentense_dan.append([i, data_zong[i][j][0]])
|
||
|
|
if bool_check_sentense_dan != []:
|
||
|
|
bool_check_sentense.append(bool_check_sentense_dan)
|
||
|
|
|
||
|
|
# 继续用rouge方法筛选
|
||
|
|
|
||
|
|
if bool_check_sentense == []:
|
||
|
|
pass
|
||
|
|
bool_check_sentense = rouge_pre_m_1(bool_check_sentense, dan_chapter_data,
|
||
|
|
recall_data_list) # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
|
||
|
|
|
||
|
|
print("bool_check_sentense", bool_check_sentense)
|
||
|
|
print("找出相似的句子序号完成")
|
||
|
|
|
||
|
|
# 策略标红
|
||
|
|
# original_sentence_index, sim_paper_name, sentence_0_list_new, sentence_1_list_new, paper_dict = check_red(bool_check_sentense, data_zong, recall_data_list, dan_chapter_data)
|
||
|
|
|
||
|
|
# t3 = time.time()
|
||
|
|
# print("标红完成")
|
||
|
|
#
|
||
|
|
# print("paper_dict", paper_dict)
|
||
|
|
# print("sentence_0_list_new", sentence_0_list_new)
|
||
|
|
# print("sentence_1_list_new", sentence_1_list_new)
|
||
|
|
# print("sim_paper_name", sim_paper_name)
|
||
|
|
# similar_content_control = [[]]
|
||
|
|
# # with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f:
|
||
|
|
# # json.dump(paper_dict, f, ensure_ascii=False)
|
||
|
|
# if sentence_0_list_new != []:
|
||
|
|
# sentence_0_list_new_cursor = sentence_0_list_new[0]
|
||
|
|
# for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan in zip(
|
||
|
|
# range(len(paper_dict)),
|
||
|
|
# sentence_0_list_new,
|
||
|
|
# sentence_1_list_new,
|
||
|
|
# sim_paper_name,
|
||
|
|
# original_sentence_index):
|
||
|
|
#
|
||
|
|
# if sentence_0_list_new_cursor != sentence_0_dan:
|
||
|
|
# similar_content_control.append(
|
||
|
|
# [[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]])
|
||
|
|
# sentence_0_list_new_cursor = sentence_0_dan
|
||
|
|
# else:
|
||
|
|
# similar_content_control[-1].append(
|
||
|
|
# [paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan])
|
||
|
|
'''
|
||
|
|
similar_content_control = [
|
||
|
|
[
|
||
|
|
[0, '吉比特清仓式分红案例研究重庆工商大学派斯学院 会计学 2019级会本8班 谭迅指导教师 赵青华', '华帝股份有限公司偿债能力分析重庆工商大学派斯学院 会计学院 2019级会计学1班 张修俊指导老师 刘芮', '14398460', [0, 1, 2]],
|
||
|
|
[0, '吉比特清仓式分红案例研究重庆工商大学派斯学院 会计学 2019级会本8班 谭迅指导教师 赵青华', '环境设计专业学生指导教师 赵伊静关键词:萌宠乐园;室内动物园;空间设计', '14398100', [0, 1, 2]],
|
||
|
|
[0, '吉比特清仓式分红案例研究重庆工商大学派斯学院 会计学 2019级会本8班 谭迅指导教师 赵青华', '姓 名: 马康凡指导教师: 杨华答辩日期: 二〇二三年 四 月', '14398611', [0, 1, 2]],
|
||
|
|
[0, '吉比特清仓式分红案例研究重庆工商大学派斯学院 会计学 2019级会本8班 谭迅指导教师 赵青华', '专 业 会计学指导教师 马英二〇二二 年 六 月 一 日', '14398819', [0, 1, 2]],
|
||
|
|
[0, '吉比特清仓式分红案例研究重庆工商大学派斯学院 会计学 2019级会本8班 谭迅指导教师 赵青华', '专 业 中医学指导教师 闫彪二零二三 年 一 月 二十五 日', '14397732', [0, 1, 2]],
|
||
|
|
[0, '吉比特清仓式分红案例研究重庆工商大学派斯学院 会计学 2019级会本8班 谭迅指导教师 赵青华', '学 号 193060083 年级 2019指导教师 黄鹏2023年4月9日', '14398099', [0, 1, 2]],
|
||
|
|
[0, '吉比特清仓式分红案例研究重庆工商大学派斯学院 会计学 2019级会本8班 谭迅指导教师 赵青华', '准考证号 058321204541指导教师 朱晓亚2023年3月24日', '14398377', [0, 1, 2]],
|
||
|
|
[0, '吉比特清仓式分红案例研究重庆工商大学派斯学院 会计学 2019级会本8班 谭迅指导教师 赵青华', '学生姓名 李佳盈指导教师 邢志涛完成时间 2023 年 4 月', '14398090', [0, 1, 2]],
|
||
|
|
[0, '吉比特清仓式分红案例研究重庆工商大学派斯学院 会计学 2019级会本8班 谭迅指导教师 赵青华', '姓 名: 许诺指导教师: 杨根红专 业: 汉语言文学', '14398355', [0, 1, 2]],
|
||
|
|
[0, '吉比特清仓式分红案例研究重庆工商大学派斯学院 会计学 2019级会本8班 谭迅指导教师 赵青华', '学生姓名 黎秋霞指导教师 诸葛佳琳完成时间 2023 年 3 月', '14398446', [0, 1, 2]],
|
||
|
|
[0, '吉比特清仓式分红案例研究重庆工商大学派斯学院 会计学 2019级会本8班 谭迅指导教师 赵青华', '函 授 站 专业班级指导教师 专业技术职务和谐小区项目施工质量管理研究', '14398189', [0, 1, 2]]
|
||
|
|
]
|
||
|
|
...
|
||
|
|
[
|
||
|
|
[原句子序号, 第一组句子, 第一组句子相似句子, 第一组句子的标红部分,第一组句子相似句子标红部分,相似句子文件名]
|
||
|
|
[原句子序号, 第二组句子, 第二组句子相似句子, 第一组句子的标红部分,第一组句子相似句子标红部分,相似句子文件名]
|
||
|
|
] # 第一组相似的句子
|
||
|
|
]
|
||
|
|
|
||
|
|
'''
|
||
|
|
similar_content_control = []
|
||
|
|
for sen_info_list in bool_check_sentense:
|
||
|
|
sen_info_new = []
|
||
|
|
for sen_info in sen_info_list:
|
||
|
|
yuan_sen_id = sen_info[0]
|
||
|
|
sim_sen_id = sen_info[1]
|
||
|
|
sen_info_new.append([yuan_sen_id, dan_chapter_data[yuan_sen_id], recall_data_list[sim_sen_id][0],
|
||
|
|
dan_chapter_data[yuan_sen_id], recall_data_list[sim_sen_id][0],
|
||
|
|
recall_data_list[sim_sen_id][
|
||
|
|
1]]) # [原句子序号, 第二组句子, 第二组句子相似句子, 第一组句子的标红部分,第一组句子相似句子标红部分,相似句子文件名]
|
||
|
|
similar_content_control.append(sen_info_new)
|
||
|
|
return similar_content_control
|
||
|
|
|
||
|
|
|
||
|
|
def accurate_check_rouge(
|
||
|
|
text_paper,
|
||
|
|
recall_data_list
|
||
|
|
):
|
||
|
|
'''
|
||
|
|
精确查重出相似句子
|
||
|
|
:param text:
|
||
|
|
:param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]]
|
||
|
|
:return:
|
||
|
|
'''
|
||
|
|
# 文本处理
|
||
|
|
# content_list = []
|
||
|
|
print("text_paper", len(text_paper))
|
||
|
|
content_list_old = text_paper.split("\n")
|
||
|
|
sentence_word_nums = 0
|
||
|
|
|
||
|
|
# 前处理,筛选句子
|
||
|
|
# TODO 使用新方法排除前面和后面的无用信息
|
||
|
|
content_list = []
|
||
|
|
for sen_zhang in content_list_old:
|
||
|
|
if len(sen_zhang) <= shout_sentence_len:
|
||
|
|
continue
|
||
|
|
elif sen_zhang == "。":
|
||
|
|
continue
|
||
|
|
else:
|
||
|
|
content_list.extend([i for i in str(sen_zhang).split("。") if i != ""])
|
||
|
|
# 分章
|
||
|
|
content_list_zong = split_chapter(content_list)
|
||
|
|
|
||
|
|
similar_content_control_zong = []
|
||
|
|
paper_dict_zong = []
|
||
|
|
num_words_zong = []
|
||
|
|
chapter_data = []
|
||
|
|
index_content_list = []
|
||
|
|
|
||
|
|
for index_content_list_zong in range(len(content_list_zong)):
|
||
|
|
dan_chapter_data, dan_chapter_num_words = content_list_zong[index_content_list_zong][0], \
|
||
|
|
content_list_zong[index_content_list_zong][1]
|
||
|
|
|
||
|
|
similar_content_control = chapter_check(dan_chapter_data, recall_data_list)
|
||
|
|
similar_content_control_zong.append(similar_content_control)
|
||
|
|
num_words_zong.append(dan_chapter_num_words)
|
||
|
|
chapter_data.append(dan_chapter_data)
|
||
|
|
index_content_list.append(index_content_list_zong)
|
||
|
|
|
||
|
|
paper_data = check_dict(similar_content_control_zong, num_words_zong, chapter_data, index_content_list)
|
||
|
|
return paper_data
|
||
|
|
|
||
|
|
|
||
|
|
def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
|
||
|
|
'''
|
||
|
|
标红的序号 [[0,1,2],[3,4,5]]
|
||
|
|
:param bool_check_sentense: # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
|
||
|
|
:return: list # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]]
|
||
|
|
'''
|
||
|
|
|
||
|
|
# print("bool_check_sentense", bool_check_sentense)
|
||
|
|
biao_red = []
|
||
|
|
i = 0
|
||
|
|
start = -1
|
||
|
|
end = -1
|
||
|
|
tiaochu = False
|
||
|
|
while True:
|
||
|
|
# if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \
|
||
|
|
# + 1 >= len(df_train_nuoche):
|
||
|
|
# break
|
||
|
|
|
||
|
|
if i >= len(bool_check_sentense):
|
||
|
|
break
|
||
|
|
|
||
|
|
for j in bool_check_sentense[i]:
|
||
|
|
# print("j", j)
|
||
|
|
if j[0] + 1 > len(data_zong):
|
||
|
|
tiaochu = True
|
||
|
|
break
|
||
|
|
|
||
|
|
# if bool_check_sentense[i][0][0] + 1 >= len(data_zong):
|
||
|
|
# if bool_check_sentense[]
|
||
|
|
# bool_check_sentense[i][0][0] + 1 = bool_check_sentense[i + 1][0][0]
|
||
|
|
# break
|
||
|
|
|
||
|
|
for j in bool_check_sentense[i]:
|
||
|
|
if j[1] + 1 >= len(df_train_nuoche):
|
||
|
|
tiaochu = True
|
||
|
|
break
|
||
|
|
|
||
|
|
if tiaochu == True:
|
||
|
|
break
|
||
|
|
|
||
|
|
# elif bool_check_sentense[i-1][0][0] == start:
|
||
|
|
# biao_red_dan = []
|
||
|
|
# for j in range(len(bool_check_sentense[i-1])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
|
||
|
|
# biao_red_dan.append([[bool_check_sentense[i-1][j][0], bool_check_sentense[i-1][j][0]+ 1, bool_check_sentense[i-1][j][0] + 2],
|
||
|
|
# [bool_check_sentense[i-1][j][1] - 1, bool_check_sentense[i-1][j][1], bool_check_sentense[i+1][j][1] + 1]])
|
||
|
|
# biao_red.append(biao_red_dan)
|
||
|
|
#
|
||
|
|
# elif bool_check_sentense[i+1][0][0] == end:
|
||
|
|
# biao_red_dan = []
|
||
|
|
# for j in range(len(bool_check_sentense[i+1])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
|
||
|
|
# biao_red_dan.append([[bool_check_sentense[i+1][j][0]-2, bool_check_sentense[i+1][j][0]-1, bool_check_sentense[i+1][j][0]],
|
||
|
|
# [bool_check_sentense[i+1][j][1] - 1, bool_check_sentense[i+1][j][1], bool_check_sentense[i+1][j][1] + 1]])
|
||
|
|
# biao_red.append(biao_red_dan)
|
||
|
|
|
||
|
|
elif i == len(bool_check_sentense) - 1:
|
||
|
|
if end == bool_check_sentense[i][0][0]:
|
||
|
|
i += 1
|
||
|
|
break
|
||
|
|
elif bool_check_sentense[i][0][0] - 1 == end + 1 and bool_check_sentense[i][0][0] == len(data_zong) - 1:
|
||
|
|
index_list = [ii for ii in range(bool_check_sentense[i][0][0] - 1, bool_check_sentense[i][0][0] + 1)]
|
||
|
|
elif bool_check_sentense[i][0][0] - 1 == end and bool_check_sentense[i][0][0] == len(data_zong) - 1:
|
||
|
|
index_list = [ii for ii in range(bool_check_sentense[i][0][0], bool_check_sentense[i][0][0] + 1)]
|
||
|
|
elif bool_check_sentense[i][0][0] - 1 > end + 1 and bool_check_sentense[i][0][0] == len(data_zong) - 1:
|
||
|
|
index_list = [ii for ii in range(bool_check_sentense[i][0][0] - 1, bool_check_sentense[i][0][0] + 1)]
|
||
|
|
else:
|
||
|
|
index_list = [ii for ii in range(bool_check_sentense[i][0][0] - 1, bool_check_sentense[i][0][0] + 2)]
|
||
|
|
|
||
|
|
biaohongset = set()
|
||
|
|
biao_red_dan = []
|
||
|
|
for j in range(len(bool_check_sentense[
|
||
|
|
i])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
|
||
|
|
if bool_check_sentense[i][j][1] not in biaohongset:
|
||
|
|
biao_red_dan.append([index_list,
|
||
|
|
[bool_check_sentense[i][j][1] - 1, bool_check_sentense[i][j][1],
|
||
|
|
bool_check_sentense[i][j][1] + 1]])
|
||
|
|
biaohongset.add(bool_check_sentense[i][j][1] - 1)
|
||
|
|
biaohongset.add(bool_check_sentense[i][j][1])
|
||
|
|
biaohongset.add(bool_check_sentense[i][j][1] + 1)
|
||
|
|
else:
|
||
|
|
continue
|
||
|
|
|
||
|
|
i += 1
|
||
|
|
biao_red.append(biao_red_dan)
|
||
|
|
break
|
||
|
|
|
||
|
|
elif bool_check_sentense[i][0][0] - 1 == start:
|
||
|
|
i += 1
|
||
|
|
continue
|
||
|
|
elif bool_check_sentense[i][0][0] == end:
|
||
|
|
i += 1
|
||
|
|
continue
|
||
|
|
elif bool_check_sentense[i][0][0] - 1 == end:
|
||
|
|
i += 1
|
||
|
|
continue
|
||
|
|
else:
|
||
|
|
biaohongset = set()
|
||
|
|
biao_red_dan = []
|
||
|
|
for j in range(len(bool_check_sentense[
|
||
|
|
i])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
|
||
|
|
if bool_check_sentense[i][j][1] not in biaohongset:
|
||
|
|
biao_red_dan.append([[bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][0],
|
||
|
|
bool_check_sentense[i][j][0] + 1],
|
||
|
|
[bool_check_sentense[i][j][1] - 1, bool_check_sentense[i][j][1],
|
||
|
|
bool_check_sentense[i][j][1] + 1]])
|
||
|
|
biaohongset.add(bool_check_sentense[i][j][1] - 1)
|
||
|
|
biaohongset.add(bool_check_sentense[i][j][1])
|
||
|
|
biaohongset.add(bool_check_sentense[i][j][1] + 1)
|
||
|
|
else:
|
||
|
|
continue
|
||
|
|
|
||
|
|
start = bool_check_sentense[i][0][0] - 1
|
||
|
|
end = bool_check_sentense[i][0][0] + 1
|
||
|
|
|
||
|
|
if bool_check_sentense[i - 1][0][0] == start:
|
||
|
|
for j in range(len(bool_check_sentense[
|
||
|
|
i - 1])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
|
||
|
|
if bool_check_sentense[i - 1][j][1] not in biaohongset:
|
||
|
|
biao_red_dan.append([[bool_check_sentense[i - 1][j][0], bool_check_sentense[i - 1][j][0] + 1,
|
||
|
|
bool_check_sentense[i - 1][j][0] + 2],
|
||
|
|
[bool_check_sentense[i - 1][j][1] - 1, bool_check_sentense[i - 1][j][1],
|
||
|
|
bool_check_sentense[i - 1][j][1] + 1]])
|
||
|
|
biaohongset.add(bool_check_sentense[i - 1][j][1] - 1)
|
||
|
|
biaohongset.add(bool_check_sentense[i - 1][j][1])
|
||
|
|
biaohongset.add(bool_check_sentense[i - 1][j][1] + 1)
|
||
|
|
else:
|
||
|
|
continue
|
||
|
|
|
||
|
|
if bool_check_sentense[i + 1][0][0] == end:
|
||
|
|
for j in range(len(bool_check_sentense[
|
||
|
|
i + 1])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
|
||
|
|
if bool_check_sentense[i + 1][j][1] not in biaohongset:
|
||
|
|
biao_red_dan.append([[bool_check_sentense[i + 1][j][0] - 2,
|
||
|
|
bool_check_sentense[i + 1][j][0] - 1, bool_check_sentense[i + 1][j][0]],
|
||
|
|
[bool_check_sentense[i + 1][j][1] - 1, bool_check_sentense[i + 1][j][1],
|
||
|
|
bool_check_sentense[i + 1][j][1] + 1]])
|
||
|
|
biaohongset.add(bool_check_sentense[i + 1][j][1] - 1)
|
||
|
|
biaohongset.add(bool_check_sentense[i + 1][j][1])
|
||
|
|
biaohongset.add(bool_check_sentense[i + 1][j][1] + 1)
|
||
|
|
else:
|
||
|
|
continue
|
||
|
|
|
||
|
|
i += 1
|
||
|
|
biao_red.append(biao_red_dan)
|
||
|
|
|
||
|
|
return biao_red # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]]
|
||
|
|
|
||
|
|
|
||
|
|
def dialog_line_parse(url, text):
|
||
|
|
"""
|
||
|
|
将数据输入模型进行分析并输出结果
|
||
|
|
:param url: 模型url
|
||
|
|
:param text: 进入模型的数据
|
||
|
|
:return: 模型返回结果
|
||
|
|
"""
|
||
|
|
|
||
|
|
response = requests.post(
|
||
|
|
url,
|
||
|
|
json=text,
|
||
|
|
timeout=100000
|
||
|
|
)
|
||
|
|
if response.status_code == 200:
|
||
|
|
return response.json()
|
||
|
|
else:
|
||
|
|
# logger.error(
|
||
|
|
# "【{}】 Failed to get a proper response from remote "
|
||
|
|
# "server. Status Code: {}. Response: {}"
|
||
|
|
# "".format(url, response.status_code, response.text)
|
||
|
|
# )
|
||
|
|
print("【{}】 Failed to get a proper response from remote "
|
||
|
|
"server. Status Code: {}. Response: {}"
|
||
|
|
"".format(url, response.status_code, response.text))
|
||
|
|
print(text)
|
||
|
|
return {}
|
||
|
|
|
||
|
|
|
||
|
|
def is_english_char(char):
|
||
|
|
code = ord(char)
|
||
|
|
return 32 <= code <= 126
|
||
|
|
|
||
|
|
|
||
|
|
def original_text_marked_red(text_original, bert_text, start, end):
|
||
|
|
'''
|
||
|
|
把原文标红字段找到
|
||
|
|
:param text_original:
|
||
|
|
:param bert_text:
|
||
|
|
:param bert_text_pre:
|
||
|
|
:return:
|
||
|
|
'''
|
||
|
|
try:
|
||
|
|
fuhao = ["\n"]
|
||
|
|
up_pointer = 0
|
||
|
|
down_pointer = 0
|
||
|
|
|
||
|
|
pointer_list = []
|
||
|
|
|
||
|
|
bert_text_list = list(bert_text)
|
||
|
|
bert_text_list.insert(start, "<red>")
|
||
|
|
bert_text_list.insert(end + 2, "</red>")
|
||
|
|
|
||
|
|
text_original_list = list(text_original)
|
||
|
|
|
||
|
|
up = 0
|
||
|
|
down = 0
|
||
|
|
|
||
|
|
while True:
|
||
|
|
if up == len(text_original_list):
|
||
|
|
break
|
||
|
|
|
||
|
|
if text_original_list[up] == bert_text_list[down]:
|
||
|
|
up += 1
|
||
|
|
down += 1
|
||
|
|
|
||
|
|
else:
|
||
|
|
if bert_text_list[down] == "<red>":
|
||
|
|
down += 1
|
||
|
|
elif bert_text_list[down] == "</red>":
|
||
|
|
down += 1
|
||
|
|
else:
|
||
|
|
bert_text_list.insert(down, text_original_list[up])
|
||
|
|
up += 1
|
||
|
|
down += 1
|
||
|
|
|
||
|
|
bert_text = "".join(bert_text_list)
|
||
|
|
return True, bert_text
|
||
|
|
except:
|
||
|
|
print("句子标红报错")
|
||
|
|
print(text_original, bert_text)
|
||
|
|
return False, ""
|
||
|
|
|
||
|
|
|
||
|
|
def biaohong_bert_predict(sentence_0_list, sentence_1_list):
|
||
|
|
'''
|
||
|
|
找出标红字符
|
||
|
|
:param bool_check_sentense:
|
||
|
|
:return:
|
||
|
|
'''
|
||
|
|
|
||
|
|
paper_dict = \
|
||
|
|
dialog_line_parse("http://192.168.31.74:16003/",
|
||
|
|
{"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})[
|
||
|
|
"resilt"]
|
||
|
|
|
||
|
|
return paper_dict
|
||
|
|
|
||
|
|
|
||
|
|
def ulit_text(title, text):
|
||
|
|
data = []
|
||
|
|
try:
|
||
|
|
text = json.loads(text)["content"]
|
||
|
|
except:
|
||
|
|
pass
|
||
|
|
|
||
|
|
text = text.strip().replace("\n", "").replace(" ", "").replace("。", "。\n")
|
||
|
|
text_list = text.split("\n")
|
||
|
|
|
||
|
|
for i in text_list:
|
||
|
|
data.append([i, title])
|
||
|
|
return data
|
||
|
|
|
||
|
|
|
||
|
|
def run_query(conn, sql, params):
|
||
|
|
with conn.cursor() as cursor:
|
||
|
|
cursor.execute(sql, params)
|
||
|
|
result = cursor.fetchall()
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
# def processing_one_text(paper_id):
|
||
|
|
# conn = pymysql.connect(
|
||
|
|
# host='192.168.31.145',
|
||
|
|
# port=3306,
|
||
|
|
# user='root',
|
||
|
|
# password='123456',
|
||
|
|
# db='zhiwang_db',
|
||
|
|
# charset='utf8mb4',
|
||
|
|
# cursorclass=pymysql.cursors.DictCursor
|
||
|
|
# )
|
||
|
|
#
|
||
|
|
# sql = 'SELECT * FROM main_table_paper_detail_message WHERE doc_id=%s'
|
||
|
|
# params = (paper_id,)
|
||
|
|
#
|
||
|
|
# result = run_query(conn, sql, params)
|
||
|
|
#
|
||
|
|
# conn.close()
|
||
|
|
# print(result[0]['title'], result[0]['author'])
|
||
|
|
# title = result[0]['title']
|
||
|
|
# author = result[0]['author']
|
||
|
|
# degree = result[0]['degree']
|
||
|
|
# year = result[0]['content'].split("/")[5]
|
||
|
|
# content_path = result[0]['content']
|
||
|
|
#
|
||
|
|
# try:
|
||
|
|
# with open(content_path, encoding="utf-8") as f:
|
||
|
|
# text = f.read()
|
||
|
|
# except:
|
||
|
|
# with open(content_path, encoding="gbk") as f:
|
||
|
|
# text = f.read()
|
||
|
|
#
|
||
|
|
# paper_info = {
|
||
|
|
# "title": title,
|
||
|
|
# "author": author,
|
||
|
|
# "degree": degree,
|
||
|
|
# "year": year,
|
||
|
|
# "paper_len_word": len(text)
|
||
|
|
# }
|
||
|
|
# data = ulit_text(paper_info, text)
|
||
|
|
# return data
|
||
|
|
|
||
|
|
|
||
|
|
from clickhouse_driver import Client
|
||
|
|
|
||
|
|
|
||
|
|
class PureClient:
|
||
|
|
def __init__(self, database='mini_check'):
|
||
|
|
# 只需要写本地地址
|
||
|
|
self.client = Client(host=f'{"192.168.31.74"}', port=9000, user='default',
|
||
|
|
password='zhicheng123*', database=database)
|
||
|
|
|
||
|
|
def run(self, sql):
|
||
|
|
client = self.client
|
||
|
|
collection = client.query_dataframe(sql)
|
||
|
|
return collection
|
||
|
|
|
||
|
|
|
||
|
|
def processing_one_text(user_uuid):
|
||
|
|
pureclient = PureClient()
|
||
|
|
print("paper_id", user_uuid)
|
||
|
|
sql = f"SELECT * FROM user_table WHERE user_uuid='{user_uuid}'"
|
||
|
|
result = pureclient.run(sql)
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
def ulit_recall_paper(uuid_uesr):
|
||
|
|
'''
|
||
|
|
对返回的十篇文章路径读取并解析
|
||
|
|
:param recall_data_list_path:
|
||
|
|
:return data: list [[sentence, filename],[sentence, filename],[sentence, filename]]
|
||
|
|
'''
|
||
|
|
|
||
|
|
# data = []
|
||
|
|
# for path in recall_data_list_path:
|
||
|
|
# filename = path.split("/")[-1]
|
||
|
|
# with open(path, encoding="gbk") as f:
|
||
|
|
# text = f.read()
|
||
|
|
# text_list = text.split("\n")
|
||
|
|
# for sentence in text_list:
|
||
|
|
# if sentence != "":
|
||
|
|
# data.append([sentence, filename])
|
||
|
|
# return data
|
||
|
|
res = processing_one_text(uuid_uesr)
|
||
|
|
res_list = res.values.tolist()
|
||
|
|
|
||
|
|
data = []
|
||
|
|
for res_dan in res_list:
|
||
|
|
user_uuid = res_dan[0]
|
||
|
|
file_path = res_dan[1]
|
||
|
|
is_delete = res_dan[2]
|
||
|
|
if is_delete == 1:
|
||
|
|
try:
|
||
|
|
with open(file_path, encoding="gbk") as f:
|
||
|
|
text = f.read()
|
||
|
|
except:
|
||
|
|
with open(file_path, encoding="utf-8") as f:
|
||
|
|
text = f.read()
|
||
|
|
|
||
|
|
content_list_old = text.split("\n")
|
||
|
|
sentence_word_nums = 0
|
||
|
|
|
||
|
|
# 前处理,筛选句子
|
||
|
|
# TODO 使用新方法排除前面和后面的无用信息
|
||
|
|
content_list = []
|
||
|
|
for sen_zhang in content_list_old:
|
||
|
|
if len(sen_zhang) <= shout_sentence_len:
|
||
|
|
continue
|
||
|
|
elif sen_zhang == "。":
|
||
|
|
continue
|
||
|
|
else:
|
||
|
|
content_list.extend([i for i in str(sen_zhang).split("。") if i != ""])
|
||
|
|
|
||
|
|
filename = file_path.split("/")[-1].split(".")[0]
|
||
|
|
for sentence in content_list:
|
||
|
|
if sentence != "":
|
||
|
|
data.append([sentence.strip("\n"), filename])
|
||
|
|
|
||
|
|
return data
|
||
|
|
|
||
|
|
|
||
|
|
def recall_10(queue_uuid, title, abst_zh, content):
|
||
|
|
'''
|
||
|
|
宇鹏召回接口
|
||
|
|
:param paper_name:
|
||
|
|
:return:
|
||
|
|
'''
|
||
|
|
|
||
|
|
request_json = {
|
||
|
|
"uuid": queue_uuid,
|
||
|
|
"title": title,
|
||
|
|
"abst_zh": abst_zh,
|
||
|
|
"content": content
|
||
|
|
}
|
||
|
|
print(request_json)
|
||
|
|
dialog_line_parse("http://192.168.31.145:50004/check1", request_json)
|
||
|
|
|
||
|
|
|
||
|
|
def uilt_content(content):
|
||
|
|
zhaiyao_list = ["摘要"]
|
||
|
|
zhaiyao_en_list = ["Abstract", "abstract"]
|
||
|
|
mulu_list = ["目录"]
|
||
|
|
key_word_list = ["关键词"]
|
||
|
|
caikanwenxian = ["参考文献"]
|
||
|
|
key_word_bool = False
|
||
|
|
key_word_str = ""
|
||
|
|
zhaiyao_bool = False
|
||
|
|
zhaiyao_en_bool = False
|
||
|
|
zhaiyao_str = ""
|
||
|
|
zhaiyao_en_str = ""
|
||
|
|
mulu_str = ""
|
||
|
|
zhaiyao_text = ""
|
||
|
|
mulu_bool = False
|
||
|
|
|
||
|
|
pantten_zhaiyao = '(摘\s*要)'
|
||
|
|
result_biaoti_list = re.findall(pantten_zhaiyao, content)
|
||
|
|
if len(result_biaoti_list) != 0:
|
||
|
|
zhaiyao_str = result_biaoti_list[0]
|
||
|
|
zhaiyao_bool = True
|
||
|
|
else:
|
||
|
|
zhaiyao_bool = False
|
||
|
|
|
||
|
|
for i in zhaiyao_en_list:
|
||
|
|
if i in content:
|
||
|
|
zhaiyao_en_bool = True
|
||
|
|
zhaiyao_en_str = i
|
||
|
|
break
|
||
|
|
|
||
|
|
for i in mulu_list:
|
||
|
|
if i in content:
|
||
|
|
mulu_str = i
|
||
|
|
mulu_bool = True
|
||
|
|
break
|
||
|
|
|
||
|
|
for i in key_word_list:
|
||
|
|
if i in content:
|
||
|
|
key_word_str = i
|
||
|
|
key_word_bool = True
|
||
|
|
break
|
||
|
|
|
||
|
|
if zhaiyao_bool == True and key_word_bool == True:
|
||
|
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, key_word_str)
|
||
|
|
result_biaoti_list = re.findall(pantten_zhaiyao, content)
|
||
|
|
zhaiyao_text = result_biaoti_list[0]
|
||
|
|
|
||
|
|
elif zhaiyao_bool == True and zhaiyao_en_bool == True:
|
||
|
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, zhaiyao_en_str)
|
||
|
|
result_biaoti_list = re.findall(pantten_zhaiyao, content)
|
||
|
|
zhaiyao_text = result_biaoti_list[0]
|
||
|
|
|
||
|
|
elif zhaiyao_bool == True and mulu_bool == True:
|
||
|
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, mulu_str)
|
||
|
|
result_biaoti_list = re.findall(pantten_zhaiyao, content)
|
||
|
|
zhaiyao_text = result_biaoti_list[0]
|
||
|
|
|
||
|
|
if zhaiyao_text == "":
|
||
|
|
content = str(content).replace("。\n", "。")
|
||
|
|
content_list = content.split("。")
|
||
|
|
zhaiyao_text = "".join(content_list[:15])
|
||
|
|
return zhaiyao_text
|
||
|
|
|
||
|
|
|
||
|
|
def ulit_request_file(file):
|
||
|
|
file_name = file.filename
|
||
|
|
if file_name.split(".")[-1] == "txt":
|
||
|
|
file_name_save = "data/request/{}".format(file_name)
|
||
|
|
file.save(file_name_save)
|
||
|
|
try:
|
||
|
|
with open(file_name_save, encoding="gbk") as f:
|
||
|
|
content = f.read()
|
||
|
|
except:
|
||
|
|
with open(file_name_save, encoding="utf-8") as f:
|
||
|
|
content = f.read()
|
||
|
|
|
||
|
|
# content = " ".join([i for i in content.split("\n") if i != ""])
|
||
|
|
# abst_zh = uilt_content(content)
|
||
|
|
abst_zh = ""
|
||
|
|
return abst_zh, content
|
||
|
|
|
||
|
|
|
||
|
|
# @app.route("/", methods=["POST"])
|
||
|
|
# def handle_query():
|
||
|
|
# print(request.remote_addr)
|
||
|
|
#
|
||
|
|
# # request.form.get('prompt')
|
||
|
|
# dataBases = request.form.get("dataBases")
|
||
|
|
# minSimilarity = request.form.get("minSimilarity") # txt
|
||
|
|
# minWords = request.form.get("minWords")
|
||
|
|
# title = request.form.get("title")
|
||
|
|
# author = request.form.get("author") # txt
|
||
|
|
# file = request.files.get('file')
|
||
|
|
# token = request.form.get("token")
|
||
|
|
# account = request.form.get("account")
|
||
|
|
# goodsId = request.form.get("goodsId")
|
||
|
|
# callbackUrl = request.form.get("callbackUrl")
|
||
|
|
#
|
||
|
|
#
|
||
|
|
# t0 = time.time()
|
||
|
|
# abst_zh, content = ulit_request_file(file)
|
||
|
|
#
|
||
|
|
# # 调用宇鹏查询相似十篇
|
||
|
|
# # recall_data_list_dict = recall_10(title, abst_zh, content)
|
||
|
|
#
|
||
|
|
# t1 = time.time()
|
||
|
|
# print("查找相似的50篇完成")
|
||
|
|
# with open("data/rell_json.txt") as f:
|
||
|
|
# recall_data_list_dict = eval(f.read())
|
||
|
|
#
|
||
|
|
# # 读取文章转化成格式数据
|
||
|
|
# recall_data_list = ulit_recall_paper(recall_data_list_dict)
|
||
|
|
# print("文章格式转化完成")
|
||
|
|
#
|
||
|
|
# # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
|
||
|
|
#
|
||
|
|
# # 进入精确查重系统
|
||
|
|
# print("进入精确查重系统")
|
||
|
|
# return_list = accurate_check_rouge(title, author, content, recall_data_list)
|
||
|
|
#
|
||
|
|
# print("召回50篇", t1 - t0)
|
||
|
|
#
|
||
|
|
# return_text = {"resilt": return_list, "probabilities": None, "status_code": 200}
|
||
|
|
# return jsonify(return_text) # 返回结果
|
||
|
|
|
||
|
|
|
||
|
|
# def classify_recall(): # 调用模型,设置最大batch_size
|
||
|
|
# while True:
|
||
|
|
# if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取
|
||
|
|
# time.sleep(3)
|
||
|
|
# continue
|
||
|
|
# query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text
|
||
|
|
# data_dict_path = json.loads(query)
|
||
|
|
# path = data_dict_path['path']
|
||
|
|
# # text_type = data_dict["text_type"]
|
||
|
|
#
|
||
|
|
# with open(path, encoding='utf8') as f1:
|
||
|
|
# # 加载文件的对象
|
||
|
|
# data_dict = json.load(f1)
|
||
|
|
#
|
||
|
|
# queue_uuid = data_dict['id']
|
||
|
|
# print(queue_uuid)
|
||
|
|
# dataBases = data_dict['dataBases']
|
||
|
|
# minSimilarity = data_dict['minSimilarity']
|
||
|
|
# minWords = data_dict['minWords']
|
||
|
|
# title = data_dict['title']
|
||
|
|
# author = data_dict['author']
|
||
|
|
# abst_zh = data_dict['abst_zh']
|
||
|
|
# content = data_dict['content']
|
||
|
|
# token = data_dict['token']
|
||
|
|
# account = data_dict['account']
|
||
|
|
# goodsId = data_dict['goodsId']
|
||
|
|
# callbackUrl = data_dict['callbackUrl']
|
||
|
|
#
|
||
|
|
# # 调用宇鹏查询相似十篇
|
||
|
|
# recall_data_list_dict = recall_10(queue_uuid, title, abst_zh, content)
|
||
|
|
#
|
||
|
|
# # print("查找相似的50篇完成")
|
||
|
|
# # with open("data/rell_json.txt") as f:
|
||
|
|
# # recall_data_list_dict = eval(f.read())
|
||
|
|
#
|
||
|
|
# # 读取文章转化成格式
|
||
|
|
|
||
|
|
|
||
|
|
def classify_accurate_check():
|
||
|
|
while True:
|
||
|
|
if redis_.llen(db_key_query_recall) == 0: # 若队列中没有元素就继续获取
|
||
|
|
time.sleep(3)
|
||
|
|
continue
|
||
|
|
|
||
|
|
query_recall = redis_.lpop(db_key_query_recall).decode('UTF-8') # 获取query的text
|
||
|
|
query_recall_dict = json.loads(
|
||
|
|
query_recall) # db_key_query_recall json.dumps({"id": id_, "path": load_request_path})
|
||
|
|
query_recall_uuid = query_recall_dict["id"]
|
||
|
|
data_dict_path = query_recall_dict["path"]
|
||
|
|
print(data_dict_path)
|
||
|
|
|
||
|
|
# d = {
|
||
|
|
# "uuid_uesr": uuid_uesr,
|
||
|
|
# "content": content
|
||
|
|
# }
|
||
|
|
with open(data_dict_path, encoding='utf8') as f:
|
||
|
|
data_dict = json.loads(f.read())
|
||
|
|
|
||
|
|
uuid_uesr = data_dict['uuid_uesr']
|
||
|
|
abstract = data_dict['content'][0]
|
||
|
|
content = data_dict['content'][1]
|
||
|
|
# try:
|
||
|
|
recall_data_list = ulit_recall_paper(uuid_uesr)
|
||
|
|
|
||
|
|
print("查找相似的50篇完成")
|
||
|
|
|
||
|
|
# with open("data/rell_json.txt") as f:
|
||
|
|
# recall_data_list_dict = eval(f.read())
|
||
|
|
# recall_data_list = ulit_recall_paper(recall_data_list_dict)
|
||
|
|
|
||
|
|
print("文章格式转化完成")
|
||
|
|
|
||
|
|
# recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
|
||
|
|
|
||
|
|
# 进入精确查重系统
|
||
|
|
print("进入精确查重系统")
|
||
|
|
|
||
|
|
return_list = accurate_check_rouge(content, recall_data_list)
|
||
|
|
|
||
|
|
return_text = {"resilt": return_list, "probabilities": None, "status_code": 200}
|
||
|
|
|
||
|
|
load_result_path = "./new_data_logs/{}.json".format(query_recall_uuid)
|
||
|
|
|
||
|
|
print("queue_uuid: ", query_recall_uuid)
|
||
|
|
print("load_result_path: ", load_result_path)
|
||
|
|
|
||
|
|
with open(load_result_path, 'w', encoding='utf8') as f2:
|
||
|
|
# ensure_ascii=False才能输入中文,否则是Unicode字符
|
||
|
|
# indent=2 JSON数据的缩进,美观
|
||
|
|
json.dump(return_text, f2, ensure_ascii=False, indent=4)
|
||
|
|
|
||
|
|
print(query_recall_uuid)
|
||
|
|
print(load_result_path)
|
||
|
|
redis_.set(query_recall_uuid, load_result_path, 86400)
|
||
|
|
redis_.srem(db_key_querying, query_recall_uuid)
|
||
|
|
# except:
|
||
|
|
# return_text = {"resilt": "", "probabilities": None, "status_code": 401}
|
||
|
|
# load_result_path = "./new_data_logs/{}.json".format(queue_uuid)
|
||
|
|
#
|
||
|
|
# print("queue_uuid: ", queue_uuid)
|
||
|
|
# print("load_result_path: ", load_result_path)
|
||
|
|
#
|
||
|
|
# with open(load_result_path, 'w', encoding='utf8') as f2:
|
||
|
|
# # ensure_ascii=False才能输入中文,否则是Unicode字符
|
||
|
|
# # indent=2 JSON数据的缩进,美观
|
||
|
|
# json.dump(return_text, f2, ensure_ascii=False, indent=4)
|
||
|
|
#
|
||
|
|
# print(queue_uuid)
|
||
|
|
# print(load_result_path)
|
||
|
|
# redis_.set(queue_uuid, load_result_path, 86400)
|
||
|
|
# redis_.srem(db_key_querying, queue_uuid)
|
||
|
|
|
||
|
|
|
||
|
|
@app.route("/", methods=["POST"])
|
||
|
|
def handle_query():
|
||
|
|
try:
|
||
|
|
print(request.remote_addr)
|
||
|
|
|
||
|
|
uuid_uesr = request.form.get("uuid")
|
||
|
|
file = request.files.get('file')
|
||
|
|
|
||
|
|
content = ulit_request_file(file)
|
||
|
|
|
||
|
|
id_ = str(uuid.uuid1()) # 为query生成唯一标识
|
||
|
|
# 绑定文本和query id
|
||
|
|
# recall_10(id_, title, abst_zh, content)
|
||
|
|
d = {
|
||
|
|
"uuid_uesr": uuid_uesr,
|
||
|
|
"content": content
|
||
|
|
}
|
||
|
|
load_request_path = './request_data_logs/{}.json'.format(id_)
|
||
|
|
with open(load_request_path, 'w', encoding='utf8') as f2:
|
||
|
|
json.dump(d, f2, ensure_ascii=False, indent=4)
|
||
|
|
|
||
|
|
redis_.rpush(db_key_query_recall, json.dumps({"id": id_, "path": load_request_path})) # 加入redis
|
||
|
|
return_text = {
|
||
|
|
'code': 0,
|
||
|
|
'msg': "请求成功",
|
||
|
|
'data': {
|
||
|
|
'balances': "",
|
||
|
|
'orderId': id_,
|
||
|
|
'consumeNum': ""
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
print("ok")
|
||
|
|
except:
|
||
|
|
return_text = {'code': 1}
|
||
|
|
return jsonify(return_text) # 返回结果
|
||
|
|
|
||
|
|
|
||
|
|
t1 = Thread(target=classify_accurate_check)
|
||
|
|
t1.start()
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
app.run(host="0.0.0.0", port=20000, threaded=True)
|