diff --git a/flask_check_bert.py b/flask_check_bert.py
index 267dd7d..30b398a 100644
--- a/flask_check_bert.py
+++ b/flask_check_bert.py
@@ -4,7 +4,7 @@ from numpy.linalg import norm
import pandas as pd
# from rouge import Rouge
from rouge_chinese import Rouge
-from Rouge_w import Rouge_w,Rouge_l
+from Rouge_w import Rouge_w, Rouge_l
import json
import pymysql
import re
@@ -15,6 +15,8 @@ import uuid
import time
import redis
from threading import Thread
+from multiprocessing import Pool
+
app = Flask(__name__)
app.config["JSON_AS_ASCII"] = False
@@ -25,7 +27,7 @@ db_key_query = 'query'
db_key_querying = 'querying'
db_key_queryset = 'queryset'
-nums_cpus = 16
+nums_cpus = 24
rouge = Rouge()
rouge_model = Rouge_w()
rouge_l_model = Rouge_l()
@@ -65,7 +67,6 @@ def bert_check(text, recall_data_list):
return return_list
-
def rouge_value_self(data_1, data_2):
data_1 = [' '.join(i) for i in data_1]
data_2 = [' '.join(i) for i in data_2]
@@ -81,7 +82,6 @@ def rouge_value_self(data_1, data_2):
def rouge_pre(text, df_train_nuoche):
-
return_list = []
index_rouge_list = []
text_list = [text] * len(df_train_nuoche)
@@ -100,12 +100,307 @@ def rouge_pre(text, df_train_nuoche):
return return_list
+def rouge_pre_m(text, df_train_nuoche):
+ return_list = []
+ index_rouge_list = []
+
+ text_list = [text] * len(df_train_nuoche)
+
+ data_list = []
+ for data_dan in df_train_nuoche:
+ data_list.append(data_dan[0])
+ rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list)
+ index_rouge_list.extend(rouge_l)
+
+ re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)]
+
+ return_list.extend(re1)
+
+ return return_list
+
+
+# 以单个章节为例
+def similar_content_func():
+ '''
+ 重复文章
+ :return:
+ '''
+ return [{
+ "content": "重复的内容标红",
+ "thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计 李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01",
+ "title": "标题",
+ "year": "日期",
+ "degree": "来源",
+ "author": "作者"
+ }]
+
+
+def original_text_contrast_func(data_sentence_dan, paper_dict):
+ '''
+ 重复的对比详细信息
+ :param similar_content:
+ :return:
+ '''
+
+ original_text = ""
+ start = len(data_sentence_dan[0][1])
+ end = 0
+ similar_content = []
+ for i in data_sentence_dan: # 可能有很多个暂且确定是一个
+
+ similar_content_dan = {
+ "paper_red_len_word": "",
+ "content": "重复的内容标红",
+ "thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计 李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01",
+ "title": "标题",
+ "year": "日期",
+ "degree": "来源",
+ "author": "作者",
+ "paper_len_word": ""
+ }
+
+ sentence_0_bool, sentence_0_dan_red = original_text_marked_red(i[1], paper_dict[i[0]][0],
+ paper_dict[i[0]][
+ 1]) # text_original, bert_text, bert_text_pre
+
+ sentence_1_bool, sentence_1_dan_red = original_text_marked_red(i[2], paper_dict[i[0]][2],
+ paper_dict[i[0]][
+ 3]) # text_original, bert_text, bert_text_pre
+
+ start_dan = sentence_0_dan_red.index("")
+ end_dan = sentence_0_dan_red.index("") - len("")
+
+ if start_dan < start:
+ start = start_dan
+ if end_dan > end:
+ end = end_dan
+
+ if sentence_0_bool == False or sentence_1_bool == False:
+ continue
+
+ similar_content_dan["content"] = sentence_1_dan_red
+ similar_content_dan["title"] = i[3]["title"]
+ similar_content_dan["author"] = i[3]["author"]
+ similar_content_dan["degree"] = i[3]["degree"]
+ similar_content_dan["year"] = i[3]["year"]
+ similar_content_dan["paper_len_word"] = i[3]["paper_len_word"]
+ similar_content_dan["paper_red_len_word"] = len(paper_dict[i[0]][3])
+
+ thesis_info = " ".join(
+ [similar_content_dan["title"], similar_content_dan["author"], similar_content_dan["degree"],
+ similar_content_dan["year"]])
+ similar_content_dan["thesis_info"] = thesis_info
+
+ similar_content.append(similar_content_dan)
+
+ original_text_list = list(data_sentence_dan[0][1])
+ original_text_list.insert(end, "")
+ original_text_list.insert(start, "")
+ original_text = "".join(original_text_list)
+
+ return_info = {
+ "original_text": original_text,
+ "dan_sentence_word_nums": end - start,
+ "similar_content": similar_content
+ }
+ return return_info
+
+
+def repeat_quote_info_func(original_text_contrast):
+ '''
+ 重复的引用信息
+ :return:
+ '''
+ chongfuwendang = {}
+
+ for sentence_dan in original_text_contrast:
+ for i in sentence_dan["similar_content"]:
+ thesis_info = i["thesis_info"]
+ if thesis_info not in chongfuwendang:
+ chongfuwendang[thesis_info] = {
+ "quote": False,
+ "thesis_author": i["author"],
+ "thesis_date": i["year"],
+ "thesis_info": thesis_info,
+ "thesis_repeat_rate": (i["paper_red_len_word"] / i["paper_len_word"]) * 100,
+ # round(repetition_rate, 3) * 100
+ "thesis_title": i["title"],
+ "thesis_link": "",
+ "thesis_publish": i["degree"],
+ "thesis_repeat_word": i["paper_red_len_word"],
+ "thesis_teacher": "",
+ "paper_len_word": i["paper_len_word"]
+ }
+ else:
+ chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"]
+ chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] /
+ chongfuwendang[thesis_info][
+ "paper_len_word"]) * 100
+ chongfuwendang = sorted(chongfuwendang.items(),
+ key=lambda x: x[1]["thesis_repeat_rate"], reverse=False)
+ chongfuwendang_list = [i[1] for i in chongfuwendang]
+
+ return chongfuwendang_list
+
+
+def total_data_func(section_data_list):
+ '''
+ 总体数据
+ :return:
+ '''
+ # "end_page_index": 0,
+ # "name": "第1部分",
+ # "repeat_rate": repeat_rate,
+ # "repeat_words": repeat_words,
+ # "start_page_index": 0,
+ # "words": section_words,
+ # "original_text": original_text,
+ # "original_text_oneself": original_text,
+ # "original_text_contrast/重复的对比详细信息": original_text_contrast,
+ # "repeat_quote_info/重复的引用信息": repeat_quote_info
+
+ repeat_words = 0
+ words = 0
+
+ for i in section_data_list:
+ repeat_words += i["repeat_words"]
+ words += i["words"]
+
+ exclude_personal_rate = str(repeat_words / words * 100) + "%"
+ exclude_quote_rate = str(repeat_words / words * 100) + "%"
+ single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"]
+ single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"]
+ total_repeat_rate = str(repeat_words / words * 100) + "%"
+ total_repeat_words = repeat_words
+ total_words = words
+
+ return {
+ "back_repeat_words": "",
+ "exclude_personal_rate": exclude_personal_rate,
+ "exclude_quote_rate": exclude_quote_rate,
+ "front_repeat_words": "",
+ "single_max_rate": single_max_rate,
+ "single_max_repeat_words": single_max_repeat_words,
+ "suspected_paragraph": "",
+ "suspected_paragraph_max_repeat_words": "",
+ "suspected_paragraph_min_repeat_words": "",
+ "total_paragraph": "",
+ "total_repeat_rate": total_repeat_rate,
+ "total_repeat_words": total_repeat_words,
+ "total_words": total_words,
+ "tables": 0
+ }
+
+
+def section_data_func_dan():
+ '''
+ 章节信息单个
+ :return:
+ '''
+ # {
+ # "section_name": "章节名称",
+ # "section_repeat_rate": "重复率",
+ # "section_repeat_words": "重复字数",
+ # "section_words": "章节字数",
+ # "oneself_repeat_words": "去除本人后重复字数",
+ # "reference_repeat_words": "去除引用后重复字数",
+ # "section_oneself_rate": "去除本人后重复率"
+ # }
+
+ return {
+ "section_name": "",
+ "section_repeat_rate": "",
+ "section_repeat_words": "",
+ "section_words": "",
+ "oneself_repeat_words": "",
+ "reference_repeat_words": "",
+ "section_oneself_rate": ""
+ }
+
+
+def section_data_func(section_details):
+ '''
+ 章节信息
+ :return:
+ '''
+ # "end_page_index": 0,
+ # "name": "第1部分",
+ # "repeat_rate": repeat_rate,
+ # "repeat_words": repeat_words,
+ # "start_page_index": 0,
+ # "words": section_words,
+ # "original_text": original_text,
+ # "original_text_oneself": original_text,
+ # "original_text_contrast/重复的对比详细信息": original_text_contrast,
+ # "repeat_quote_info/重复的引用信息": repeat_quote_info
+
+ section_name = section_details["name"]
+ section_repeat_rate = section_details["repeat_rate"]
+ section_repeat_words = section_details["repeat_words"]
+ section_words = section_details["words"]
+ oneself_repeat_words = section_details["repeat_words"]
+ reference_repeat_words = section_details["repeat_words"]
+ section_oneself_rate = section_details["repeat_rate"]
+
+ return {
+ "section_name": section_name,
+ "section_repeat_rate": section_repeat_rate,
+ "section_repeat_words": section_repeat_words,
+ "section_words": section_words,
+ "oneself_repeat_words": oneself_repeat_words,
+ "reference_repeat_words": reference_repeat_words,
+ "section_oneself_rate": section_oneself_rate
+ }
+
+
+def section_details_func(data_section_dan, paper_dict):
+ '''
+ 章节详细信息
+ :param original_text_contrast:
+ :param repeat_quote_info:
+ :return:
+ '''
+ original_text_contrast = []
+ section_repeat_rate = ""
+ repeat_words = 0
+ section_words = 0
+ oneself_repeat_words = ""
+ reference_repeat_words = ""
+ section_oneself_rate = ""
+ original_text_list = []
+
+ for sentence_dan in data_section_dan:
+ original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict)
+ original_text_contrast.append(original_text_contrast_dan)
+ repeat_words += original_text_contrast_dan["dan_sentence_word_nums"]
+ original_text_list.append(original_text_contrast_dan["original_text"])
+ section_words += len(sentence_dan[0][1])
+
+ original_text = "。".join(original_text_list)
+ repeat_rate = repeat_words / section_words
+
+ repeat_quote_info = repeat_quote_info_func(original_text_contrast)
+
+ return {
+ "end_page_index": 0,
+ "name": "第1部分",
+ "repeat_rate": repeat_rate,
+ "repeat_words": repeat_words,
+ "start_page_index": 0,
+ "words": section_words,
+ "original_text": original_text,
+ "original_text_oneself": original_text,
+ "original_text_contrast": original_text_contrast,
+ "repeat_quote_info": repeat_quote_info
+ }
+
+
def accurate_check_rouge(
title,
author,
text_paper,
recall_data_list
- ):
+):
'''
精确查重出相似句子
:param text:
@@ -120,9 +415,23 @@ def accurate_check_rouge(
sentence_word_nums = 0
# rouge算法查重
- for text in centent_list:
- rouge_pre_list = rouge_pre(text, recall_data_list)
- data_zong.append(rouge_pre_list)
+ rst = []
+ p = Pool(nums_cpus) # 进程池中含有n个子进程
+
+ print("centent_list", centent_list)
+
+ for i in range(len(centent_list)):
+ text = centent_list[i]
+ a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,))
+ rst.append(a)
+ p.close()
+ p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。
+
+ rst = [i.get() for i in rst]
+
+ for i in range(len(rst)):
+ print(rst[i])
+ data_zong.append(rst[i])
t0 = time.time()
# bert算法查重
@@ -132,7 +441,6 @@ def accurate_check_rouge(
t1 = time.time()
original_dict = []
-
# 找出相似的句子序号
bool_check_sentense = []
# bert算法
@@ -142,12 +450,14 @@ def accurate_check_rouge(
# rouge算法
for i in range(len(data_zong)):
- if data_zong[i][0] > 0.47:
- bool_check_sentense.append([i,data_zong[i][1]])
- biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
-
- print("bert精确查重时间", t1-t0)
+ for j in range(len(data_zong[i])):
+ if data_zong[i][j][1] > 0.47:
+ bool_check_sentense.append([i, data_zong[i][j][0]])
+ biao_red = biaohong(bool_check_sentense, data_zong,
+ recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
+ print("bert精确查重时间", t1 - t0)
+ print(biao_red)
sentence_0_list = []
sentence_1_list = []
@@ -156,7 +466,8 @@ def accurate_check_rouge(
for i in biao_red:
if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]:
sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]]))
- sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]]))
+ sentence_1_list.append(
+ "".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]]))
sim_paper_name.append(recall_data_list[i[1][0]][1])
else:
continue
@@ -164,7 +475,6 @@ def accurate_check_rouge(
sentence_0_list_new = []
sentence_1_list_new = []
-
for i in zip(sentence_0_list, sentence_1_list):
if len(i[0]) + len(i[1]) < 1200:
sentence_0_list_new.append(i[0])
@@ -183,141 +493,59 @@ def accurate_check_rouge(
chongfuwendang = {}
- for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name):
-
- print([sentence_0_dan, sentence_1_dan])
- original_text_contrast_dict = {
- "original_text": "",
- "similar_content": [
- {
- "content": "",
- "thesis_info": "",
- "title": "",
- "year": "",
- "degree": "",
- "author": "",
- }
- ]
- }
- try:
- sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre
- except:
- print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]])
- continue
- # 9/0
- sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre
+ print("paper_dict", paper_dict)
+ print("sentence_0_list_new", sentence_0_list_new)
+ print("sentence_1_list_new", sentence_1_list_new)
+ print("sim_paper_name", sim_paper_name)
+ similar_content_control = [[]]
- if sentence_0_bool == False or sentence_1_bool == False:
- continue
+ with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f:
+ json.dump(paper_dict, f, ensure_ascii=False)
- dan_sentence_word_nums = len(paper_dict[paper_dict_dan_id][1])
- sentence_word_nums += dan_sentence_word_nums
-
- original_text.append(sentence_0_dan_red)
- original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format(
- dan_sentence_word_nums) + sentence_0_dan_red
-
- thesis_info = " ".join([sim_paper_name_dan["title"], sim_paper_name_dan["author"], sim_paper_name_dan["degree"], sim_paper_name_dan["year"]])
- original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red
- original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan["title"]
- original_text_contrast_dict["similar_content"][0]["author"] = sim_paper_name_dan["author"]
- original_text_contrast_dict["similar_content"][0]["degree"] = sim_paper_name_dan["degree"]
- original_text_contrast_dict["similar_content"][0]["year"] = sim_paper_name_dan["year"]
- original_text_contrast_dict["similar_content"][0]["thesis_info"] = thesis_info
-
- original_text_contrast.append(original_text_contrast_dict)
-
- # for i in repeat_quote_info:
- # if
-
- if thesis_info not in chongfuwendang:
- chongfuwendang[thesis_info] = {
- "quote": False,
- "thesis_author": sim_paper_name_dan["author"],
- "thesis_date" : sim_paper_name_dan["year"],
- "thesis_info" : thesis_info,
- "thesis_repeat_rate": (dan_sentence_word_nums/sim_paper_name_dan["paper_len_word"]) * 100, #round(repetition_rate, 3) * 100
- "thesis_title": sim_paper_name_dan["title"],
- "thesis_link": "",
- "thesis_publish": sim_paper_name_dan["degree"],
- "thesis_repeat_word": dan_sentence_word_nums,
- "thesis_teacher": "",
- "paper_len_word": sim_paper_name_dan["paper_len_word"]
- }
+ sentence_0_list_new_cursor = sentence_0_list_new[0]
+ for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)),
+ sentence_0_list_new,
+ sentence_1_list_new,
+ sim_paper_name):
+
+ if sentence_0_list_new_cursor != sentence_0_dan:
+ similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]])
else:
- chongfuwendang[thesis_info]["thesis_repeat_word"] += dan_sentence_word_nums
- chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"]/chongfuwendang[thesis_info]["paper_len_word"]) * 100
+ similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan])
+ data = [similar_content_control]
- chongfuwendang = sorted(chongfuwendang.items(),
- key=lambda x: x[1]["thesis_repeat_rate"], reverse=False)
+ # 模拟多个章节
+ section_details_list = []
+ for data_dan in data:
+ data_section_dan = data_dan
+ # 章节详细信息
+ section_details = section_details_func(data_section_dan, paper_dict)
+ section_details_list.append(section_details)
- for i in range(len(chongfuwendang)):
- repeat_paper_one_info_dict = chongfuwendang[i][1]
- repeat_paper_one_info_dict.pop("paper_len_word")
- repeat_paper_one_info_dict["thesis_repeat_rate"] = str(round(repeat_paper_one_info_dict["thesis_repeat_rate"], 1)) + "%"
- repeat_quote_info.append(repeat_paper_one_info_dict)
+ # 模拟多个章节
- original_text = "。".join(original_text)
+ section_data_list = []
+ for section_details in section_details_list:
+ section_data = section_data_func(section_details)
- repetition_rate = sentence_word_nums/len(text_paper)
- repetition_rate = round(repetition_rate, 3) * 100
+ total_data = total_data_func(section_details_list)
format = '%Y-%m-%d %H:%M:%S'
value = time.localtime(int(time.time()))
dt = time.strftime(format, value)
- return {
+ paper_data = {
"author": author,
"check_time": dt,
- "title": title,
"time_range": "1900-01-01至2023-08-08",
- "section_data": [
- {
- "oneself_repeat_words": sentence_word_nums,
- "reference_repeat_words": sentence_word_nums,
- "section_name": "第1部分",
- "section_oneself_rate": "{}%".format(repetition_rate),
- "section_repeat_rate": "{}%".format(repetition_rate),
- "section_repeat_words": sentence_word_nums,
- "section_words": len(text_paper)
- }
- ],
- "section_details": [
- {
- "end_page_index": 0,
- "name": "",
- "repeat_rate": "",
- "repeat_words": "",
- "words": "",
- "original_text": original_text,
- "original_text_oneself": original_text,
- "original_text_contrast": original_text_contrast,
- "repeat_quote_info": repeat_quote_info
- }
- ],
- "total_data": {
- "back_repeat_words": "",
- "exclude_personal_rate": "{}%".format(repetition_rate),
- "exclude_quote_rate": "{}%".format(repetition_rate),
- "foot_end_note": "0",
- "front_repeat_words": "",
- "single_max_rate": "",
- "single_max_repeat_words": "",
- "suspected_paragraph": "1",
- "suspected_paragraph_max_repeat_words": "",
- "suspected_paragraph_min_repeat_words": "",
- "tables": "0",
- "total_paragraph": "1",
- "total_repeat_rate": "{}%".format(repetition_rate),
- "total_repeat_words": sentence_word_nums,
- "total_words": len(text_paper)
- }
+ "title": title,
+ "total_data": total_data,
+ "section_data": section_data_list,
+ "section_details": section_details_list
}
-
-
-
+ return paper_data
def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
@@ -331,15 +559,16 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
start = -1
end = -1
while True:
- if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche):
+ if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \
+ + 1 >= len(df_train_nuoche):
break
- elif bool_check_sentense[i][0]-1 == start:
+ elif bool_check_sentense[i][0] - 1 == start:
i += 1
continue
elif bool_check_sentense[i][0] == end:
i += 1
continue
- elif bool_check_sentense[i][0]-1 == end:
+ elif bool_check_sentense[i][0] - 1 == end:
i += 1
continue
else:
@@ -347,10 +576,10 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1])
biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]])
biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1])
- biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1],
- [bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]])
- start = bool_check_sentense[i][0]-1
- end = bool_check_sentense[i][0]+1
+ biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1],
+ [bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]])
+ start = bool_check_sentense[i][0] - 1
+ end = bool_check_sentense[i][0] + 1
i += 1
return biao_red
@@ -378,8 +607,8 @@ def dialog_line_parse(url, text):
# "".format(url, response.status_code, response.text)
# )
print("【{}】 Failed to get a proper response from remote "
- "server. Status Code: {}. Response: {}"
- "".format(url, response.status_code, response.text))
+ "server. Status Code: {}. Response: {}"
+ "".format(url, response.status_code, response.text))
print(text)
return {}
@@ -410,7 +639,7 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre):
while True:
if down_pointer >= len(bert_text_pre):
break
- elif down_pointer == len(bert_text_pre)-1:
+ elif down_pointer == len(bert_text_pre) - 1:
if bert_text[up_pointer] == bert_text_pre[down_pointer]:
pointer_list.append(up_pointer)
break
@@ -428,7 +657,7 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre):
up_pointer += 1
down_pointer += 1
else:
- if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]":
+ if bert_text_pre[down_pointer:down_pointer + 5] == "[UNK]":
up_pointer += 1
down_pointer += 5
pointer_list.append(up_pointer)
@@ -441,12 +670,11 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre):
down_pointer = 0
pointer_list = []
-
start = pointer_list[0]
end = pointer_list[-1]
bert_text_list = list(bert_text)
bert_text_list.insert(start, "")
- bert_text_list.insert(end + 2 , "")
+ bert_text_list.insert(end + 2, "")
text_original_list = list(text_original)
@@ -482,30 +710,14 @@ def biaohong_bert_predict(sentence_0_list, sentence_1_list):
:return:
'''
- # sentence_0_list = []
- # sentence_1_list = []
- # sim_paper_name = []
- #
- # for i in biaohong_list:
- # sentence_0_list.append("。".join([paper_list[i[0][0]], paper_list[i[0][1]], paper_list[i[0][2]]]))
- # sentence_1_list.append("。".join([recall_data_list[i[1][1]], recall_data_list[i[1][1]], recall_data_list[i[1][2]]]))
-
- paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"]
-
- # paper_dict
- # print("原文:".format(i), paper_dict[i][0])
- # print("原文标红:".format(i), paper_dict[i][1])
- # print("相似:".format(i), paper_dict[i][2])
- # print("相似标红:".format(i), paper_dict[i][3])
-
- # original_text
- #
- #
- # for paper_dict_dan, sentence_0_dan, sentence_1_dan in zip(paper_dict, sentence_0_list, sentence_1_list):
- # original_text_marked_red
+ paper_dict = \
+ dialog_line_parse("http://192.168.31.74:16003/",
+ {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})[
+ "resilt"]
return paper_dict
+
def ulit_text(title, text):
data = []
try:
@@ -520,6 +732,7 @@ def ulit_text(title, text):
data.append([i, title])
return data
+
def run_query(conn, sql, params):
with conn.cursor() as cursor:
cursor.execute(sql, params)
@@ -587,9 +800,8 @@ def ulit_recall_paper(recall_data_list_dict):
# data.append([sentence, filename])
# return data
-
data = []
- for i in list(recall_data_list_dict.items())[:5]:
+ for i in list(recall_data_list_dict.items())[:10]:
data_one = processing_one_text(i[0])
data.extend(data_one)
@@ -652,18 +864,18 @@ def uilt_content(content):
key_word_bool = True
break
- if zhaiyao_bool== True and zhaiyao_en_bool == True:
- pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,zhaiyao_en_str)
+ if zhaiyao_bool == True and zhaiyao_en_bool == True:
+ pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, zhaiyao_en_str)
result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0]
elif zhaiyao_bool == True and key_word_bool == True:
- pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,key_word_str )
+ pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, key_word_str)
result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0]
elif zhaiyao_bool == True and mulu_bool == True:
- pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,mulu_str)
+ pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, mulu_str)
result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0]
@@ -688,7 +900,6 @@ def ulit_request_file(file):
return abst_zh, content
-
# @app.route("/", methods=["POST"])
# def handle_query():
# print(request.remote_addr)
@@ -761,7 +972,6 @@ def classify(): # 调用模型,设置最大batch_size
goodsId = data_dict['goodsId']
callbackUrl = data_dict['callbackUrl']
-
# 调用宇鹏查询相似十篇
# recall_data_list_dict = recall_10(title, abst_zh, content)
@@ -805,10 +1015,10 @@ def handle_query():
# request.form.get('prompt')
dataBases = request.form.get("dataBases")
- minSimilarity = request.form.get("minSimilarity") # txt
+ minSimilarity = request.form.get("minSimilarity") # txt
minWords = request.form.get("minWords")
title = request.form.get("title")
- author = request.form.get("author") # txt
+ author = request.form.get("author") # txt
file = request.files.get('file')
token = request.form.get("token")
account = request.form.get("account")
@@ -833,7 +1043,7 @@ def handle_query():
'account': account,
'goodsId': goodsId,
'callbackUrl': callbackUrl
- }
+ }
# 绑定文本和query id
print(d)
@@ -860,8 +1070,9 @@ def handle_query():
return_text = {'code': 1}
return jsonify(return_text) # 返回结果
+
t = Thread(target=classify)
t.start()
if __name__ == "__main__":
- app.run(host="0.0.0.0", port=16001, threaded=True, debug=True, use_reloader=False)
\ No newline at end of file
+ app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)
diff --git a/flask_check_bert_test.py b/flask_check_bert_test.py
index ac36ec1..30b398a 100644
--- a/flask_check_bert_test.py
+++ b/flask_check_bert_test.py
@@ -4,7 +4,7 @@ from numpy.linalg import norm
import pandas as pd
# from rouge import Rouge
from rouge_chinese import Rouge
-from Rouge_w import Rouge_w,Rouge_l
+from Rouge_w import Rouge_w, Rouge_l
import json
import pymysql
import re
@@ -16,6 +16,7 @@ import time
import redis
from threading import Thread
from multiprocessing import Pool
+
app = Flask(__name__)
app.config["JSON_AS_ASCII"] = False
@@ -66,7 +67,6 @@ def bert_check(text, recall_data_list):
return return_list
-
def rouge_value_self(data_1, data_2):
data_1 = [' '.join(i) for i in data_1]
data_2 = [' '.join(i) for i in data_2]
@@ -82,7 +82,6 @@ def rouge_value_self(data_1, data_2):
def rouge_pre(text, df_train_nuoche):
-
return_list = []
index_rouge_list = []
text_list = [text] * len(df_train_nuoche)
@@ -102,7 +101,6 @@ def rouge_pre(text, df_train_nuoche):
def rouge_pre_m(text, df_train_nuoche):
-
return_list = []
index_rouge_list = []
@@ -144,12 +142,11 @@ def original_text_contrast_func(data_sentence_dan, paper_dict):
:return:
'''
-
original_text = ""
start = len(data_sentence_dan[0][1])
end = 0
similar_content = []
- for i in data_sentence_dan: #可能有很多个暂且确定是一个
+ for i in data_sentence_dan: # 可能有很多个暂且确定是一个
similar_content_dan = {
"paper_red_len_word": "",
@@ -216,7 +213,6 @@ def repeat_quote_info_func(original_text_contrast):
'''
chongfuwendang = {}
-
for sentence_dan in original_text_contrast:
for i in sentence_dan["similar_content"]:
thesis_info = i["thesis_info"]
@@ -238,7 +234,8 @@ def repeat_quote_info_func(original_text_contrast):
else:
chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"]
chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] /
- chongfuwendang[thesis_info]["paper_len_word"]) * 100
+ chongfuwendang[thesis_info][
+ "paper_len_word"]) * 100
chongfuwendang = sorted(chongfuwendang.items(),
key=lambda x: x[1]["thesis_repeat_rate"], reverse=False)
chongfuwendang_list = [i[1] for i in chongfuwendang]
@@ -265,16 +262,15 @@ def total_data_func(section_data_list):
repeat_words = 0
words = 0
-
for i in section_data_list:
repeat_words += i["repeat_words"]
words += i["words"]
- exclude_personal_rate = str(repeat_words/words * 100) + "%"
- exclude_quote_rate = str(repeat_words/words * 100) + "%"
+ exclude_personal_rate = str(repeat_words / words * 100) + "%"
+ exclude_quote_rate = str(repeat_words / words * 100) + "%"
single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"]
single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"]
- total_repeat_rate = str(repeat_words/words * 100) + "%"
+ total_repeat_rate = str(repeat_words / words * 100) + "%"
total_repeat_words = repeat_words
total_words = words
@@ -321,6 +317,7 @@ def section_data_func_dan():
"section_oneself_rate": ""
}
+
def section_data_func(section_details):
'''
章节信息
@@ -380,13 +377,10 @@ def section_details_func(data_section_dan, paper_dict):
section_words += len(sentence_dan[0][1])
original_text = "。".join(original_text_list)
- repeat_rate = repeat_words/section_words
+ repeat_rate = repeat_words / section_words
repeat_quote_info = repeat_quote_info_func(original_text_contrast)
-
-
-
return {
"end_page_index": 0,
"name": "第1部分",
@@ -401,13 +395,12 @@ def section_details_func(data_section_dan, paper_dict):
}
-
def accurate_check_rouge(
title,
author,
text_paper,
recall_data_list
- ):
+):
'''
精确查重出相似句子
:param text:
@@ -448,7 +441,6 @@ def accurate_check_rouge(
t1 = time.time()
original_dict = []
-
# 找出相似的句子序号
bool_check_sentense = []
# bert算法
@@ -460,13 +452,13 @@ def accurate_check_rouge(
for i in range(len(data_zong)):
for j in range(len(data_zong[i])):
if data_zong[i][j][1] > 0.47:
- bool_check_sentense.append([i,data_zong[i][j][0]])
- biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
+ bool_check_sentense.append([i, data_zong[i][j][0]])
+ biao_red = biaohong(bool_check_sentense, data_zong,
+ recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
- print("bert精确查重时间", t1-t0)
+ print("bert精确查重时间", t1 - t0)
print(biao_red)
-
sentence_0_list = []
sentence_1_list = []
sim_paper_name = []
@@ -474,7 +466,8 @@ def accurate_check_rouge(
for i in biao_red:
if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]:
sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]]))
- sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]]))
+ sentence_1_list.append(
+ "".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]]))
sim_paper_name.append(recall_data_list[i[1][0]][1])
else:
continue
@@ -482,7 +475,6 @@ def accurate_check_rouge(
sentence_0_list_new = []
sentence_1_list_new = []
-
for i in zip(sentence_0_list, sentence_1_list):
if len(i[0]) + len(i[1]) < 1200:
sentence_0_list_new.append(i[0])
@@ -556,8 +548,6 @@ def accurate_check_rouge(
return paper_data
-
-
def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
'''
标红的序号 [[0,1,2],[3,4,5]]
@@ -569,15 +559,16 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
start = -1
end = -1
while True:
- if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche):
+ if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \
+ + 1 >= len(df_train_nuoche):
break
- elif bool_check_sentense[i][0]-1 == start:
+ elif bool_check_sentense[i][0] - 1 == start:
i += 1
continue
elif bool_check_sentense[i][0] == end:
i += 1
continue
- elif bool_check_sentense[i][0]-1 == end:
+ elif bool_check_sentense[i][0] - 1 == end:
i += 1
continue
else:
@@ -585,10 +576,10 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1])
biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]])
biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1])
- biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1],
- [bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]])
- start = bool_check_sentense[i][0]-1
- end = bool_check_sentense[i][0]+1
+ biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1],
+ [bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]])
+ start = bool_check_sentense[i][0] - 1
+ end = bool_check_sentense[i][0] + 1
i += 1
return biao_red
@@ -616,8 +607,8 @@ def dialog_line_parse(url, text):
# "".format(url, response.status_code, response.text)
# )
print("【{}】 Failed to get a proper response from remote "
- "server. Status Code: {}. Response: {}"
- "".format(url, response.status_code, response.text))
+ "server. Status Code: {}. Response: {}"
+ "".format(url, response.status_code, response.text))
print(text)
return {}
@@ -648,7 +639,7 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre):
while True:
if down_pointer >= len(bert_text_pre):
break
- elif down_pointer == len(bert_text_pre)-1:
+ elif down_pointer == len(bert_text_pre) - 1:
if bert_text[up_pointer] == bert_text_pre[down_pointer]:
pointer_list.append(up_pointer)
break
@@ -666,7 +657,7 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre):
up_pointer += 1
down_pointer += 1
else:
- if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]":
+ if bert_text_pre[down_pointer:down_pointer + 5] == "[UNK]":
up_pointer += 1
down_pointer += 5
pointer_list.append(up_pointer)
@@ -679,12 +670,11 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre):
down_pointer = 0
pointer_list = []
-
start = pointer_list[0]
end = pointer_list[-1]
bert_text_list = list(bert_text)
bert_text_list.insert(start, "")
- bert_text_list.insert(end + 2 , "")
+ bert_text_list.insert(end + 2, "")
text_original_list = list(text_original)
@@ -720,10 +710,14 @@ def biaohong_bert_predict(sentence_0_list, sentence_1_list):
:return:
'''
- paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"]
+ paper_dict = \
+ dialog_line_parse("http://192.168.31.74:16003/",
+ {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})[
+ "resilt"]
return paper_dict
+
def ulit_text(title, text):
data = []
try:
@@ -738,6 +732,7 @@ def ulit_text(title, text):
data.append([i, title])
return data
+
def run_query(conn, sql, params):
with conn.cursor() as cursor:
cursor.execute(sql, params)
@@ -805,7 +800,6 @@ def ulit_recall_paper(recall_data_list_dict):
# data.append([sentence, filename])
# return data
-
data = []
for i in list(recall_data_list_dict.items())[:10]:
data_one = processing_one_text(i[0])
@@ -870,18 +864,18 @@ def uilt_content(content):
key_word_bool = True
break
- if zhaiyao_bool== True and zhaiyao_en_bool == True:
- pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,zhaiyao_en_str)
+ if zhaiyao_bool == True and zhaiyao_en_bool == True:
+ pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, zhaiyao_en_str)
result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0]
elif zhaiyao_bool == True and key_word_bool == True:
- pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,key_word_str )
+ pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, key_word_str)
result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0]
elif zhaiyao_bool == True and mulu_bool == True:
- pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,mulu_str)
+ pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, mulu_str)
result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0]
@@ -906,7 +900,6 @@ def ulit_request_file(file):
return abst_zh, content
-
# @app.route("/", methods=["POST"])
# def handle_query():
# print(request.remote_addr)
@@ -979,7 +972,6 @@ def classify(): # 调用模型,设置最大batch_size
goodsId = data_dict['goodsId']
callbackUrl = data_dict['callbackUrl']
-
# 调用宇鹏查询相似十篇
# recall_data_list_dict = recall_10(title, abst_zh, content)
@@ -1023,10 +1015,10 @@ def handle_query():
# request.form.get('prompt')
dataBases = request.form.get("dataBases")
- minSimilarity = request.form.get("minSimilarity") # txt
+ minSimilarity = request.form.get("minSimilarity") # txt
minWords = request.form.get("minWords")
title = request.form.get("title")
- author = request.form.get("author") # txt
+ author = request.form.get("author") # txt
file = request.files.get('file')
token = request.form.get("token")
account = request.form.get("account")
@@ -1051,7 +1043,7 @@ def handle_query():
'account': account,
'goodsId': goodsId,
'callbackUrl': callbackUrl
- }
+ }
# 绑定文本和query id
print(d)
@@ -1078,8 +1070,9 @@ def handle_query():
return_text = {'code': 1}
return jsonify(return_text) # 返回结果
+
t = Thread(target=classify)
t.start()
if __name__ == "__main__":
- app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)
\ No newline at end of file
+ app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)