|
@ -4,7 +4,7 @@ from numpy.linalg import norm |
|
|
import pandas as pd |
|
|
import pandas as pd |
|
|
# from rouge import Rouge |
|
|
# from rouge import Rouge |
|
|
from rouge_chinese import Rouge |
|
|
from rouge_chinese import Rouge |
|
|
from Rouge_w import Rouge_w,Rouge_l |
|
|
from Rouge_w import Rouge_w, Rouge_l |
|
|
import json |
|
|
import json |
|
|
import pymysql |
|
|
import pymysql |
|
|
import re |
|
|
import re |
|
@ -15,6 +15,8 @@ import uuid |
|
|
import time |
|
|
import time |
|
|
import redis |
|
|
import redis |
|
|
from threading import Thread |
|
|
from threading import Thread |
|
|
|
|
|
from multiprocessing import Pool |
|
|
|
|
|
|
|
|
app = Flask(__name__) |
|
|
app = Flask(__name__) |
|
|
app.config["JSON_AS_ASCII"] = False |
|
|
app.config["JSON_AS_ASCII"] = False |
|
|
|
|
|
|
|
@ -25,7 +27,7 @@ db_key_query = 'query' |
|
|
db_key_querying = 'querying' |
|
|
db_key_querying = 'querying' |
|
|
db_key_queryset = 'queryset' |
|
|
db_key_queryset = 'queryset' |
|
|
|
|
|
|
|
|
nums_cpus = 16 |
|
|
nums_cpus = 24 |
|
|
rouge = Rouge() |
|
|
rouge = Rouge() |
|
|
rouge_model = Rouge_w() |
|
|
rouge_model = Rouge_w() |
|
|
rouge_l_model = Rouge_l() |
|
|
rouge_l_model = Rouge_l() |
|
@ -65,7 +67,6 @@ def bert_check(text, recall_data_list): |
|
|
return return_list |
|
|
return return_list |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rouge_value_self(data_1, data_2): |
|
|
def rouge_value_self(data_1, data_2): |
|
|
data_1 = [' '.join(i) for i in data_1] |
|
|
data_1 = [' '.join(i) for i in data_1] |
|
|
data_2 = [' '.join(i) for i in data_2] |
|
|
data_2 = [' '.join(i) for i in data_2] |
|
@ -81,7 +82,6 @@ def rouge_value_self(data_1, data_2): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rouge_pre(text, df_train_nuoche): |
|
|
def rouge_pre(text, df_train_nuoche): |
|
|
|
|
|
|
|
|
return_list = [] |
|
|
return_list = [] |
|
|
index_rouge_list = [] |
|
|
index_rouge_list = [] |
|
|
text_list = [text] * len(df_train_nuoche) |
|
|
text_list = [text] * len(df_train_nuoche) |
|
@ -100,12 +100,307 @@ def rouge_pre(text, df_train_nuoche): |
|
|
return return_list |
|
|
return return_list |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rouge_pre_m(text, df_train_nuoche): |
|
|
|
|
|
return_list = [] |
|
|
|
|
|
index_rouge_list = [] |
|
|
|
|
|
|
|
|
|
|
|
text_list = [text] * len(df_train_nuoche) |
|
|
|
|
|
|
|
|
|
|
|
data_list = [] |
|
|
|
|
|
for data_dan in df_train_nuoche: |
|
|
|
|
|
data_list.append(data_dan[0]) |
|
|
|
|
|
rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list) |
|
|
|
|
|
index_rouge_list.extend(rouge_l) |
|
|
|
|
|
|
|
|
|
|
|
re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] |
|
|
|
|
|
|
|
|
|
|
|
return_list.extend(re1) |
|
|
|
|
|
|
|
|
|
|
|
return return_list |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 以单个章节为例 |
|
|
|
|
|
def similar_content_func(): |
|
|
|
|
|
''' |
|
|
|
|
|
重复文章 |
|
|
|
|
|
:return: |
|
|
|
|
|
''' |
|
|
|
|
|
return [{ |
|
|
|
|
|
"content": "重复的内容标红", |
|
|
|
|
|
"thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计 李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01", |
|
|
|
|
|
"title": "标题", |
|
|
|
|
|
"year": "日期", |
|
|
|
|
|
"degree": "来源", |
|
|
|
|
|
"author": "作者" |
|
|
|
|
|
}] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def original_text_contrast_func(data_sentence_dan, paper_dict): |
|
|
|
|
|
''' |
|
|
|
|
|
重复的对比详细信息 |
|
|
|
|
|
:param similar_content: |
|
|
|
|
|
:return: |
|
|
|
|
|
''' |
|
|
|
|
|
|
|
|
|
|
|
original_text = "" |
|
|
|
|
|
start = len(data_sentence_dan[0][1]) |
|
|
|
|
|
end = 0 |
|
|
|
|
|
similar_content = [] |
|
|
|
|
|
for i in data_sentence_dan: # 可能有很多个暂且确定是一个 |
|
|
|
|
|
|
|
|
|
|
|
similar_content_dan = { |
|
|
|
|
|
"paper_red_len_word": "", |
|
|
|
|
|
"content": "重复的内容标红", |
|
|
|
|
|
"thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计 李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01", |
|
|
|
|
|
"title": "标题", |
|
|
|
|
|
"year": "日期", |
|
|
|
|
|
"degree": "来源", |
|
|
|
|
|
"author": "作者", |
|
|
|
|
|
"paper_len_word": "" |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
sentence_0_bool, sentence_0_dan_red = original_text_marked_red(i[1], paper_dict[i[0]][0], |
|
|
|
|
|
paper_dict[i[0]][ |
|
|
|
|
|
1]) # text_original, bert_text, bert_text_pre |
|
|
|
|
|
|
|
|
|
|
|
sentence_1_bool, sentence_1_dan_red = original_text_marked_red(i[2], paper_dict[i[0]][2], |
|
|
|
|
|
paper_dict[i[0]][ |
|
|
|
|
|
3]) # text_original, bert_text, bert_text_pre |
|
|
|
|
|
|
|
|
|
|
|
start_dan = sentence_0_dan_red.index("<red>") |
|
|
|
|
|
end_dan = sentence_0_dan_red.index("</red>") - len("<red>") |
|
|
|
|
|
|
|
|
|
|
|
if start_dan < start: |
|
|
|
|
|
start = start_dan |
|
|
|
|
|
if end_dan > end: |
|
|
|
|
|
end = end_dan |
|
|
|
|
|
|
|
|
|
|
|
if sentence_0_bool == False or sentence_1_bool == False: |
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
similar_content_dan["content"] = sentence_1_dan_red |
|
|
|
|
|
similar_content_dan["title"] = i[3]["title"] |
|
|
|
|
|
similar_content_dan["author"] = i[3]["author"] |
|
|
|
|
|
similar_content_dan["degree"] = i[3]["degree"] |
|
|
|
|
|
similar_content_dan["year"] = i[3]["year"] |
|
|
|
|
|
similar_content_dan["paper_len_word"] = i[3]["paper_len_word"] |
|
|
|
|
|
similar_content_dan["paper_red_len_word"] = len(paper_dict[i[0]][3]) |
|
|
|
|
|
|
|
|
|
|
|
thesis_info = " ".join( |
|
|
|
|
|
[similar_content_dan["title"], similar_content_dan["author"], similar_content_dan["degree"], |
|
|
|
|
|
similar_content_dan["year"]]) |
|
|
|
|
|
similar_content_dan["thesis_info"] = thesis_info |
|
|
|
|
|
|
|
|
|
|
|
similar_content.append(similar_content_dan) |
|
|
|
|
|
|
|
|
|
|
|
original_text_list = list(data_sentence_dan[0][1]) |
|
|
|
|
|
original_text_list.insert(end, "</red>") |
|
|
|
|
|
original_text_list.insert(start, "<red>") |
|
|
|
|
|
original_text = "".join(original_text_list) |
|
|
|
|
|
|
|
|
|
|
|
return_info = { |
|
|
|
|
|
"original_text": original_text, |
|
|
|
|
|
"dan_sentence_word_nums": end - start, |
|
|
|
|
|
"similar_content": similar_content |
|
|
|
|
|
} |
|
|
|
|
|
return return_info |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def repeat_quote_info_func(original_text_contrast): |
|
|
|
|
|
''' |
|
|
|
|
|
重复的引用信息 |
|
|
|
|
|
:return: |
|
|
|
|
|
''' |
|
|
|
|
|
chongfuwendang = {} |
|
|
|
|
|
|
|
|
|
|
|
for sentence_dan in original_text_contrast: |
|
|
|
|
|
for i in sentence_dan["similar_content"]: |
|
|
|
|
|
thesis_info = i["thesis_info"] |
|
|
|
|
|
if thesis_info not in chongfuwendang: |
|
|
|
|
|
chongfuwendang[thesis_info] = { |
|
|
|
|
|
"quote": False, |
|
|
|
|
|
"thesis_author": i["author"], |
|
|
|
|
|
"thesis_date": i["year"], |
|
|
|
|
|
"thesis_info": thesis_info, |
|
|
|
|
|
"thesis_repeat_rate": (i["paper_red_len_word"] / i["paper_len_word"]) * 100, |
|
|
|
|
|
# round(repetition_rate, 3) * 100 |
|
|
|
|
|
"thesis_title": i["title"], |
|
|
|
|
|
"thesis_link": "", |
|
|
|
|
|
"thesis_publish": i["degree"], |
|
|
|
|
|
"thesis_repeat_word": i["paper_red_len_word"], |
|
|
|
|
|
"thesis_teacher": "", |
|
|
|
|
|
"paper_len_word": i["paper_len_word"] |
|
|
|
|
|
} |
|
|
|
|
|
else: |
|
|
|
|
|
chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"] |
|
|
|
|
|
chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] / |
|
|
|
|
|
chongfuwendang[thesis_info][ |
|
|
|
|
|
"paper_len_word"]) * 100 |
|
|
|
|
|
chongfuwendang = sorted(chongfuwendang.items(), |
|
|
|
|
|
key=lambda x: x[1]["thesis_repeat_rate"], reverse=False) |
|
|
|
|
|
chongfuwendang_list = [i[1] for i in chongfuwendang] |
|
|
|
|
|
|
|
|
|
|
|
return chongfuwendang_list |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def total_data_func(section_data_list): |
|
|
|
|
|
''' |
|
|
|
|
|
总体数据 |
|
|
|
|
|
:return: |
|
|
|
|
|
''' |
|
|
|
|
|
# "end_page_index": 0, |
|
|
|
|
|
# "name": "第1部分", |
|
|
|
|
|
# "repeat_rate": repeat_rate, |
|
|
|
|
|
# "repeat_words": repeat_words, |
|
|
|
|
|
# "start_page_index": 0, |
|
|
|
|
|
# "words": section_words, |
|
|
|
|
|
# "original_text": original_text, |
|
|
|
|
|
# "original_text_oneself": original_text, |
|
|
|
|
|
# "original_text_contrast/重复的对比详细信息": original_text_contrast, |
|
|
|
|
|
# "repeat_quote_info/重复的引用信息": repeat_quote_info |
|
|
|
|
|
|
|
|
|
|
|
repeat_words = 0 |
|
|
|
|
|
words = 0 |
|
|
|
|
|
|
|
|
|
|
|
for i in section_data_list: |
|
|
|
|
|
repeat_words += i["repeat_words"] |
|
|
|
|
|
words += i["words"] |
|
|
|
|
|
|
|
|
|
|
|
exclude_personal_rate = str(repeat_words / words * 100) + "%" |
|
|
|
|
|
exclude_quote_rate = str(repeat_words / words * 100) + "%" |
|
|
|
|
|
single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"] |
|
|
|
|
|
single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"] |
|
|
|
|
|
total_repeat_rate = str(repeat_words / words * 100) + "%" |
|
|
|
|
|
total_repeat_words = repeat_words |
|
|
|
|
|
total_words = words |
|
|
|
|
|
|
|
|
|
|
|
return { |
|
|
|
|
|
"back_repeat_words": "", |
|
|
|
|
|
"exclude_personal_rate": exclude_personal_rate, |
|
|
|
|
|
"exclude_quote_rate": exclude_quote_rate, |
|
|
|
|
|
"front_repeat_words": "", |
|
|
|
|
|
"single_max_rate": single_max_rate, |
|
|
|
|
|
"single_max_repeat_words": single_max_repeat_words, |
|
|
|
|
|
"suspected_paragraph": "", |
|
|
|
|
|
"suspected_paragraph_max_repeat_words": "", |
|
|
|
|
|
"suspected_paragraph_min_repeat_words": "", |
|
|
|
|
|
"total_paragraph": "", |
|
|
|
|
|
"total_repeat_rate": total_repeat_rate, |
|
|
|
|
|
"total_repeat_words": total_repeat_words, |
|
|
|
|
|
"total_words": total_words, |
|
|
|
|
|
"tables": 0 |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def section_data_func_dan(): |
|
|
|
|
|
''' |
|
|
|
|
|
章节信息单个 |
|
|
|
|
|
:return: |
|
|
|
|
|
''' |
|
|
|
|
|
# { |
|
|
|
|
|
# "section_name": "章节名称", |
|
|
|
|
|
# "section_repeat_rate": "重复率", |
|
|
|
|
|
# "section_repeat_words": "重复字数", |
|
|
|
|
|
# "section_words": "章节字数", |
|
|
|
|
|
# "oneself_repeat_words": "去除本人后重复字数", |
|
|
|
|
|
# "reference_repeat_words": "去除引用后重复字数", |
|
|
|
|
|
# "section_oneself_rate": "去除本人后重复率" |
|
|
|
|
|
# } |
|
|
|
|
|
|
|
|
|
|
|
return { |
|
|
|
|
|
"section_name": "", |
|
|
|
|
|
"section_repeat_rate": "", |
|
|
|
|
|
"section_repeat_words": "", |
|
|
|
|
|
"section_words": "", |
|
|
|
|
|
"oneself_repeat_words": "", |
|
|
|
|
|
"reference_repeat_words": "", |
|
|
|
|
|
"section_oneself_rate": "" |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def section_data_func(section_details): |
|
|
|
|
|
''' |
|
|
|
|
|
章节信息 |
|
|
|
|
|
:return: |
|
|
|
|
|
''' |
|
|
|
|
|
# "end_page_index": 0, |
|
|
|
|
|
# "name": "第1部分", |
|
|
|
|
|
# "repeat_rate": repeat_rate, |
|
|
|
|
|
# "repeat_words": repeat_words, |
|
|
|
|
|
# "start_page_index": 0, |
|
|
|
|
|
# "words": section_words, |
|
|
|
|
|
# "original_text": original_text, |
|
|
|
|
|
# "original_text_oneself": original_text, |
|
|
|
|
|
# "original_text_contrast/重复的对比详细信息": original_text_contrast, |
|
|
|
|
|
# "repeat_quote_info/重复的引用信息": repeat_quote_info |
|
|
|
|
|
|
|
|
|
|
|
section_name = section_details["name"] |
|
|
|
|
|
section_repeat_rate = section_details["repeat_rate"] |
|
|
|
|
|
section_repeat_words = section_details["repeat_words"] |
|
|
|
|
|
section_words = section_details["words"] |
|
|
|
|
|
oneself_repeat_words = section_details["repeat_words"] |
|
|
|
|
|
reference_repeat_words = section_details["repeat_words"] |
|
|
|
|
|
section_oneself_rate = section_details["repeat_rate"] |
|
|
|
|
|
|
|
|
|
|
|
return { |
|
|
|
|
|
"section_name": section_name, |
|
|
|
|
|
"section_repeat_rate": section_repeat_rate, |
|
|
|
|
|
"section_repeat_words": section_repeat_words, |
|
|
|
|
|
"section_words": section_words, |
|
|
|
|
|
"oneself_repeat_words": oneself_repeat_words, |
|
|
|
|
|
"reference_repeat_words": reference_repeat_words, |
|
|
|
|
|
"section_oneself_rate": section_oneself_rate |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def section_details_func(data_section_dan, paper_dict): |
|
|
|
|
|
''' |
|
|
|
|
|
章节详细信息 |
|
|
|
|
|
:param original_text_contrast: |
|
|
|
|
|
:param repeat_quote_info: |
|
|
|
|
|
:return: |
|
|
|
|
|
''' |
|
|
|
|
|
original_text_contrast = [] |
|
|
|
|
|
section_repeat_rate = "" |
|
|
|
|
|
repeat_words = 0 |
|
|
|
|
|
section_words = 0 |
|
|
|
|
|
oneself_repeat_words = "" |
|
|
|
|
|
reference_repeat_words = "" |
|
|
|
|
|
section_oneself_rate = "" |
|
|
|
|
|
original_text_list = [] |
|
|
|
|
|
|
|
|
|
|
|
for sentence_dan in data_section_dan: |
|
|
|
|
|
original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict) |
|
|
|
|
|
original_text_contrast.append(original_text_contrast_dan) |
|
|
|
|
|
repeat_words += original_text_contrast_dan["dan_sentence_word_nums"] |
|
|
|
|
|
original_text_list.append(original_text_contrast_dan["original_text"]) |
|
|
|
|
|
section_words += len(sentence_dan[0][1]) |
|
|
|
|
|
|
|
|
|
|
|
original_text = "。".join(original_text_list) |
|
|
|
|
|
repeat_rate = repeat_words / section_words |
|
|
|
|
|
|
|
|
|
|
|
repeat_quote_info = repeat_quote_info_func(original_text_contrast) |
|
|
|
|
|
|
|
|
|
|
|
return { |
|
|
|
|
|
"end_page_index": 0, |
|
|
|
|
|
"name": "第1部分", |
|
|
|
|
|
"repeat_rate": repeat_rate, |
|
|
|
|
|
"repeat_words": repeat_words, |
|
|
|
|
|
"start_page_index": 0, |
|
|
|
|
|
"words": section_words, |
|
|
|
|
|
"original_text": original_text, |
|
|
|
|
|
"original_text_oneself": original_text, |
|
|
|
|
|
"original_text_contrast": original_text_contrast, |
|
|
|
|
|
"repeat_quote_info": repeat_quote_info |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def accurate_check_rouge( |
|
|
def accurate_check_rouge( |
|
|
title, |
|
|
title, |
|
|
author, |
|
|
author, |
|
|
text_paper, |
|
|
text_paper, |
|
|
recall_data_list |
|
|
recall_data_list |
|
|
): |
|
|
): |
|
|
''' |
|
|
''' |
|
|
精确查重出相似句子 |
|
|
精确查重出相似句子 |
|
|
:param text: |
|
|
:param text: |
|
@ -120,9 +415,23 @@ def accurate_check_rouge( |
|
|
sentence_word_nums = 0 |
|
|
sentence_word_nums = 0 |
|
|
|
|
|
|
|
|
# rouge算法查重 |
|
|
# rouge算法查重 |
|
|
for text in centent_list: |
|
|
rst = [] |
|
|
rouge_pre_list = rouge_pre(text, recall_data_list) |
|
|
p = Pool(nums_cpus) # 进程池中含有n个子进程 |
|
|
data_zong.append(rouge_pre_list) |
|
|
|
|
|
|
|
|
print("centent_list", centent_list) |
|
|
|
|
|
|
|
|
|
|
|
for i in range(len(centent_list)): |
|
|
|
|
|
text = centent_list[i] |
|
|
|
|
|
a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,)) |
|
|
|
|
|
rst.append(a) |
|
|
|
|
|
p.close() |
|
|
|
|
|
p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。 |
|
|
|
|
|
|
|
|
|
|
|
rst = [i.get() for i in rst] |
|
|
|
|
|
|
|
|
|
|
|
for i in range(len(rst)): |
|
|
|
|
|
print(rst[i]) |
|
|
|
|
|
data_zong.append(rst[i]) |
|
|
|
|
|
|
|
|
t0 = time.time() |
|
|
t0 = time.time() |
|
|
# bert算法查重 |
|
|
# bert算法查重 |
|
@ -132,7 +441,6 @@ def accurate_check_rouge( |
|
|
t1 = time.time() |
|
|
t1 = time.time() |
|
|
original_dict = [] |
|
|
original_dict = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 找出相似的句子序号 |
|
|
# 找出相似的句子序号 |
|
|
bool_check_sentense = [] |
|
|
bool_check_sentense = [] |
|
|
# bert算法 |
|
|
# bert算法 |
|
@ -142,12 +450,14 @@ def accurate_check_rouge( |
|
|
|
|
|
|
|
|
# rouge算法 |
|
|
# rouge算法 |
|
|
for i in range(len(data_zong)): |
|
|
for i in range(len(data_zong)): |
|
|
if data_zong[i][0] > 0.47: |
|
|
for j in range(len(data_zong[i])): |
|
|
bool_check_sentense.append([i,data_zong[i][1]]) |
|
|
if data_zong[i][j][1] > 0.47: |
|
|
biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] |
|
|
bool_check_sentense.append([i, data_zong[i][j][0]]) |
|
|
|
|
|
biao_red = biaohong(bool_check_sentense, data_zong, |
|
|
print("bert精确查重时间", t1-t0) |
|
|
recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] |
|
|
|
|
|
|
|
|
|
|
|
print("bert精确查重时间", t1 - t0) |
|
|
|
|
|
print(biao_red) |
|
|
|
|
|
|
|
|
sentence_0_list = [] |
|
|
sentence_0_list = [] |
|
|
sentence_1_list = [] |
|
|
sentence_1_list = [] |
|
@ -156,7 +466,8 @@ def accurate_check_rouge( |
|
|
for i in biao_red: |
|
|
for i in biao_red: |
|
|
if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]: |
|
|
if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]: |
|
|
sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]])) |
|
|
sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]])) |
|
|
sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) |
|
|
sentence_1_list.append( |
|
|
|
|
|
"".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) |
|
|
sim_paper_name.append(recall_data_list[i[1][0]][1]) |
|
|
sim_paper_name.append(recall_data_list[i[1][0]][1]) |
|
|
else: |
|
|
else: |
|
|
continue |
|
|
continue |
|
@ -164,7 +475,6 @@ def accurate_check_rouge( |
|
|
sentence_0_list_new = [] |
|
|
sentence_0_list_new = [] |
|
|
sentence_1_list_new = [] |
|
|
sentence_1_list_new = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for i in zip(sentence_0_list, sentence_1_list): |
|
|
for i in zip(sentence_0_list, sentence_1_list): |
|
|
if len(i[0]) + len(i[1]) < 1200: |
|
|
if len(i[0]) + len(i[1]) < 1200: |
|
|
sentence_0_list_new.append(i[0]) |
|
|
sentence_0_list_new.append(i[0]) |
|
@ -183,141 +493,59 @@ def accurate_check_rouge( |
|
|
|
|
|
|
|
|
chongfuwendang = {} |
|
|
chongfuwendang = {} |
|
|
|
|
|
|
|
|
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): |
|
|
print("paper_dict", paper_dict) |
|
|
|
|
|
print("sentence_0_list_new", sentence_0_list_new) |
|
|
print([sentence_0_dan, sentence_1_dan]) |
|
|
print("sentence_1_list_new", sentence_1_list_new) |
|
|
original_text_contrast_dict = { |
|
|
print("sim_paper_name", sim_paper_name) |
|
|
"original_text": "", |
|
|
similar_content_control = [[]] |
|
|
"similar_content": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"content": "", |
|
|
|
|
|
"thesis_info": "", |
|
|
|
|
|
"title": "", |
|
|
|
|
|
"year": "", |
|
|
|
|
|
"degree": "", |
|
|
|
|
|
"author": "", |
|
|
|
|
|
} |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
try: |
|
|
|
|
|
sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre |
|
|
|
|
|
except: |
|
|
|
|
|
print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]]) |
|
|
|
|
|
continue |
|
|
|
|
|
# 9/0 |
|
|
|
|
|
sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre |
|
|
|
|
|
|
|
|
|
|
|
if sentence_0_bool == False or sentence_1_bool == False: |
|
|
with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f: |
|
|
continue |
|
|
json.dump(paper_dict, f, ensure_ascii=False) |
|
|
|
|
|
|
|
|
dan_sentence_word_nums = len(paper_dict[paper_dict_dan_id][1]) |
|
|
sentence_0_list_new_cursor = sentence_0_list_new[0] |
|
|
sentence_word_nums += dan_sentence_word_nums |
|
|
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), |
|
|
|
|
|
sentence_0_list_new, |
|
|
original_text.append(sentence_0_dan_red) |
|
|
sentence_1_list_new, |
|
|
original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format( |
|
|
sim_paper_name): |
|
|
dan_sentence_word_nums) + sentence_0_dan_red |
|
|
|
|
|
|
|
|
if sentence_0_list_new_cursor != sentence_0_dan: |
|
|
thesis_info = " ".join([sim_paper_name_dan["title"], sim_paper_name_dan["author"], sim_paper_name_dan["degree"], sim_paper_name_dan["year"]]) |
|
|
similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]]) |
|
|
original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red |
|
|
|
|
|
original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan["title"] |
|
|
|
|
|
original_text_contrast_dict["similar_content"][0]["author"] = sim_paper_name_dan["author"] |
|
|
|
|
|
original_text_contrast_dict["similar_content"][0]["degree"] = sim_paper_name_dan["degree"] |
|
|
|
|
|
original_text_contrast_dict["similar_content"][0]["year"] = sim_paper_name_dan["year"] |
|
|
|
|
|
original_text_contrast_dict["similar_content"][0]["thesis_info"] = thesis_info |
|
|
|
|
|
|
|
|
|
|
|
original_text_contrast.append(original_text_contrast_dict) |
|
|
|
|
|
|
|
|
|
|
|
# for i in repeat_quote_info: |
|
|
|
|
|
# if |
|
|
|
|
|
|
|
|
|
|
|
if thesis_info not in chongfuwendang: |
|
|
|
|
|
chongfuwendang[thesis_info] = { |
|
|
|
|
|
"quote": False, |
|
|
|
|
|
"thesis_author": sim_paper_name_dan["author"], |
|
|
|
|
|
"thesis_date" : sim_paper_name_dan["year"], |
|
|
|
|
|
"thesis_info" : thesis_info, |
|
|
|
|
|
"thesis_repeat_rate": (dan_sentence_word_nums/sim_paper_name_dan["paper_len_word"]) * 100, #round(repetition_rate, 3) * 100 |
|
|
|
|
|
"thesis_title": sim_paper_name_dan["title"], |
|
|
|
|
|
"thesis_link": "", |
|
|
|
|
|
"thesis_publish": sim_paper_name_dan["degree"], |
|
|
|
|
|
"thesis_repeat_word": dan_sentence_word_nums, |
|
|
|
|
|
"thesis_teacher": "", |
|
|
|
|
|
"paper_len_word": sim_paper_name_dan["paper_len_word"] |
|
|
|
|
|
} |
|
|
|
|
|
else: |
|
|
else: |
|
|
chongfuwendang[thesis_info]["thesis_repeat_word"] += dan_sentence_word_nums |
|
|
similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]) |
|
|
chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"]/chongfuwendang[thesis_info]["paper_len_word"]) * 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data = [similar_content_control] |
|
|
|
|
|
|
|
|
chongfuwendang = sorted(chongfuwendang.items(), |
|
|
# 模拟多个章节 |
|
|
key=lambda x: x[1]["thesis_repeat_rate"], reverse=False) |
|
|
section_details_list = [] |
|
|
|
|
|
for data_dan in data: |
|
|
|
|
|
data_section_dan = data_dan |
|
|
|
|
|
|
|
|
|
|
|
# 章节详细信息 |
|
|
|
|
|
section_details = section_details_func(data_section_dan, paper_dict) |
|
|
|
|
|
section_details_list.append(section_details) |
|
|
|
|
|
|
|
|
for i in range(len(chongfuwendang)): |
|
|
# 模拟多个章节 |
|
|
repeat_paper_one_info_dict = chongfuwendang[i][1] |
|
|
|
|
|
repeat_paper_one_info_dict.pop("paper_len_word") |
|
|
|
|
|
repeat_paper_one_info_dict["thesis_repeat_rate"] = str(round(repeat_paper_one_info_dict["thesis_repeat_rate"], 1)) + "%" |
|
|
|
|
|
repeat_quote_info.append(repeat_paper_one_info_dict) |
|
|
|
|
|
|
|
|
|
|
|
original_text = "。".join(original_text) |
|
|
section_data_list = [] |
|
|
|
|
|
for section_details in section_details_list: |
|
|
|
|
|
section_data = section_data_func(section_details) |
|
|
|
|
|
|
|
|
repetition_rate = sentence_word_nums/len(text_paper) |
|
|
total_data = total_data_func(section_details_list) |
|
|
repetition_rate = round(repetition_rate, 3) * 100 |
|
|
|
|
|
|
|
|
|
|
|
format = '%Y-%m-%d %H:%M:%S' |
|
|
format = '%Y-%m-%d %H:%M:%S' |
|
|
value = time.localtime(int(time.time())) |
|
|
value = time.localtime(int(time.time())) |
|
|
dt = time.strftime(format, value) |
|
|
dt = time.strftime(format, value) |
|
|
|
|
|
|
|
|
return { |
|
|
paper_data = { |
|
|
"author": author, |
|
|
"author": author, |
|
|
"check_time": dt, |
|
|
"check_time": dt, |
|
|
"title": title, |
|
|
|
|
|
"time_range": "1900-01-01至2023-08-08", |
|
|
"time_range": "1900-01-01至2023-08-08", |
|
|
"section_data": [ |
|
|
"title": title, |
|
|
{ |
|
|
"total_data": total_data, |
|
|
"oneself_repeat_words": sentence_word_nums, |
|
|
"section_data": section_data_list, |
|
|
"reference_repeat_words": sentence_word_nums, |
|
|
"section_details": section_details_list |
|
|
"section_name": "第1部分", |
|
|
|
|
|
"section_oneself_rate": "{}%".format(repetition_rate), |
|
|
|
|
|
"section_repeat_rate": "{}%".format(repetition_rate), |
|
|
|
|
|
"section_repeat_words": sentence_word_nums, |
|
|
|
|
|
"section_words": len(text_paper) |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"section_details": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"end_page_index": 0, |
|
|
|
|
|
"name": "", |
|
|
|
|
|
"repeat_rate": "", |
|
|
|
|
|
"repeat_words": "", |
|
|
|
|
|
"words": "", |
|
|
|
|
|
"original_text": original_text, |
|
|
|
|
|
"original_text_oneself": original_text, |
|
|
|
|
|
"original_text_contrast": original_text_contrast, |
|
|
|
|
|
"repeat_quote_info": repeat_quote_info |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"total_data": { |
|
|
|
|
|
"back_repeat_words": "", |
|
|
|
|
|
"exclude_personal_rate": "{}%".format(repetition_rate), |
|
|
|
|
|
"exclude_quote_rate": "{}%".format(repetition_rate), |
|
|
|
|
|
"foot_end_note": "0", |
|
|
|
|
|
"front_repeat_words": "", |
|
|
|
|
|
"single_max_rate": "", |
|
|
|
|
|
"single_max_repeat_words": "", |
|
|
|
|
|
"suspected_paragraph": "1", |
|
|
|
|
|
"suspected_paragraph_max_repeat_words": "", |
|
|
|
|
|
"suspected_paragraph_min_repeat_words": "", |
|
|
|
|
|
"tables": "0", |
|
|
|
|
|
"total_paragraph": "1", |
|
|
|
|
|
"total_repeat_rate": "{}%".format(repetition_rate), |
|
|
|
|
|
"total_repeat_words": sentence_word_nums, |
|
|
|
|
|
"total_words": len(text_paper) |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
return paper_data |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def biaohong(bool_check_sentense, data_zong, df_train_nuoche): |
|
|
def biaohong(bool_check_sentense, data_zong, df_train_nuoche): |
|
@ -331,15 +559,16 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche): |
|
|
start = -1 |
|
|
start = -1 |
|
|
end = -1 |
|
|
end = -1 |
|
|
while True: |
|
|
while True: |
|
|
if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche): |
|
|
if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \ |
|
|
|
|
|
+ 1 >= len(df_train_nuoche): |
|
|
break |
|
|
break |
|
|
elif bool_check_sentense[i][0]-1 == start: |
|
|
elif bool_check_sentense[i][0] - 1 == start: |
|
|
i += 1 |
|
|
i += 1 |
|
|
continue |
|
|
continue |
|
|
elif bool_check_sentense[i][0] == end: |
|
|
elif bool_check_sentense[i][0] == end: |
|
|
i += 1 |
|
|
i += 1 |
|
|
continue |
|
|
continue |
|
|
elif bool_check_sentense[i][0]-1 == end: |
|
|
elif bool_check_sentense[i][0] - 1 == end: |
|
|
i += 1 |
|
|
i += 1 |
|
|
continue |
|
|
continue |
|
|
else: |
|
|
else: |
|
@ -347,10 +576,10 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche): |
|
|
biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1]) |
|
|
biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1]) |
|
|
biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]]) |
|
|
biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]]) |
|
|
biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1]) |
|
|
biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1]) |
|
|
biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1], |
|
|
biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1], |
|
|
[bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]]) |
|
|
[bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]]) |
|
|
start = bool_check_sentense[i][0]-1 |
|
|
start = bool_check_sentense[i][0] - 1 |
|
|
end = bool_check_sentense[i][0]+1 |
|
|
end = bool_check_sentense[i][0] + 1 |
|
|
i += 1 |
|
|
i += 1 |
|
|
|
|
|
|
|
|
return biao_red |
|
|
return biao_red |
|
@ -378,8 +607,8 @@ def dialog_line_parse(url, text): |
|
|
# "".format(url, response.status_code, response.text) |
|
|
# "".format(url, response.status_code, response.text) |
|
|
# ) |
|
|
# ) |
|
|
print("【{}】 Failed to get a proper response from remote " |
|
|
print("【{}】 Failed to get a proper response from remote " |
|
|
"server. Status Code: {}. Response: {}" |
|
|
"server. Status Code: {}. Response: {}" |
|
|
"".format(url, response.status_code, response.text)) |
|
|
"".format(url, response.status_code, response.text)) |
|
|
print(text) |
|
|
print(text) |
|
|
return {} |
|
|
return {} |
|
|
|
|
|
|
|
@ -410,7 +639,7 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre): |
|
|
while True: |
|
|
while True: |
|
|
if down_pointer >= len(bert_text_pre): |
|
|
if down_pointer >= len(bert_text_pre): |
|
|
break |
|
|
break |
|
|
elif down_pointer == len(bert_text_pre)-1: |
|
|
elif down_pointer == len(bert_text_pre) - 1: |
|
|
if bert_text[up_pointer] == bert_text_pre[down_pointer]: |
|
|
if bert_text[up_pointer] == bert_text_pre[down_pointer]: |
|
|
pointer_list.append(up_pointer) |
|
|
pointer_list.append(up_pointer) |
|
|
break |
|
|
break |
|
@ -428,7 +657,7 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre): |
|
|
up_pointer += 1 |
|
|
up_pointer += 1 |
|
|
down_pointer += 1 |
|
|
down_pointer += 1 |
|
|
else: |
|
|
else: |
|
|
if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]": |
|
|
if bert_text_pre[down_pointer:down_pointer + 5] == "[UNK]": |
|
|
up_pointer += 1 |
|
|
up_pointer += 1 |
|
|
down_pointer += 5 |
|
|
down_pointer += 5 |
|
|
pointer_list.append(up_pointer) |
|
|
pointer_list.append(up_pointer) |
|
@ -441,12 +670,11 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre): |
|
|
down_pointer = 0 |
|
|
down_pointer = 0 |
|
|
pointer_list = [] |
|
|
pointer_list = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start = pointer_list[0] |
|
|
start = pointer_list[0] |
|
|
end = pointer_list[-1] |
|
|
end = pointer_list[-1] |
|
|
bert_text_list = list(bert_text) |
|
|
bert_text_list = list(bert_text) |
|
|
bert_text_list.insert(start, "<red>") |
|
|
bert_text_list.insert(start, "<red>") |
|
|
bert_text_list.insert(end + 2 , "</red>") |
|
|
bert_text_list.insert(end + 2, "</red>") |
|
|
|
|
|
|
|
|
text_original_list = list(text_original) |
|
|
text_original_list = list(text_original) |
|
|
|
|
|
|
|
@ -482,30 +710,14 @@ def biaohong_bert_predict(sentence_0_list, sentence_1_list): |
|
|
:return: |
|
|
:return: |
|
|
''' |
|
|
''' |
|
|
|
|
|
|
|
|
# sentence_0_list = [] |
|
|
paper_dict = \ |
|
|
# sentence_1_list = [] |
|
|
dialog_line_parse("http://192.168.31.74:16003/", |
|
|
# sim_paper_name = [] |
|
|
{"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})[ |
|
|
# |
|
|
"resilt"] |
|
|
# for i in biaohong_list: |
|
|
|
|
|
# sentence_0_list.append("。".join([paper_list[i[0][0]], paper_list[i[0][1]], paper_list[i[0][2]]])) |
|
|
|
|
|
# sentence_1_list.append("。".join([recall_data_list[i[1][1]], recall_data_list[i[1][1]], recall_data_list[i[1][2]]])) |
|
|
|
|
|
|
|
|
|
|
|
paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"] |
|
|
|
|
|
|
|
|
|
|
|
# paper_dict |
|
|
|
|
|
# print("原文:".format(i), paper_dict[i][0]) |
|
|
|
|
|
# print("原文标红:".format(i), paper_dict[i][1]) |
|
|
|
|
|
# print("相似:".format(i), paper_dict[i][2]) |
|
|
|
|
|
# print("相似标红:".format(i), paper_dict[i][3]) |
|
|
|
|
|
|
|
|
|
|
|
# original_text |
|
|
|
|
|
# |
|
|
|
|
|
# |
|
|
|
|
|
# for paper_dict_dan, sentence_0_dan, sentence_1_dan in zip(paper_dict, sentence_0_list, sentence_1_list): |
|
|
|
|
|
# original_text_marked_red |
|
|
|
|
|
|
|
|
|
|
|
return paper_dict |
|
|
return paper_dict |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ulit_text(title, text): |
|
|
def ulit_text(title, text): |
|
|
data = [] |
|
|
data = [] |
|
|
try: |
|
|
try: |
|
@ -520,6 +732,7 @@ def ulit_text(title, text): |
|
|
data.append([i, title]) |
|
|
data.append([i, title]) |
|
|
return data |
|
|
return data |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_query(conn, sql, params): |
|
|
def run_query(conn, sql, params): |
|
|
with conn.cursor() as cursor: |
|
|
with conn.cursor() as cursor: |
|
|
cursor.execute(sql, params) |
|
|
cursor.execute(sql, params) |
|
@ -587,9 +800,8 @@ def ulit_recall_paper(recall_data_list_dict): |
|
|
# data.append([sentence, filename]) |
|
|
# data.append([sentence, filename]) |
|
|
# return data |
|
|
# return data |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data = [] |
|
|
data = [] |
|
|
for i in list(recall_data_list_dict.items())[:5]: |
|
|
for i in list(recall_data_list_dict.items())[:10]: |
|
|
data_one = processing_one_text(i[0]) |
|
|
data_one = processing_one_text(i[0]) |
|
|
data.extend(data_one) |
|
|
data.extend(data_one) |
|
|
|
|
|
|
|
@ -652,18 +864,18 @@ def uilt_content(content): |
|
|
key_word_bool = True |
|
|
key_word_bool = True |
|
|
break |
|
|
break |
|
|
|
|
|
|
|
|
if zhaiyao_bool== True and zhaiyao_en_bool == True: |
|
|
if zhaiyao_bool == True and zhaiyao_en_bool == True: |
|
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,zhaiyao_en_str) |
|
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, zhaiyao_en_str) |
|
|
result_biaoti_list = re.findall(pantten_zhaiyao, content) |
|
|
result_biaoti_list = re.findall(pantten_zhaiyao, content) |
|
|
zhaiyao_text = result_biaoti_list[0] |
|
|
zhaiyao_text = result_biaoti_list[0] |
|
|
|
|
|
|
|
|
elif zhaiyao_bool == True and key_word_bool == True: |
|
|
elif zhaiyao_bool == True and key_word_bool == True: |
|
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,key_word_str ) |
|
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, key_word_str) |
|
|
result_biaoti_list = re.findall(pantten_zhaiyao, content) |
|
|
result_biaoti_list = re.findall(pantten_zhaiyao, content) |
|
|
zhaiyao_text = result_biaoti_list[0] |
|
|
zhaiyao_text = result_biaoti_list[0] |
|
|
|
|
|
|
|
|
elif zhaiyao_bool == True and mulu_bool == True: |
|
|
elif zhaiyao_bool == True and mulu_bool == True: |
|
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,mulu_str) |
|
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, mulu_str) |
|
|
result_biaoti_list = re.findall(pantten_zhaiyao, content) |
|
|
result_biaoti_list = re.findall(pantten_zhaiyao, content) |
|
|
zhaiyao_text = result_biaoti_list[0] |
|
|
zhaiyao_text = result_biaoti_list[0] |
|
|
|
|
|
|
|
@ -688,7 +900,6 @@ def ulit_request_file(file): |
|
|
return abst_zh, content |
|
|
return abst_zh, content |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @app.route("/", methods=["POST"]) |
|
|
# @app.route("/", methods=["POST"]) |
|
|
# def handle_query(): |
|
|
# def handle_query(): |
|
|
# print(request.remote_addr) |
|
|
# print(request.remote_addr) |
|
@ -761,7 +972,6 @@ def classify(): # 调用模型,设置最大batch_size |
|
|
goodsId = data_dict['goodsId'] |
|
|
goodsId = data_dict['goodsId'] |
|
|
callbackUrl = data_dict['callbackUrl'] |
|
|
callbackUrl = data_dict['callbackUrl'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 调用宇鹏查询相似十篇 |
|
|
# 调用宇鹏查询相似十篇 |
|
|
# recall_data_list_dict = recall_10(title, abst_zh, content) |
|
|
# recall_data_list_dict = recall_10(title, abst_zh, content) |
|
|
|
|
|
|
|
@ -805,10 +1015,10 @@ def handle_query(): |
|
|
|
|
|
|
|
|
# request.form.get('prompt') |
|
|
# request.form.get('prompt') |
|
|
dataBases = request.form.get("dataBases") |
|
|
dataBases = request.form.get("dataBases") |
|
|
minSimilarity = request.form.get("minSimilarity") # txt |
|
|
minSimilarity = request.form.get("minSimilarity") # txt |
|
|
minWords = request.form.get("minWords") |
|
|
minWords = request.form.get("minWords") |
|
|
title = request.form.get("title") |
|
|
title = request.form.get("title") |
|
|
author = request.form.get("author") # txt |
|
|
author = request.form.get("author") # txt |
|
|
file = request.files.get('file') |
|
|
file = request.files.get('file') |
|
|
token = request.form.get("token") |
|
|
token = request.form.get("token") |
|
|
account = request.form.get("account") |
|
|
account = request.form.get("account") |
|
@ -833,7 +1043,7 @@ def handle_query(): |
|
|
'account': account, |
|
|
'account': account, |
|
|
'goodsId': goodsId, |
|
|
'goodsId': goodsId, |
|
|
'callbackUrl': callbackUrl |
|
|
'callbackUrl': callbackUrl |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
# 绑定文本和query id |
|
|
# 绑定文本和query id |
|
|
print(d) |
|
|
print(d) |
|
@ -860,8 +1070,9 @@ def handle_query(): |
|
|
return_text = {'code': 1} |
|
|
return_text = {'code': 1} |
|
|
return jsonify(return_text) # 返回结果 |
|
|
return jsonify(return_text) # 返回结果 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
t = Thread(target=classify) |
|
|
t = Thread(target=classify) |
|
|
t.start() |
|
|
t.start() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
if __name__ == "__main__": |
|
|
app.run(host="0.0.0.0", port=16001, threaded=True, debug=True, use_reloader=False) |
|
|
app.run(host="0.0.0.0", port=16001, threaded=True, debug=True) |
|
|