|
@ -7,7 +7,7 @@ from rouge_chinese import Rouge |
|
|
from Rouge_w import Rouge_w,Rouge_l |
|
|
from Rouge_w import Rouge_w,Rouge_l |
|
|
import json |
|
|
import json |
|
|
import pymysql |
|
|
import pymysql |
|
|
|
|
|
import re |
|
|
import requests |
|
|
import requests |
|
|
from flask import Flask, jsonify |
|
|
from flask import Flask, jsonify |
|
|
from flask import request |
|
|
from flask import request |
|
@ -105,6 +105,7 @@ def accurate_check_rouge(text_paper, recall_data_list): |
|
|
text_paper = str(text_paper).replace("。\n", "。") |
|
|
text_paper = str(text_paper).replace("。\n", "。") |
|
|
centent_list.extend(text_paper.split("。")) |
|
|
centent_list.extend(text_paper.split("。")) |
|
|
data_zong = [] |
|
|
data_zong = [] |
|
|
|
|
|
sentence_word_nums = 0 |
|
|
|
|
|
|
|
|
# rouge算法查重 |
|
|
# rouge算法查重 |
|
|
# for text in centent_list: |
|
|
# for text in centent_list: |
|
@ -126,6 +127,7 @@ def accurate_check_rouge(text_paper, recall_data_list): |
|
|
bool_check_sentense.append([i,data_zong[i][1]]) |
|
|
bool_check_sentense.append([i,data_zong[i][1]]) |
|
|
biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] |
|
|
biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sentence_0_list = [] |
|
|
sentence_0_list = [] |
|
|
sentence_1_list = [] |
|
|
sentence_1_list = [] |
|
|
sim_paper_name = [] |
|
|
sim_paper_name = [] |
|
@ -169,41 +171,97 @@ def accurate_check_rouge(text_paper, recall_data_list): |
|
|
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): |
|
|
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): |
|
|
|
|
|
|
|
|
print([sentence_0_dan, sentence_1_dan]) |
|
|
print([sentence_0_dan, sentence_1_dan]) |
|
|
original_text_contrast_dict = {} |
|
|
original_text_contrast_dict = { |
|
|
|
|
|
"original_text": "", |
|
|
|
|
|
"similar_content": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"content": "", |
|
|
|
|
|
"thesis_info": "", |
|
|
|
|
|
"title": "", |
|
|
|
|
|
"year": "", |
|
|
|
|
|
"degree": "", |
|
|
|
|
|
"author": "", |
|
|
|
|
|
} |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
similar_content = {"author": ""} |
|
|
similar_content = {"author": ""} |
|
|
try: |
|
|
try: |
|
|
sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre |
|
|
sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre |
|
|
except: |
|
|
except: |
|
|
print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]]) |
|
|
print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]]) |
|
|
9/0 |
|
|
continue |
|
|
|
|
|
# 9/0 |
|
|
sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre |
|
|
sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre |
|
|
|
|
|
|
|
|
if sentence_0_bool == False or sentence_1_bool == False: |
|
|
if sentence_0_bool == False or sentence_1_bool == False: |
|
|
continue |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
dan_sentence_word_nums = len(paper_dict[paper_dict_dan_id][1]) |
|
|
|
|
|
sentence_word_nums += dan_sentence_word_nums |
|
|
|
|
|
|
|
|
original_text.append(sentence_0_dan_red) |
|
|
original_text.append(sentence_0_dan_red) |
|
|
original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format( |
|
|
original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format( |
|
|
len(paper_dict[paper_dict_dan_id][1])) + sentence_0_dan_red |
|
|
dan_sentence_word_nums) + sentence_0_dan_red |
|
|
|
|
|
|
|
|
|
|
|
# similar_content["content"] = sentence_1_dan_red |
|
|
|
|
|
# similar_content["title"] = sim_paper_name_dan |
|
|
|
|
|
# original_text_contrast_dict["similar_content"][0] = similar_content |
|
|
|
|
|
|
|
|
similar_content["content"] = sentence_1_dan_red |
|
|
original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red |
|
|
similar_content["title"] = sim_paper_name_dan |
|
|
original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan |
|
|
original_text_contrast_dict["similar_content"] = similar_content |
|
|
|
|
|
|
|
|
|
|
|
original_text_contrast.append(original_text_contrast_dict) |
|
|
original_text_contrast.append(original_text_contrast_dict) |
|
|
|
|
|
|
|
|
original_text = "。".join(original_text) |
|
|
original_text = "。".join(original_text) |
|
|
|
|
|
|
|
|
|
|
|
repetition_rate = sentence_word_nums/len(text_paper) |
|
|
|
|
|
repetition_rate = round(repetition_rate, 3) *100 |
|
|
|
|
|
|
|
|
return { |
|
|
return { |
|
|
"author": "", |
|
|
"author": "", |
|
|
"check_time": "", |
|
|
"check_time": "", |
|
|
"section_data": "", |
|
|
"section_data": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"oneself_repeat_words": sentence_word_nums, |
|
|
|
|
|
"reference_repeat_words": sentence_word_nums, |
|
|
|
|
|
"section_name": "第1部分", |
|
|
|
|
|
"section_oneself_rate": "{}%".format(repetition_rate), |
|
|
|
|
|
"section_repeat_rate": "{}%".format(repetition_rate), |
|
|
|
|
|
"section_repeat_words": sentence_word_nums, |
|
|
|
|
|
"section_words": len(text_paper) |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"section_details": [ |
|
|
"section_details": [ |
|
|
{ |
|
|
{ |
|
|
"end_page_index": 0, |
|
|
"end_page_index": 0, |
|
|
"name": "", |
|
|
"name": "", |
|
|
|
|
|
"repeat_rate": "", |
|
|
|
|
|
"repeat_words": "", |
|
|
|
|
|
"words": "", |
|
|
"original_text": original_text, |
|
|
"original_text": original_text, |
|
|
|
|
|
"original_text_oneself": original_text, |
|
|
"original_text_contrast": original_text_contrast |
|
|
"original_text_contrast": original_text_contrast |
|
|
} |
|
|
} |
|
|
] |
|
|
], |
|
|
|
|
|
"time_range": "1900-01-01至2023-08-08", |
|
|
|
|
|
"title": "3", |
|
|
|
|
|
"total_data": { |
|
|
|
|
|
"back_repeat_words": "", |
|
|
|
|
|
"exclude_personal_rate": "{}%".format(repetition_rate), |
|
|
|
|
|
"exclude_quote_rate": "{}%".format(repetition_rate), |
|
|
|
|
|
"foot_end_note": "0", |
|
|
|
|
|
"front_repeat_words": "", |
|
|
|
|
|
"single_max_rate": "", |
|
|
|
|
|
"single_max_repeat_words": "", |
|
|
|
|
|
"suspected_paragraph": "1", |
|
|
|
|
|
"suspected_paragraph_max_repeat_words": "", |
|
|
|
|
|
"suspected_paragraph_min_repeat_words": "", |
|
|
|
|
|
"tables": "0", |
|
|
|
|
|
"total_paragraph": "1", |
|
|
|
|
|
"total_repeat_rate": "{}%".format(repetition_rate), |
|
|
|
|
|
"total_repeat_words": sentence_word_nums, |
|
|
|
|
|
"total_words": len(text_paper) |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -489,21 +547,111 @@ def recall_10(title, abst_zh, content) -> list: |
|
|
"abst_zh": abst_zh, |
|
|
"abst_zh": abst_zh, |
|
|
"content": content |
|
|
"content": content |
|
|
} |
|
|
} |
|
|
paper_dict = dialog_line_parse("http://192.168.31.145:50002/check", request_json) |
|
|
paper_dict = dialog_line_parse("http://192.168.31.145:50004/check", request_json) |
|
|
|
|
|
|
|
|
return paper_dict |
|
|
return paper_dict |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def uilt_content(content): |
|
|
|
|
|
zhaiyao_list = ["摘要"] |
|
|
|
|
|
zhaiyao_en_list = ["Abstract", "abstract"] |
|
|
|
|
|
mulu_list = ["目录"] |
|
|
|
|
|
key_word_list = ["关键词"] |
|
|
|
|
|
key_word_bool = False |
|
|
|
|
|
key_word_str = "" |
|
|
|
|
|
zhaiyao_bool = False |
|
|
|
|
|
zhaiyao_en_bool = False |
|
|
|
|
|
zhaiyao_str = "" |
|
|
|
|
|
zhaiyao_en_str = "" |
|
|
|
|
|
mulu_str = "" |
|
|
|
|
|
zhaiyao_text = "" |
|
|
|
|
|
mulu_bool = False |
|
|
|
|
|
|
|
|
|
|
|
for i in zhaiyao_list: |
|
|
|
|
|
if i in content: |
|
|
|
|
|
zhaiyao_bool = True |
|
|
|
|
|
zhaiyao_str = i |
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
for i in zhaiyao_en_list: |
|
|
|
|
|
if i in content: |
|
|
|
|
|
zhaiyao_en_bool = True |
|
|
|
|
|
zhaiyao_en_str = i |
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
for i in mulu_list: |
|
|
|
|
|
if i in content: |
|
|
|
|
|
mulu_str = i |
|
|
|
|
|
mulu_bool = True |
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
for i in key_word_list: |
|
|
|
|
|
if i in content: |
|
|
|
|
|
key_word_str = i |
|
|
|
|
|
key_word_bool = True |
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
if zhaiyao_bool== True and zhaiyao_en_bool == True: |
|
|
|
|
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,zhaiyao_en_str) |
|
|
|
|
|
result_biaoti_list = re.findall(pantten_zhaiyao, content) |
|
|
|
|
|
zhaiyao_text = result_biaoti_list[0] |
|
|
|
|
|
|
|
|
|
|
|
elif zhaiyao_bool == True and key_word_bool == True: |
|
|
|
|
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,key_word_str ) |
|
|
|
|
|
result_biaoti_list = re.findall(pantten_zhaiyao, content) |
|
|
|
|
|
zhaiyao_text = result_biaoti_list[0] |
|
|
|
|
|
|
|
|
|
|
|
elif zhaiyao_bool == True and mulu_bool == True: |
|
|
|
|
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,mulu_str) |
|
|
|
|
|
result_biaoti_list = re.findall(pantten_zhaiyao, content) |
|
|
|
|
|
zhaiyao_text = result_biaoti_list[0] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return zhaiyao_text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ulit_request_file(file): |
|
|
|
|
|
file_name = file.filename |
|
|
|
|
|
if file_name.split(".")[-1] == "txt": |
|
|
|
|
|
file_name_save = "data/request/{}".format(file_name) |
|
|
|
|
|
file.save(file_name_save) |
|
|
|
|
|
try: |
|
|
|
|
|
with open(file_name_save, encoding="gbk") as f: |
|
|
|
|
|
content = f.read() |
|
|
|
|
|
except: |
|
|
|
|
|
with open(file_name_save, encoding="utf-8") as f: |
|
|
|
|
|
content = f.read() |
|
|
|
|
|
|
|
|
|
|
|
content = content.strip().replace("\n", "").replace(" ", "") |
|
|
|
|
|
abst_zh = uilt_content(content) |
|
|
|
|
|
|
|
|
|
|
|
return abst_zh, content |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route("/", methods=["POST"]) |
|
|
@app.route("/", methods=["POST"]) |
|
|
def handle_query(): |
|
|
def handle_query(): |
|
|
print(request.remote_addr) |
|
|
print(request.remote_addr) |
|
|
title = request.json["title"] |
|
|
|
|
|
abst_zh = request.json["abst_zh"] # txt |
|
|
|
|
|
content = request.json["content"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# request.form.get('prompt') |
|
|
|
|
|
dataBases = request.form.get("dataBases") |
|
|
|
|
|
minSimilarity = request.form.get("minSimilarity") # txt |
|
|
|
|
|
minWords = request.form.get("minWords") |
|
|
|
|
|
title = request.form.get("title") |
|
|
|
|
|
author = request.form.get("author") # txt |
|
|
|
|
|
file = request.files.get('file') |
|
|
|
|
|
token = request.form.get("token") |
|
|
|
|
|
account = request.form.get("account") |
|
|
|
|
|
goodsId = request.form.get("goodsId") |
|
|
|
|
|
callbackUrl = request.form.get("callbackUrl") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
abst_zh, content = ulit_request_file(file) |
|
|
# 调用宇鹏查询相似十篇 |
|
|
# 调用宇鹏查询相似十篇 |
|
|
# recall_data_list_dict = recall_10(title, abst_zh, content) |
|
|
recall_data_list_dict = recall_10(title, abst_zh, content) |
|
|
with open("data/rell_json.txt") as f: |
|
|
# with open("data/rell_json.txt") as f: |
|
|
recall_data_list_dict = eval(f.read()) |
|
|
# recall_data_list_dict = eval(f.read()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 读取文章转化成格式数据 |
|
|
# 读取文章转化成格式数据 |
|
|