diff --git a/flask_check_bert_test.py b/flask_check_bert_test.py index b5a96b9..263e3ba 100644 --- a/flask_check_bert_test.py +++ b/flask_check_bert_test.py @@ -20,12 +20,14 @@ from multiprocessing import Pool app = Flask(__name__) app.config["JSON_AS_ASCII"] = False -pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=7, password="zhicheng123*") +pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=8, password="zhicheng123*") redis_ = redis.Redis(connection_pool=pool, decode_responses=True) -db_key_query = 'query' -db_key_querying = 'querying' -db_key_queryset = 'queryset' +db_key_query = 'query_check_task' +db_key_querying = 'querying_check_task' +db_key_queryset = 'queryset_check_task' +db_key_query_recall = 'query_recall' + nums_cpus = 24 rouge = Rouge() @@ -101,6 +103,9 @@ def rouge_pre(text, df_train_nuoche): def rouge_pre_m(text, df_train_nuoche): + + + return_list = [] index_rouge_list = [] @@ -266,14 +271,17 @@ def total_data_func(section_data_list): repeat_words += i["repeat_words"] words += i["words"] - exclude_personal_rate = str(repeat_words / words * 100) + "%" - exclude_quote_rate = str(repeat_words / words * 100) + "%" + baifenbi = (repeat_words / words) *100 + exclude_personal_rate = str(round(baifenbi, 1)) + "%" + exclude_quote_rate = str(round(baifenbi, 1)) + "%" single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"] single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"] - total_repeat_rate = str(repeat_words / words * 100) + "%" + total_repeat_rate = str(round(baifenbi, 1)) + "%" total_repeat_words = repeat_words total_words = words + print(exclude_personal_rate) + return { "back_repeat_words": "", "exclude_personal_rate": exclude_personal_rate, @@ -353,7 +361,7 @@ def section_data_func(section_details): } -def section_details_func(data_section_dan, paper_dict): +def section_details_func(data_section_dan, paper_dict, num_words): ''' 章节详细信息 :param original_text_contrast: @@ -363,21 +371,22 @@ def section_details_func(data_section_dan, paper_dict): original_text_contrast = [] section_repeat_rate = "" repeat_words = 0 - section_words = 0 + section_words = num_words oneself_repeat_words = "" reference_repeat_words = "" section_oneself_rate = "" original_text_list = [] for sentence_dan in data_section_dan: + print("sentence_dan", sentence_dan) original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict) original_text_contrast.append(original_text_contrast_dan) repeat_words += original_text_contrast_dan["dan_sentence_word_nums"] original_text_list.append(original_text_contrast_dan["original_text"]) - section_words += len(sentence_dan[0][1]) original_text = "。".join(original_text_list) - repeat_rate = repeat_words / section_words + repeat_rate = (repeat_words / section_words)* 100 + repeat_rate = str(round(repeat_rate, 1)) + "%" repeat_quote_info = repeat_quote_info_func(original_text_contrast) @@ -395,6 +404,108 @@ def section_details_func(data_section_dan, paper_dict): } +def check_dict(similar_content_control, paper_dict, num_words, title, author): + ''' + 生成返回字典 + :param similar_content_control: + :param paper_dict: + :param num_words: + :param title: + :param author: + :return: + ''' + if paper_dict != []: + data = [similar_content_control] + + # 模拟多个章节 + section_details_list = [] + for data_dan in data: + data_section_dan = data_dan + + # 章节详细信息 + section_details = section_details_func(data_section_dan, paper_dict, num_words) + section_details_list.append(section_details) + + # 模拟多个章节 + + section_data_list = [] + for section_details in section_details_list: + section_data = section_data_func(section_details) + section_data_list.append(section_data) + + total_data = total_data_func(section_details_list) + + format = '%Y-%m-%d %H:%M:%S' + value = time.localtime(int(time.time())) + dt = time.strftime(format, value) + + paper_data = { + "author": author, + "check_time": dt, + "time_range": "1900-01-01至2023-08-08", + "title": title, + "total_data": total_data, + "section_data": section_data_list, + "section_details": section_details_list + } + else: + total_data = { + "back_repeat_words": "", + "exclude_personal_rate": 0, + "exclude_quote_rate": 0, + "front_repeat_words": "", + "single_max_rate": 0, + "single_max_repeat_words": 0, + "suspected_paragraph": "", + "suspected_paragraph_max_repeat_words": "", + "suspected_paragraph_min_repeat_words": "", + "total_paragraph": "", + "total_repeat_rate": 0, + "total_repeat_words": 0, + "total_words": num_words, + "tables": 0 + } + + section_data_list = [{ + "section_name": "第一部分", + "section_repeat_rate": 0, + "section_repeat_words": 0, + "section_words": num_words, + "oneself_repeat_words": 0, + "reference_repeat_words": 0, + "section_oneself_rate": 0 + }] + + + section_details_list = [ + { + "end_page_index": 0, + "name": "第1部分", + "repeat_rate": 0, + "repeat_words": 0, + "start_page_index": 0, + "words": num_words, + "original_text": "", + "original_text_oneself": "", + "original_text_contrast": [], + "repeat_quote_info": [] + } + ] + format = '%Y-%m-%d %H:%M:%S' + value = time.localtime(int(time.time())) + dt = time.strftime(format, value) + + paper_data = { + "author": author, + "check_time": dt, + "time_range": "1900-01-01至2023-08-08", + "title": title, + "total_data": total_data, + "section_data": section_data_list, + "section_details": section_details_list + } + return paper_data + def accurate_check_rouge( title, author, @@ -408,18 +519,27 @@ def accurate_check_rouge( :return: ''' # 文本处理 - centent_list = [] + # centent_list = [] + print("text_paper", len(text_paper)) text_paper = str(text_paper).replace("。\n", "。") - centent_list.extend(text_paper.split("。")) + centent_list_old = text_paper.split("。") data_zong = [] sentence_word_nums = 0 # rouge算法查重 rst = [] - p = Pool(nums_cpus) # 进程池中含有n个子进程 + p = Pool(nums_cpus) # 进程池中含有n个子进程 - print("centent_list", centent_list) + # print("centent_list", centent_list) + + num_words = 0 + centent_list = [] + for i in centent_list_old: + num_words += len(i) + if len(i) < 300: + centent_list.append(i) + print("num_words", num_words) for i in range(len(centent_list)): text = centent_list[i] a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,)) @@ -430,7 +550,7 @@ def accurate_check_rouge( rst = [i.get() for i in rst] for i in range(len(rst)): - print(rst[i]) + # print(rst[i]) data_zong.append(rst[i]) t0 = time.time() @@ -453,6 +573,7 @@ def accurate_check_rouge( for j in range(len(data_zong[i])): if data_zong[i][j][1] > 0.47: bool_check_sentense.append([i, data_zong[i][j][0]]) + biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] @@ -483,7 +604,11 @@ def accurate_check_rouge( print(len(i[0]) + len(i[1])) continue t2 = time.time() - paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) + + if sentence_0_list_new == sentence_1_list_new == []: + paper_dict = [] + else: + paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) t3 = time.time() print("标红时间", t3 - t2) @@ -498,9 +623,8 @@ def accurate_check_rouge( print("sentence_1_list_new", sentence_1_list_new) print("sim_paper_name", sim_paper_name) similar_content_control = [[]] - - with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f: - json.dump(paper_dict, f, ensure_ascii=False) + # with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f: + # json.dump(paper_dict, f, ensure_ascii=False) sentence_0_list_new_cursor = sentence_0_list_new[0] for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), @@ -513,38 +637,40 @@ def accurate_check_rouge( else: similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]) - data = [similar_content_control] - - # 模拟多个章节 - section_details_list = [] - for data_dan in data: - data_section_dan = data_dan - - # 章节详细信息 - section_details = section_details_func(data_section_dan, paper_dict) - section_details_list.append(section_details) - - # 模拟多个章节 - - section_data_list = [] - for section_details in section_details_list: - section_data = section_data_func(section_details) - - total_data = total_data_func(section_details_list) - - format = '%Y-%m-%d %H:%M:%S' - value = time.localtime(int(time.time())) - dt = time.strftime(format, value) - - paper_data = { - "author": author, - "check_time": dt, - "time_range": "1900-01-01至2023-08-08", - "title": title, - "total_data": total_data, - "section_data": section_data_list, - "section_details": section_details_list - } + paper_data = check_dict(similar_content_control, paper_dict, num_words, title, author) + # data = [similar_content_control] + # + # # 模拟多个章节 + # section_details_list = [] + # for data_dan in data: + # data_section_dan = data_dan + # + # # 章节详细信息 + # section_details = section_details_func(data_section_dan, paper_dict, num_words) + # section_details_list.append(section_details) + # + # # 模拟多个章节 + # + # section_data_list = [] + # for section_details in section_details_list: + # section_data = section_data_func(section_details) + # section_data_list.append(section_data) + # + # total_data = total_data_func(section_details_list) + # + # format = '%Y-%m-%d %H:%M:%S' + # value = time.localtime(int(time.time())) + # dt = time.strftime(format, value) + # + # paper_data = { + # "author": author, + # "check_time": dt, + # "time_range": "1900-01-01至2023-08-08", + # "title": title, + # "total_data": total_data, + # "section_data": section_data_list, + # "section_details": section_details_list + # } return paper_data @@ -801,14 +927,14 @@ def ulit_recall_paper(recall_data_list_dict): # return data data = [] - for i in list(recall_data_list_dict.items())[:10]: + for i in list(recall_data_list_dict.items()): data_one = processing_one_text(i[0]) data.extend(data_one) return data -def recall_10(queue_uuid, title, abst_zh, content) -> dict: +def recall_10(queue_uuid, title, abst_zh, content): ''' 宇鹏召回接口 :param paper_name: @@ -821,9 +947,9 @@ def recall_10(queue_uuid, title, abst_zh, content) -> dict: "abst_zh": abst_zh, "content": content } - paper_dict = dialog_line_parse("http://192.168.31.145:50004/check", request_json) + print(request_json) + da = dialog_line_parse("http://192.168.31.145:50004/check", request_json) - return paper_dict def uilt_content(content): @@ -831,6 +957,7 @@ def uilt_content(content): zhaiyao_en_list = ["Abstract", "abstract"] mulu_list = ["目录"] key_word_list = ["关键词"] + caikanwenxian = ["参考文献"] key_word_bool = False key_word_str = "" zhaiyao_bool = False @@ -880,6 +1007,10 @@ def uilt_content(content): result_biaoti_list = re.findall(pantten_zhaiyao, content) zhaiyao_text = result_biaoti_list[0] + if zhaiyao_text == "": + content = str(content).replace("。\n", "。") + content_list = content.split("。") + zhaiyao_text = "".join(content_list[:15]) return zhaiyao_text @@ -895,7 +1026,7 @@ def ulit_request_file(file): with open(file_name_save, encoding="utf-8") as f: content = f.read() - content = content.strip().replace("\n", "").replace(" ", "") + content = " ".join([i for i in content.split("\n") if i != ""]) abst_zh = uilt_content(content) return abst_zh, content @@ -945,22 +1076,61 @@ def ulit_request_file(file): # return jsonify(return_text) # 返回结果 -def classify(): # 调用模型,设置最大batch_size +# def classify_recall(): # 调用模型,设置最大batch_size +# while True: +# if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取 +# time.sleep(3) +# continue +# query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text +# data_dict_path = json.loads(query) +# path = data_dict_path['path'] +# # text_type = data_dict["text_type"] +# +# with open(path, encoding='utf8') as f1: +# # 加载文件的对象 +# data_dict = json.load(f1) +# +# queue_uuid = data_dict['id'] +# print(queue_uuid) +# dataBases = data_dict['dataBases'] +# minSimilarity = data_dict['minSimilarity'] +# minWords = data_dict['minWords'] +# title = data_dict['title'] +# author = data_dict['author'] +# abst_zh = data_dict['abst_zh'] +# content = data_dict['content'] +# token = data_dict['token'] +# account = data_dict['account'] +# goodsId = data_dict['goodsId'] +# callbackUrl = data_dict['callbackUrl'] +# +# # 调用宇鹏查询相似十篇 +# recall_data_list_dict = recall_10(queue_uuid, title, abst_zh, content) +# +# # print("查找相似的50篇完成") +# # with open("data/rell_json.txt") as f: +# # recall_data_list_dict = eval(f.read()) +# +# # 读取文章转化成格式 + + +def classify_accurate_check(): while True: - if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取 + if redis_.llen(db_key_query_recall) == 0: # 若队列中没有元素就继续获取 time.sleep(3) continue - query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text - data_dict_path = json.loads(query) - path = data_dict_path['path'] - # text_type = data_dict["text_type"] - with open(path, encoding='utf8') as f1: - # 加载文件的对象 - data_dict = json.load(f1) + query_recall = redis_.lpop(db_key_query_recall).decode('UTF-8') # 获取query的text + query_recall_dict = json.loads(query_recall) + query_recall_uuid = query_recall_dict["uuid"] + recall_data_list_dict = json.loads(query_recall_dict["data"]) + recall_data_list = ulit_recall_paper(recall_data_list_dict) + data_dict_path = redis_.get(query_recall_uuid + "_request_check") + with open(data_dict_path, encoding='utf8') as f: + data_dict = json.loads(f.read()) + queue_uuid = data_dict['id'] - print(queue_uuid) dataBases = data_dict['dataBases'] minSimilarity = data_dict['minSimilarity'] minWords = data_dict['minWords'] @@ -973,21 +1143,22 @@ def classify(): # 调用模型,设置最大batch_size goodsId = data_dict['goodsId'] callbackUrl = data_dict['callbackUrl'] - # 调用宇鹏查询相似十篇 - recall_data_list_dict = recall_10(queue_uuid, title, abst_zh, content) - # print("查找相似的50篇完成") + + print("查找相似的50篇完成") + print(len(content)) + # with open("data/rell_json.txt") as f: # recall_data_list_dict = eval(f.read()) + # recall_data_list = ulit_recall_paper(recall_data_list_dict) - # 读取文章转化成格式数据 - recall_data_list = ulit_recall_paper(recall_data_list_dict) print("文章格式转化完成") # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() # 进入精确查重系统 print("进入精确查重系统") + return_list = accurate_check_rouge(title, author, content, recall_data_list) return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} @@ -1028,7 +1199,7 @@ def handle_query(): abst_zh, content = ulit_request_file(file) id_ = str(uuid.uuid1()) # 为query生成唯一标识 - print("uuid: ", uuid) + print("uuid: ", id_) print(id_) d = { 'id': id_, @@ -1044,17 +1215,21 @@ def handle_query(): 'goodsId': goodsId, 'callbackUrl': callbackUrl } - - # 绑定文本和query id print(d) + # 绑定文本和query id + # recall_10(id_, title, abst_zh, content) + + Thread_rellce = Thread(target=recall_10, args=(id_, title, abst_zh, content,)) + Thread_rellce.start() + load_request_path = './request_data_logs/{}.json'.format(id_) - with open(load_request_path, 'w', encoding='utf8') as f2: - # ensure_ascii=False才能输入中文,否则是Unicode字符 - # indent=2 JSON数据的缩进,美观 + with open(load_request_path, 'w', encoding='utf8') as f2: # ensure_ascii=False才能输入中文,否则是Unicode字符 indent=2 JSON数据的缩进,美观 json.dump(d, f2, ensure_ascii=False, indent=4) - redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path})) # 加入redis - redis_.sadd(db_key_querying, id_) - redis_.sadd(db_key_queryset, id_) + # redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path})) # 加入redis + # redis_.sadd(db_key_querying, id_) + # redis_.sadd(db_key_queryset, id_) + + redis_.set(id_ + "_request_check", load_request_path) return_text = { 'code': 0, 'msg': "请求成功", @@ -1070,9 +1245,11 @@ def handle_query(): return_text = {'code': 1} return jsonify(return_text) # 返回结果 +t1 = Thread(target=classify_accurate_check) +t1.start() -t = Thread(target=classify) -t.start() +# t = Thread(target=classify_recall) +# t.start() if __name__ == "__main__": - app.run(host="0.0.0.0", port=16001, threaded=True, debug=True) + app.run(host="0.0.0.0", port=16001, threaded=True)