From 70e151d68793fad0852037f32c19ceb623fc0d2d Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Tue, 9 Dec 2025 18:15:45 +0800 Subject: [PATCH] =?UTF-8?q?=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90=E4=BA=A4?= =?UTF-8?q?=EF=BC=8C=E8=87=AA=E5=BB=BA=E5=BA=93=E6=9F=A5=E8=AF=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 0 Rouge_w.py | 192 ++++++ flask_check_bert_test.py | 1590 +++++++++++++++++++++++++++++++++++++++++++ redis_check_uuid_mistral.py | 92 +++ 连接数据库.py | 49 ++ 5 files changed, 1923 insertions(+) create mode 100644 README.md create mode 100644 Rouge_w.py create mode 100644 flask_check_bert_test.py create mode 100644 redis_check_uuid_mistral.py create mode 100644 连接数据库.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/Rouge_w.py b/Rouge_w.py new file mode 100644 index 0000000..9a57d13 --- /dev/null +++ b/Rouge_w.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/16 11:03 +@Author : +@FileName: +@Software: +@Describe: +""" +from rouge import Rouge +rouge = Rouge() +from copy import deepcopy + +class Rouge_w: + def __init__(self): + self.k = 0.1 + self.ki = 1.2 + self.p = 1.0 + + def fi_(self,a): + return a * self.ki + + def f(self, a): + return self.k * (a ** 2) + + def WLCS(self, X, Y, f): + m = len(X) + n = len(Y) + c = [[0 for j in range(n+1)] for i in range(m+1)] + w = [[0 for j in range(n+1)] for i in range(m+1)] + + for i in range(1, m+1): + for j in range(1, n+1): + if X[i-1] == Y[j-1]: + k = w[i-1][j-1] + c[i][j] = c[i-1][j-1] + 10.0 * (f(k+1) - f(k)) + w[i][j] = k+1 + else: + if c[i-1][j] > c[i][j-1]: + c[i][j] = c[i-1][j] + w[i][j] = 0 + else: + c[i][j] = c[i][j-1] + w[i][j] = 0 + + return c[m][n] + + def f_1(self, k): + return k ** 0.5 + + def f_(self, k): + return k ** 2 + +# print(WLCS([1,2,5], [1,2,5],f)) + + def score(self, p, r): + m = len(p) + n = len(r) + wlcs = self.WLCS(p, r, self.f) + p_wlcs = self.f_1(wlcs/self.f_(m)) + r_wlcs = self.f_1(wlcs/self.f_(n)) + f_lcs = (1 + self.p **2) * ((p_wlcs * r_wlcs) / (p_wlcs + ((self.p ** 2) *r_wlcs) + 1e-8)) + return f_lcs + +class Rouge_l: + def __init__(self): + self.b = 3 + + def LCS(self, X, Y): + m = len(X) + n = len(Y) + # 创建一个二维数组来存储中间结果 + dp = [[0] * (n + 1) for _ in range(m + 1)] + + # 使用动态规划填充dp数组 + for i in range(1, m + 1): + for j in range(1, n + 1): + if X[i - 1] == Y[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + else: + dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) + + return dp[m][n] + +# print(WLCS([1,2,5], [1,2,5],f)) + + def score(self, p, r): + m = len(p) + n = len(r) + lcs = self.LCS(p, r) + p_lcs = lcs/m + r_lcs = lcs/n + f_lcs = ((1 + self.b ** 2) * (p_lcs * r_lcs) / (p_lcs + self.b ** 2 * r_lcs + 1e-8)) + return f_lcs + + +# class Ngrams(object): +# """ +# Ngrams datastructure based on `set` or `list` +# depending in `exclusive` +# """ +# +# def __init__(self, ngrams={}, exclusive=True): +# if exclusive: +# self._ngrams = set(ngrams) +# else: +# self._ngrams = list(ngrams) +# self.exclusive = exclusive +# +# def add(self, o): +# if self.exclusive: +# self._ngrams.add(o) +# else: +# self._ngrams.append(o) +# +# def __len__(self): +# return len(self._ngrams) +# +# def intersection(self, o): +# if self.exclusive: +# inter_set = self._ngrams.intersection(o._ngrams) +# return Ngrams(inter_set, exclusive=True) +# else: +# other_list = deepcopy(o._ngrams) +# inter_list = [] +# +# for e in self._ngrams: +# try: +# i = other_list.index(e) +# except ValueError: +# continue +# other_list.pop(i) +# inter_list.append(e) +# return Ngrams(inter_list, exclusive=False) +# +# def union(self, *ngrams): +# if self.exclusive: +# union_set = self._ngrams +# for o in ngrams: +# union_set = union_set.union(o._ngrams) +# return Ngrams(union_set, exclusive=True) +# else: +# union_list = deepcopy(self._ngrams) +# for o in ngrams: +# union_list.extend(o._ngrams) +# return Ngrams(union_list, exclusive=False) +# +# class Rouge_l: +# def __init__(self): +# +# def score(self, evaluated_sentences, reference_sentences, raw_results=False, exclusive=True, **_): +# if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: +# raise ValueError("Collections must contain at least 1 sentence.") +# +# # total number of words in reference sentences +# m = len( +# Ngrams( +# _split_into_words(reference_sentences), +# exclusive=exclusive)) +# +# # total number of words in evaluated sentences +# n = len( +# Ngrams( +# _split_into_words(evaluated_sentences), +# exclusive=exclusive)) +# +# # print("m,n %d %d" % (m, n)) +# union_lcs_sum_across_all_references = 0 +# union = Ngrams(exclusive=exclusive) +# for ref_s in reference_sentences: +# lcs_count, union = _union_lcs(evaluated_sentences, +# ref_s, +# prev_union=union, +# exclusive=exclusive) +# union_lcs_sum_across_all_references += lcs_count +# +# llcs = union_lcs_sum_across_all_references +# r_lcs = llcs / m +# p_lcs = llcs / n +# +# f_lcs = 2.0 * ((p_lcs * r_lcs) / (p_lcs + r_lcs + 1e-8)) + +if __name__ == '__main__': + + rouge_model = Rouge_l() + X = ["A", "B", "C", "D", "u", "u", "u", "u", "u", "u"] + Y1 = ["A", "B", "C", "D", "H", "I", "K", "K", "K", "K", "K", "K"] + Y2 = ["A", "H", "B", "K", "C", "I", "K", "K", "K", "K", "K", "K"] + # X = "我爱你" + # Y = "我他爱" + print(rouge_model.score(X, Y1)) + # print(WLCS([1,2,5], [1,2,5],f)) \ No newline at end of file diff --git a/flask_check_bert_test.py b/flask_check_bert_test.py new file mode 100644 index 0000000..a837a40 --- /dev/null +++ b/flask_check_bert_test.py @@ -0,0 +1,1590 @@ +# -*- coding:utf-8 -*- +import os +import numpy as np +from numpy.linalg import norm +import pandas as pd +# from rouge import Rouge +from rouge_chinese import Rouge +from Rouge_w import Rouge_w, Rouge_l +import json +# import pymysql +import re +import requests +from flask import Flask, jsonify +from flask import request +import uuid +import time +import redis +from threading import Thread +from multiprocessing import Pool + +app = Flask(__name__) +app.config["JSON_AS_ASCII"] = False + +# pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=16, password="zhicheng123*") + +pool = redis.ConnectionPool(host='192.168.31.74', port=63179, max_connections=100, db=17, password="zhicheng123*") +redis_ = redis.Redis(connection_pool=pool, decode_responses=True) + +db_key_querying = 'querying_check_task' +db_key_queryset = 'queryset_check_task' +db_key_query_recall = 'query_recall' + + +nums_cpus = 24 +rouge = Rouge() +rouge_model = Rouge_w() +rouge_l_model = Rouge_l() + +def jaccard_similarity(s1, s2): + set1 = set(s1) + set2 = set(s2) + intersection = set1 & set2 + union = set1 | set2 + return len(intersection) / len(union) + +# def bert_check(text, recall_data_list): +# ''' +# bert 查重 +# :return: +# ''' +# +# sen_0 = [text] * len(recall_data_list) +# sen_1 = [i[0] for i in recall_data_list] +# +# return_list = [] +# request_json = { +# "texts": [sen_0, sen_1], +# } +# paper_dict = dialog_line_parse("http://192.168.31.74:16002/", request_json) +# score_list = paper_dict["res"] +# +# # 后期要改 +# # return_list.append(re1[0][1]) +# # return_list.append(re1[0][0]) +# if 1 in score_list: +# index_score = score_list.index(1) +# else: +# index_score = "NaN" +# +# if index_score == "NaN": +# return_list.append(0) +# return_list.append("") +# else: +# return_list.append(1) +# return_list.append(index_score) +# +# return return_list + + +def rouge_value_self(data_1, data_2): + data_1 = [' '.join(i) for i in data_1] + data_2 = [' '.join(i) for i in data_2] + rouge_l_list = [] + + for sen_1, sen_2 in zip(data_1, data_2): + sen_1 = sen_1.split(" ") + sen_2 = sen_2.split(" ") + rouge_l_score = rouge_l_model.score(sen_1, sen_2) + rouge_l_list.append(rouge_l_score) + + return "", "", rouge_l_list + + +def strsim_value(data_1, data_2): + data_1 = [' '.join(i) for i in data_1] + data_2 = [' '.join(i) for i in data_2] + rouge_l_list = [] + + for sen_1, sen_2 in zip(data_1, data_2): + sen_1 = sen_1.split(" ") + sen_2 = sen_2.split(" ") + rouge_l_score = jaccard_similarity(sen_1, sen_2) + rouge_l_list.append(rouge_l_score) + + return "", "", rouge_l_list + + +def rouge_pre(text, df_train_nuoche): + return_list = [] + index_rouge_list = [] + text_list = [text] * len(df_train_nuoche) + + data_list = [] + for data_dan in df_train_nuoche: + data_list.append(data_dan[0]) + rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list) + index_rouge_list.extend(rouge_l) + + re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] + + return_list.append(re1[0][1]) + return_list.append(re1[0][0]) + + return return_list + + +def rouge_pre_m(text, df_train_nuoche): + + return_list = [] + index_rouge_list = [] + + text_list = [text] * len(df_train_nuoche) + + data_list = [] + for data_dan in df_train_nuoche: + data_list.append(data_dan[0]) + rouge_1, rouge_2, rouge_l = strsim_value(text_list, data_list) + index_rouge_list.extend(rouge_l) + + re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] + + return_list.extend(re1) + + return return_list + + +def rouge_pre_m_1(bool_check_sentense, content_list, recall_data_list): + # bool_check_sentense [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + + bool_check_sentense_new = [] + for bool_check_sentense_dan in bool_check_sentense: + bool_check_sentense_new_dan = [] + + text_list = [] + data_list = [] + linshi = [] + for i in bool_check_sentense_dan: + text1 = content_list[i[0]] + text2 = recall_data_list[i[1]][0] + linshi.append([i[0], i[1]]) + text_list.append(text1) + data_list.append(text2) + _, _, rouge_l_list = rouge_value_self(text_list, data_list) + for i in range(len(rouge_l_list)): + if rouge_l_list[i] > 0.47: + bool_check_sentense_new_dan.append(linshi[i]) + if bool_check_sentense_new_dan != []: + bool_check_sentense_new.append(bool_check_sentense_new_dan) + return bool_check_sentense_new + +# 以单个章节为例 +def similar_content_func(): + ''' + 重复文章 + :return: + ''' + return [{ + "content": "重复的内容标红", + "thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计 李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01", + "title": "标题", + "year": "日期", + "degree": "来源", + "author": "作者" + }] + + +def original_text_contrast_func(data_sentence_dan, paper_dict, content_list): + ''' + 重复的对比详细信息 + :param similar_content: + :return: + ''' + + if data_sentence_dan != []: + original_text = "" + start = len(data_sentence_dan[0][1]) + end = 0 + similar_content = [] + for i in data_sentence_dan: # 可能有很多个暂且确定是一个 + + similar_content_dan = { + "paper_red_len_word": "", + "content": "重复的内容标红", + "thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计 李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01", + "title": "标题", + "year": "日期", + "degree": "来源", + "author": "作者", + "paper_len_word": "" + } + + sentence_0_bool, sentence_0_dan_red = original_text_marked_red(i[1], paper_dict[i[0]][0], + paper_dict[i[0]][4][0], + paper_dict[i[0]][4][1]) # text_original, bert_text, bert_text_pre + + sentence_1_bool, sentence_1_dan_red = original_text_marked_red(i[2], paper_dict[i[0]][2], + paper_dict[i[0]][4][2], + paper_dict[i[0]][4][3]) # text_original, bert_text, bert_text_pre + + if sentence_0_bool == False or sentence_1_bool == False: + continue + + start_dan = sentence_0_dan_red.index("") + end_dan = sentence_0_dan_red.index("") - len("") + + if start_dan < start: + start = start_dan + if end_dan > end: + end = end_dan + + similar_content_dan["content"] = sentence_1_dan_red + similar_content_dan["title"] = i[3] + similar_content_dan["author"] = "" + similar_content_dan["degree"] = "" + similar_content_dan["year"] = "" + similar_content_dan["paper_len_word"] = "" + similar_content_dan["paper_red_len_word"] = end_dan - start_dan + + thesis_info = " ".join( + [similar_content_dan["title"], similar_content_dan["author"], similar_content_dan["degree"], + similar_content_dan["year"]]) + similar_content_dan["thesis_info"] = thesis_info + + similar_content.append(similar_content_dan) + + original_text_list = list(data_sentence_dan[0][1]) + # original_text_list.insert(end, "\n") + # original_text_list.insert(start, "\n") + target_text_str = "".join(["\n"] + original_text_list[start: end] + ["\n"]) + + original_text_start = "".join(original_text_list[:start]) + original_text_end = "".join(original_text_list[end:]) + + print(data_sentence_dan) + if data_sentence_dan[0][4][0]-1 < 0: + start_sen = "" + else: + start_sen = content_list[data_sentence_dan[0][4][0]-1] + + if data_sentence_dan[0][4][-1]+1 >= len(content_list): + end_sen = "" + else: + end_sen = content_list[data_sentence_dan[0][4][-1]+1] + + start_sen = start_sen + original_text_start + end_sen = original_text_end + end_sen + original_text = "此处有 {} 字相似\n".format(str(end - start)) + start_sen[-60:] + target_text_str + end_sen[:60] + else: + original_text = "" + end = 0 + start = 0 + similar_content = [] + return_info = { + "original_text": original_text, + "dan_sentence_word_nums": end - start, + "similar_content": similar_content + } + return return_info + + +def repeat_quote_info_func(original_text_contrast, section_words): + ''' + 重复的引用信息 + :return: + ''' + chongfuwendang = {} + + for sentence_dan in original_text_contrast: + for i in sentence_dan["similar_content"]: + thesis_info = i["thesis_info"] + if thesis_info not in chongfuwendang: + chongfuwendang[thesis_info] = { + "quote": False, + "thesis_author": i["author"], + "thesis_date": i["year"], + "thesis_info": thesis_info, + "thesis_repeat_rate": (i["paper_red_len_word"] / section_words) * 100, # str(round(repeat_rate, 1)) + "%" + # round(repetition_rate, 3) * 100 + "thesis_title": i["title"], + "thesis_link": "", + "thesis_publish": i["degree"], + "thesis_repeat_word": i["paper_red_len_word"], + "thesis_teacher": "", + "paper_len_word": i["paper_len_word"] + } + else: + chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"] + chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] / + section_words) * 100 + chongfuwendang = sorted(chongfuwendang.items(), + key=lambda x: x[1]["thesis_repeat_rate"], reverse=False) + + + chongfuwendang_list = [] + + for i in chongfuwendang: + chongfuwendang_dan = i[1] + print(chongfuwendang_dan) + chongfuwendang_dan["thesis_repeat_rate"] = str(round(chongfuwendang_dan["thesis_repeat_rate"], 1)) + "%" + chongfuwendang_list.append(chongfuwendang_dan) + + + return chongfuwendang_list + + +def total_data_func(section_data_list): + ''' + 总体数据 + :return: + ''' + # "end_page_index": 0, + # "name": "第1部分", + # "repeat_rate": repeat_rate, + # "repeat_words": repeat_words, + # "start_page_index": 0, + # "words": section_words, + # "original_text": original_text, + # "original_text_oneself": original_text, + # "original_text_contrast/重复的对比详细信息": original_text_contrast, + # "repeat_quote_info/重复的引用信息": repeat_quote_info + + repeat_words = 0 + words = 0 + + for i in section_data_list: + repeat_words += i["repeat_words"] + words += i["words"] + + baifenbi = (repeat_words / words) *100 + exclude_personal_rate = str(round(baifenbi, 1)) + "%" + exclude_quote_rate = str(round(baifenbi, 1)) + "%" + single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"] + single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"] + total_repeat_rate = str(round(baifenbi, 1)) + "%" + total_repeat_words = repeat_words + total_words = words + + print(exclude_personal_rate) + + return { + "back_repeat_words": "", + "exclude_personal_rate": exclude_personal_rate, + "exclude_quote_rate": exclude_quote_rate, + "front_repeat_words": "", + "single_max_rate": single_max_rate, + "single_max_repeat_words": single_max_repeat_words, + "suspected_paragraph": "", + "suspected_paragraph_max_repeat_words": "", + "suspected_paragraph_min_repeat_words": "", + "total_paragraph": "", + "total_repeat_rate": total_repeat_rate, + "total_repeat_words": total_repeat_words, + "total_words": total_words, + "tables": 0 + } + + +def section_data_func_dan(): + ''' + 章节信息单个 + :return: + ''' + # { + # "section_name": "章节名称", + # "section_repeat_rate": "重复率", + # "section_repeat_words": "重复字数", + # "section_words": "章节字数", + # "oneself_repeat_words": "去除本人后重复字数", + # "reference_repeat_words": "去除引用后重复字数", + # "section_oneself_rate": "去除本人后重复率" + # } + + return { + "section_name": "", + "section_repeat_rate": "", + "section_repeat_words": "", + "section_words": "", + "oneself_repeat_words": "", + "reference_repeat_words": "", + "section_oneself_rate": "" + } + + +def section_data_func(section_details): + ''' + 章节信息 + :return: + ''' + # "end_page_index": 0, + # "name": "第1部分", + # "repeat_rate": repeat_rate, + # "repeat_words": repeat_words, + # "start_page_index": 0, + # "words": section_words, + # "original_text": original_text, + # "original_text_oneself": original_text, + # "original_text_contrast/重复的对比详细信息": original_text_contrast, + # "repeat_quote_info/重复的引用信息": repeat_quote_info + + section_name = section_details["name"] + section_repeat_rate = section_details["repeat_rate"] + section_repeat_words = section_details["repeat_words"] + section_words = section_details["words"] + oneself_repeat_words = section_details["repeat_words"] + reference_repeat_words = section_details["repeat_words"] + section_oneself_rate = section_details["repeat_rate"] + + return { + "section_name": section_name, + "section_repeat_rate": section_repeat_rate, + "section_repeat_words": section_repeat_words, + "section_words": section_words, + "oneself_repeat_words": oneself_repeat_words, + "reference_repeat_words": reference_repeat_words, + "section_oneself_rate": section_oneself_rate + } + + +def section_details_func(data_section_dan, paper_dict, num_words, content_list, index_content_list_dan): + ''' + 章节详细信息 + :param original_text_contrast: + :param repeat_quote_info: + :return: + ''' + original_text_contrast = [] + section_repeat_rate = "" + repeat_words = 0 + section_words = num_words + oneself_repeat_words = "" + reference_repeat_words = "" + section_oneself_rate = "" + original_text_list = [] + + for sentence_dan in data_section_dan: + original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict, content_list) + original_text_contrast.append(original_text_contrast_dan) + repeat_words += original_text_contrast_dan["dan_sentence_word_nums"] + original_text_list.append(original_text_contrast_dan["original_text"]) + + original_text = "".join(original_text_list) + repeat_rate = (repeat_words / section_words) * 100 + repeat_rate = str(round(repeat_rate, 1)) + "%" + + repeat_quote_info = repeat_quote_info_func(original_text_contrast, section_words) + + return { + "end_page_index": 0, + "name": "第{}部分".format(str(index_content_list_dan)), + "repeat_rate": repeat_rate, + "repeat_words": repeat_words, + "start_page_index": 0, + "words": section_words, + "original_text": original_text, + "original_text_oneself": original_text, + "original_text_contrast": original_text_contrast, + "repeat_quote_info": repeat_quote_info + } + + +def check_dict(similar_content_control_zong, paper_dict_zong, num_words_zong, chapter_data, index_content_list): + # similar_content_control, paper_dict, num_words, title, author, content_list + ''' + 生成返回字典 + :param similar_content_control: + :param paper_dict: + :param num_words: + :param title: + :param author: + :return: + ''' + if paper_dict_zong != []: + + # 模拟多个章节 + section_details_list = [] + for data_section_dan, paper_dict, num_words, content_list, index_content_list_dan in zip(similar_content_control_zong, paper_dict_zong, num_words_zong, chapter_data, index_content_list): + + # 章节详细信息 + section_details = section_details_func(data_section_dan, paper_dict, num_words, content_list, index_content_list_dan) + section_details_list.append(section_details) + + # 模拟多个章节 + + section_data_list = [] + for section_details in section_details_list: + section_data = section_data_func(section_details) + section_data_list.append(section_data) + + total_data = total_data_func(section_details_list) + + format = '%Y-%m-%d %H:%M:%S' + value = time.localtime(int(time.time())) + dt = time.strftime(format, value) + + paper_data = { + "author": "", + "check_time": dt, + "time_range": "1900-01-01至2023-08-08", + "title": "", + "total_data": total_data, + "section_data": section_data_list, + "section_details": section_details_list + } + else: + total_data = { + "back_repeat_words": "", + "exclude_personal_rate": 0, + "exclude_quote_rate": 0, + "front_repeat_words": "", + "single_max_rate": 0, + "single_max_repeat_words": 0, + "suspected_paragraph": "", + "suspected_paragraph_max_repeat_words": "", + "suspected_paragraph_min_repeat_words": "", + "total_paragraph": "", + "total_repeat_rate": 0, + "total_repeat_words": 0, + "total_words": 0, + "tables": 0 + } + + section_data_list = [{ + "section_name": "第一部分", + "section_repeat_rate": 0, + "section_repeat_words": 0, + "section_words": 0, + "oneself_repeat_words": 0, + "reference_repeat_words": 0, + "section_oneself_rate": 0 + }] + + + section_details_list = [ + { + "end_page_index": 0, + "name": "第1部分", + "repeat_rate": 0, + "repeat_words": 0, + "start_page_index": 0, + "words": 0, + "original_text": "", + "original_text_oneself": "", + "original_text_contrast": [], + "repeat_quote_info": [] + } + ] + format = '%Y-%m-%d %H:%M:%S' + value = time.localtime(int(time.time())) + dt = time.strftime(format, value) + + paper_data = { + "author": "", + "check_time": dt, + "time_range": "1900-01-01至2023-08-08", + "title": "", + "total_data": total_data, + "section_data": section_data_list, + "section_details": section_details_list + } + return paper_data + + +def split_chapter(content_list): + ''' + + :param content_list: + :return: [[[sentence, sentence, ... sentence], 2000], [[sentence, sentence, ... sentence], 2000]] + ''' + + content_list_new = [] + zishu = 9000 + dangqianzishu = 0 + + i = 0 + content_list_dan = [] + while True: + if i >= len(content_list): + if content_list_dan != []: + content_list_new.append([content_list_dan, dangqianzishu]) + break + content_list_dan.append(content_list[i]) + dangqianzishu += len(content_list[i]) + if dangqianzishu > zishu: + content_list_new.append([content_list_dan, dangqianzishu]) + dangqianzishu = 0 + content_list_dan = [] + i += 1 + + return content_list_new + + +# def biahong_rule(s1, s2): +# ''' +# +# :param s1: +# :param s2: +# :return: +# { +# "probabilities": null, +# "resilt": [ +# [ +# "而且受大型火灾事件的影响,会对公众心理造成相应的伤害,火灾发生后让人们产生恐惧、烦躁等心理现象,这也会妨碍对火灾的管理。1大型商业建筑火灾的特点大型企业的商业建筑一般设计为商业综合体,其中包括办公,餐饮,商铺,娱乐等活动场所,具有建筑市场规模大、人数多、火灾荷载大、消防救援难等特点,其建筑火灾具有以下特点。1.1可燃物种类多,火灾荷载集中大型企业综合商业建筑设计一般主要包括室内步行街和大量的商铺柜台等部分。", +# "火灾的管理。1大型商业建筑火灾的特点大型企业的商业建筑一般设计为商业综合体,其中包括办公,餐饮,商铺,娱乐等活动场所,具有建筑市场规模大、人数多、火灾荷载大、消防救援难等特点,其建筑火灾具有以下特点。1.1可燃物种类多,火灾荷载集中大型企业综合商业建筑设计一般主要包括室内步行街和大量的商铺柜台等部分", +# "1大型商业建筑火灾的特点大型商业建筑一般为商业综合体,其中包含商铺、娱乐等多种场所,具有建筑规模大、人员多、火灾荷载大、扑救困难等特点,其建筑火灾有以下特点。1.1可燃物种类多,火灾荷载集中大型综合体商业建筑中一般包含室内步行街与大量百货专柜等部分。1111", +# "1大型商业建筑火灾的特点大型商业建筑一般为商业综合体,其中包含商铺、娱乐等多种场所,具有建筑规模大、人员多、火灾荷载大、扑救困难等特点,其建筑火灾有以下特点。1.1可燃物种类多,火灾荷载集中大型综合体商业建筑中一般包含室内步行街与大量百货专柜等部分。1111", +# [ +# 54, +# 204, +# 0, +# 126 +# ] +# ], +# ] +# } +# ''' +# +# id_start_1_dan_best = 0 +# id_end_1_dan_best = len(s1) -1 +# id_start_2_dan_best = 0 +# id_end_2_dan_best = len(s2) -1 +# sim_score_best_best = 0 +# +# while True: +# if sim_score_best >= 0.75: +# break +# else: +# id_start_1_dan = 0 +# id_end_1_dan = len(s1) - 1 +# id_start_2_dan = 0 +# id_end_2_dan = len(s2) - 1 +# +# sen_list = [ +# [s1[id_start_1_dan_best+1:id_end_1_dan_best +1], s2[id_start_2_dan_best:id_end_2_dan_best +1]], +# [s1[id_start_1_dan_best:id_end_1_dan_best], s2[id_start_2_dan_best:id_end_2_dan_best +1]], +# [s1[id_start_1_dan_best :id_end_1_dan_best + 1], s2[id_start_2_dan_best + 1:id_end_2_dan_best + 1]], +# [s1[id_start_1_dan_best:id_end_1_dan_best + 1], s2[id_start_2_dan_best:id_end_2_dan_best]] +# ] +# for i in sen_list: +# sim_score = jaccard_similarity(i[0], i[1]) +# +# +# return + +def chapter_check(dan_chapter_data, recall_data_list): + # ============================================================================================= + # 多进程算法 + # rouge算法查重 + # t1_0 = time.time() + # rst = [] + # p = Pool(nums_cpus) # 进程池中含有n个子进程 + # + # print("num_words", num_words) + # for i in range(len(content_list)): + # text = content_list[i] + # a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,)) + # rst.append(a) + # p.close() + # p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。 + # + # print("筛选句子完成") + # rst = [i.get() for i in rst] + # + # t2_0 = time.time() + # print(t2_0- t1_0) + + # ========================================================================================================= + + rst = [] + for i in range(len(dan_chapter_data)): + text = dan_chapter_data[i] + rst.append(rouge_pre_m(text, recall_data_list)) + + # ======================================================================================================== + + data_zong = [] + for i in range(len(rst)): + # print(rst[i]) + data_zong.append(rst[i]) + + t0 = time.time() + # bert算法查重 + # for text in content_list: + # bert_pre_list = bert_check(text, recall_data_list) + # data_zong.append(bert_pre_list) + t1 = time.time() + original_dict = [] + + # 找出相似的句子序号 + bool_check_sentense = [] # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + + + # rouge算法 + for i in range(len(data_zong)): + bool_check_sentense_dan = [] # [[1, 223],[1, 226], [1, 562]] + for j in range(len(data_zong[i])): + if data_zong[i][j][1] > 0.3: + # print("data_zong[{}][{}]".format(i,j), data_zong[i][j][0]) + bool_check_sentense_dan.append([i, data_zong[i][j][0]]) + if bool_check_sentense_dan != []: + bool_check_sentense.append(bool_check_sentense_dan) + + # 继续用rouge方法筛选 + + if bool_check_sentense == []: + pass + bool_check_sentense = rouge_pre_m_1(bool_check_sentense, dan_chapter_data, + recall_data_list) # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + + print("bool_check_sentense", bool_check_sentense) + print("找出相似的句子序号完成") + + # print("data_zong", data_zong) + biao_red = biaohong(bool_check_sentense, data_zong, + recall_data_list) # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] + + print("biao_red", str(biao_red)) + + original_sentence_index = [] + # for i in biao_red: + # for j in i: + # original_sentence_index.append(j[0]) + + sentence_0_list = [] + sentence_1_list = [] + sim_paper_name = [] + + for i in range(len(biao_red)): + for j in range(len(biao_red[i])): + print("i,j",i, j) + # if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]: + # sentence_0_list.append("".join([content_list[biao_red[i][j][0][0]], content_list[biao_red[i][j][0][1]], content_list[biao_red[i][j][0][2]]])) + # sentence_1_list.append( + # "".join([recall_data_list[biao_red[i][j][1][0]][0], recall_data_list[biao_red[i][j][1][1]][0], recall_data_list[biao_red[i][j][1][2]][0]])) + # sim_paper_name.append(recall_data_list[biao_red[i][j][1][0]][1]) + # else: + # continue + + file_name = recall_data_list[biao_red[i][j][1][1]][1] + sentence_0_list_dan = [] + sentence_1_list_dan = [] + sentence_0_list_dan_index = [] + # houxuna_file_list = [ + # [recall_data_list[biao_red[i][j][1][0]][1], dan_chapter_data[biao_red[i][j][0][0]], + # recall_data_list[biao_red[i][j][1][0]][0]], + # [recall_data_list[biao_red[i][j][1][1]][1], dan_chapter_data[biao_red[i][j][0][1]], + # recall_data_list[biao_red[i][j][1][1]][0]], + # [recall_data_list[biao_red[i][j][1][2]][1], dan_chapter_data[biao_red[i][j][0][2]], + # recall_data_list[biao_red[i][j][1][2]][0]] + # ] + + sentence_0_list_dan = [dan_chapter_data[biao_red[i][j][0][index_simsentence]] for index_simsentence in range(len(biao_red[i][j][0]))] + houxuna_file_list = [[recall_data_list[biao_red[i][j][1][index_simsentence]][1], recall_data_list[biao_red[i][j][1][index_simsentence]][0]] for index_simsentence in range(len(biao_red[i][j][0]))] + + for dan_sen_info in houxuna_file_list: + if dan_sen_info[0] == file_name: + sentence_1_list_dan.append(dan_sen_info[1]) + if sentence_0_list_dan != [] and sentence_1_list_dan != []: + sentence_0_list.append("".join(sentence_0_list_dan)) + sentence_1_list.append("".join(sentence_1_list_dan)) + original_sentence_index.append(biao_red[i][j][0]) + sim_paper_name.append(recall_data_list[biao_red[i][j][1][1]][1]) + + print("待标红句子筛选完成") + sentence_0_list_new = [] + sentence_1_list_new = [] + + for i in zip(sentence_0_list, sentence_1_list): + if len(i[0]) + len(i[1]) < 1200: + sentence_0_list_new.append(i[0]) + sentence_1_list_new.append(i[1]) + else: + print(len(i[0]) + len(i[1])) + continue + t2 = time.time() + + print() + for i in sentence_0_list_new: + print("sentence_0_list_new", i) + if sentence_0_list_new == sentence_1_list_new == []: + paper_dict = [] + else: + print("sentence_0_list_new", len(sentence_0_list_new)) + print("sentence_1_list_new", len(sentence_1_list_new)) + + # ================================================================================================ + # 深度学习标红 + paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) + + # 策略标红 + + + + t3 = time.time() + print("标红完成") + print("标红时间", t3 - t2) + + print("paper_dict", paper_dict) + print("sentence_0_list_new", sentence_0_list_new) + print("sentence_1_list_new", sentence_1_list_new) + print("sim_paper_name", sim_paper_name) + similar_content_control = [[]] + # with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f: + # json.dump(paper_dict, f, ensure_ascii=False) + if sentence_0_list_new != []: + sentence_0_list_new_cursor = sentence_0_list_new[0] + for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan in zip( + range(len(paper_dict)), + sentence_0_list_new, + sentence_1_list_new, + sim_paper_name, + original_sentence_index): + + if sentence_0_list_new_cursor != sentence_0_dan: + similar_content_control.append( + [[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]]) + sentence_0_list_new_cursor = sentence_0_dan + else: + similar_content_control[-1].append( + [paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]) + return similar_content_control, paper_dict + + +def accurate_check_rouge( + text_paper, + recall_data_list +): + ''' + 精确查重出相似句子 + :param text: + :param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]] + :return: + ''' + # 文本处理 + # content_list = [] + print("text_paper", len(text_paper)) + text_paper = str(text_paper).replace("。\n", "。") + content_list_old = text_paper.split("。") + + sentence_word_nums = 0 + + # 前处理,筛选句子 + content_list = [] + for i in content_list_old: + if len(i) <= 7: + continue + elif len(i) < 300: + content_list.append(i + "。") + if i == "。": + continue + # 分章 + content_list_zong = split_chapter(content_list) + + similar_content_control_zong = [] + paper_dict_zong = [] + num_words_zong = [] + chapter_data = [] + index_content_list = [] + + for index_content_list_zong in range(len(content_list_zong)): + dan_chapter_data, dan_chapter_num_words = content_list_zong[index_content_list_zong][0], content_list_zong[index_content_list_zong][1] + + similar_content_control, paper_dict = chapter_check(dan_chapter_data, recall_data_list) + similar_content_control_zong.append(similar_content_control) + paper_dict_zong.append(paper_dict) + num_words_zong.append(dan_chapter_num_words) + chapter_data.append(dan_chapter_data) + index_content_list.append(index_content_list_zong) + + paper_data = check_dict(similar_content_control_zong, paper_dict_zong, num_words_zong, chapter_data, index_content_list) + # data = [similar_content_control] + # + # # 模拟多个章节 + # section_details_list = [] + # for data_dan in data: + # data_section_dan = data_dan + # + # # 章节详细信息 + # section_details = section_details_func(data_section_dan, paper_dict, num_words) + # section_details_list.append(section_details) + # + # # 模拟多个章节 + # + # section_data_list = [] + # for section_details in section_details_list: + # section_data = section_data_func(section_details) + # section_data_list.append(section_data) + # + # total_data = total_data_func(section_details_list) + # + # format = '%Y-%m-%d %H:%M:%S' + # value = time.localtime(int(time.time())) + # dt = time.strftime(format, value) + # + # paper_data = { + # "author": author, + # "check_time": dt, + # "time_range": "1900-01-01至2023-08-08", + # "title": title, + # "total_data": total_data, + # "section_data": section_data_list, + # "section_details": section_details_list + # } + return paper_data + + +def biaohong(bool_check_sentense, data_zong, df_train_nuoche): + ''' + 标红的序号 [[0,1,2],[3,4,5]] + :param bool_check_sentense: # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + :return: list # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] + ''' + + # print("bool_check_sentense", bool_check_sentense) + biao_red = [] + i = 0 + start = -1 + end = -1 + tiaochu = False + while True: + # if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \ + # + 1 >= len(df_train_nuoche): + # break + + if i >= len(bool_check_sentense): + break + + for j in bool_check_sentense[i]: + # print("j", j) + if j[0] + 1 > len(data_zong): + tiaochu = True + break + + # if bool_check_sentense[i][0][0] + 1 >= len(data_zong): + # if bool_check_sentense[] + # bool_check_sentense[i][0][0] + 1 = bool_check_sentense[i + 1][0][0] + # break + + for j in bool_check_sentense[i]: + if j[1] + 1 >= len(df_train_nuoche): + tiaochu = True + break + + if tiaochu == True: + break + + # elif bool_check_sentense[i-1][0][0] == start: + # biao_red_dan = [] + # for j in range(len(bool_check_sentense[i-1])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + # biao_red_dan.append([[bool_check_sentense[i-1][j][0], bool_check_sentense[i-1][j][0]+ 1, bool_check_sentense[i-1][j][0] + 2], + # [bool_check_sentense[i-1][j][1] - 1, bool_check_sentense[i-1][j][1], bool_check_sentense[i+1][j][1] + 1]]) + # biao_red.append(biao_red_dan) + # + # elif bool_check_sentense[i+1][0][0] == end: + # biao_red_dan = [] + # for j in range(len(bool_check_sentense[i+1])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + # biao_red_dan.append([[bool_check_sentense[i+1][j][0]-2, bool_check_sentense[i+1][j][0]-1, bool_check_sentense[i+1][j][0]], + # [bool_check_sentense[i+1][j][1] - 1, bool_check_sentense[i+1][j][1], bool_check_sentense[i+1][j][1] + 1]]) + # biao_red.append(biao_red_dan) + + elif i == len(bool_check_sentense)-1: + if end == bool_check_sentense[i][0][0]: + i += 1 + break + elif bool_check_sentense[i][0][0]-1 == end + 1 and bool_check_sentense[i][0][0] == len(data_zong) -1: + index_list = [ii for ii in range(bool_check_sentense[i][0][0]-1, bool_check_sentense[i][0][0] + 1)] + elif bool_check_sentense[i][0][0]-1 == end and bool_check_sentense[i][0][0] == len(data_zong) -1: + index_list = [ii for ii in range(bool_check_sentense[i][0][0], bool_check_sentense[i][0][0] + 1)] + elif bool_check_sentense[i][0][0]-1 > end + 1 and bool_check_sentense[i][0][0] == len(data_zong) -1: + index_list = [ii for ii in range(bool_check_sentense[i][0][0] - 1, bool_check_sentense[i][0][0] + 1)] + else: + index_list = [ii for ii in range(bool_check_sentense[i][0][0] - 1, bool_check_sentense[i][0][0] + 2)] + + biaohongset = set() + biao_red_dan = [] + for j in range(len(bool_check_sentense[ + i])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + if bool_check_sentense[i][j][1] not in biaohongset: + biao_red_dan.append([index_list, + [bool_check_sentense[i][j][1] - 1, bool_check_sentense[i][j][1], + bool_check_sentense[i][j][1] + 1]]) + biaohongset.add(bool_check_sentense[i][j][1] - 1) + biaohongset.add(bool_check_sentense[i][j][1]) + biaohongset.add(bool_check_sentense[i][j][1] + 1) + else: + continue + + i += 1 + biao_red.append(biao_red_dan) + break + + elif bool_check_sentense[i][0][0] - 1 == start: + i += 1 + continue + elif bool_check_sentense[i][0][0] == end: + i += 1 + continue + elif bool_check_sentense[i][0][0] - 1 == end: + i += 1 + continue + else: + biaohongset = set() + biao_red_dan = [] + for j in range(len(bool_check_sentense[i])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + if bool_check_sentense[i][j][1] not in biaohongset: + biao_red_dan.append([[bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][0], bool_check_sentense[i][j][0] + 1], + [bool_check_sentense[i][j][1] - 1, bool_check_sentense[i][j][1], bool_check_sentense[i][j][1] + 1]]) + biaohongset.add(bool_check_sentense[i][j][1] - 1) + biaohongset.add(bool_check_sentense[i][j][1]) + biaohongset.add(bool_check_sentense[i][j][1] + 1) + else: + continue + + start = bool_check_sentense[i][0][0] - 1 + end = bool_check_sentense[i][0][0] + 1 + + if bool_check_sentense[i-1][0][0] == start: + for j in range(len(bool_check_sentense[i-1])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + if bool_check_sentense[i - 1][j][1] not in biaohongset: + biao_red_dan.append([[bool_check_sentense[i-1][j][0], bool_check_sentense[i-1][j][0] + 1, bool_check_sentense[i-1][j][0] + 2], + [bool_check_sentense[i-1][j][1] - 1, bool_check_sentense[i-1][j][1], bool_check_sentense[i-1][j][1] + 1]]) + biaohongset.add(bool_check_sentense[i-1][j][1] - 1) + biaohongset.add(bool_check_sentense[i-1][j][1]) + biaohongset.add(bool_check_sentense[i-1][j][1] + 1) + else: + continue + + if bool_check_sentense[i+1][0][0] == end: + for j in range(len(bool_check_sentense[i+1])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + if bool_check_sentense[i + 1][j][1] not in biaohongset: + biao_red_dan.append([[bool_check_sentense[i+1][j][0]-2, bool_check_sentense[i+1][j][0]-1, bool_check_sentense[i+1][j][0]], + [bool_check_sentense[i+1][j][1] - 1, bool_check_sentense[i+1][j][1], bool_check_sentense[i+1][j][1] + 1]]) + biaohongset.add(bool_check_sentense[i+1][j][1] - 1) + biaohongset.add(bool_check_sentense[i+1][j][1]) + biaohongset.add(bool_check_sentense[i+1][j][1] + 1) + else: + continue + + i += 1 + biao_red.append(biao_red_dan) + + return biao_red # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] + + +def dialog_line_parse(url, text): + """ + 将数据输入模型进行分析并输出结果 + :param url: 模型url + :param text: 进入模型的数据 + :return: 模型返回结果 + """ + + response = requests.post( + url, + json=text, + timeout=100000 + ) + if response.status_code == 200: + return response.json() + else: + # logger.error( + # "【{}】 Failed to get a proper response from remote " + # "server. Status Code: {}. Response: {}" + # "".format(url, response.status_code, response.text) + # ) + print("【{}】 Failed to get a proper response from remote " + "server. Status Code: {}. Response: {}" + "".format(url, response.status_code, response.text)) + print(text) + return {} + + +def is_english_char(char): + code = ord(char) + return 32 <= code <= 126 + + +def original_text_marked_red(text_original, bert_text, start, end): + ''' + 把原文标红字段找到 + :param text_original: + :param bert_text: + :param bert_text_pre: + :return: + ''' + try: + fuhao = ["\n"] + up_pointer = 0 + down_pointer = 0 + + pointer_list = [] + + bert_text_list = list(bert_text) + bert_text_list.insert(start, "") + bert_text_list.insert(end + 2, "") + + text_original_list = list(text_original) + + up = 0 + down = 0 + + while True: + if up == len(text_original_list): + break + + if text_original_list[up] == bert_text_list[down]: + up += 1 + down += 1 + + else: + if bert_text_list[down] == "": + down += 1 + elif bert_text_list[down] == "": + down += 1 + else: + bert_text_list.insert(down, text_original_list[up]) + up += 1 + down += 1 + + bert_text = "".join(bert_text_list) + return True, bert_text + except: + print("句子标红报错") + print(text_original, bert_text) + return False, "" + + +def biaohong_bert_predict(sentence_0_list, sentence_1_list): + ''' + 找出标红字符 + :param bool_check_sentense: + :return: + ''' + + paper_dict = \ + dialog_line_parse("http://192.168.31.74:16003/", + {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})[ + "resilt"] + + return paper_dict + + +def ulit_text(title, text): + data = [] + try: + text = json.loads(text)["content"] + except: + pass + + text = text.strip().replace("\n", "").replace(" ", "").replace("。", "。\n") + text_list = text.split("\n") + + for i in text_list: + data.append([i, title]) + return data + + +def run_query(conn, sql, params): + with conn.cursor() as cursor: + cursor.execute(sql, params) + result = cursor.fetchall() + return result + + +# def processing_one_text(paper_id): +# conn = pymysql.connect( +# host='192.168.31.145', +# port=3306, +# user='root', +# password='123456', +# db='zhiwang_db', +# charset='utf8mb4', +# cursorclass=pymysql.cursors.DictCursor +# ) +# +# sql = 'SELECT * FROM main_table_paper_detail_message WHERE doc_id=%s' +# params = (paper_id,) +# +# result = run_query(conn, sql, params) +# +# conn.close() +# print(result[0]['title'], result[0]['author']) +# title = result[0]['title'] +# author = result[0]['author'] +# degree = result[0]['degree'] +# year = result[0]['content'].split("/")[5] +# content_path = result[0]['content'] +# +# try: +# with open(content_path, encoding="utf-8") as f: +# text = f.read() +# except: +# with open(content_path, encoding="gbk") as f: +# text = f.read() +# +# paper_info = { +# "title": title, +# "author": author, +# "degree": degree, +# "year": year, +# "paper_len_word": len(text) +# } +# data = ulit_text(paper_info, text) +# return data + + +from clickhouse_driver import Client + +class PureClient: + def __init__(self, database='mini_check'): + # 只需要写本地地址 + self.client = Client(host=f'{"192.168.31.74"}', port=9000, user='default', + password='zhicheng123*', database=database) + + def run(self, sql): + client = self.client + collection = client.query_dataframe(sql) + return collection + +def processing_one_text(user_uuid): + + pureclient = PureClient() + print("paper_id", user_uuid) + sql = f"SELECT * FROM user_table WHERE user_uuid='{user_uuid}'" + result = pureclient.run(sql) + return result + + +def ulit_recall_paper(uuid_uesr): + ''' + 对返回的十篇文章路径读取并解析 + :param recall_data_list_path: + :return data: list [[sentence, filename],[sentence, filename],[sentence, filename]] + ''' + + # data = [] + # for path in recall_data_list_path: + # filename = path.split("/")[-1] + # with open(path, encoding="gbk") as f: + # text = f.read() + # text_list = text.split("\n") + # for sentence in text_list: + # if sentence != "": + # data.append([sentence, filename]) + # return data + res = processing_one_text(uuid_uesr) + res_list = res.values.tolist() + + data = [] + for res_dan in res_list: + user_uuid = res_dan[0] + file_path = res_dan[1] + is_delete = res_dan[2] + if is_delete == 1: + try: + with open(file_path, encoding="gbk") as f: + text = f.read() + except: + with open(file_path, encoding="utf-8") as f: + text = f.read() + text_list = text.split("。") + + filename = file_path.split("/")[-1].split(".")[0] + for sentence in text_list: + if sentence != "": + data.append([sentence.strip("\n"), filename]) + + return data + + +def recall_10(queue_uuid, title, abst_zh, content): + ''' + 宇鹏召回接口 + :param paper_name: + :return: + ''' + + request_json = { + "uuid": queue_uuid, + "title": title, + "abst_zh": abst_zh, + "content": content + } + print(request_json) + dialog_line_parse("http://192.168.31.145:50004/check1", request_json) + + + +def uilt_content(content): + zhaiyao_list = ["摘要"] + zhaiyao_en_list = ["Abstract", "abstract"] + mulu_list = ["目录"] + key_word_list = ["关键词"] + caikanwenxian = ["参考文献"] + key_word_bool = False + key_word_str = "" + zhaiyao_bool = False + zhaiyao_en_bool = False + zhaiyao_str = "" + zhaiyao_en_str = "" + mulu_str = "" + zhaiyao_text = "" + mulu_bool = False + + pantten_zhaiyao = '(摘\s*要)' + result_biaoti_list = re.findall(pantten_zhaiyao, content) + if len(result_biaoti_list) != 0: + zhaiyao_str = result_biaoti_list[0] + zhaiyao_bool = True + else: + zhaiyao_bool = False + + for i in zhaiyao_en_list: + if i in content: + zhaiyao_en_bool = True + zhaiyao_en_str = i + break + + for i in mulu_list: + if i in content: + mulu_str = i + mulu_bool = True + break + + for i in key_word_list: + if i in content: + key_word_str = i + key_word_bool = True + break + + if zhaiyao_bool == True and key_word_bool == True: + pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, key_word_str) + result_biaoti_list = re.findall(pantten_zhaiyao, content) + zhaiyao_text = result_biaoti_list[0] + + elif zhaiyao_bool == True and zhaiyao_en_bool == True: + pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, zhaiyao_en_str) + result_biaoti_list = re.findall(pantten_zhaiyao, content) + zhaiyao_text = result_biaoti_list[0] + + elif zhaiyao_bool == True and mulu_bool == True: + pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, mulu_str) + result_biaoti_list = re.findall(pantten_zhaiyao, content) + zhaiyao_text = result_biaoti_list[0] + + if zhaiyao_text == "": + content = str(content).replace("。\n", "。") + content_list = content.split("。") + zhaiyao_text = "".join(content_list[:15]) + return zhaiyao_text + + +def ulit_request_file(file): + file_name = file.filename + if file_name.split(".")[-1] == "txt": + file_name_save = "data/request/{}".format(file_name) + file.save(file_name_save) + try: + with open(file_name_save, encoding="gbk") as f: + content = f.read() + except: + with open(file_name_save, encoding="utf-8") as f: + content = f.read() + + content = " ".join([i for i in content.split("\n") if i != ""]) + abst_zh = uilt_content(content) + + return abst_zh, content + + +# @app.route("/", methods=["POST"]) +# def handle_query(): +# print(request.remote_addr) +# +# # request.form.get('prompt') +# dataBases = request.form.get("dataBases") +# minSimilarity = request.form.get("minSimilarity") # txt +# minWords = request.form.get("minWords") +# title = request.form.get("title") +# author = request.form.get("author") # txt +# file = request.files.get('file') +# token = request.form.get("token") +# account = request.form.get("account") +# goodsId = request.form.get("goodsId") +# callbackUrl = request.form.get("callbackUrl") +# +# +# t0 = time.time() +# abst_zh, content = ulit_request_file(file) +# +# # 调用宇鹏查询相似十篇 +# # recall_data_list_dict = recall_10(title, abst_zh, content) +# +# t1 = time.time() +# print("查找相似的50篇完成") +# with open("data/rell_json.txt") as f: +# recall_data_list_dict = eval(f.read()) +# +# # 读取文章转化成格式数据 +# recall_data_list = ulit_recall_paper(recall_data_list_dict) +# print("文章格式转化完成") +# +# # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() +# +# # 进入精确查重系统 +# print("进入精确查重系统") +# return_list = accurate_check_rouge(title, author, content, recall_data_list) +# +# print("召回50篇", t1 - t0) +# +# return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} +# return jsonify(return_text) # 返回结果 + + +# def classify_recall(): # 调用模型,设置最大batch_size +# while True: +# if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取 +# time.sleep(3) +# continue +# query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text +# data_dict_path = json.loads(query) +# path = data_dict_path['path'] +# # text_type = data_dict["text_type"] +# +# with open(path, encoding='utf8') as f1: +# # 加载文件的对象 +# data_dict = json.load(f1) +# +# queue_uuid = data_dict['id'] +# print(queue_uuid) +# dataBases = data_dict['dataBases'] +# minSimilarity = data_dict['minSimilarity'] +# minWords = data_dict['minWords'] +# title = data_dict['title'] +# author = data_dict['author'] +# abst_zh = data_dict['abst_zh'] +# content = data_dict['content'] +# token = data_dict['token'] +# account = data_dict['account'] +# goodsId = data_dict['goodsId'] +# callbackUrl = data_dict['callbackUrl'] +# +# # 调用宇鹏查询相似十篇 +# recall_data_list_dict = recall_10(queue_uuid, title, abst_zh, content) +# +# # print("查找相似的50篇完成") +# # with open("data/rell_json.txt") as f: +# # recall_data_list_dict = eval(f.read()) +# +# # 读取文章转化成格式 + + +def classify_accurate_check(): + while True: + if redis_.llen(db_key_query_recall) == 0: # 若队列中没有元素就继续获取 + time.sleep(3) + continue + + query_recall = redis_.lpop(db_key_query_recall).decode('UTF-8') # 获取query的text + query_recall_dict = json.loads(query_recall) # db_key_query_recall json.dumps({"id": id_, "path": load_request_path}) + query_recall_uuid = query_recall_dict["id"] + data_dict_path = query_recall_dict["path"] + print(data_dict_path) + + # d = { + # "uuid_uesr": uuid_uesr, + # "content": content + # } + with open(data_dict_path, encoding='utf8') as f: + data_dict = json.loads(f.read()) + + + uuid_uesr = data_dict['uuid_uesr'] + abstract = data_dict['content'][0] + content = data_dict['content'][1] + # try: + recall_data_list = ulit_recall_paper(uuid_uesr) + + print("查找相似的50篇完成") + + # with open("data/rell_json.txt") as f: + # recall_data_list_dict = eval(f.read()) + # recall_data_list = ulit_recall_paper(recall_data_list_dict) + + print("文章格式转化完成") + + # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() + + # 进入精确查重系统 + print("进入精确查重系统") + + return_list = accurate_check_rouge(content, recall_data_list) + + + return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} + + load_result_path = "./new_data_logs/{}.json".format(query_recall_uuid) + + print("queue_uuid: ", query_recall_uuid) + print("load_result_path: ", load_result_path) + + with open(load_result_path, 'w', encoding='utf8') as f2: + # ensure_ascii=False才能输入中文,否则是Unicode字符 + # indent=2 JSON数据的缩进,美观 + json.dump(return_text, f2, ensure_ascii=False, indent=4) + + print(query_recall_uuid) + print(load_result_path) + redis_.set(query_recall_uuid, load_result_path, 86400) + redis_.srem(db_key_querying, query_recall_uuid) + # except: + # return_text = {"resilt": "", "probabilities": None, "status_code": 401} + # load_result_path = "./new_data_logs/{}.json".format(queue_uuid) + # + # print("queue_uuid: ", queue_uuid) + # print("load_result_path: ", load_result_path) + # + # with open(load_result_path, 'w', encoding='utf8') as f2: + # # ensure_ascii=False才能输入中文,否则是Unicode字符 + # # indent=2 JSON数据的缩进,美观 + # json.dump(return_text, f2, ensure_ascii=False, indent=4) + # + # print(queue_uuid) + # print(load_result_path) + # redis_.set(queue_uuid, load_result_path, 86400) + # redis_.srem(db_key_querying, queue_uuid) + + +@app.route("/", methods=["POST"]) +def handle_query(): + try: + print(request.remote_addr) + + uuid_uesr = request.form.get("uuid") + file = request.files.get('file') + + content = ulit_request_file(file) + + id_ = str(uuid.uuid1()) # 为query生成唯一标识 + # 绑定文本和query id + # recall_10(id_, title, abst_zh, content) + d = { + "uuid_uesr": uuid_uesr, + "content": content + } + load_request_path = './request_data_logs/{}.json'.format(id_) + with open(load_request_path, 'w', encoding='utf8') as f2: + json.dump(d, f2, ensure_ascii=False, indent=4) + + redis_.rpush(db_key_query_recall, json.dumps({"id": id_, "path": load_request_path})) # 加入redis + return_text = { + 'code': 0, + 'msg': "请求成功", + 'data': { + 'balances': "", + 'orderId': id_, + 'consumeNum': "" + } + } + + print("ok") + except: + return_text = {'code': 1} + return jsonify(return_text) # 返回结果 + +t1 = Thread(target=classify_accurate_check) +t1.start() + + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=20000, threaded=True) diff --git a/redis_check_uuid_mistral.py b/redis_check_uuid_mistral.py new file mode 100644 index 0000000..79f16f1 --- /dev/null +++ b/redis_check_uuid_mistral.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/2 19:31 +@Author : +@FileName: +@Software: +@Describe: +""" +# +# import redis +# +# redis_pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='', db=0) +# redis_conn = redis.Redis(connection_pool=redis_pool) +# +# +# name_dict = { +# 'name_4' : 'Zarten_4', +# 'name_5' : 'Zarten_5' +# } +# redis_conn.mset(name_dict) + +import flask +import redis +import uuid +import json +from threading import Thread +import time + +app = flask.Flask(__name__) +pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=6, password="zhicheng123*") +redis_ = redis.Redis(connection_pool=pool, decode_responses=True) + +db_key_query = 'queryset_check_task' +db_key_querying = 'querying_check_task' +db_key_error = 'error' + +@app.route("/search", methods=["POST"]) +def handle_query(): + id_ = flask.request.json['id'] # 获取用户query中的文本 例如"I love you" + result = redis_.get(id_) # 获取该query的模型结果 + if result is not None: + # redis_.delete(id_) + result_path = result.decode('UTF-8') + with open(result_path, encoding='utf8') as f1: + # 加载文件的对象 + result_dict = json.load(f1) + code = result_dict["status_code"] + resilt = result_dict["resilt"] + probabilities = result_dict["probabilities"] + if str(code) == 400: + redis_.rpush(db_key_error, json.dumps({"id": id_})) + return False + result_text = {'code': code, 'resilt': resilt, 'probabilities': probabilities} + else: + querying_list = list(redis_.smembers("querying")) + querying_set = set() + for i in querying_list: + querying_set.add(i.decode()) + + querying_bool = False + if id_ in querying_set: + querying_bool = True + + query_list_json = redis_.lrange(db_key_query, 0, -1) + query_set_ids = set() + for i in query_list_json: + data_dict = json.loads(i) + query_id = data_dict['id'] + query_set_ids.add(query_id) + + query_bool = False + if id_ in query_set_ids: + query_bool = True + + if querying_bool == True and query_bool == True: + result_text = {'code': "201", 'text': "", 'probabilities': None} + elif querying_bool == True and query_bool == False: + result_text = {'code': "202", 'text': "", 'probabilities': None} + else: + result_text = {'code': "203", 'text': "", 'probabilities': None} + load_request_path = './request_data_logs_203/{}.json'.format(id_) + with open(load_request_path, 'w', encoding='utf8') as f2: + # ensure_ascii=False才能输入中文,否则是Unicode字符 + # indent=2 JSON数据的缩进,美观 + json.dump(result_text, f2, ensure_ascii=False, indent=4) + + return flask.jsonify(result_text) # 返回结果 + + +if __name__ == "__main__": + app.run(debug=False, host='0.0.0.0', port=14001) diff --git a/连接数据库.py b/连接数据库.py new file mode 100644 index 0000000..cea783f --- /dev/null +++ b/连接数据库.py @@ -0,0 +1,49 @@ +from clickhouse_driver import Client + +# 连接到ClickHouse +client = Client( + host='192.168.31.74', + port=9000, + user='default', + password='zhicheng123*', + database='mini_check' +) + + +# 2. 使用新数据库 +client.execute('USE mini_check') + +# 3. 创建简单的表 +# create_table_sql = """ +# CREATE TABLE IF NOT EXISTS user_table ( +# user_uuid String, +# file_path String, +# is_delete UInt32, +# ) ENGINE = MergeTree() +# """ + +# create_table_sql = """ +# CREATE TABLE IF NOT EXISTS user_table ( +# user_uuid String, +# file_path String, +# is_delete UInt32, +# ) ENGINE = MergeTree() +# ORDER BY (user_uuid) -- 必须指定 ORDER BY +# SETTINGS index_granularity = 8192; +# """ +# +# client.execute(create_table_sql) +# +# 4. 插入数据 +data = [ + ("113", '/home/zyp/mnt/8T_disk/program/docx_deal/deal_finish_txt/2023-04-08/14397246.txt', 1), + ("113", '/home/zyp/mnt/8T_disk/program/docx_deal/deal_finish_txt/2023-04-08/14397314.txt', 1), + ("113", '/home/zyp/mnt/8T_disk/program/docx_deal/deal_finish_txt/2023-04-08/14397321.txt', 1) +] +client.execute('INSERT INTO user_table (user_uuid, file_path, is_delete) VALUES', data) +# +# 5. 查询数据 +result = client.query_dataframe('SELECT * FROM user_table') +print(result) + +