From a963fc354ac39e57ab7b94f2a6707cfe13832e97 Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Wed, 4 Feb 2026 14:56:07 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E7=89=88=E8=87=AA=E5=BB=BA=E5=BA=93?= =?UTF-8?q?=E6=9F=A5=E9=87=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_check_2.py | 152 ++++++++++------ flask_operate_sql.py | 410 ++++++++++++++++++++++++++++++++++++++++++++ redis_check_uuid_mistral.py | 7 +- 3 files changed, 509 insertions(+), 60 deletions(-) create mode 100644 flask_operate_sql.py diff --git a/flask_check_2.py b/flask_check_2.py index 7bc49f1..e9c0ca0 100644 --- a/flask_check_2.py +++ b/flask_check_2.py @@ -187,10 +187,10 @@ def similar_content_func(): }] -def original_text_contrast_func(sentence_dan, content_list): +def original_text_contrast_func(sentence_dan): ''' - :param data_sentence_dan: section_dan[0] = [原句子序号, 第一组句子, 第一组句子相似句子, 第一组句子的标红部分,第一组句子相似句子标红部分,相似句子文件名] + :param data_sentence_dan: sentence_dan[0] = [原句子序号, 第一组句子, 第一组句子相似句子, 第一组句子的标红部分,第一组句子相似句子标红部分,相似句子文件名] :param content_list: :return: ''' @@ -222,9 +222,8 @@ def original_text_contrast_func(sentence_dan, content_list): ''' if sentence_dan != []: - original_text = "" - start = len(sentence_dan[0][1]) - end = 0 + start = 0 + end = len(sentence_dan[0][1]) similar_content = [] for dan_sen_info in sentence_dan: # 可能有很多个暂且确定是一个 @@ -265,27 +264,8 @@ def original_text_contrast_func(sentence_dan, content_list): similar_content.append(similar_content_dan) original_text_list = list(sentence_dan[0][1]) - # original_text_list.insert(end, "\n") - # original_text_list.insert(start, "\n") - target_text_str = "".join(["\n"] + original_text_list[start: end] + ["\n"]) - - original_text_start = "".join(original_text_list[:start]) - original_text_end = "".join(original_text_list[end:]) - - print(sentence_dan) - if sentence_dan[0][4][0] - 1 < 0: - start_sen = "" - else: - start_sen = content_list[data_sentence_dan[0][4][0] - 1] - - if data_sentence_dan[0][4][-1] + 1 >= len(content_list): - end_sen = "" - else: - end_sen = content_list[data_sentence_dan[0][4][-1] + 1] - - start_sen = start_sen + original_text_start - end_sen = original_text_end + end_sen - original_text = "此处有 {} 字相似\n".format(str(end - start)) + start_sen[-60:] + target_text_str + end_sen[:60] + target_text_str = "".join([""] + original_text_list + ["\n"]) + original_text = "此处有 {} 字相似\n".format(str(len(sentence_dan[0][1]))) + target_text_str else: original_text = "" end = 0 @@ -346,6 +326,14 @@ def repeat_quote_info_func(original_text_contrast, section_words): def total_data_func(section_data_list): ''' 总体数据 + section_data_list[0] = { + "section_name": "第{}部分".format(str(index_content_list_dan)), + "section_repeat_rate": repeat_rate, + "section_repeat_words": repeat_words, + "section_words": section_words, + "section_original_text_contrast": original_text_contrast, + "section_similar_paper_word_list": paper_similar_word_dict_new + } :return: ''' # "end_page_index": 0, @@ -361,23 +349,48 @@ def total_data_func(section_data_list): repeat_words = 0 words = 0 + single_max_rate = "" + single_max_repeat_words = "" + repeat_paper_info_words = {} + repeat_paper_info = [] + # 相似文档信息汇总 for i in section_data_list: - repeat_words += i["repeat_words"] - words += i["words"] + print("==============================") + print(i) + print("==============================") + repeat_words += i["section_repeat_words"] + words += i["section_words"] + for j in i['section_original_text_contrast']: + for z in j['similar_content']: + print(z) + if z['title'] not in repeat_paper_info_words: + repeat_paper_info_words[z['title']] = z['paper_red_len_word'] + else: + repeat_paper_info_words[z['title']] += z['paper_red_len_word'] baifenbi = (repeat_words / words) * 100 exclude_personal_rate = str(round(baifenbi, 1)) + "%" exclude_quote_rate = str(round(baifenbi, 1)) + "%" - single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"] - single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"] + total_repeat_rate = str(round(baifenbi, 1)) + "%" total_repeat_words = repeat_words total_words = words + repeat_paper_info_words = sorted(repeat_paper_info_words.items(), key=lambda x: x[1], reverse=True) + for i in repeat_paper_info_words: + repeat_paper_info.append({ + "title": i[0], + "words": i[1], + "rate": str(round(i[1]/total_words, 1)) + "%" + }) + + single_max_rate = repeat_paper_info[0]["rate"] + single_max_repeat_words = repeat_paper_info[0]["words"] print(exclude_personal_rate) return { + "repeat_paper_info": repeat_paper_info, "back_repeat_words": "", "exclude_personal_rate": exclude_personal_rate, "exclude_quote_rate": exclude_quote_rate, @@ -456,7 +469,7 @@ def section_data_func(section_details): } -def section_details_func(data_section_dan, num_words, content_list, index_content_list_dan): +def section_details_func(data_section_dan, num_words, index_content_list_dan): ''' 章节详细信息 :param data_section_dan: 章节的每一个内容的相似句子的信息 data_section_dan[0][0] = [原句子序号, 第一组句子, 第一组句子相似句子, 第一组句子的标红部分,第一组句子相似句子标红部分,相似句子文件名] @@ -475,28 +488,56 @@ def section_details_func(data_section_dan, num_words, content_list, index_conten original_text_list = [] for sentence_dan in data_section_dan: - original_text_contrast_dan = original_text_contrast_func(sentence_dan, content_list) + original_text_contrast_dan = original_text_contrast_func(sentence_dan) original_text_contrast.append(original_text_contrast_dan) repeat_words += original_text_contrast_dan["dan_sentence_word_nums"] original_text_list.append(original_text_contrast_dan["original_text"]) - original_text = "".join(original_text_list) repeat_rate = (repeat_words / section_words) * 100 repeat_rate = str(round(repeat_rate, 1)) + "%" - repeat_quote_info = repeat_quote_info_func(original_text_contrast, section_words) + + # 计算章节相似文章排序: + paper_similar_word_dict = {} + for i in original_text_contrast: + for j in i['similar_content']: + if j['title'] not in paper_similar_word_dict: + paper_similar_word_dict[j['title']] = j['paper_red_len_word'] + sorted_by_score = sorted(paper_similar_word_dict.items(), key=lambda item: item[1], reverse=True) + paper_similar_word_dict_new = [] + for i in range(len(sorted_by_score)): + paper_similar_word_dict_new.append({ + "similar_content_red_len_word": sorted_by_score[i][1], + "similar_content_rate": str(round((sorted_by_score[i][1]/ section_words)* 100, 1)) + "%", + "similar_content_title": [sorted_by_score[i][0]] + }) + + + # section_name = section_details["name"] + # section_repeat_rate = section_details["repeat_rate"] + # section_repeat_words = section_details["repeat_words"] + # section_words = section_details["words"] + # oneself_repeat_words = section_details["repeat_words"] + # reference_repeat_words = section_details["repeat_words"] + # section_oneself_rate = section_details["repeat_rate"] + # + # return { + # "section_name": section_name, + # "section_repeat_rate": section_repeat_rate, + # "section_repeat_words": section_repeat_words, + # "section_words": section_words, + # "oneself_repeat_words": oneself_repeat_words, + # "reference_repeat_words": reference_repeat_words, + # "section_oneself_rate": section_oneself_rate + # } return { - "end_page_index": 0, - "name": "第{}部分".format(str(index_content_list_dan)), - "repeat_rate": repeat_rate, - "repeat_words": repeat_words, - "start_page_index": 0, - "words": section_words, - "original_text": original_text, - "original_text_oneself": original_text, - "original_text_contrast": original_text_contrast, - "repeat_quote_info": repeat_quote_info + "section_name": "第{}部分".format(str(index_content_list_dan)), + "section_repeat_rate": repeat_rate, + "section_repeat_words": repeat_words, + "section_words": section_words, + "section_original_text_contrast": original_text_contrast, + "section_similar_paper_word_list": paper_similar_word_dict_new } @@ -517,16 +558,11 @@ def check_dict(similar_content_control_zong, num_words_zong, chapter_data, index num_words_zong, chapter_data, index_content_list): # 章节详细信息 - section_details = section_details_func(data_section_dan, num_words, content_list, index_content_list_dan) + section_details = section_details_func(data_section_dan, num_words, index_content_list_dan) section_details_list.append(section_details) # 模拟多个章节 - section_data_list = [] - for section_details in section_details_list: - section_data = section_data_func(section_details) - section_data_list.append(section_data) - total_data = total_data_func(section_details_list) format = '%Y-%m-%d %H:%M:%S' @@ -539,8 +575,7 @@ def check_dict(similar_content_control_zong, num_words_zong, chapter_data, index "time_range": "1900-01-01至2023-08-08", "title": "", "total_data": total_data, - "section_data": section_data_list, - "section_details": section_details_list + "section_data": section_details_list, } return paper_data @@ -1220,7 +1255,7 @@ class PureClient: def processing_one_text(user_uuid): pureclient = PureClient() print("paper_id", user_uuid) - sql = f"SELECT * FROM user_table WHERE user_uuid='{user_uuid}'" + sql = f"SELECT * FROM user_table_1 WHERE user_uuid='{user_uuid}'" result = pureclient.run(sql) return result @@ -1247,10 +1282,13 @@ def ulit_recall_paper(uuid_uesr): data = [] for res_dan in res_list: - user_uuid = res_dan[0] - file_path = res_dan[1] - is_delete = res_dan[2] - if is_delete == 1: + records_uuid = res_dan[0] + user_uuid = res_dan[1] + file_path = res_dan[2] + is_delete = res_dan[3] + created_at = res_dan[4] + updated_at = res_dan[5] + if is_delete == 0: try: with open(file_path, encoding="gbk") as f: text = f.read() diff --git a/flask_operate_sql.py b/flask_operate_sql.py new file mode 100644 index 0000000..871c72d --- /dev/null +++ b/flask_operate_sql.py @@ -0,0 +1,410 @@ +import os +from flask import Flask, request, render_template_string, redirect, url_for, jsonify +from werkzeug.utils import secure_filename +from flask_cors import CORS +from clickhouse_client import clickhouse_client +from config import Config +from datetime import datetime + + +app = Flask(__name__) +CORS(app) # 允许跨域请求 +app.config.from_object(Config) + +# 配置 +UPLOAD_FOLDER = 'uploads' +ALLOWED_EXTENSIONS = {'txt'} + +# 确保上传目录存在 +if not os.path.exists(UPLOAD_FOLDER): + os.makedirs(UPLOAD_FOLDER) + +app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER + +def upload_file(): + message = '' + filename = '' + + # 检查是否有文件部分 + if 'file' not in request.files: + message = '没有文件部分' + return_info = { + "code": 203, + "message": message + } + return jsonify(return_info) + + file = request.files['file'] + + # 如果用户没有选择文件 + if file.filename == '': + message = '没有选择文件' + return_info = { + "code": 203, + "message": message + } + + return jsonify(return_info) + + if file and allowed_file(file.filename): + # 安全处理文件名 + filename = secure_filename(file.filename) + + # 处理同名文件:添加时间戳 + name, ext = os.path.splitext(filename) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"{name}_{timestamp}{ext}" + + # 保存文件 + filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) + file.save(filepath) + + message = '文件上传成功!' + return_info = { + "code": 203, + "message": message + } + + return jsonify(return_info) + else: + message = '文件类型不允许' + +def allowed_file(filename): + """检查文件扩展名是否允许""" + return '.' in filename and \ + filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS + +# ========== API路由 ========== + +@app.route('/') +def index(): + """首页""" + return jsonify({ + 'message': 'ClickHouse Flask API', + 'endpoints': { + 'GET /health': '检查服务健康状态', + 'POST /users//records': '创建用户记录', + 'POST /users//records/batch': '批量创建记录', + 'GET /users//records': '获取用户所有记录', + 'GET /users//records/': '获取单条记录', + 'PUT /users//records/': '更新记录', + 'DELETE /users//records/': '删除单条记录', + 'DELETE /users//records': '删除用户所有记录', + 'PUT /users//records//restore': '恢复已删除记录', + 'GET /users//search': '搜索用户记录', + 'GET /users//stats': '获取用户统计信息' + } + }) + + +@app.route('/health', methods=['GET']) +def health_check(): + """健康检查""" + try: + # 测试ClickHouse连接 + clickhouse_client.client.execute('SELECT 1') + return jsonify({'status': 'healthy', 'database': 'connected'}) + except Exception as e: + return jsonify({'status': 'unhealthy', 'error': str(e)}), 500 + + +# ========== 用户记录操作 ========== + +@app.route('/users//records', methods=['POST']) +def create_record(user_uuid): + """创建用户记录""" + try: + data = request.get_json() + if not data: + return jsonify({'error': 'No data provided'}), 400 + + file_path = data.get('file_path') + is_delete = data.get('is_delete', 0) + + if not file_path: + return jsonify({'error': 'file_path is required'}), 400 + + # 调用更新后的方法,返回record_id + record_id = clickhouse_client.create_user_record(user_uuid, file_path, is_delete) + + if record_id: + return jsonify({ + 'message': 'Record created successfully', + 'user_uuid': user_uuid, + 'record_id': record_id, + 'file_path': file_path + }), 201 + else: + return jsonify({'error': 'Failed to create record'}), 500 + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/users//records/batch', methods=['POST']) +def batch_create_records(user_uuid): + """批量创建用户记录""" + try: + data = request.get_json() + if not data or not isinstance(data, list): + return jsonify({'error': 'Data must be a list of records'}), 400 + + # 为每条记录添加user_uuid + records = [] + for item in data: + if not isinstance(item, dict): + continue + + record = { + 'user_uuid': user_uuid, + 'file_path': item.get('file_path', ''), + 'is_delete': item.get('is_delete', 0) + } + + if record['file_path']: # 只添加有文件路径的记录 + records.append(record) + + if not records: + return jsonify({'error': 'No valid records provided'}), 400 + + # 调用更新后的方法,返回record_ids + record_ids = clickhouse_client.batch_create_records(records) + + if record_ids: + return jsonify({ + 'message': f'{len(record_ids)} records created successfully', + 'user_uuid': user_uuid, + 'record_ids': record_ids, + 'count': len(record_ids) + }), 201 + else: + return jsonify({'error': 'Failed to create records'}), 500 + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/users//records', methods=['GET']) +def get_records(user_uuid): + """获取用户的所有记录""" + try: + include_deleted = request.args.get('include_deleted', 'false').lower() == 'true' + + records = clickhouse_client.get_user_records(user_uuid, include_deleted) + + return jsonify({ + 'user_uuid': user_uuid, + 'count': len(records), + 'records': records + }) + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/users//records/', methods=['GET']) +def get_single_record(user_uuid, record_id): + """获取单条记录""" + try: + record = clickhouse_client.get_user_record_by_id(record_id) + + if not record: + return jsonify({'error': 'Record not found'}), 404 + + if record['user_uuid'] != user_uuid: + return jsonify({'error': 'Record does not belong to this user'}), 403 + + return jsonify(record) + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/users//records/', methods=['PUT']) +def update_record(user_uuid, record_id): + """更新记录""" + try: + # 先检查记录是否存在且属于该用户 + record = clickhouse_client.get_user_record_by_id(record_id) + if not record: + return jsonify({'error': 'Record not found'}), 404 + + if record['user_uuid'] != user_uuid: + return jsonify({'error': 'Record does not belong to this user'}), 403 + + data = request.get_json() + if not data: + return jsonify({'error': 'No data provided'}), 400 + + file_path = data.get('file_path') + is_delete = data.get('is_delete') + + success = clickhouse_client.update_user_record(record_id, file_path, is_delete) + + if success: + return jsonify({ + 'message': 'Record updated successfully', + 'record_id': record_id, + 'user_uuid': user_uuid + }) + else: + return jsonify({'error': 'Failed to update record'}), 500 + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/users//records/', methods=['DELETE']) +def delete_record(user_uuid, record_id): + """删除单条记录""" + try: + # 先检查记录是否存在且属于该用户 + record = clickhouse_client.get_user_record_by_id(record_id) + if not record: + return jsonify({'error': 'Record not found'}), 404 + + if record['user_uuid'] != user_uuid: + return jsonify({'error': 'Record does not belong to this user'}), 403 + + # 获取删除类型参数 + soft_delete = request.args.get('soft_delete', 'true').lower() == 'true' + + success = clickhouse_client.delete_user_record(record_id, soft_delete) + + if success: + action = 'soft deleted' if soft_delete else 'permanently deleted' + return jsonify({ + 'message': f'Record {action} successfully', + 'record_id': record_id, + 'user_uuid': user_uuid + }) + else: + return jsonify({'error': 'Failed to delete record'}), 500 + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/users//records', methods=['DELETE']) +def delete_all_records(user_uuid): + """删除用户的所有记录""" + try: + # 获取删除类型参数 + soft_delete = request.args.get('soft_delete', 'true').lower() == 'true' + + success = clickhouse_client.delete_all_user_records(user_uuid, soft_delete) + + if success: + action = 'soft deleted' if soft_delete else 'permanently deleted' + return jsonify({ + 'message': f'All records for user {user_uuid} have been {action}', + 'user_uuid': user_uuid + }) + else: + return jsonify({'error': 'Failed to delete records'}), 500 + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/users//records//restore', methods=['PUT']) +def restore_record(user_uuid, record_id): + """恢复已删除的记录""" + try: + # 先检查记录是否存在 + record = clickhouse_client.get_user_record_by_id(record_id) + if not record: + return jsonify({'error': 'Record not found'}), 404 + + if record['user_uuid'] != user_uuid: + return jsonify({'error': 'Record does not belong to this user'}), 403 + + success = clickhouse_client.restore_user_record(record_id) + + if success: + return jsonify({ + 'message': 'Record restored successfully', + 'record_id': record_id, + 'user_uuid': user_uuid + }) + else: + return jsonify({'error': 'Failed to restore record'}), 500 + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/users//search', methods=['GET']) +def search_records(user_uuid): + """搜索用户记录""" + try: + keyword = request.args.get('keyword') + is_delete_param = request.args.get('is_delete') + + is_delete = None + if is_delete_param is not None: + is_delete = int(is_delete_param) + + records = clickhouse_client.search_user_records(user_uuid, keyword, is_delete) + + return jsonify({ + 'user_uuid': user_uuid, + 'count': len(records), + 'records': records + }) + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/users//stats', methods=['GET']) +def get_user_stats(user_uuid): + """获取用户统计信息""" + try: + stats = clickhouse_client.get_user_stats(user_uuid) + + if stats: + return jsonify({ + 'user_uuid': user_uuid, + 'stats': stats + }) + else: + return jsonify({ + 'user_uuid': user_uuid, + 'stats': { + 'total_count': 0, + 'active_count': 0, + 'deleted_count': 0, + 'first_created': None, + 'last_created': None + } + }) + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +# ========== 错误处理 ========== + +@app.errorhandler(404) +def not_found(error): + return jsonify({'error': 'Not found'}), 404 + + +@app.errorhandler(500) +def internal_error(error): + return jsonify({'error': 'Internal server error'}), 500 + + +# ========== 主程序 ========== + +if __name__ == '__main__': + # 创建上传目录(如果需要) + os.makedirs('uploads', exist_ok=True) + + # 启动Flask应用 + app.run( + host='0.0.0.0', + port=28002, + debug=Config.DEBUG + ) \ No newline at end of file diff --git a/redis_check_uuid_mistral.py b/redis_check_uuid_mistral.py index 79f16f1..0513194 100644 --- a/redis_check_uuid_mistral.py +++ b/redis_check_uuid_mistral.py @@ -28,11 +28,12 @@ from threading import Thread import time app = flask.Flask(__name__) -pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=6, password="zhicheng123*") +pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=17, password="zhicheng123*") redis_ = redis.Redis(connection_pool=pool, decode_responses=True) -db_key_query = 'queryset_check_task' db_key_querying = 'querying_check_task' +db_key_queryset = 'queryset_check_task' +db_key_query = 'query_recall' db_key_error = 'error' @app.route("/search", methods=["POST"]) @@ -89,4 +90,4 @@ def handle_query(): if __name__ == "__main__": - app.run(debug=False, host='0.0.0.0', port=14001) + app.run(debug=False, host='0.0.0.0', port=28001)