新版自建库查重

3 days ago · a963fc354a
3 changed files with 509 additions and 60 deletions
--- a/flask_check_2.py
+++ b/flask_check_2.py
@ -187,10 +187,10 @@ def similar_content_func():
    }]


-def original_text_contrast_func(sentence_dan, content_list):
+def original_text_contrast_func(sentence_dan):
    '''

-    :param data_sentence_dan: section_dan[0] = [原句子序号， 第一组句子， 第一组句子相似句子， 第一组句子的标红部分，第一组句子相似句子标红部分，相似句子文件名]
+    :param data_sentence_dan: sentence_dan[0] = [原句子序号， 第一组句子， 第一组句子相似句子， 第一组句子的标红部分，第一组句子相似句子标红部分，相似句子文件名]
    :param content_list:
    :return:
    '''
@ -222,9 +222,8 @@ def original_text_contrast_func(sentence_dan, content_list):
    '''

    if sentence_dan != []:
-        original_text = ""
-        start = len(sentence_dan[0][1])
-        end = 0
+        start = 0
+        end = len(sentence_dan[0][1])
        similar_content = []
        for dan_sen_info in sentence_dan:  # 可能有很多个暂且确定是一个

@ -265,27 +264,8 @@ def original_text_contrast_func(sentence_dan, content_list):
            similar_content.append(similar_content_dan)

        original_text_list = list(sentence_dan[0][1])
-        # original_text_list.insert(end, "</red>\n")
-        # original_text_list.insert(start, "\n<red>")
-        target_text_str = "".join(["\n<red>"] + original_text_list[start: end] + ["</red>\n"])
-
-        original_text_start = "".join(original_text_list[:start])
-        original_text_end = "".join(original_text_list[end:])
-
-        print(sentence_dan)
-        if sentence_dan[0][4][0] - 1 < 0:
-            start_sen = ""
-        else:
-            start_sen = content_list[data_sentence_dan[0][4][0] - 1]
-
-        if data_sentence_dan[0][4][-1] + 1 >= len(content_list):
-            end_sen = ""
-        else:
-            end_sen = content_list[data_sentence_dan[0][4][-1] + 1]
-
-        start_sen = start_sen + original_text_start
-        end_sen = original_text_end + end_sen
-        original_text = "此处有 {} 字相似\n".format(str(end - start)) + start_sen[-60:] + target_text_str + end_sen[:60]
+        target_text_str = "".join(["<red>"] + original_text_list + ["</red>\n"])
+        original_text = "此处有 {} 字相似\n".format(str(len(sentence_dan[0][1]))) + target_text_str
    else:
        original_text = ""
        end = 0
@ -346,6 +326,14 @@ def repeat_quote_info_func(original_text_contrast, section_words):
 def total_data_func(section_data_list):
    '''
    总体数据
+    section_data_list[0] = {
+        "section_name": "第{}部分".format(str(index_content_list_dan)),
+        "section_repeat_rate": repeat_rate,
+        "section_repeat_words": repeat_words,
+        "section_words": section_words,
+        "section_original_text_contrast": original_text_contrast,
+        "section_similar_paper_word_list": paper_similar_word_dict_new
+    }
    :return:
    '''
    # "end_page_index": 0,
@ -361,23 +349,48 @@ def total_data_func(section_data_list):

    repeat_words = 0
    words = 0
+    single_max_rate = ""
+    single_max_repeat_words = ""
+    repeat_paper_info_words = {}
+    repeat_paper_info = []

+    # 相似文档信息汇总
    for i in section_data_list:
-        repeat_words += i["repeat_words"]
-        words += i["words"]
+        print("==============================")
+        print(i)
+        print("==============================")
+        repeat_words += i["section_repeat_words"]
+        words += i["section_words"]
+        for j in i['section_original_text_contrast']:
+            for z in j['similar_content']:
+                print(z)
+                if z['title'] not in repeat_paper_info_words:
+                    repeat_paper_info_words[z['title']] = z['paper_red_len_word']
+                else:
+                    repeat_paper_info_words[z['title']] += z['paper_red_len_word']

    baifenbi = (repeat_words / words) * 100
    exclude_personal_rate = str(round(baifenbi, 1)) + "%"
    exclude_quote_rate = str(round(baifenbi, 1)) + "%"
-    single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"]
-    single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"]
+
    total_repeat_rate = str(round(baifenbi, 1)) + "%"
    total_repeat_words = repeat_words
    total_words = words

+    repeat_paper_info_words = sorted(repeat_paper_info_words.items(), key=lambda x: x[1], reverse=True)
+    for i in repeat_paper_info_words:
+        repeat_paper_info.append({
+            "title": i[0],
+            "words": i[1],
+            "rate": str(round(i[1]/total_words, 1)) + "%"
+        })
+
+    single_max_rate = repeat_paper_info[0]["rate"]
+    single_max_repeat_words = repeat_paper_info[0]["words"]
    print(exclude_personal_rate)

    return {
+        "repeat_paper_info": repeat_paper_info,
        "back_repeat_words": "",
        "exclude_personal_rate": exclude_personal_rate,
        "exclude_quote_rate": exclude_quote_rate,
@ -456,7 +469,7 @@ def section_data_func(section_details):
    }


-def section_details_func(data_section_dan, num_words, content_list, index_content_list_dan):
+def section_details_func(data_section_dan, num_words, index_content_list_dan):
    '''
    章节详细信息
    :param data_section_dan: 章节的每一个内容的相似句子的信息  data_section_dan[0][0] = [原句子序号， 第一组句子， 第一组句子相似句子， 第一组句子的标红部分，第一组句子相似句子标红部分，相似句子文件名]
@ -475,28 +488,56 @@ def section_details_func(data_section_dan, num_words, content_list, index_conten
    original_text_list = []

    for sentence_dan in data_section_dan:
-        original_text_contrast_dan = original_text_contrast_func(sentence_dan, content_list)
+        original_text_contrast_dan = original_text_contrast_func(sentence_dan)
        original_text_contrast.append(original_text_contrast_dan)
        repeat_words += original_text_contrast_dan["dan_sentence_word_nums"]
        original_text_list.append(original_text_contrast_dan["original_text"])

-    original_text = "".join(original_text_list)
    repeat_rate = (repeat_words / section_words) * 100
    repeat_rate = str(round(repeat_rate, 1)) + "%"

-    repeat_quote_info = repeat_quote_info_func(original_text_contrast, section_words)
+
+    # 计算章节相似文章排序：
+    paper_similar_word_dict = {}
+    for i in original_text_contrast:
+        for j in i['similar_content']:
+            if j['title'] not in paper_similar_word_dict:
+                paper_similar_word_dict[j['title']] = j['paper_red_len_word']
+    sorted_by_score = sorted(paper_similar_word_dict.items(), key=lambda item: item[1], reverse=True)
+    paper_similar_word_dict_new = []
+    for i in range(len(sorted_by_score)):
+        paper_similar_word_dict_new.append({
+            "similar_content_red_len_word": sorted_by_score[i][1],
+            "similar_content_rate": str(round((sorted_by_score[i][1]/ section_words)* 100, 1)) + "%",
+            "similar_content_title": [sorted_by_score[i][0]]
+        })
+
+
+    # section_name = section_details["name"]
+    # section_repeat_rate = section_details["repeat_rate"]
+    # section_repeat_words = section_details["repeat_words"]
+    # section_words = section_details["words"]
+    # oneself_repeat_words = section_details["repeat_words"]
+    # reference_repeat_words = section_details["repeat_words"]
+    # section_oneself_rate = section_details["repeat_rate"]
+    #
+    # return {
+    #     "section_name": section_name,
+    #     "section_repeat_rate": section_repeat_rate,
+    #     "section_repeat_words": section_repeat_words,
+    #     "section_words": section_words,
+    #     "oneself_repeat_words": oneself_repeat_words,
+    #     "reference_repeat_words": reference_repeat_words,
+    #     "section_oneself_rate": section_oneself_rate
+    # }

    return {
-        "end_page_index": 0,
-        "name": "第{}部分".format(str(index_content_list_dan)),
-        "repeat_rate": repeat_rate,
-        "repeat_words": repeat_words,
-        "start_page_index": 0,
-        "words": section_words,
-        "original_text": original_text,
-        "original_text_oneself": original_text,
-        "original_text_contrast": original_text_contrast,
-        "repeat_quote_info": repeat_quote_info
+        "section_name": "第{}部分".format(str(index_content_list_dan)),
+        "section_repeat_rate": repeat_rate,
+        "section_repeat_words": repeat_words,
+        "section_words": section_words,
+        "section_original_text_contrast": original_text_contrast,
+        "section_similar_paper_word_list": paper_similar_word_dict_new
    }


@ -517,16 +558,11 @@ def check_dict(similar_content_control_zong, num_words_zong, chapter_data, index
                                                                                 num_words_zong, chapter_data,
                                                                                 index_content_list):
        # 章节详细信息
-        section_details = section_details_func(data_section_dan, num_words, content_list, index_content_list_dan)
+        section_details = section_details_func(data_section_dan, num_words, index_content_list_dan)
        section_details_list.append(section_details)

    # 模拟多个章节

-    section_data_list = []
-    for section_details in section_details_list:
-        section_data = section_data_func(section_details)
-        section_data_list.append(section_data)
-
    total_data = total_data_func(section_details_list)

    format = '%Y-%m-%d %H:%M:%S'
@ -539,8 +575,7 @@ def check_dict(similar_content_control_zong, num_words_zong, chapter_data, index
        "time_range": "1900-01-01至2023-08-08",
        "title": "",
        "total_data": total_data,
-        "section_data": section_data_list,
-        "section_details": section_details_list
+        "section_data": section_details_list,
    }

    return paper_data
@ -1220,7 +1255,7 @@ class PureClient:
 def processing_one_text(user_uuid):
    pureclient = PureClient()
    print("paper_id", user_uuid)
-    sql = f"SELECT * FROM user_table WHERE user_uuid='{user_uuid}'"
+    sql = f"SELECT * FROM user_table_1 WHERE user_uuid='{user_uuid}'"
    result = pureclient.run(sql)
    return result

@ -1247,10 +1282,13 @@ def ulit_recall_paper(uuid_uesr):

    data = []
    for res_dan in res_list:
-        user_uuid = res_dan[0]
-        file_path = res_dan[1]
-        is_delete = res_dan[2]
-        if is_delete == 1:
+        records_uuid = res_dan[0]
+        user_uuid = res_dan[1]
+        file_path = res_dan[2]
+        is_delete = res_dan[3]
+        created_at = res_dan[4]
+        updated_at = res_dan[5]
+        if is_delete == 0:
            try:
                with open(file_path, encoding="gbk") as f:
                    text = f.read()
--- a/flask_operate_sql.py
+++ b/flask_operate_sql.py
@ -0,0 +1,410 @@
+import os
+from flask import Flask, request, render_template_string, redirect, url_for, jsonify
+from werkzeug.utils import secure_filename
+from flask_cors import CORS
+from clickhouse_client import clickhouse_client
+from config import Config
+from datetime import datetime
+
+
+app = Flask(__name__)
+CORS(app)  # 允许跨域请求
+app.config.from_object(Config)
+
+# 配置
+UPLOAD_FOLDER = 'uploads'
+ALLOWED_EXTENSIONS = {'txt'}
+
+# 确保上传目录存在
+if not os.path.exists(UPLOAD_FOLDER):
+    os.makedirs(UPLOAD_FOLDER)
+
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+
+def upload_file():
+    message = ''
+    filename = ''
+
+    # 检查是否有文件部分
+    if 'file' not in request.files:
+        message = '没有文件部分'
+        return_info = {
+            "code": 203,
+            "message": message
+        }
+        return jsonify(return_info)
+
+    file = request.files['file']
+
+    # 如果用户没有选择文件
+    if file.filename == '':
+        message = '没有选择文件'
+        return_info = {
+            "code": 203,
+            "message": message
+        }
+
+        return jsonify(return_info)
+
+    if file and allowed_file(file.filename):
+        # 安全处理文件名
+        filename = secure_filename(file.filename)
+
+        # 处理同名文件：添加时间戳
+        name, ext = os.path.splitext(filename)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"{name}_{timestamp}{ext}"
+
+        # 保存文件
+        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+        file.save(filepath)
+
+        message = '文件上传成功！'
+        return_info = {
+            "code": 203,
+            "message": message
+        }
+
+        return jsonify(return_info)
+    else:
+        message = '文件类型不允许'
+
+def allowed_file(filename):
+    """检查文件扩展名是否允许"""
+    return '.' in filename and \
+           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+
+# ========== API路由 ==========
+
+@app.route('/')
+def index():
+    """首页"""
+    return jsonify({
+        'message': 'ClickHouse Flask API',
+        'endpoints': {
+            'GET /health': '检查服务健康状态',
+            'POST /users/<user_uuid>/records': '创建用户记录',
+            'POST /users/<user_uuid>/records/batch': '批量创建记录',
+            'GET /users/<user_uuid>/records': '获取用户所有记录',
+            'GET /users/<user_uuid>/records/<record_id>': '获取单条记录',
+            'PUT /users/<user_uuid>/records/<record_id>': '更新记录',
+            'DELETE /users/<user_uuid>/records/<record_id>': '删除单条记录',
+            'DELETE /users/<user_uuid>/records': '删除用户所有记录',
+            'PUT /users/<user_uuid>/records/<record_id>/restore': '恢复已删除记录',
+            'GET /users/<user_uuid>/search': '搜索用户记录',
+            'GET /users/<user_uuid>/stats': '获取用户统计信息'
+        }
+    })
+
+
+@app.route('/health', methods=['GET'])
+def health_check():
+    """健康检查"""
+    try:
+        # 测试ClickHouse连接
+        clickhouse_client.client.execute('SELECT 1')
+        return jsonify({'status': 'healthy', 'database': 'connected'})
+    except Exception as e:
+        return jsonify({'status': 'unhealthy', 'error': str(e)}), 500
+
+
+# ========== 用户记录操作 ==========
+
+@app.route('/users/<user_uuid>/records', methods=['POST'])
+def create_record(user_uuid):
+    """创建用户记录"""
+    try:
+        data = request.get_json()
+        if not data:
+            return jsonify({'error': 'No data provided'}), 400
+
+        file_path = data.get('file_path')
+        is_delete = data.get('is_delete', 0)
+
+        if not file_path:
+            return jsonify({'error': 'file_path is required'}), 400
+
+        # 调用更新后的方法，返回record_id
+        record_id = clickhouse_client.create_user_record(user_uuid, file_path, is_delete)
+
+        if record_id:
+            return jsonify({
+                'message': 'Record created successfully',
+                'user_uuid': user_uuid,
+                'record_id': record_id,
+                'file_path': file_path
+            }), 201
+        else:
+            return jsonify({'error': 'Failed to create record'}), 500
+
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+
+@app.route('/users/<user_uuid>/records/batch', methods=['POST'])
+def batch_create_records(user_uuid):
+    """批量创建用户记录"""
+    try:
+        data = request.get_json()
+        if not data or not isinstance(data, list):
+            return jsonify({'error': 'Data must be a list of records'}), 400
+
+        # 为每条记录添加user_uuid
+        records = []
+        for item in data:
+            if not isinstance(item, dict):
+                continue
+
+            record = {
+                'user_uuid': user_uuid,
+                'file_path': item.get('file_path', ''),
+                'is_delete': item.get('is_delete', 0)
+            }
+
+            if record['file_path']:  # 只添加有文件路径的记录
+                records.append(record)
+
+        if not records:
+            return jsonify({'error': 'No valid records provided'}), 400
+
+        # 调用更新后的方法，返回record_ids
+        record_ids = clickhouse_client.batch_create_records(records)
+
+        if record_ids:
+            return jsonify({
+                'message': f'{len(record_ids)} records created successfully',
+                'user_uuid': user_uuid,
+                'record_ids': record_ids,
+                'count': len(record_ids)
+            }), 201
+        else:
+            return jsonify({'error': 'Failed to create records'}), 500
+
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+
+@app.route('/users/<user_uuid>/records', methods=['GET'])
+def get_records(user_uuid):
+    """获取用户的所有记录"""
+    try:
+        include_deleted = request.args.get('include_deleted', 'false').lower() == 'true'
+
+        records = clickhouse_client.get_user_records(user_uuid, include_deleted)
+
+        return jsonify({
+            'user_uuid': user_uuid,
+            'count': len(records),
+            'records': records
+        })
+
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+
+@app.route('/users/<user_uuid>/records/<record_id>', methods=['GET'])
+def get_single_record(user_uuid, record_id):
+    """获取单条记录"""
+    try:
+        record = clickhouse_client.get_user_record_by_id(record_id)
+
+        if not record:
+            return jsonify({'error': 'Record not found'}), 404
+
+        if record['user_uuid'] != user_uuid:
+            return jsonify({'error': 'Record does not belong to this user'}), 403
+
+        return jsonify(record)
+
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+
+@app.route('/users/<user_uuid>/records/<record_id>', methods=['PUT'])
+def update_record(user_uuid, record_id):
+    """更新记录"""
+    try:
+        # 先检查记录是否存在且属于该用户
+        record = clickhouse_client.get_user_record_by_id(record_id)
+        if not record:
+            return jsonify({'error': 'Record not found'}), 404
+
+        if record['user_uuid'] != user_uuid:
+            return jsonify({'error': 'Record does not belong to this user'}), 403
+
+        data = request.get_json()
+        if not data:
+            return jsonify({'error': 'No data provided'}), 400
+
+        file_path = data.get('file_path')
+        is_delete = data.get('is_delete')
+
+        success = clickhouse_client.update_user_record(record_id, file_path, is_delete)
+
+        if success:
+            return jsonify({
+                'message': 'Record updated successfully',
+                'record_id': record_id,
+                'user_uuid': user_uuid
+            })
+        else:
+            return jsonify({'error': 'Failed to update record'}), 500
+
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+
+@app.route('/users/<user_uuid>/records/<record_id>', methods=['DELETE'])
+def delete_record(user_uuid, record_id):
+    """删除单条记录"""
+    try:
+        # 先检查记录是否存在且属于该用户
+        record = clickhouse_client.get_user_record_by_id(record_id)
+        if not record:
+            return jsonify({'error': 'Record not found'}), 404
+
+        if record['user_uuid'] != user_uuid:
+            return jsonify({'error': 'Record does not belong to this user'}), 403
+
+        # 获取删除类型参数
+        soft_delete = request.args.get('soft_delete', 'true').lower() == 'true'
+
+        success = clickhouse_client.delete_user_record(record_id, soft_delete)
+
+        if success:
+            action = 'soft deleted' if soft_delete else 'permanently deleted'
+            return jsonify({
+                'message': f'Record {action} successfully',
+                'record_id': record_id,
+                'user_uuid': user_uuid
+            })
+        else:
+            return jsonify({'error': 'Failed to delete record'}), 500
+
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+
+@app.route('/users/<user_uuid>/records', methods=['DELETE'])
+def delete_all_records(user_uuid):
+    """删除用户的所有记录"""
+    try:
+        # 获取删除类型参数
+        soft_delete = request.args.get('soft_delete', 'true').lower() == 'true'
+
+        success = clickhouse_client.delete_all_user_records(user_uuid, soft_delete)
+
+        if success:
+            action = 'soft deleted' if soft_delete else 'permanently deleted'
+            return jsonify({
+                'message': f'All records for user {user_uuid} have been {action}',
+                'user_uuid': user_uuid
+            })
+        else:
+            return jsonify({'error': 'Failed to delete records'}), 500
+
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+
+@app.route('/users/<user_uuid>/records/<record_id>/restore', methods=['PUT'])
+def restore_record(user_uuid, record_id):
+    """恢复已删除的记录"""
+    try:
+        # 先检查记录是否存在
+        record = clickhouse_client.get_user_record_by_id(record_id)
+        if not record:
+            return jsonify({'error': 'Record not found'}), 404
+
+        if record['user_uuid'] != user_uuid:
+            return jsonify({'error': 'Record does not belong to this user'}), 403
+
+        success = clickhouse_client.restore_user_record(record_id)
+
+        if success:
+            return jsonify({
+                'message': 'Record restored successfully',
+                'record_id': record_id,
+                'user_uuid': user_uuid
+            })
+        else:
+            return jsonify({'error': 'Failed to restore record'}), 500
+
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+
+@app.route('/users/<user_uuid>/search', methods=['GET'])
+def search_records(user_uuid):
+    """搜索用户记录"""
+    try:
+        keyword = request.args.get('keyword')
+        is_delete_param = request.args.get('is_delete')
+
+        is_delete = None
+        if is_delete_param is not None:
+            is_delete = int(is_delete_param)
+
+        records = clickhouse_client.search_user_records(user_uuid, keyword, is_delete)
+
+        return jsonify({
+            'user_uuid': user_uuid,
+            'count': len(records),
+            'records': records
+        })
+
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+
+@app.route('/users/<user_uuid>/stats', methods=['GET'])
+def get_user_stats(user_uuid):
+    """获取用户统计信息"""
+    try:
+        stats = clickhouse_client.get_user_stats(user_uuid)
+
+        if stats:
+            return jsonify({
+                'user_uuid': user_uuid,
+                'stats': stats
+            })
+        else:
+            return jsonify({
+                'user_uuid': user_uuid,
+                'stats': {
+                    'total_count': 0,
+                    'active_count': 0,
+                    'deleted_count': 0,
+                    'first_created': None,
+                    'last_created': None
+                }
+            })
+
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+
+# ========== 错误处理 ==========
+
+@app.errorhandler(404)
+def not_found(error):
+    return jsonify({'error': 'Not found'}), 404
+
+
+@app.errorhandler(500)
+def internal_error(error):
+    return jsonify({'error': 'Internal server error'}), 500
+
+
+# ========== 主程序 ==========
+
+if __name__ == '__main__':
+    # 创建上传目录（如果需要）
+    os.makedirs('uploads', exist_ok=True)
+
+    # 启动Flask应用
+    app.run(
+        host='0.0.0.0',
+        port=28002,
+        debug=Config.DEBUG
+    )
--- a/redis_check_uuid_mistral.py
+++ b/redis_check_uuid_mistral.py
@ -28,11 +28,12 @@ from threading import Thread
 import time

 app = flask.Flask(__name__)
-pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=6, password="zhicheng123*")
+pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=17, password="zhicheng123*")
 redis_ = redis.Redis(connection_pool=pool, decode_responses=True)

-db_key_query = 'queryset_check_task'
 db_key_querying = 'querying_check_task'
+db_key_queryset = 'queryset_check_task'
+db_key_query = 'query_recall'
 db_key_error = 'error'

@app.route("/search", methods=["POST"])
@ -89,4 +90,4 @@ def handle_query():


 if __name__ == "__main__":
-    app.run(debug=False, host='0.0.0.0', port=14001)
+    app.run(debug=False, host='0.0.0.0', port=28001)