Browse Source

新版自建库查重

master
majiahui@haimaqingfan.com 3 days ago
parent
commit
a963fc354a
  1. 152
      flask_check_2.py
  2. 410
      flask_operate_sql.py
  3. 7
      redis_check_uuid_mistral.py

152
flask_check_2.py

@ -187,10 +187,10 @@ def similar_content_func():
}]
def original_text_contrast_func(sentence_dan, content_list):
def original_text_contrast_func(sentence_dan):
'''
:param data_sentence_dan: section_dan[0] = [原句子序号 第一组句子 第一组句子相似句子 第一组句子的标红部分第一组句子相似句子标红部分相似句子文件名]
:param data_sentence_dan: sentence_dan[0] = [原句子序号 第一组句子 第一组句子相似句子 第一组句子的标红部分第一组句子相似句子标红部分相似句子文件名]
:param content_list:
:return:
'''
@ -222,9 +222,8 @@ def original_text_contrast_func(sentence_dan, content_list):
'''
if sentence_dan != []:
original_text = ""
start = len(sentence_dan[0][1])
end = 0
start = 0
end = len(sentence_dan[0][1])
similar_content = []
for dan_sen_info in sentence_dan: # 可能有很多个暂且确定是一个
@ -265,27 +264,8 @@ def original_text_contrast_func(sentence_dan, content_list):
similar_content.append(similar_content_dan)
original_text_list = list(sentence_dan[0][1])
# original_text_list.insert(end, "</red>\n")
# original_text_list.insert(start, "\n<red>")
target_text_str = "".join(["\n<red>"] + original_text_list[start: end] + ["</red>\n"])
original_text_start = "".join(original_text_list[:start])
original_text_end = "".join(original_text_list[end:])
print(sentence_dan)
if sentence_dan[0][4][0] - 1 < 0:
start_sen = ""
else:
start_sen = content_list[data_sentence_dan[0][4][0] - 1]
if data_sentence_dan[0][4][-1] + 1 >= len(content_list):
end_sen = ""
else:
end_sen = content_list[data_sentence_dan[0][4][-1] + 1]
start_sen = start_sen + original_text_start
end_sen = original_text_end + end_sen
original_text = "此处有 {} 字相似\n".format(str(end - start)) + start_sen[-60:] + target_text_str + end_sen[:60]
target_text_str = "".join(["<red>"] + original_text_list + ["</red>\n"])
original_text = "此处有 {} 字相似\n".format(str(len(sentence_dan[0][1]))) + target_text_str
else:
original_text = ""
end = 0
@ -346,6 +326,14 @@ def repeat_quote_info_func(original_text_contrast, section_words):
def total_data_func(section_data_list):
'''
总体数据
section_data_list[0] = {
"section_name": "{}部分".format(str(index_content_list_dan)),
"section_repeat_rate": repeat_rate,
"section_repeat_words": repeat_words,
"section_words": section_words,
"section_original_text_contrast": original_text_contrast,
"section_similar_paper_word_list": paper_similar_word_dict_new
}
:return:
'''
# "end_page_index": 0,
@ -361,23 +349,48 @@ def total_data_func(section_data_list):
repeat_words = 0
words = 0
single_max_rate = ""
single_max_repeat_words = ""
repeat_paper_info_words = {}
repeat_paper_info = []
# 相似文档信息汇总
for i in section_data_list:
repeat_words += i["repeat_words"]
words += i["words"]
print("==============================")
print(i)
print("==============================")
repeat_words += i["section_repeat_words"]
words += i["section_words"]
for j in i['section_original_text_contrast']:
for z in j['similar_content']:
print(z)
if z['title'] not in repeat_paper_info_words:
repeat_paper_info_words[z['title']] = z['paper_red_len_word']
else:
repeat_paper_info_words[z['title']] += z['paper_red_len_word']
baifenbi = (repeat_words / words) * 100
exclude_personal_rate = str(round(baifenbi, 1)) + "%"
exclude_quote_rate = str(round(baifenbi, 1)) + "%"
single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"]
single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"]
total_repeat_rate = str(round(baifenbi, 1)) + "%"
total_repeat_words = repeat_words
total_words = words
repeat_paper_info_words = sorted(repeat_paper_info_words.items(), key=lambda x: x[1], reverse=True)
for i in repeat_paper_info_words:
repeat_paper_info.append({
"title": i[0],
"words": i[1],
"rate": str(round(i[1]/total_words, 1)) + "%"
})
single_max_rate = repeat_paper_info[0]["rate"]
single_max_repeat_words = repeat_paper_info[0]["words"]
print(exclude_personal_rate)
return {
"repeat_paper_info": repeat_paper_info,
"back_repeat_words": "",
"exclude_personal_rate": exclude_personal_rate,
"exclude_quote_rate": exclude_quote_rate,
@ -456,7 +469,7 @@ def section_data_func(section_details):
}
def section_details_func(data_section_dan, num_words, content_list, index_content_list_dan):
def section_details_func(data_section_dan, num_words, index_content_list_dan):
'''
章节详细信息
:param data_section_dan: 章节的每一个内容的相似句子的信息 data_section_dan[0][0] = [原句子序号 第一组句子 第一组句子相似句子 第一组句子的标红部分第一组句子相似句子标红部分相似句子文件名]
@ -475,28 +488,56 @@ def section_details_func(data_section_dan, num_words, content_list, index_conten
original_text_list = []
for sentence_dan in data_section_dan:
original_text_contrast_dan = original_text_contrast_func(sentence_dan, content_list)
original_text_contrast_dan = original_text_contrast_func(sentence_dan)
original_text_contrast.append(original_text_contrast_dan)
repeat_words += original_text_contrast_dan["dan_sentence_word_nums"]
original_text_list.append(original_text_contrast_dan["original_text"])
original_text = "".join(original_text_list)
repeat_rate = (repeat_words / section_words) * 100
repeat_rate = str(round(repeat_rate, 1)) + "%"
repeat_quote_info = repeat_quote_info_func(original_text_contrast, section_words)
# 计算章节相似文章排序:
paper_similar_word_dict = {}
for i in original_text_contrast:
for j in i['similar_content']:
if j['title'] not in paper_similar_word_dict:
paper_similar_word_dict[j['title']] = j['paper_red_len_word']
sorted_by_score = sorted(paper_similar_word_dict.items(), key=lambda item: item[1], reverse=True)
paper_similar_word_dict_new = []
for i in range(len(sorted_by_score)):
paper_similar_word_dict_new.append({
"similar_content_red_len_word": sorted_by_score[i][1],
"similar_content_rate": str(round((sorted_by_score[i][1]/ section_words)* 100, 1)) + "%",
"similar_content_title": [sorted_by_score[i][0]]
})
# section_name = section_details["name"]
# section_repeat_rate = section_details["repeat_rate"]
# section_repeat_words = section_details["repeat_words"]
# section_words = section_details["words"]
# oneself_repeat_words = section_details["repeat_words"]
# reference_repeat_words = section_details["repeat_words"]
# section_oneself_rate = section_details["repeat_rate"]
#
# return {
# "section_name": section_name,
# "section_repeat_rate": section_repeat_rate,
# "section_repeat_words": section_repeat_words,
# "section_words": section_words,
# "oneself_repeat_words": oneself_repeat_words,
# "reference_repeat_words": reference_repeat_words,
# "section_oneself_rate": section_oneself_rate
# }
return {
"end_page_index": 0,
"name": "{}部分".format(str(index_content_list_dan)),
"repeat_rate": repeat_rate,
"repeat_words": repeat_words,
"start_page_index": 0,
"words": section_words,
"original_text": original_text,
"original_text_oneself": original_text,
"original_text_contrast": original_text_contrast,
"repeat_quote_info": repeat_quote_info
"section_name": "{}部分".format(str(index_content_list_dan)),
"section_repeat_rate": repeat_rate,
"section_repeat_words": repeat_words,
"section_words": section_words,
"section_original_text_contrast": original_text_contrast,
"section_similar_paper_word_list": paper_similar_word_dict_new
}
@ -517,16 +558,11 @@ def check_dict(similar_content_control_zong, num_words_zong, chapter_data, index
num_words_zong, chapter_data,
index_content_list):
# 章节详细信息
section_details = section_details_func(data_section_dan, num_words, content_list, index_content_list_dan)
section_details = section_details_func(data_section_dan, num_words, index_content_list_dan)
section_details_list.append(section_details)
# 模拟多个章节
section_data_list = []
for section_details in section_details_list:
section_data = section_data_func(section_details)
section_data_list.append(section_data)
total_data = total_data_func(section_details_list)
format = '%Y-%m-%d %H:%M:%S'
@ -539,8 +575,7 @@ def check_dict(similar_content_control_zong, num_words_zong, chapter_data, index
"time_range": "1900-01-01至2023-08-08",
"title": "",
"total_data": total_data,
"section_data": section_data_list,
"section_details": section_details_list
"section_data": section_details_list,
}
return paper_data
@ -1220,7 +1255,7 @@ class PureClient:
def processing_one_text(user_uuid):
pureclient = PureClient()
print("paper_id", user_uuid)
sql = f"SELECT * FROM user_table WHERE user_uuid='{user_uuid}'"
sql = f"SELECT * FROM user_table_1 WHERE user_uuid='{user_uuid}'"
result = pureclient.run(sql)
return result
@ -1247,10 +1282,13 @@ def ulit_recall_paper(uuid_uesr):
data = []
for res_dan in res_list:
user_uuid = res_dan[0]
file_path = res_dan[1]
is_delete = res_dan[2]
if is_delete == 1:
records_uuid = res_dan[0]
user_uuid = res_dan[1]
file_path = res_dan[2]
is_delete = res_dan[3]
created_at = res_dan[4]
updated_at = res_dan[5]
if is_delete == 0:
try:
with open(file_path, encoding="gbk") as f:
text = f.read()

410
flask_operate_sql.py

@ -0,0 +1,410 @@
import os
from flask import Flask, request, render_template_string, redirect, url_for, jsonify
from werkzeug.utils import secure_filename
from flask_cors import CORS
from clickhouse_client import clickhouse_client
from config import Config
from datetime import datetime
app = Flask(__name__)
CORS(app) # 允许跨域请求
app.config.from_object(Config)
# 配置
UPLOAD_FOLDER = 'uploads'
ALLOWED_EXTENSIONS = {'txt'}
# 确保上传目录存在
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
def upload_file():
message = ''
filename = ''
# 检查是否有文件部分
if 'file' not in request.files:
message = '没有文件部分'
return_info = {
"code": 203,
"message": message
}
return jsonify(return_info)
file = request.files['file']
# 如果用户没有选择文件
if file.filename == '':
message = '没有选择文件'
return_info = {
"code": 203,
"message": message
}
return jsonify(return_info)
if file and allowed_file(file.filename):
# 安全处理文件名
filename = secure_filename(file.filename)
# 处理同名文件:添加时间戳
name, ext = os.path.splitext(filename)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{name}_{timestamp}{ext}"
# 保存文件
filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
file.save(filepath)
message = '文件上传成功!'
return_info = {
"code": 203,
"message": message
}
return jsonify(return_info)
else:
message = '文件类型不允许'
def allowed_file(filename):
"""检查文件扩展名是否允许"""
return '.' in filename and \
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
# ========== API路由 ==========
@app.route('/')
def index():
"""首页"""
return jsonify({
'message': 'ClickHouse Flask API',
'endpoints': {
'GET /health': '检查服务健康状态',
'POST /users/<user_uuid>/records': '创建用户记录',
'POST /users/<user_uuid>/records/batch': '批量创建记录',
'GET /users/<user_uuid>/records': '获取用户所有记录',
'GET /users/<user_uuid>/records/<record_id>': '获取单条记录',
'PUT /users/<user_uuid>/records/<record_id>': '更新记录',
'DELETE /users/<user_uuid>/records/<record_id>': '删除单条记录',
'DELETE /users/<user_uuid>/records': '删除用户所有记录',
'PUT /users/<user_uuid>/records/<record_id>/restore': '恢复已删除记录',
'GET /users/<user_uuid>/search': '搜索用户记录',
'GET /users/<user_uuid>/stats': '获取用户统计信息'
}
})
@app.route('/health', methods=['GET'])
def health_check():
"""健康检查"""
try:
# 测试ClickHouse连接
clickhouse_client.client.execute('SELECT 1')
return jsonify({'status': 'healthy', 'database': 'connected'})
except Exception as e:
return jsonify({'status': 'unhealthy', 'error': str(e)}), 500
# ========== 用户记录操作 ==========
@app.route('/users/<user_uuid>/records', methods=['POST'])
def create_record(user_uuid):
"""创建用户记录"""
try:
data = request.get_json()
if not data:
return jsonify({'error': 'No data provided'}), 400
file_path = data.get('file_path')
is_delete = data.get('is_delete', 0)
if not file_path:
return jsonify({'error': 'file_path is required'}), 400
# 调用更新后的方法,返回record_id
record_id = clickhouse_client.create_user_record(user_uuid, file_path, is_delete)
if record_id:
return jsonify({
'message': 'Record created successfully',
'user_uuid': user_uuid,
'record_id': record_id,
'file_path': file_path
}), 201
else:
return jsonify({'error': 'Failed to create record'}), 500
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/users/<user_uuid>/records/batch', methods=['POST'])
def batch_create_records(user_uuid):
"""批量创建用户记录"""
try:
data = request.get_json()
if not data or not isinstance(data, list):
return jsonify({'error': 'Data must be a list of records'}), 400
# 为每条记录添加user_uuid
records = []
for item in data:
if not isinstance(item, dict):
continue
record = {
'user_uuid': user_uuid,
'file_path': item.get('file_path', ''),
'is_delete': item.get('is_delete', 0)
}
if record['file_path']: # 只添加有文件路径的记录
records.append(record)
if not records:
return jsonify({'error': 'No valid records provided'}), 400
# 调用更新后的方法,返回record_ids
record_ids = clickhouse_client.batch_create_records(records)
if record_ids:
return jsonify({
'message': f'{len(record_ids)} records created successfully',
'user_uuid': user_uuid,
'record_ids': record_ids,
'count': len(record_ids)
}), 201
else:
return jsonify({'error': 'Failed to create records'}), 500
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/users/<user_uuid>/records', methods=['GET'])
def get_records(user_uuid):
"""获取用户的所有记录"""
try:
include_deleted = request.args.get('include_deleted', 'false').lower() == 'true'
records = clickhouse_client.get_user_records(user_uuid, include_deleted)
return jsonify({
'user_uuid': user_uuid,
'count': len(records),
'records': records
})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/users/<user_uuid>/records/<record_id>', methods=['GET'])
def get_single_record(user_uuid, record_id):
"""获取单条记录"""
try:
record = clickhouse_client.get_user_record_by_id(record_id)
if not record:
return jsonify({'error': 'Record not found'}), 404
if record['user_uuid'] != user_uuid:
return jsonify({'error': 'Record does not belong to this user'}), 403
return jsonify(record)
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/users/<user_uuid>/records/<record_id>', methods=['PUT'])
def update_record(user_uuid, record_id):
"""更新记录"""
try:
# 先检查记录是否存在且属于该用户
record = clickhouse_client.get_user_record_by_id(record_id)
if not record:
return jsonify({'error': 'Record not found'}), 404
if record['user_uuid'] != user_uuid:
return jsonify({'error': 'Record does not belong to this user'}), 403
data = request.get_json()
if not data:
return jsonify({'error': 'No data provided'}), 400
file_path = data.get('file_path')
is_delete = data.get('is_delete')
success = clickhouse_client.update_user_record(record_id, file_path, is_delete)
if success:
return jsonify({
'message': 'Record updated successfully',
'record_id': record_id,
'user_uuid': user_uuid
})
else:
return jsonify({'error': 'Failed to update record'}), 500
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/users/<user_uuid>/records/<record_id>', methods=['DELETE'])
def delete_record(user_uuid, record_id):
"""删除单条记录"""
try:
# 先检查记录是否存在且属于该用户
record = clickhouse_client.get_user_record_by_id(record_id)
if not record:
return jsonify({'error': 'Record not found'}), 404
if record['user_uuid'] != user_uuid:
return jsonify({'error': 'Record does not belong to this user'}), 403
# 获取删除类型参数
soft_delete = request.args.get('soft_delete', 'true').lower() == 'true'
success = clickhouse_client.delete_user_record(record_id, soft_delete)
if success:
action = 'soft deleted' if soft_delete else 'permanently deleted'
return jsonify({
'message': f'Record {action} successfully',
'record_id': record_id,
'user_uuid': user_uuid
})
else:
return jsonify({'error': 'Failed to delete record'}), 500
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/users/<user_uuid>/records', methods=['DELETE'])
def delete_all_records(user_uuid):
"""删除用户的所有记录"""
try:
# 获取删除类型参数
soft_delete = request.args.get('soft_delete', 'true').lower() == 'true'
success = clickhouse_client.delete_all_user_records(user_uuid, soft_delete)
if success:
action = 'soft deleted' if soft_delete else 'permanently deleted'
return jsonify({
'message': f'All records for user {user_uuid} have been {action}',
'user_uuid': user_uuid
})
else:
return jsonify({'error': 'Failed to delete records'}), 500
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/users/<user_uuid>/records/<record_id>/restore', methods=['PUT'])
def restore_record(user_uuid, record_id):
"""恢复已删除的记录"""
try:
# 先检查记录是否存在
record = clickhouse_client.get_user_record_by_id(record_id)
if not record:
return jsonify({'error': 'Record not found'}), 404
if record['user_uuid'] != user_uuid:
return jsonify({'error': 'Record does not belong to this user'}), 403
success = clickhouse_client.restore_user_record(record_id)
if success:
return jsonify({
'message': 'Record restored successfully',
'record_id': record_id,
'user_uuid': user_uuid
})
else:
return jsonify({'error': 'Failed to restore record'}), 500
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/users/<user_uuid>/search', methods=['GET'])
def search_records(user_uuid):
"""搜索用户记录"""
try:
keyword = request.args.get('keyword')
is_delete_param = request.args.get('is_delete')
is_delete = None
if is_delete_param is not None:
is_delete = int(is_delete_param)
records = clickhouse_client.search_user_records(user_uuid, keyword, is_delete)
return jsonify({
'user_uuid': user_uuid,
'count': len(records),
'records': records
})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/users/<user_uuid>/stats', methods=['GET'])
def get_user_stats(user_uuid):
"""获取用户统计信息"""
try:
stats = clickhouse_client.get_user_stats(user_uuid)
if stats:
return jsonify({
'user_uuid': user_uuid,
'stats': stats
})
else:
return jsonify({
'user_uuid': user_uuid,
'stats': {
'total_count': 0,
'active_count': 0,
'deleted_count': 0,
'first_created': None,
'last_created': None
}
})
except Exception as e:
return jsonify({'error': str(e)}), 500
# ========== 错误处理 ==========
@app.errorhandler(404)
def not_found(error):
return jsonify({'error': 'Not found'}), 404
@app.errorhandler(500)
def internal_error(error):
return jsonify({'error': 'Internal server error'}), 500
# ========== 主程序 ==========
if __name__ == '__main__':
# 创建上传目录(如果需要)
os.makedirs('uploads', exist_ok=True)
# 启动Flask应用
app.run(
host='0.0.0.0',
port=28002,
debug=Config.DEBUG
)

7
redis_check_uuid_mistral.py

@ -28,11 +28,12 @@ from threading import Thread
import time
app = flask.Flask(__name__)
pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=6, password="zhicheng123*")
pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=17, password="zhicheng123*")
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
db_key_query = 'queryset_check_task'
db_key_querying = 'querying_check_task'
db_key_queryset = 'queryset_check_task'
db_key_query = 'query_recall'
db_key_error = 'error'
@app.route("/search", methods=["POST"])
@ -89,4 +90,4 @@ def handle_query():
if __name__ == "__main__":
app.run(debug=False, host='0.0.0.0', port=14001)
app.run(debug=False, host='0.0.0.0', port=28001)

Loading…
Cancel
Save