From 823923c92761f82d710d3ef00e71facc84a988ef Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Mon, 14 Apr 2025 17:16:19 +0800 Subject: [PATCH 01/13] =?UTF-8?q?=E5=8D=95=E6=9D=A1=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E4=B8=8A=E4=BC=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 341 ++++++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 248 insertions(+), 93 deletions(-) diff --git a/main.py b/main.py index 306b1bf..87f8823 100644 --- a/main.py +++ b/main.py @@ -2,21 +2,37 @@ # 按 Shift+F10 执行或将其替换为您的代码。 # 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。 +import os import faiss import numpy as np from tqdm import tqdm from sentence_transformers import SentenceTransformer import requests import time -from flask import Flask, jsonify -from flask import request +from flask import Flask, jsonify, Response, request +from openai import OpenAI +from flask_cors import CORS import pandas as pd +import concurrent.futures +import json app = Flask(__name__) +CORS(app) app.config["JSON_AS_ASCII"] = False -model = SentenceTransformer('/home/majiahui/project/models-llm/bge-large-zh-v1.5') +openai_api_key = "token-abc123" +openai_api_base = "http://127.0.0.1:12011/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +# model = models.data[0].id +model = "1" +model_encode = SentenceTransformer('/home/majiahui/project/models-llm/bge-large-zh-v1.5') propmt_connect = '''我是一名中医,你是一个中医的医生的助理,我的患者有一个症状,症状如下: {} 根据这些症状,我通过查找资料,{} @@ -26,7 +42,7 @@ propmt_connect_ziliao = '''在“{}”资料中,有如下相关内容: {}''' -def dialog_line_parse(url, text): +def dialog_line_parse(text): """ 将数据输入模型进行分析并输出结果 :param url: 模型url @@ -34,8 +50,9 @@ def dialog_line_parse(url, text): :return: 模型返回结果 """ + url_predict = "http://118.178.228.101:12004/predict" response = requests.post( - url, + url_predict, json=text, timeout=100000 ) @@ -49,46 +66,167 @@ def dialog_line_parse(url, text): # ) print("【{}】 Failed to get a proper response from remote " "server. Status Code: {}. Response: {}" - "".format(url, response.status_code, response.text)) + "".format(url_predict, response.status_code, response.text)) return {} + # ['choices'][0]['message']['content'] + # + # text = text['messages'][0]['content'] + # return_text = { + # 'code': 200, + # 'id': "1", + # 'object': 0, + # 'created': 0, + # 'model': 0, + # 'choices': [ + # { + # 'index': 0, + # 'message': { + # 'role': 'assistant', + # 'content': text + # }, + # 'logprobs': None, + # 'finish_reason': 'stop' + # } + # ], + # 'usage': 0, + # 'system_fingerprint': 0 + # } + # return return_text + def shengcehng_array(data): - embs = model.encode(data, normalize_embeddings=True) + embs = model_encode.encode(data, normalize_embeddings=True) return embs -def Building_vector_database(type, name, df): - data_ndarray = np.empty((0, 1024)) - for sen in df: - data_ndarray = np.concatenate((data_ndarray, shengcehng_array([sen[0]]))) - print("data_ndarray.shape", data_ndarray.shape) - print("data_ndarray.shape", data_ndarray.shape) - np.save(f'data_np/{name}.npy', data_ndarray) +def Building_vector_database(title, df): + # 加载需要处理的数据(有效且未向量化) + to_process = df[(df["有效"] == True) & (df["已向量化"] == False)] + if len(to_process) == 0: + print("无新增数据需要向量化") + return -def ulit_request_file(file, title): - file_name = file.filename - file_name_save = "data_file/{}.csv".format(title) - file.save(file_name_save) + # 生成向量数组 + new_vectors = shengcehng_array(to_process["总结"].tolist()) # 假设这是你的向量生成函数 - # try: - # with open(file_name_save, encoding="gbk") as f: - # content = f.read() - # except: - # with open(file_name_save, encoding="utf-8") as f: - # content = f.read() - # elif file_name.split(".")[-1] == "docx": - # content = docx2txt.process(file_name_save) + # 加载现有向量库和索引 + vector_path = f"data_np/{title}.npy" + index_path = f"data_np/{title}_index.json" - # content_list = [i for i in content.split("\n")] - df = pd.read_csv(file_name_save, sep="\t", encoding="utf-8").values.tolist() + vectors = np.load(vector_path) if os.path.exists(vector_path) else np.empty((0, 1024)) + index_data = {} + if os.path.exists(index_path): + with open(index_path, "r") as f: + index_data = json.load(f) - return df + # 更新索引和向量库 + start_idx = len(vectors) + vectors = np.vstack([vectors, new_vectors]) + + for i, (_, row) in enumerate(to_process.iterrows()): + index_data[row["ID"]] = { + "row": start_idx + i, + "valid": True + } + + # 保存数据 + np.save(vector_path, vectors) + with open(index_path, "w") as f: + json.dump(index_data, f) + # 标记已向量化 + df.loc[to_process.index, "已向量化"] = True + df.to_csv(f"data_file_res/{title}.csv", sep="\t", index=False) + + +def delete_data(title, data_id): + # 更新CSV标记 + csv_path = f"data_file_res/{title}.csv" + df = pd.read_csv(csv_path, sep="\t", dtype={"ID": str}) + df.loc[df["ID"] == data_id, "有效"] = False + df.to_csv(csv_path, sep="\t", index=False) + + # 更新索引标记 + index_path = f"data_np/{title}_index.json" + if os.path.exists(index_path): + with open(index_path, "r+") as f: + index_data = json.load(f) + if data_id in index_data: + index_data[data_id]["valid"] = False + f.seek(0) + json.dump(index_data, f) + f.truncate() + + +def check_file_exists(file_path): + """ + 检查文件是否存在 -def main(question, db_type, top): + 参数: + file_path (str): 要检查的文件路径 + + 返回: + bool: 文件存在返回True,否则返回False + """ + return os.path.isfile(file_path) + + +def ulit_request_file(new_id, sentence, title): + file_name_res_save = f"data_file_res/{title}.csv" + + # 初始化或读取CSV文件 + if os.path.exists(file_name_res_save): + df = pd.read_csv(file_name_res_save, sep="\t") + # 检查是否已存在相同正文 + if sentence in df["正文"].values: + print("正文已存在,跳过处理") + return df + else: + df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化"]) + + # 添加新数据(生成唯一ID) + new_row = { + "ID": str(new_id), + "正文": sentence, + "总结": None, + "有效": True, + "已向量化": False + } + df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) + + # 筛选需要处理的记录 + to_process = df[(df["总结"].isna()) & (df["有效"] == True)] + + # 调用API生成总结(示例保留原有逻辑) + data_list = [] + for _, row in to_process.iterrows(): + data_list.append({ + "model": "gpt-4-turbo", + "messages": [{ + "role": "user", + "content": f"{row['正文']}\n以上这条中可能包含了一些病情或者症状,请帮我归纳这条中所对应的病情或者症状是哪些,总结出来,不需要很长,简单归纳即可,直接输出症状或者病情,可以包含一些形容词来辅助描述,不需要有辅助词汇" + }], + "top_p": 0.9, + "temperature": 0.6 + }) + + # 并发处理请求 + with concurrent.futures.ThreadPoolExecutor(200) as executor: + results = list(executor.map(dialog_line_parse, data_list)) + + # 更新总结字段 + for idx, result in zip(to_process.index, results): + summary = result['choices'][0]['message']['content'] + df.at[idx, "总结"] = summary + + # 保存更新后的CSV + df.to_csv(file_name_res_save, sep="\t", index=False) + return df + +def main(question, title, top): db_dict = { "1": "yetianshi" } @@ -114,30 +252,43 @@ def main(question, db_type, top): 根据提问匹配上下文 ''' d = 1024 - db_type_list = db_type.split(",") + db_type_list = title.split(",") paper_list_str = "" - for i in db_type_list: + for title_dan in db_type_list: embs = shengcehng_array([question]) index = faiss.IndexFlatIP(d) # buid the index - data_np = np.load(f"data_np/{i}.npy") - # data_str = open(f"data_file/{i}.txt").read().split("\n") - data_str = pd.read_csv(f"data_file/{i}.csv", sep="\t", encoding="utf-8").values.tolist() - index.add(data_np) + + # 查找向量 + vector_path = f"data_np/{title_dan}.npy" + index_path = f"data_np/{title_dan}_index.json" + + if not os.path.exists(vector_path) or not os.path.exists(index_path): + return np.empty((0, 1024)) + + vectors = np.load(vector_path) + with open(index_path, "r") as f: + index_data = json.load(f) + + data_str = pd.read_csv(f"data_file_res/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist() + index.add(vectors) D, I = index.search(embs, int(top)) print(I) reference_list = [] for i,j in zip(I[0], D[0]): - reference_list.append([data_str[i], j]) + if data_str[i][3] == True: + reference_list.append([data_str[i], j]) for i,j in enumerate(reference_list): - paper_list_str += "第{}篇\n{},此篇文章的转发数为{},评论数为{},点赞数为{}\n,此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][0], j[0][1], j[0][2], j[0][3], j[1]) + paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][1], j[1]) + ''' 构造prompt ''' print("paper_list_str", paper_list_str) + 9/0 propmt_connect_ziliao_input = [] for i in db_type_list: propmt_connect_ziliao_input.append(propmt_connect_ziliao.format(i, paper_list_str)) @@ -147,61 +298,70 @@ def main(question, db_type, top): ''' 生成回答 ''' - url_predict = "http://192.168.31.74:26000/predict" - url_search = "http://192.168.31.74:26000/search" - - # data = { - # "content": propmt_connect_input - # } - data = { - "content": propmt_connect_input, - "model": "qwq-32", - "top_p": 0.9, - "temperature": 0.6 - } - - res = dialog_line_parse(url_predict, data) - id_ = res["texts"]["id"] - - data = { - "id": id_ - } - - while True: - res = dialog_line_parse(url_search, data) - if res["code"] == 200: - break - else: - time.sleep(1) - spilt_str = "" - think, response = str(res["text"]).split(spilt_str) - return think, response - - -@app.route("/upload_file", methods=["POST"]) -def upload_file(): - print(request.remote_addr) - file = request.files.get('file') - title = request.form.get("title") - df = ulit_request_file(file, title) - Building_vector_database("1", title, df) - return_json = { - "code": 200, - "info": "上传完成" - } - return jsonify(return_json) # 返回结果 + return model_generate_stream(propmt_connect_input) + + +def model_generate_stream(prompt): + messages = [ + {"role": "user", "content": prompt} + ] + + stream = client.chat.completions.create(model=model, + messages=messages, + stream=True) + printed_reasoning_content = False + printed_content = False + + for chunk in stream: + reasoning_content = None + content = None + # Check the content is reasoning_content or content + if hasattr(chunk.choices[0].delta, "reasoning_content"): + reasoning_content = chunk.choices[0].delta.reasoning_content + elif hasattr(chunk.choices[0].delta, "content"): + content = chunk.choices[0].delta.content + + if reasoning_content is not None: + if not printed_reasoning_content: + printed_reasoning_content = True + print("reasoning_content:", end="", flush=True) + print(reasoning_content, end="", flush=True) + elif content is not None: + if not printed_content: + printed_content = True + print("\ncontent:", end="", flush=True) + # Extract and print the content + # print(content, end="", flush=True) + print(content) + yield content @app.route("/upload_file_check", methods=["POST"]) def upload_file_check(): print(request.remote_addr) - file = request.files.get('file') + sentence = request.form.get('sentence') title = request.form.get("title") - df = ulit_request_file(file, title) - Building_vector_database("1", title, df) + new_id = request.form.get("id") + state = request.form.get("state") + ''' + { + "1": "csv", + "2": "xlsx", + "3": "txt", + "4": "pdf" + } + ''' + state_res = "" + if state == "1": + df = ulit_request_file(new_id, sentence, title) + Building_vector_database(title, df) + state_res = "上传完成" + elif state == "2": + delete_data(title, new_id) + state_res = "删除完成" return_json = { "code": 200, - "info": "上传完成" + "info": state_res } return jsonify(return_json) # 返回结果 @@ -210,15 +370,10 @@ def upload_file_check(): def search(): print(request.remote_addr) texts = request.json["texts"] - text_type = request.json["text_type"] + title = request.json["title"] top = request.json["top"] - think, response = main(texts, text_type, top) - return_json = { - "code": 200, - "think": think, - "response": response - } - return jsonify(return_json) # 返回结果 + response = main(texts, title, top) + return Response(response, mimetype='text/plain; charset=utf-8') # 返回结果 if __name__ == "__main__": -- 1.8.3.1 From 1e9375725481639a4840656976e4f9dbae00d5bd Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Tue, 15 Apr 2025 14:45:31 +0800 Subject: [PATCH 02/13] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=89=B9=E9=87=8F?= =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E5=87=BA=E9=94=99=E9=97=AE=E9=A2=98=EF=BC=8C?= =?UTF-8?q?=E5=B9=B6=E5=8F=91=E6=89=BF=E8=BD=BD=E5=8A=9B=E6=9B=B4=E5=BC=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 59 ++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/main.py b/main.py index 87f8823..c38157d 100644 --- a/main.py +++ b/main.py @@ -15,21 +15,29 @@ from flask_cors import CORS import pandas as pd import concurrent.futures import json - +from threading import Thread +import redis app = Flask(__name__) CORS(app) app.config["JSON_AS_ASCII"] = False -openai_api_key = "token-abc123" -openai_api_base = "http://127.0.0.1:12011/v1" +pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=1, password="zhicheng123*") +redis_ = redis.Redis(connection_pool=pool, decode_responses=True) + +db_key_query = 'query' +db_key_querying = 'querying' +batch_size = 32 -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) +# openai_api_key = "token-abc123" +# openai_api_base = "http://127.0.0.1:12011/v1" +# +# client = OpenAI( +# api_key=openai_api_key, +# base_url=openai_api_base, +# ) -models = client.models.list() +# models = client.models.list() # model = models.data[0].id model = "1" model_encode = SentenceTransformer('/home/majiahui/project/models-llm/bge-large-zh-v1.5') @@ -288,7 +296,6 @@ def main(question, title, top): 构造prompt ''' print("paper_list_str", paper_list_str) - 9/0 propmt_connect_ziliao_input = [] for i in db_type_list: propmt_connect_ziliao_input.append(propmt_connect_ziliao.format(i, paper_list_str)) @@ -300,6 +307,20 @@ def main(question, title, top): ''' return model_generate_stream(propmt_connect_input) +def classify(): # 调用模型,设置最大batch_size + while True: + if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取 + time.sleep(3) + continue + query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text + data_dict = json.loads(query) + if data_dict["state"] == "1": + new_id = data_dict["id"] + sentence = data_dict["sentence"] + title = data_dict["title"] + df = ulit_request_file(new_id, sentence, title) + Building_vector_database(title, df) + def model_generate_stream(prompt): messages = [ @@ -339,10 +360,10 @@ def model_generate_stream(prompt): @app.route("/upload_file_check", methods=["POST"]) def upload_file_check(): print(request.remote_addr) - sentence = request.form.get('sentence') - title = request.form.get("title") - new_id = request.form.get("id") - state = request.form.get("state") + sentence = request.json['sentence'] + title = request.json["title"] + new_id = request.json["id"] + state = request.json["state"] ''' { "1": "csv", @@ -353,8 +374,14 @@ def upload_file_check(): ''' state_res = "" if state == "1": - df = ulit_request_file(new_id, sentence, title) - Building_vector_database(title, df) + # df = ulit_request_file(new_id, sentence, title) + # Building_vector_database(title, df) + redis_.rpush(db_key_query, json.dumps({ + "id": new_id, + "sentence": sentence, + "state": state, + "title": title + })) # 加入redis state_res = "上传完成" elif state == "2": delete_data(title, new_id) @@ -375,6 +402,8 @@ def search(): response = main(texts, title, top) return Response(response, mimetype='text/plain; charset=utf-8') # 返回结果 +t = Thread(target=classify) +t.start() if __name__ == "__main__": app.run(host="0.0.0.0", port=27000, threaded=True, debug=False) -- 1.8.3.1 From 90af48046f88c9f99353cfe05ba35cd610b2b252 Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Fri, 18 Apr 2025 15:52:38 +0800 Subject: [PATCH 03/13] =?UTF-8?q?=E5=90=91=E9=87=8F=E8=9E=8D=E5=90=88?= =?UTF-8?q?=E5=88=B0csv=E4=B8=AD=EF=BC=8C=E5=B9=B6=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E5=A2=9E=E5=88=A0=E6=94=B9=E6=9F=A5=E6=93=8D=E4=BD=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 175 +++++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 124 insertions(+), 51 deletions(-) diff --git a/main.py b/main.py index c38157d..486d827 100644 --- a/main.py +++ b/main.py @@ -29,17 +29,17 @@ db_key_query = 'query' db_key_querying = 'querying' batch_size = 32 -# openai_api_key = "token-abc123" -# openai_api_base = "http://127.0.0.1:12011/v1" -# -# client = OpenAI( -# api_key=openai_api_key, -# base_url=openai_api_base, -# ) - -# models = client.models.list() -# model = models.data[0].id -model = "1" +openai_api_key = "token-abc123" +openai_api_base = "http://127.0.0.1:12011/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id +# model = "1" model_encode = SentenceTransformer('/home/majiahui/project/models-llm/bge-large-zh-v1.5') propmt_connect = '''我是一名中医,你是一个中医的医生的助理,我的患者有一个症状,症状如下: {} @@ -157,17 +157,6 @@ def delete_data(title, data_id): df.loc[df["ID"] == data_id, "有效"] = False df.to_csv(csv_path, sep="\t", index=False) - # 更新索引标记 - index_path = f"data_np/{title}_index.json" - if os.path.exists(index_path): - with open(index_path, "r+") as f: - index_data = json.load(f) - if data_id in index_data: - index_data[data_id]["valid"] = False - f.seek(0) - json.dump(index_data, f) - f.truncate() - def check_file_exists(file_path): """ @@ -188,20 +177,20 @@ def ulit_request_file(new_id, sentence, title): # 初始化或读取CSV文件 if os.path.exists(file_name_res_save): df = pd.read_csv(file_name_res_save, sep="\t") - # 检查是否已存在相同正文 - if sentence in df["正文"].values: - print("正文已存在,跳过处理") - return df + # # 检查是否已存在相同正文 + # if sentence in df["正文"].values: + # print("正文已存在,跳过处理") + # return df else: - df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化"]) + df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化", "向量"]) - # 添加新数据(生成唯一ID) new_row = { - "ID": str(new_id), + "ID": new_id, "正文": sentence, "总结": None, "有效": True, - "已向量化": False + "已向量化": False, + "向量": None, } df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) @@ -230,9 +219,19 @@ def ulit_request_file(new_id, sentence, title): summary = result['choices'][0]['message']['content'] df.at[idx, "总结"] = summary - # 保存更新后的CSV + # df.loc[df.index[2], "总结"] = None + # df.loc[df.index[3], "总结"] = None + # df.loc[df.index[4], "总结"] = None + # df.loc[df.index[5], "总结"] = None + + df_ce = df[(df["有效"] == True) & (df["总结"].notnull())] + for idx in df_ce.index: + a = shengcehng_array([df_ce.at[idx, "总结"]]) + df.at[idx, "向量"] = json.dumps(a[0].tolist()) + df.at[idx, "已向量化"] = True + df.to_csv(file_name_res_save, sep="\t", index=False) - return df + def main(question, title, top): db_dict = { @@ -268,28 +267,28 @@ def main(question, title, top): index = faiss.IndexFlatIP(d) # buid the index # 查找向量 - vector_path = f"data_np/{title_dan}.npy" - index_path = f"data_np/{title_dan}_index.json" + file_name_res_save = f"data_file_res/{title_dan}.csv" + df = pd.read_csv(file_name_res_save, sep="\t", encoding="utf-8") + df_ce = df[df["有效"] == True] - if not os.path.exists(vector_path) or not os.path.exists(index_path): - return np.empty((0, 1024)) + print(df_ce.shape) + data_np = [] + for idx in df_ce.index: + data_np.append(json.loads(df.loc[idx, "向量"])) - vectors = np.load(vector_path) - with open(index_path, "r") as f: - index_data = json.load(f) + vectors = np.array(data_np, dtype=object) - data_str = pd.read_csv(f"data_file_res/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist() + # data_str = pd.read_csv(file_name_res_save, sep="\t", encoding="utf-8").values.tolist() index.add(vectors) D, I = index.search(embs, int(top)) print(I) reference_list = [] for i,j in zip(I[0], D[0]): - if data_str[i][3] == True: - reference_list.append([data_str[i], j]) + reference_list.append([df_ce.loc[df_ce.index[i], "正文"], j]) for i,j in enumerate(reference_list): - paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][1], j[1]) + paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0], j[1]) ''' @@ -318,8 +317,48 @@ def classify(): # 调用模型,设置最大batch_size new_id = data_dict["id"] sentence = data_dict["sentence"] title = data_dict["title"] - df = ulit_request_file(new_id, sentence, title) - Building_vector_database(title, df) + ulit_request_file(new_id, sentence, title) + + +def add_dan_data(new_id, sentence, title): + file_name_res_save = f"data_file_res/{title}.csv" + + # 初始化或读取CSV文件 + + df = pd.read_csv(file_name_res_save, sep="\t") + + if sentence in df["正文"].values: + print("正文已存在,跳过处理") + return False + else: + ulit_request_file(new_id, sentence, title) + return True + +def updata_dan_data(new_id, sentence, title): + file_name_res_save = f"data_file_res/{title}.csv" + df = pd.read_csv(file_name_res_save, sep="\t") + + + # 筛选需要处理的记录 + propmt_connect = { + "model": "gpt-4-turbo", + "messages": [{ + "role": "user", + "content": f"{sentence}\n以上这条中可能包含了一些病情或者症状,请帮我归纳这条中所对应的病情或者症状是哪些,总结出来,不需要很长,简单归纳即可,直接输出症状或者病情,可以包含一些形容词来辅助描述,不需要有辅助词汇" + }], + "top_p": 0.9, + "temperature": 0.6 + } + result = dialog_line_parse(propmt_connect) + print(result) + summary = result['choices'][0]['message']['content'] + # 更新总结,正文字段 + df.loc[df["ID"] == new_id, "总结"] = summary + df.loc[df["ID"] == new_id, "正文"] = sentence + + a = shengcehng_array([summary]) + df.loc[df["ID"] == new_id, "向量"] = json.dumps(a[0].tolist()) + df.to_csv(file_name_res_save, sep="\t", index=False) def model_generate_stream(prompt): @@ -357,13 +396,13 @@ def model_generate_stream(prompt): yield content -@app.route("/upload_file_check", methods=["POST"]) -def upload_file_check(): +@app.route("/upload_file", methods=["POST"]) +def upload_file(): print(request.remote_addr) sentence = request.json['sentence'] title = request.json["title"] new_id = request.json["id"] - state = request.json["state"] + state = request.json["state"] # 1: 批量新增 2:单条新增 3:单挑修改 4: 单条删除 ''' { "1": "csv", @@ -374,18 +413,28 @@ def upload_file_check(): ''' state_res = "" if state == "1": - # df = ulit_request_file(new_id, sentence, title) - # Building_vector_database(title, df) redis_.rpush(db_key_query, json.dumps({ "id": new_id, "sentence": sentence, "state": state, "title": title })) # 加入redis - state_res = "上传完成" + state_res = "上传完成,正在排队处理数据" elif state == "2": + info_bool = add_dan_data(new_id, sentence, title) + if info_bool == True: + state_res = "上传完成" + else: + state_res = "上传失败,库中有重复数据" + + elif state == "3": + updata_dan_data(new_id, sentence, title) + state_res = "修改完成" + + elif state == "4": delete_data(title, new_id) state_res = "删除完成" + return_json = { "code": 200, "info": state_res @@ -393,6 +442,30 @@ def upload_file_check(): return jsonify(return_json) # 返回结果 +@app.route("/upload_file_check", methods=["POST"]) +def upload_file_check(): + print(request.remote_addr) + new_id = request.json["id"] + data_list = redis_.lrange(db_key_query, 0, -1) # 0 表示开始,-1 表示结束(全部) + # 解析 JSON 数据 + data_list_id_ = [] + for item in data_list: + data = json.loads(item.decode("utf-8")) # Redis 返回的是 bytes,需要 decode + json.loads + data_list_id_.append(data["id"]) + if new_id in data_list_id_: + return_json = { + "code": 200, + "info": "上传中" + } + return jsonify(return_json) + else: + return_json = { + "code": 200, + "info": "已入库" + } + return jsonify(return_json) + + @app.route("/search", methods=["POST"]) def search(): print(request.remote_addr) -- 1.8.3.1 From 9c1cc4c768c9a44d26557b52ca988030c9d834be Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Sat, 19 Apr 2025 03:07:29 +0800 Subject: [PATCH 04/13] =?UTF-8?q?=E5=A2=9E=E5=8A=A0scokt=E8=AF=B7=E6=B1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main_scokt.py | 241 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 241 insertions(+) create mode 100644 main_scokt.py diff --git a/main_scokt.py b/main_scokt.py new file mode 100644 index 0000000..bf75674 --- /dev/null +++ b/main_scokt.py @@ -0,0 +1,241 @@ +# 这是一个示例 Python 脚本。 + +# 按 Shift+F10 执行或将其替换为您的代码。 +# 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。 +import os +import faiss +import numpy as np +from tqdm import tqdm +from sentence_transformers import SentenceTransformer +import requests +import time +from flask import Flask, jsonify, Response, request +from openai import OpenAI +from flask_cors import CORS +import pandas as pd +import concurrent.futures +import json +from threading import Thread +import redis +import asyncio +import websockets + +app = Flask(__name__) +CORS(app) +app.config["JSON_AS_ASCII"] = False + +pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=1, password="zhicheng123*") +redis_ = redis.Redis(connection_pool=pool, decode_responses=True) + +db_key_query = 'query' +db_key_querying = 'querying' +batch_size = 32 + +openai_api_key = "token-abc123" +openai_api_base = "http://127.0.0.1:12011/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id +# model = "1" +model_encode = SentenceTransformer('/home/majiahui/project/models-llm/bge-large-zh-v1.5') +propmt_connect = '''我是一名中医,你是一个中医的医生的助理,我的患者有一个症状,症状如下: +{} +根据这些症状,我通过查找资料,{} +请根据上面的这些资料和方子,并根据每篇文章的转发数确定文章的重要程度,转发数越高的文章,最终答案的参考度越高,反之越低。根据患者的症状和上面的文章的资料的重要程度以及文章和症状的匹配程度,帮我开出正确的药方和治疗方案''' + +propmt_connect_ziliao = '''在“{}”资料中,有如下相关内容: +{}''' + + +def dialog_line_parse(text): + """ + 将数据输入模型进行分析并输出结果 + :param url: 模型url + :param text: 进入模型的数据 + :return: 模型返回结果 + """ + + url_predict = "http://118.178.228.101:12004/predict" + response = requests.post( + url_predict, + json=text, + timeout=100000 + ) + if response.status_code == 200: + return response.json() + else: + # logger.error( + # "【{}】 Failed to get a proper response from remote " + # "server. Status Code: {}. Response: {}" + # "".format(url, response.status_code, response.text) + # ) + print("【{}】 Failed to get a proper response from remote " + "server. Status Code: {}. Response: {}" + "".format(url_predict, response.status_code, response.text)) + return {} + + # ['choices'][0]['message']['content'] + # + # text = text['messages'][0]['content'] + # return_text = { + # 'code': 200, + # 'id': "1", + # 'object': 0, + # 'created': 0, + # 'model': 0, + # 'choices': [ + # { + # 'index': 0, + # 'message': { + # 'role': 'assistant', + # 'content': text + # }, + # 'logprobs': None, + # 'finish_reason': 'stop' + # } + # ], + # 'usage': 0, + # 'system_fingerprint': 0 + # } + # return return_text + + +def shengcehng_array(data): + embs = model_encode.encode(data, normalize_embeddings=True) + return embs + + +def main(question, title, top): + db_dict = { + "1": "yetianshi" + } + ''' + 定义文件路径 + ''' + + ''' + 加载文件 + ''' + + ''' + 文本分割 + ''' + + ''' + 构建向量数据库 + 1. 正常匹配 + 2. 把文本使用大模型生成一个问题之后再进行匹配 + ''' + + ''' + 根据提问匹配上下文 + ''' + d = 1024 + db_type_list = title.split(",") + + paper_list_str = "" + for title_dan in db_type_list: + embs = shengcehng_array([question]) + index = faiss.IndexFlatIP(d) # buid the index + + # 查找向量 + vector_path = f"data_np/{title_dan}.npy" + vectors = np.load(vector_path) + + data_str = pd.read_csv(f"data_file/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist() + index.add(vectors) + D, I = index.search(embs, int(top)) + print(I) + + reference_list = [] + for i, j in zip(I[0], D[0]): + reference_list.append([data_str[i], j]) + + for i, j in enumerate(reference_list): + paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i + 1), j[0][1], j[1]) + + + ''' + 构造prompt + ''' + print("paper_list_str", paper_list_str) + propmt_connect_ziliao_input = [] + for i in db_type_list: + propmt_connect_ziliao_input.append(propmt_connect_ziliao.format(i, paper_list_str)) + + propmt_connect_ziliao_input_str = ",".join(propmt_connect_ziliao_input) + propmt_connect_input = propmt_connect.format(question, propmt_connect_ziliao_input_str) + ''' + 生成回答 + ''' + return model_generate_stream(propmt_connect_input) + + +def model_generate_stream(prompt): + messages = [ + {"role": "user", "content": prompt} + ] + + stream = client.chat.completions.create(model=model, + messages=messages, + stream=True) + printed_reasoning_content = False + printed_content = False + + for chunk in stream: + reasoning_content = None + content = None + # Check the content is reasoning_content or content + if hasattr(chunk.choices[0].delta, "reasoning_content"): + reasoning_content = chunk.choices[0].delta.reasoning_content + elif hasattr(chunk.choices[0].delta, "content"): + content = chunk.choices[0].delta.content + + if reasoning_content is not None: + if not printed_reasoning_content: + printed_reasoning_content = True + print("reasoning_content:", end="", flush=True) + print(reasoning_content, end="", flush=True) + elif content is not None: + if not printed_content: + printed_content = True + print("\ncontent:", end="", flush=True) + # Extract and print the content + # print(content, end="", flush=True) + print(content) + yield content + + +async def handle_websocket(websocket): + print("客户端已连接") + try: + while True: + message = await websocket.recv() + data = json.loads(message) + texts = data.get("texts") + title = data.get("title") + top = data.get("top") + print("收到消息:", message) + + response = main(texts, title, top) + # response = message + "111" + for char in response: + await websocket.send(char) + # await asyncio.sleep(0.3) + await websocket.send("[DONE]") + except websockets.exceptions.ConnectionClosed: + print("客户端断开连接") + + +async def main_api(): + async with websockets.serve(handle_websocket, "0.0.0.0", 27001): + print("WebSocket 服务器已启动,监听端口 27001") + await asyncio.Future() # 永久运行 + +if __name__ == "__main__": + asyncio.run(main_api()) # 正确启动事件循环 -- 1.8.3.1 From a83c265f4eca900773495828207fb068c6e360c9 Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Mon, 21 Apr 2025 10:16:15 +0800 Subject: [PATCH 05/13] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B5=8B=E8=AF=95scokt?= =?UTF-8?q?=E8=AF=B7=E6=B1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ceshi_scokt.py | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 ceshi_scokt.py diff --git a/ceshi_scokt.py b/ceshi_scokt.py new file mode 100644 index 0000000..ba14324 --- /dev/null +++ b/ceshi_scokt.py @@ -0,0 +1,85 @@ +from openai import OpenAI + +openai_api_key = "token-abc123" +openai_api_base = "http://127.0.0.1:12011/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + + +def model_generate_stream(prompt): + messages = [ + {"role": "user", "content": prompt} + ] + + stream = client.chat.completions.create(model=model, + messages=messages, + stream=True) + printed_reasoning_content = False + printed_content = False + + for chunk in stream: + reasoning_content = None + content = None + # Check the content is reasoning_content or content + if hasattr(chunk.choices[0].delta, "reasoning_content"): + reasoning_content = chunk.choices[0].delta.reasoning_content + elif hasattr(chunk.choices[0].delta, "content"): + content = chunk.choices[0].delta.content + + if reasoning_content is not None: + if not printed_reasoning_content: + printed_reasoning_content = True + print("reasoning_content:", end="", flush=True) + print(reasoning_content, end="", flush=True) + elif content is not None: + if not printed_content: + printed_content = True + print("\ncontent:", end="", flush=True) + # Extract and print the content + # print(content, end="", flush=True) + print(content) + yield content +# if __name__ == '__main__': +# for i in model_generate_stream("你好"): +# print(i) + + +import asyncio +import websockets +import json + + +async def handle_websocket(websocket): + print("客户端已连接") + try: + while True: + message = await websocket.recv() + print("收到消息:", message) + + data = json.loads(message) + texts = data.get("texts") + title = data.get("title") + top = data.get("top") + + response = model_generate_stream(texts) + # response = message + "111" + for char in response: + await websocket.send(char) + # await asyncio.sleep(0.3) + await websocket.send("[DONE]") + except websockets.exceptions.ConnectionClosed: + print("客户端断开连接") + +async def main(): + async with websockets.serve(handle_websocket, "0.0.0.0", 5500): + print("WebSocket 服务器已启动,监听端口 5500") + await asyncio.Future() # 永久运行 + +if __name__ == "__main__": + asyncio.run(main()) # 正确启动事件循环 \ No newline at end of file -- 1.8.3.1 From 2c91b46a6651cda200e1e082cd7eab485399c94c Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Mon, 18 Aug 2025 15:45:37 +0800 Subject: [PATCH 06/13] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B5=8B=E8=AF=95scokt?= =?UTF-8?q?=E8=AF=B7=E6=B1=82=EF=BC=8C=E5=B9=B6=E6=9B=B4=E6=94=B9=E6=B5=81?= =?UTF-8?q?=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 184 ++++++++++++---------------------------------------------- main_scokt.py | 2 + 2 files changed, 39 insertions(+), 147 deletions(-) diff --git a/main.py b/main.py index 486d827..0b30e43 100644 --- a/main.py +++ b/main.py @@ -3,6 +3,7 @@ # 按 Shift+F10 执行或将其替换为您的代码。 # 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。 import os +os.environ["CUDA_VISIBLE_DEVICES"] = "-1" import faiss import numpy as np from tqdm import tqdm @@ -15,20 +16,12 @@ from flask_cors import CORS import pandas as pd import concurrent.futures import json -from threading import Thread -import redis + app = Flask(__name__) CORS(app) app.config["JSON_AS_ASCII"] = False -pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=1, password="zhicheng123*") -redis_ = redis.Redis(connection_pool=pool, decode_responses=True) - -db_key_query = 'query' -db_key_querying = 'querying' -batch_size = 32 - openai_api_key = "token-abc123" openai_api_base = "http://127.0.0.1:12011/v1" @@ -157,6 +150,17 @@ def delete_data(title, data_id): df.loc[df["ID"] == data_id, "有效"] = False df.to_csv(csv_path, sep="\t", index=False) + # 更新索引标记 + index_path = f"data_np/{title}_index.json" + if os.path.exists(index_path): + with open(index_path, "r+") as f: + index_data = json.load(f) + if data_id in index_data: + index_data[data_id]["valid"] = False + f.seek(0) + json.dump(index_data, f) + f.truncate() + def check_file_exists(file_path): """ @@ -177,20 +181,20 @@ def ulit_request_file(new_id, sentence, title): # 初始化或读取CSV文件 if os.path.exists(file_name_res_save): df = pd.read_csv(file_name_res_save, sep="\t") - # # 检查是否已存在相同正文 - # if sentence in df["正文"].values: - # print("正文已存在,跳过处理") - # return df + # 检查是否已存在相同正文 + if sentence in df["正文"].values: + print("正文已存在,跳过处理") + return df else: - df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化", "向量"]) + df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化"]) + # 添加新数据(生成唯一ID) new_row = { - "ID": new_id, + "ID": str(new_id), "正文": sentence, "总结": None, "有效": True, - "已向量化": False, - "向量": None, + "已向量化": False } df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) @@ -219,19 +223,9 @@ def ulit_request_file(new_id, sentence, title): summary = result['choices'][0]['message']['content'] df.at[idx, "总结"] = summary - # df.loc[df.index[2], "总结"] = None - # df.loc[df.index[3], "总结"] = None - # df.loc[df.index[4], "总结"] = None - # df.loc[df.index[5], "总结"] = None - - df_ce = df[(df["有效"] == True) & (df["总结"].notnull())] - for idx in df_ce.index: - a = shengcehng_array([df_ce.at[idx, "总结"]]) - df.at[idx, "向量"] = json.dumps(a[0].tolist()) - df.at[idx, "已向量化"] = True - + # 保存更新后的CSV df.to_csv(file_name_res_save, sep="\t", index=False) - + return df def main(question, title, top): db_dict = { @@ -267,28 +261,20 @@ def main(question, title, top): index = faiss.IndexFlatIP(d) # buid the index # 查找向量 - file_name_res_save = f"data_file_res/{title_dan}.csv" - df = pd.read_csv(file_name_res_save, sep="\t", encoding="utf-8") - df_ce = df[df["有效"] == True] + vector_path = f"data_np/{title_dan}.npy" + vectors = np.load(vector_path) - print(df_ce.shape) - data_np = [] - for idx in df_ce.index: - data_np.append(json.loads(df.loc[idx, "向量"])) - - vectors = np.array(data_np, dtype=object) - - # data_str = pd.read_csv(file_name_res_save, sep="\t", encoding="utf-8").values.tolist() + data_str = pd.read_csv(f"data_file/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist() index.add(vectors) D, I = index.search(embs, int(top)) print(I) reference_list = [] for i,j in zip(I[0], D[0]): - reference_list.append([df_ce.loc[df_ce.index[i], "正文"], j]) + reference_list.append([data_str[i], j]) for i,j in enumerate(reference_list): - paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0], j[1]) + paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][0], j[1]) ''' @@ -306,60 +292,6 @@ def main(question, title, top): ''' return model_generate_stream(propmt_connect_input) -def classify(): # 调用模型,设置最大batch_size - while True: - if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取 - time.sleep(3) - continue - query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text - data_dict = json.loads(query) - if data_dict["state"] == "1": - new_id = data_dict["id"] - sentence = data_dict["sentence"] - title = data_dict["title"] - ulit_request_file(new_id, sentence, title) - - -def add_dan_data(new_id, sentence, title): - file_name_res_save = f"data_file_res/{title}.csv" - - # 初始化或读取CSV文件 - - df = pd.read_csv(file_name_res_save, sep="\t") - - if sentence in df["正文"].values: - print("正文已存在,跳过处理") - return False - else: - ulit_request_file(new_id, sentence, title) - return True - -def updata_dan_data(new_id, sentence, title): - file_name_res_save = f"data_file_res/{title}.csv" - df = pd.read_csv(file_name_res_save, sep="\t") - - - # 筛选需要处理的记录 - propmt_connect = { - "model": "gpt-4-turbo", - "messages": [{ - "role": "user", - "content": f"{sentence}\n以上这条中可能包含了一些病情或者症状,请帮我归纳这条中所对应的病情或者症状是哪些,总结出来,不需要很长,简单归纳即可,直接输出症状或者病情,可以包含一些形容词来辅助描述,不需要有辅助词汇" - }], - "top_p": 0.9, - "temperature": 0.6 - } - result = dialog_line_parse(propmt_connect) - print(result) - summary = result['choices'][0]['message']['content'] - # 更新总结,正文字段 - df.loc[df["ID"] == new_id, "总结"] = summary - df.loc[df["ID"] == new_id, "正文"] = sentence - - a = shengcehng_array([summary]) - df.loc[df["ID"] == new_id, "向量"] = json.dumps(a[0].tolist()) - df.to_csv(file_name_res_save, sep="\t", index=False) - def model_generate_stream(prompt): messages = [ @@ -396,13 +328,13 @@ def model_generate_stream(prompt): yield content -@app.route("/upload_file", methods=["POST"]) -def upload_file(): +@app.route("/upload_file_check", methods=["POST"]) +def upload_file_check(): print(request.remote_addr) - sentence = request.json['sentence'] - title = request.json["title"] - new_id = request.json["id"] - state = request.json["state"] # 1: 批量新增 2:单条新增 3:单挑修改 4: 单条删除 + sentence = request.form.get('sentence') + title = request.form.get("title") + new_id = request.form.get("id") + state = request.form.get("state") ''' { "1": "csv", @@ -413,28 +345,12 @@ def upload_file(): ''' state_res = "" if state == "1": - redis_.rpush(db_key_query, json.dumps({ - "id": new_id, - "sentence": sentence, - "state": state, - "title": title - })) # 加入redis - state_res = "上传完成,正在排队处理数据" + df = ulit_request_file(new_id, sentence, title) + Building_vector_database(title, df) + state_res = "上传完成" elif state == "2": - info_bool = add_dan_data(new_id, sentence, title) - if info_bool == True: - state_res = "上传完成" - else: - state_res = "上传失败,库中有重复数据" - - elif state == "3": - updata_dan_data(new_id, sentence, title) - state_res = "修改完成" - - elif state == "4": delete_data(title, new_id) state_res = "删除完成" - return_json = { "code": 200, "info": state_res @@ -442,30 +358,6 @@ def upload_file(): return jsonify(return_json) # 返回结果 -@app.route("/upload_file_check", methods=["POST"]) -def upload_file_check(): - print(request.remote_addr) - new_id = request.json["id"] - data_list = redis_.lrange(db_key_query, 0, -1) # 0 表示开始,-1 表示结束(全部) - # 解析 JSON 数据 - data_list_id_ = [] - for item in data_list: - data = json.loads(item.decode("utf-8")) # Redis 返回的是 bytes,需要 decode + json.loads - data_list_id_.append(data["id"]) - if new_id in data_list_id_: - return_json = { - "code": 200, - "info": "上传中" - } - return jsonify(return_json) - else: - return_json = { - "code": 200, - "info": "已入库" - } - return jsonify(return_json) - - @app.route("/search", methods=["POST"]) def search(): print(request.remote_addr) @@ -475,8 +367,6 @@ def search(): response = main(texts, title, top) return Response(response, mimetype='text/plain; charset=utf-8') # 返回结果 -t = Thread(target=classify) -t.start() if __name__ == "__main__": app.run(host="0.0.0.0", port=27000, threaded=True, debug=False) diff --git a/main_scokt.py b/main_scokt.py index bf75674..0dbad79 100644 --- a/main_scokt.py +++ b/main_scokt.py @@ -154,6 +154,8 @@ def main(question, title, top): reference_list = [] for i, j in zip(I[0], D[0]): + print("i", i) + print("data_str[i]", data_str[i]) reference_list.append([data_str[i], j]) for i, j in enumerate(reference_list): -- 1.8.3.1 From 8d7708f7b045e9c8c06ed0e0c621e20e9e8dd4e4 Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Fri, 22 Aug 2025 14:19:20 +0800 Subject: [PATCH 07/13] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B5=8B=E8=AF=95scokt?= =?UTF-8?q?=E8=AF=B7=E6=B1=82=EF=BC=8C=E5=B9=B6=E6=9B=B4=E6=94=B9=E6=B5=81?= =?UTF-8?q?=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 0b30e43..af2b5c7 100644 --- a/main.py +++ b/main.py @@ -211,6 +211,55 @@ def ulit_request_file(new_id, sentence, title): "content": f"{row['正文']}\n以上这条中可能包含了一些病情或者症状,请帮我归纳这条中所对应的病情或者症状是哪些,总结出来,不需要很长,简单归纳即可,直接输出症状或者病情,可以包含一些形容词来辅助描述,不需要有辅助词汇" }], "top_p": 0.9, + "temperature": 0.3 + }) + + # 并发处理请求 + with concurrent.futures.ThreadPoolExecutor(200) as executor: + results = list(executor.map(dialog_line_parse, data_list)) + + # 更新总结字段 + for idx, result in zip(to_process.index, results): + summary = result['choices'][0]['message']['content'] + df.at[idx, "总结"] = summary + + # 保存更新后的CSV + df.to_csv(file_name_res_save, sep="\t", index=False) + return df + + +def ulit_request_file_zongjie(new_id, sentence, zongjie, title): + file_name_res_save = f"data_file_res/{title}.csv" + + # 初始化或读取CSV文件 + if os.path.exists(file_name_res_save): + df = pd.read_csv(file_name_res_save, sep="\t") + else: + df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化"]) + + # # 添加新数据(生成唯一ID) + # new_row = { + # "ID": str(new_id), + # "正文": sentence, + # "总结": zongjie, + # "有效": True, + # "已向量化": False + # } + # df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) + + # 筛选需要处理的记录 + to_process = df[df["有效"] == True] + + # 调用API生成总结(示例保留原有逻辑) + data_list = [] + for _, row in to_process.iterrows(): + data_list.append({ + "model": "gpt-4-turbo", + "messages": [{ + "role": "user", + "content": f"{row['正文']}\n以上这条中可能包含了一些病情或者症状,请帮我归纳这条中所对应的病情或者症状是哪些,总结出来,不需要很长,简单归纳即可,直接输出症状或者病情,可以包含一些形容词来辅助描述,不需要有辅助词汇" + }], + "top_p": 0.9, "temperature": 0.6 }) @@ -227,6 +276,7 @@ def ulit_request_file(new_id, sentence, title): df.to_csv(file_name_res_save, sep="\t", index=False) return df + def main(question, title, top): db_dict = { "1": "yetianshi" @@ -264,7 +314,7 @@ def main(question, title, top): vector_path = f"data_np/{title_dan}.npy" vectors = np.load(vector_path) - data_str = pd.read_csv(f"data_file/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist() + data_str = pd.read_csv(f"data_file_res/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist() index.add(vectors) D, I = index.search(embs, int(top)) print(I) @@ -274,7 +324,7 @@ def main(question, title, top): reference_list.append([data_str[i], j]) for i,j in enumerate(reference_list): - paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][0], j[1]) + paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][1], j[1]) ''' @@ -334,23 +384,39 @@ def upload_file_check(): sentence = request.form.get('sentence') title = request.form.get("title") new_id = request.form.get("id") + zongjie = request.form.get("zongjie") state = request.form.get("state") ''' { - "1": "csv", + "1": "csv", "2": "xlsx", "3": "txt", "4": "pdf" } ''' + # 增 state_res = "" if state == "1": df = ulit_request_file(new_id, sentence, title) Building_vector_database(title, df) state_res = "上传完成" + + # 删 elif state == "2": delete_data(title, new_id) state_res = "删除完成" + + # 改 + elif state == "3": + df = ulit_request_file(new_id, sentence, title) + Building_vector_database(title, df) + state_res = "修改完成" + + # 查 + elif state == "4": + df = ulit_request_file(new_id, sentence, title) + state_res = "" + return_json = { "code": 200, "info": state_res -- 1.8.3.1 From 2f48a11ac8dddc99218466c9cdfb630793121ad9 Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Mon, 1 Sep 2025 10:17:49 +0800 Subject: [PATCH 08/13] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B5=8B=E8=AF=95scokt?= =?UTF-8?q?=E8=AF=B7=E6=B1=82=EF=BC=8C=E5=B9=B6=E6=9B=B4=E6=94=B9=E6=B5=81?= =?UTF-8?q?=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main_scokt.py | 148 ++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 114 insertions(+), 34 deletions(-) diff --git a/main_scokt.py b/main_scokt.py index 0dbad79..3437755 100644 --- a/main_scokt.py +++ b/main_scokt.py @@ -10,15 +10,15 @@ from sentence_transformers import SentenceTransformer import requests import time from flask import Flask, jsonify, Response, request -from openai import OpenAI from flask_cors import CORS import pandas as pd -import concurrent.futures -import json -from threading import Thread import redis +from openai import OpenAI import asyncio import websockets +import json +import ssl +import pathlib app = Flask(__name__) CORS(app) @@ -144,24 +144,30 @@ def main(question, title, top): index = faiss.IndexFlatIP(d) # buid the index # 查找向量 - vector_path = f"data_np/{title_dan}.npy" - vectors = np.load(vector_path) + # vector_path = f"data_np/{title_dan}.npy" + # vectors = np.load(vector_path) - data_str = pd.read_csv(f"data_file/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist() + data_str = pd.read_csv(f"data_file_res/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist() + + data_str_valid = [] + for i in data_str: + if i[3] == True: + data_str_valid.append(i) + + data_str_vectors_list = [] + for i in data_str_valid: + data_str_vectors_list.append(eval(i[-1])) + vectors = np.array(data_str_vectors_list) index.add(vectors) D, I = index.search(embs, int(top)) print(I) reference_list = [] - for i, j in zip(I[0], D[0]): - print("i", i) - print("data_str[i]", data_str[i]) - reference_list.append([data_str[i], j]) - - for i, j in enumerate(reference_list): - paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i + 1), j[0][1], j[1]) - + for i,j in zip(I[0], D[0]): + reference_list.append([data_str_valid[i], j]) + for i,j in enumerate(reference_list): + paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][1], j[1]) ''' 构造prompt ''' @@ -216,28 +222,102 @@ def model_generate_stream(prompt): async def handle_websocket(websocket): print("客户端已连接") try: - while True: - message = await websocket.recv() - data = json.loads(message) - texts = data.get("texts") - title = data.get("title") - top = data.get("top") - print("收到消息:", message) - - response = main(texts, title, top) - # response = message + "111" - for char in response: - await websocket.send(char) - # await asyncio.sleep(0.3) - await websocket.send("[DONE]") + async for message in websocket: + try: + data = json.loads(message) + texts = data.get("texts") + title = data.get("title") + top = data.get("top") + print(f"收到消息: {texts}") + + # 获取响应 + response = main(texts, title, top) + + # 发送响应 + for char in response: + await websocket.send(char) + await asyncio.sleep(0.001) # 小延迟避免发送过快 + + # 发送完成标记 + await websocket.send("[DONE]") + print("消息发送完成") + + except json.JSONDecodeError: + await websocket.send('{"error": "Invalid JSON format"}') + except Exception as e: + print(f"处理消息时发生错误: {e}") + await websocket.send('{"error": "Internal server error"}') + except websockets.exceptions.ConnectionClosed: print("客户端断开连接") - + except Exception as e: + print(f"WebSocket处理异常: {e}") + finally: + print("连接处理结束") async def main_api(): - async with websockets.serve(handle_websocket, "0.0.0.0", 27001): - print("WebSocket 服务器已启动,监听端口 27001") - await asyncio.Future() # 永久运行 + try: + ssl_context = None + + # 检查证书文件是否存在 + ssl_cert = "server.crt" + ssl_key = "server.key" + + if os.path.exists(ssl_cert) and os.path.exists(ssl_key): + try: + # 创建SSL上下文 + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + # 加载证书链 + ssl_context.load_cert_chain(ssl_cert, ssl_key) + # 禁用主机名验证(对于自签名证书) + ssl_context.check_hostname = False + ssl_context.verify_mode = ssl.CERT_NONE + print("SSL证书已加载,使用WSS协议") + except Exception as e: + print(f"SSL证书加载失败: {e}") + print("将使用WS协议") + ssl_context = None + else: + print("警告:SSL证书文件未找到,将使用WS协议") + ssl_context = None + + # 创建服务器 + server = await websockets.serve( + handle_websocket, + "0.0.0.0", + 27001, + ssl=ssl_context, + ping_interval=30, # 添加ping间隔 + ping_timeout=30, # 添加ping超时 + close_timeout=30 # 添加关闭超时 + ) + + if ssl_context: + print("WSS服务器已启动: wss://0.0.0.0:27001") + else: + print("WS服务器已启动: ws://0.0.0.0:27001") + + # 保持服务器运行 + await server.wait_closed() + + except Exception as e: + print(f"服务器启动失败: {e}") + import traceback + traceback.print_exc() + if __name__ == "__main__": - asyncio.run(main_api()) # 正确启动事件循环 + # 设置更详细的事件循环调试 + import logging + + logging.basicConfig(level=logging.INFO) + + try: + asyncio.run(main_api()) + except KeyboardInterrupt: + print("服务器被用户中断") + except Exception as e: + print(f"服务器运行错误: {e}") + import traceback + + traceback.print_exc() -- 1.8.3.1 From cfb587a02ecfbd9e6e4510c8907efa0de5c290eb Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Tue, 9 Sep 2025 12:03:13 +0800 Subject: [PATCH 09/13] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B5=8B=E8=AF=95scokt?= =?UTF-8?q?=E8=AF=B7=E6=B1=82=EF=BC=8C=E5=B9=B6=E6=9B=B4=E6=94=B9=E6=B5=81?= =?UTF-8?q?=E7=A8=8B=EF=BC=8C=E7=BA=BF=E4=B8=8A=E8=B7=91=E9=80=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main_scokt.py | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/main_scokt.py b/main_scokt.py index 3437755..72a99b6 100644 --- a/main_scokt.py +++ b/main_scokt.py @@ -260,26 +260,31 @@ async def main_api(): ssl_context = None # 检查证书文件是否存在 - ssl_cert = "server.crt" - ssl_key = "server.key" - - if os.path.exists(ssl_cert) and os.path.exists(ssl_key): - try: - # 创建SSL上下文 - ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) - # 加载证书链 - ssl_context.load_cert_chain(ssl_cert, ssl_key) - # 禁用主机名验证(对于自签名证书) - ssl_context.check_hostname = False - ssl_context.verify_mode = ssl.CERT_NONE - print("SSL证书已加载,使用WSS协议") - except Exception as e: - print(f"SSL证书加载失败: {e}") - print("将使用WS协议") - ssl_context = None - else: - print("警告:SSL证书文件未找到,将使用WS协议") - ssl_context = None + ssl_cert = "yizherenxin.cn.crt" + ssl_key = "yizherenxin.cn.key" + + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + ssl_context.load_cert_chain(ssl_cert, ssl_key) + ssl_context.check_hostname = False # 必须禁用主机名验证 + ssl_context.verify_mode = ssl.CERT_NONE # 不验证证书 + + # if os.path.exists(ssl_cert) and os.path.exists(ssl_key): + # try: + # # 创建SSL上下文 + # ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + # # 加载证书链 + # ssl_context.load_cert_chain(ssl_cert, ssl_key) + # # 禁用主机名验证(对于自签名证书) + # ssl_context.check_hostname = False + # ssl_context.verify_mode = ssl.CERT_NONE + # print("SSL证书已加载,使用WSS协议") + # except Exception as e: + # print(f"SSL证书加载失败: {e}") + # print("将使用WS协议") + # ssl_context = None + # else: + # print("警告:SSL证书文件未找到,将使用WS协议") + # ssl_context = None # 创建服务器 server = await websockets.serve( @@ -317,7 +322,4 @@ if __name__ == "__main__": except KeyboardInterrupt: print("服务器被用户中断") except Exception as e: - print(f"服务器运行错误: {e}") - import traceback - - traceback.print_exc() + print(f"服务器运行错误: {e}") \ No newline at end of file -- 1.8.3.1 From dab07a99e4b3ab22a4526adf48f1d6ae8512eb42 Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Tue, 9 Sep 2025 14:55:04 +0800 Subject: [PATCH 10/13] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B5=8B=E8=AF=95scokt?= =?UTF-8?q?=E8=AF=B7=E6=B1=82=EF=BC=8C=E5=B9=B6=E6=9B=B4=E6=94=B9=E6=B5=81?= =?UTF-8?q?=E7=A8=8B=EF=BC=8C=E7=BA=BF=E4=B8=8A=E8=B7=91=E9=80=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main_scokt.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/main_scokt.py b/main_scokt.py index 72a99b6..f3756a5 100644 --- a/main_scokt.py +++ b/main_scokt.py @@ -3,6 +3,8 @@ # 按 Shift+F10 执行或将其替换为您的代码。 # 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。 import os +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "-1" import faiss import numpy as np from tqdm import tqdm @@ -260,8 +262,12 @@ async def main_api(): ssl_context = None # 检查证书文件是否存在 - ssl_cert = "yizherenxin.cn.crt" - ssl_key = "yizherenxin.cn.key" + + ssl_cert = "yitongtang66.com.crt" + ssl_key = "yitongtang66.com.key" + + # ssl_cert = "yizherenxin.cn.crt" + # ssl_key = "yizherenxin.cn.key" ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) ssl_context.load_cert_chain(ssl_cert, ssl_key) -- 1.8.3.1 From c751f371a79ee36f88508785f81eab65dcd1dcbf Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Mon, 29 Sep 2025 14:16:49 +0800 Subject: [PATCH 11/13] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B5=8B=E8=AF=95scokt?= =?UTF-8?q?=E8=AF=B7=E6=B1=82=EF=BC=8C=E5=B9=B6=E6=9B=B4=E6=94=B9=E6=B5=81?= =?UTF-8?q?=E7=A8=8B=EF=BC=8C=E7=BA=BF=E4=B8=8A=E8=B7=91=E9=80=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 19 +++++ main.py | 251 +++++++++++++++++++++++++++++++++++----------------------- main_scokt.py | 80 ++++++++++++------- 3 files changed, 226 insertions(+), 124 deletions(-) diff --git a/README.md b/README.md index e69de29..ac056fb 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,19 @@ +### 知识库增删改查 +``` +python main.py +``` + +### 知识库流式问答socket +``` +python main_scokt.py +``` + +### 仿deepseek流式问答 socket (非 rag相关内容) +``` +python main_scoket_deepspeek.py +``` + +### 具体接口可以参考接口文档 + 接口文档.docx + https://console-docs.apipost.cn/preview/55b7d541588142d1/f4645422856c695a + https://console-docs.apipost.cn/preview/f03a79d844523711/2f4079d715d28b32 \ No newline at end of file diff --git a/main.py b/main.py index af2b5c7..5afd6d1 100644 --- a/main.py +++ b/main.py @@ -16,8 +16,10 @@ from flask_cors import CORS import pandas as pd import concurrent.futures import json +import torch +import uuid - +# flask配置 app = Flask(__name__) CORS(app) app.config["JSON_AS_ASCII"] = False @@ -30,10 +32,13 @@ client = OpenAI( base_url=openai_api_base, ) +# 模型配置 models = client.models.list() model = models.data[0].id # model = "1" model_encode = SentenceTransformer('/home/majiahui/project/models-llm/bge-large-zh-v1.5') + +# 提示配置 propmt_connect = '''我是一名中医,你是一个中医的医生的助理,我的患者有一个症状,症状如下: {} 根据这些症状,我通过查找资料,{} @@ -97,12 +102,23 @@ def dialog_line_parse(text): def shengcehng_array(data): + ''' + 模型生成向量 + :param data: + :return: + ''' embs = model_encode.encode(data, normalize_embeddings=True) return embs def Building_vector_database(title, df): + ''' + 次函数暂时弃用 + :param title: + :param df: + :return: + ''' # 加载需要处理的数据(有效且未向量化) to_process = df[(df["有效"] == True) & (df["已向量化"] == False)] @@ -143,23 +159,21 @@ def Building_vector_database(title, df): df.to_csv(f"data_file_res/{title}.csv", sep="\t", index=False) -def delete_data(title, data_id): +def delete_data(title, new_id): + ''' + 假删除,只是标记有效无效 + :param title: + :param new_id: + :return: + ''' + new_id = str(new_id) # 更新CSV标记 csv_path = f"data_file_res/{title}.csv" - df = pd.read_csv(csv_path, sep="\t", dtype={"ID": str}) - df.loc[df["ID"] == data_id, "有效"] = False + df = pd.read_csv(csv_path, sep="\t") + # df.loc[df["ID"] == new_id, "有效"] = False + df.loc[df['ID'] == new_id, "有效"] = False df.to_csv(csv_path, sep="\t", index=False) - - # 更新索引标记 - index_path = f"data_np/{title}_index.json" - if os.path.exists(index_path): - with open(index_path, "r+") as f: - index_data = json.load(f) - if data_id in index_data: - index_data[data_id]["valid"] = False - f.seek(0) - json.dump(index_data, f) - f.truncate() + return "删除完成" def check_file_exists(file_path): @@ -175,106 +189,141 @@ def check_file_exists(file_path): return os.path.isfile(file_path) -def ulit_request_file(new_id, sentence, title): +def ulit_request_file(sentence, title, zongjie): + ''' + 上传文件,生成固定内容,"ID", "正文", "总结", "有效", "向量" + :param sentence: + :param title: + :param zongjie: + :return: + ''' file_name_res_save = f"data_file_res/{title}.csv" - # 初始化或读取CSV文件 + # 初始化或读取CSV文件,如果存在文件,读取文件,并添加行, + # 如果不存在文件,新建DataFrame if os.path.exists(file_name_res_save): df = pd.read_csv(file_name_res_save, sep="\t") # 检查是否已存在相同正文 if sentence in df["正文"].values: - print("正文已存在,跳过处理") - return df + if zongjie == None: + return "正文已存在,跳过处理" + else: + result = df[df['正文'] == sentence] + id_ = result['ID'].values[0] + print(id_) + return ulit_request_file_zongjie(id_, sentence, zongjie, title) else: - df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化"]) + df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "向量"]) # 添加新数据(生成唯一ID) - new_row = { - "ID": str(new_id), - "正文": sentence, - "总结": None, - "有效": True, - "已向量化": False - } - df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) - - # 筛选需要处理的记录 - to_process = df[(df["总结"].isna()) & (df["有效"] == True)] + if zongjie == None: + id_ = str(uuid.uuid1()) + new_row = { + "ID": id_, + "正文": sentence, + "总结": None, + "有效": True, + "向量": None + } + df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) - # 调用API生成总结(示例保留原有逻辑) - data_list = [] - for _, row in to_process.iterrows(): - data_list.append({ + # 需要根据不同的项目修改提示,目的是精简内容,为了方便匹配 + data_dan = { "model": "gpt-4-turbo", "messages": [{ "role": "user", - "content": f"{row['正文']}\n以上这条中可能包含了一些病情或者症状,请帮我归纳这条中所对应的病情或者症状是哪些,总结出来,不需要很长,简单归纳即可,直接输出症状或者病情,可以包含一些形容词来辅助描述,不需要有辅助词汇" + "content": f"{sentence}\n以上这条中可能包含了一些病情或者症状,请帮我归纳这条中所对应的病情或者症状是哪些,总结出来,不需要很长,简单归纳即可,直接输出症状或者病情,可以包含一些形容词来辅助描述,不需要有辅助词汇" }], "top_p": 0.9, "temperature": 0.3 - }) + } + results = dialog_line_parse(data_dan) + summary = results['choices'][0]['message']['content'] - # 并发处理请求 - with concurrent.futures.ThreadPoolExecutor(200) as executor: - results = list(executor.map(dialog_line_parse, data_list)) + # 这是你的向量生成函数,来生成总结的词汇的向量 + new_vectors = shengcehng_array([summary]) + df.loc[df['ID'] == id_, '总结'] = summary + df.loc[df['ID'] == id_, '向量'] = str(new_vectors[0].tolist()) - # 更新总结字段 - for idx, result in zip(to_process.index, results): - summary = result['choices'][0]['message']['content'] - df.at[idx, "总结"] = summary + else: + id_ = str(uuid.uuid1()) + new_row = { + "ID": id_ , + "正文": sentence, + "总结": zongjie, + "有效": True, + "向量": None + } + df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) + new_vectors = shengcehng_array([zongjie]) # 假设这是你的向量生成函数 + df.loc[df['ID'] == id_, '总结'] = zongjie + df.loc[df['ID'] == id_, '向量'] = str(new_vectors[0].tolist()) # 保存更新后的CSV df.to_csv(file_name_res_save, sep="\t", index=False) - return df + return "上传完成" def ulit_request_file_zongjie(new_id, sentence, zongjie, title): + new_id = str(new_id) + print(new_id) + print(type(new_id)) file_name_res_save = f"data_file_res/{title}.csv" # 初始化或读取CSV文件 - if os.path.exists(file_name_res_save): - df = pd.read_csv(file_name_res_save, sep="\t") + + df = pd.read_csv(file_name_res_save, sep="\t") + df.loc[df['ID'] == new_id, '正文'] = sentence + if zongjie == None: + pass else: - df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化"]) - - # # 添加新数据(生成唯一ID) - # new_row = { - # "ID": str(new_id), - # "正文": sentence, - # "总结": zongjie, - # "有效": True, - # "已向量化": False - # } - # df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) + df.loc[df['ID'] == new_id, '总结'] = zongjie + new_vectors = shengcehng_array([zongjie]) # 假设这是你的向量生成函数 + df.loc[df['ID'] == new_id, '向量'] = str(new_vectors[0].tolist()) - # 筛选需要处理的记录 - to_process = df[df["有效"] == True] + # 保存更新后的CSV + df.to_csv(file_name_res_save, sep="\t", index=False) + return "修改完成" - # 调用API生成总结(示例保留原有逻辑) - data_list = [] - for _, row in to_process.iterrows(): - data_list.append({ - "model": "gpt-4-turbo", - "messages": [{ - "role": "user", - "content": f"{row['正文']}\n以上这条中可能包含了一些病情或者症状,请帮我归纳这条中所对应的病情或者症状是哪些,总结出来,不需要很长,简单归纳即可,直接输出症状或者病情,可以包含一些形容词来辅助描述,不需要有辅助词汇" - }], - "top_p": 0.9, - "temperature": 0.6 - }) - # 并发处理请求 - with concurrent.futures.ThreadPoolExecutor(200) as executor: - results = list(executor.map(dialog_line_parse, data_list)) +def ulit_request_file_check(title): + file_name_res_save = f"data_file_res/{title}.csv" - # 更新总结字段 - for idx, result in zip(to_process.index, results): - summary = result['choices'][0]['message']['content'] - df.at[idx, "总结"] = summary + # 初始化或读取CSV文件 - # 保存更新后的CSV - df.to_csv(file_name_res_save, sep="\t", index=False) - return df + # 初始化或读取CSV文件 + if os.path.exists(file_name_res_save): + df = pd.read_csv(file_name_res_save, sep="\t").values.tolist() + data_new = [] + for i in df: + if i[3] == True: + data_new.append([i[0], i[1], i[2]]) + return data_new + else: + return "无可展示文件" + + +def ulit_request_file_check_dan(new_id, title): + new_id = str(new_id) + file_name_res_save = f"data_file_res/{title}.csv" + + # 初始化或读取CSV文件 + + # 初始化或读取CSV文件 + if os.path.exists(file_name_res_save): + df = pd.read_csv(file_name_res_save, sep="\t") + zhengwen = df.loc[df['ID'] == new_id, '正文'].values + zongjie = df.loc[df['ID'] == new_id, '总结'].values + # 输出结果 + if len(zhengwen) > 0: + if df.loc[df['ID'] == new_id, '有效'].values == True: + return [new_id, zhengwen[0], zongjie[0]] + else: + return "未找到对应的ID" + else: + return "未找到对应的ID" + else: + return "无可展示文件" def main(question, title, top): @@ -311,22 +360,30 @@ def main(question, title, top): index = faiss.IndexFlatIP(d) # buid the index # 查找向量 - vector_path = f"data_np/{title_dan}.npy" - vectors = np.load(vector_path) + # vector_path = f"data_np/{title_dan}.npy" + # vectors = np.load(vector_path) data_str = pd.read_csv(f"data_file_res/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist() + + data_str_valid = [] + for i in data_str: + if i[3] == True: + data_str_valid.append(i) + + data_str_vectors_list = [] + for i in data_str_valid: + data_str_vectors_list.append(eval(i[-1])) + vectors = np.array(data_str_vectors_list) index.add(vectors) D, I = index.search(embs, int(top)) print(I) reference_list = [] for i,j in zip(I[0], D[0]): - reference_list.append([data_str[i], j]) + reference_list.append([data_str_valid[i], j]) for i,j in enumerate(reference_list): paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][1], j[1]) - - ''' 构造prompt ''' @@ -374,7 +431,7 @@ def model_generate_stream(prompt): print("\ncontent:", end="", flush=True) # Extract and print the content # print(content, end="", flush=True) - print(content) + print(content, end="") yield content @@ -397,24 +454,24 @@ def upload_file_check(): # 增 state_res = "" if state == "1": - df = ulit_request_file(new_id, sentence, title) - Building_vector_database(title, df) - state_res = "上传完成" + state_res = ulit_request_file(sentence, title, zongjie) + # 删 elif state == "2": - delete_data(title, new_id) - state_res = "删除完成" + state_res = delete_data(title, new_id) # 改 elif state == "3": - df = ulit_request_file(new_id, sentence, title) - Building_vector_database(title, df) - state_res = "修改完成" + state_res = ulit_request_file_zongjie(new_id, sentence, zongjie,title) # 查 elif state == "4": - df = ulit_request_file(new_id, sentence, title) + state_res = ulit_request_file_check(title) + + # 通过uuid查单条数据 + elif state == "5": + ulit_request_file_check_dan(new_id, title) state_res = "" return_json = { diff --git a/main_scokt.py b/main_scokt.py index f3756a5..1ac0300 100644 --- a/main_scokt.py +++ b/main_scokt.py @@ -44,7 +44,7 @@ client = OpenAI( models = client.models.list() model = models.data[0].id # model = "1" -model_encode = SentenceTransformer('/home/majiahui/project/models-llm/bge-large-zh-v1.5') +model_encode = SentenceTransformer('/home/zhangbaoxun/project/models-llm/bge-large-zh-v1.5') propmt_connect = '''我是一名中医,你是一个中医的医生的助理,我的患者有一个症状,症状如下: {} 根据这些症状,我通过查找资料,{} @@ -113,6 +113,13 @@ def shengcehng_array(data): def main(question, title, top): + ''' + 主函数,用来匹配句子放到prompt中,生成回答 + :param question: + :param title: + :param top: + :return: + ''' db_dict = { "1": "yetianshi" } @@ -149,23 +156,36 @@ def main(question, title, top): # vector_path = f"data_np/{title_dan}.npy" # vectors = np.load(vector_path) + # 读取向量文件 csv文件结构: + # ID + # 正文 + # 总结 + # 有效 + # 向量 data_str = pd.read_csv(f"data_file_res/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist() data_str_valid = [] for i in data_str: + # i[3] == True 说明数据没有被删除,如果是false说明被删除 if i[3] == True: data_str_valid.append(i) + # 把有效数据的向量汇总出来 data_str_vectors_list = [] for i in data_str_valid: data_str_vectors_list.append(eval(i[-1])) + + # 拼接成向量矩阵 vectors = np.array(data_str_vectors_list) index.add(vectors) + + # 使用faiss找到最相似向量 D, I = index.search(embs, int(top)) print(I) reference_list = [] for i,j in zip(I[0], D[0]): + # 添加 csv对应的数据 data_str_valid[i]表示 csv中一行的所有数据 ID 正文 总结 有效 向量 以及 j表示相关度是多少 reference_list.append([data_str_valid[i], j]) for i,j in enumerate(reference_list): @@ -178,10 +198,12 @@ def main(question, title, top): for i in db_type_list: propmt_connect_ziliao_input.append(propmt_connect_ziliao.format(i, paper_list_str)) + # 构造输入问题,把上面的都展示出来 propmt_connect_ziliao_input_str = ",".join(propmt_connect_ziliao_input) propmt_connect_input = propmt_connect.format(question, propmt_connect_ziliao_input_str) + ''' - 生成回答 + 生成回答,这个model_generate_stream可以根据需要指定模型 ''' return model_generate_stream(propmt_connect_input) @@ -259,38 +281,40 @@ async def handle_websocket(websocket): async def main_api(): try: + # 是否加载 ssl_context = None - + # wss服务开关 True是打开wss服务 + wss_bool = False # 检查证书文件是否存在 - ssl_cert = "yitongtang66.com.crt" + ssl_cert = "yitongtang66.com_ca_chains.crt" ssl_key = "yitongtang66.com.key" # ssl_cert = "yizherenxin.cn.crt" # ssl_key = "yizherenxin.cn.key" - ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) - ssl_context.load_cert_chain(ssl_cert, ssl_key) - ssl_context.check_hostname = False # 必须禁用主机名验证 - ssl_context.verify_mode = ssl.CERT_NONE # 不验证证书 - - # if os.path.exists(ssl_cert) and os.path.exists(ssl_key): - # try: - # # 创建SSL上下文 - # ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) - # # 加载证书链 - # ssl_context.load_cert_chain(ssl_cert, ssl_key) - # # 禁用主机名验证(对于自签名证书) - # ssl_context.check_hostname = False - # ssl_context.verify_mode = ssl.CERT_NONE - # print("SSL证书已加载,使用WSS协议") - # except Exception as e: - # print(f"SSL证书加载失败: {e}") - # print("将使用WS协议") - # ssl_context = None - # else: - # print("警告:SSL证书文件未找到,将使用WS协议") - # ssl_context = None + # ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + # ssl_context.load_cert_chain(ssl_cert, ssl_key) + # ssl_context.check_hostname = False # 必须禁用主机名验证 + # ssl_context.verify_mode = ssl.CERT_NONE # 不验证证书 + + if wss_bool == True: + try: + # 创建SSL上下文 + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + # 加载证书链 + ssl_context.load_cert_chain(ssl_cert, ssl_key) + # 禁用主机名验证(对于自签名证书) + ssl_context.check_hostname = False + ssl_context.verify_mode = ssl.CERT_NONE + print("SSL证书已加载,使用WSS协议") + except Exception as e: + print(f"SSL证书加载失败: {e}") + print("将使用WS协议") + ssl_context = None + else: + print("警告:SSL证书文件未找到,将使用WS协议") + ssl_context = None # 创建服务器 server = await websockets.serve( @@ -303,6 +327,7 @@ async def main_api(): close_timeout=30 # 添加关闭超时 ) + # 启动27001端口 if ssl_context: print("WSS服务器已启动: wss://0.0.0.0:27001") else: @@ -323,9 +348,10 @@ if __name__ == "__main__": logging.basicConfig(level=logging.INFO) + # 启动服务 try: asyncio.run(main_api()) except KeyboardInterrupt: print("服务器被用户中断") except Exception as e: - print(f"服务器运行错误: {e}") \ No newline at end of file + print(f"服务器运行错误: {e}") -- 1.8.3.1 From c07cc61c8bd8414414451bf48e967755db2f6433 Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Tue, 30 Sep 2025 16:24:18 +0800 Subject: [PATCH 12/13] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B5=8B=E8=AF=95scokt?= =?UTF-8?q?=E8=AF=B7=E6=B1=82=EF=BC=8C=E5=B9=B6=E6=9B=B4=E6=94=B9=E6=B5=81?= =?UTF-8?q?=E7=A8=8B=EF=BC=8C=E7=BA=BF=E4=B8=8A=E8=B7=91=E9=80=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ac056fb..d9a9f88 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ python main_scokt.py python main_scoket_deepspeek.py ``` -### 具体接口可以参考接口文档 +### 具体接口可以参考接口文档1 接口文档.docx https://console-docs.apipost.cn/preview/55b7d541588142d1/f4645422856c695a https://console-docs.apipost.cn/preview/f03a79d844523711/2f4079d715d28b32 \ No newline at end of file -- 1.8.3.1 From 634fd049d5af6ff8133f520b7cdef244501137b6 Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Tue, 30 Sep 2025 16:26:55 +0800 Subject: [PATCH 13/13] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B5=8B=E8=AF=95scokt?= =?UTF-8?q?=E8=AF=B7=E6=B1=82=EF=BC=8C=E5=B9=B6=E6=9B=B4=E6=94=B9=E6=B5=81?= =?UTF-8?q?=E7=A8=8B=EF=BC=8C=E7=BA=BF=E4=B8=8A=E8=B7=91=E9=80=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d9a9f88..1e2c494 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ python main_scokt.py python main_scoket_deepspeek.py ``` -### 具体接口可以参考接口文档1 +### 具体接口可以参考接口文档11 接口文档.docx https://console-docs.apipost.cn/preview/55b7d541588142d1/f4645422856c695a https://console-docs.apipost.cn/preview/f03a79d844523711/2f4079d715d28b32 \ No newline at end of file -- 1.8.3.1