From 9938da6680dd50cd5febe98c47b09ff7b01a9dc7 Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Thu, 3 Apr 2025 16:43:26 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dbug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 58 ++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/main.py b/main.py index 46cee45..306b1bf 100644 --- a/main.py +++ b/main.py @@ -10,6 +10,8 @@ import requests import time from flask import Flask, jsonify from flask import request +import pandas as pd + app = Flask(__name__) app.config["JSON_AS_ASCII"] = False @@ -18,7 +20,7 @@ model = SentenceTransformer('/home/majiahui/project/models-llm/bge-large-zh-v1.5 propmt_connect = '''我是一名中医,你是一个中医的医生的助理,我的患者有一个症状,症状如下: {} 根据这些症状,我通过查找资料,{} -请根据上面的这些资料和方子,根据患者的症状帮我开出正确的药方和治疗方案''' +请根据上面的这些资料和方子,并根据每篇文章的转发数确定文章的重要程度,转发数越高的文章,最终答案的参考度越高,反之越低。根据患者的症状和上面的文章的资料的重要程度以及文章和症状的匹配程度,帮我开出正确的药方和治疗方案''' propmt_connect_ziliao = '''在“{}”资料中,有如下相关内容: {}''' @@ -55,10 +57,10 @@ def shengcehng_array(data): embs = model.encode(data, normalize_embeddings=True) return embs -def Building_vector_database(type, name, data): +def Building_vector_database(type, name, df): data_ndarray = np.empty((0, 1024)) - for sen in data: - data_ndarray = np.concatenate((data_ndarray, shengcehng_array([sen]))) + for sen in df: + data_ndarray = np.concatenate((data_ndarray, shengcehng_array([sen[0]]))) print("data_ndarray.shape", data_ndarray.shape) print("data_ndarray.shape", data_ndarray.shape) @@ -68,21 +70,22 @@ def Building_vector_database(type, name, data): def ulit_request_file(file, title): file_name = file.filename - file_name_save = "data_file/{}.txt".format(title) + file_name_save = "data_file/{}.csv".format(title) file.save(file_name_save) - try: - with open(file_name_save, encoding="gbk") as f: - content = f.read() - except: - with open(file_name_save, encoding="utf-8") as f: - content = f.read() + # try: + # with open(file_name_save, encoding="gbk") as f: + # content = f.read() + # except: + # with open(file_name_save, encoding="utf-8") as f: + # content = f.read() # elif file_name.split(".")[-1] == "docx": # content = docx2txt.process(file_name_save) - content_list = [i for i in content.split("\n")] + # content_list = [i for i in content.split("\n")] + df = pd.read_csv(file_name_save, sep="\t", encoding="utf-8").values.tolist() - return content_list + return df def main(question, db_type, top): @@ -116,19 +119,20 @@ def main(question, db_type, top): paper_list_str = "" for i in db_type_list: embs = shengcehng_array([question]) - index = faiss.IndexFlatL2(d) # buid the index + index = faiss.IndexFlatIP(d) # buid the index data_np = np.load(f"data_np/{i}.npy") - data_str = open(f"data_file/{i}.txt").read().split("\n") + # data_str = open(f"data_file/{i}.txt").read().split("\n") + data_str = pd.read_csv(f"data_file/{i}.csv", sep="\t", encoding="utf-8").values.tolist() index.add(data_np) D, I = index.search(embs, int(top)) print(I) reference_list = [] - for i in I[0]: - reference_list.append(data_str[i]) + for i,j in zip(I[0], D[0]): + reference_list.append([data_str[i], j]) for i,j in enumerate(reference_list): - paper_list_str += "第{}篇\n{}\n".format(str(i+1), j) + paper_list_str += "第{}篇\n{},此篇文章的转发数为{},评论数为{},点赞数为{}\n,此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][0], j[0][1], j[0][2], j[0][3], j[1]) ''' 构造prompt @@ -179,8 +183,22 @@ def upload_file(): print(request.remote_addr) file = request.files.get('file') title = request.form.get("title") - data = ulit_request_file(file, title) - Building_vector_database("1", title, data) + df = ulit_request_file(file, title) + Building_vector_database("1", title, df) + return_json = { + "code": 200, + "info": "上传完成" + } + return jsonify(return_json) # 返回结果 + + +@app.route("/upload_file_check", methods=["POST"]) +def upload_file_check(): + print(request.remote_addr) + file = request.files.get('file') + title = request.form.get("title") + df = ulit_request_file(file, title) + Building_vector_database("1", title, df) return_json = { "code": 200, "info": "上传完成"