From 7650bf7f3eec5d2ff59aea7d7aa66ab0bcfa670b Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Thu, 1 Aug 2024 15:33:22 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E6=A3=80=E6=B5=8B=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=EF=BC=8C=E5=A2=9E=E5=8A=A0=E8=BE=B9=E7=95=8C=E6=9D=A1?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- chatgpt_detector_model_predict.py | 31 ++++++++++---- flask_api.py | 59 +++++++++++++++++++++++++- flask_chatgpt-detector_predict_redis_search.py | 2 +- 3 files changed, 82 insertions(+), 10 deletions(-) diff --git a/chatgpt_detector_model_predict.py b/chatgpt_detector_model_predict.py index c3a3f05..8b69dda 100644 --- a/chatgpt_detector_model_predict.py +++ b/chatgpt_detector_model_predict.py @@ -22,14 +22,17 @@ import json import docx2txt -pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=13, password="zhicheng123*") +pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=12, password="zhicheng123*") redis_ = redis.Redis(connection_pool=pool, decode_responses=True) db_key_query = 'query' db_key_querying = 'querying' db_key_queryset = 'queryset' batch_size = 32 +# model_name = "AIGC_detector_zhv2" model_name = "drop_aigc_model_2" +# model_name = "drop_aigc_model_3" + tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name).cpu() @@ -56,10 +59,22 @@ def model_preidct(text): output = torch.sigmoid(output[0]).tolist() print(output) - return_list = { - "humen": output[0][0], - "robot": output[0][1] - } + if model_name == "drop_aigc_model_2": + return_list = { + "humen": output[0][1], + "robot": output[0][0] + } + elif model_name == "AIGC_detector_zhv2": + return_list = { + "humen": output[0][0], + "robot": output[0][1] + } + else: + return_list = { + "humen": output[0][0], + "robot": output[0][1] + } + return return_list @@ -93,12 +108,12 @@ def main(content_list: list): gpt_score_list.append(res["robot"]) sim_word += len(content_list[i]) gpt_content.append( - "".format(str(i)) + content_list[i] + "。\n" + "") - elif 0.9 > res["robot"] > 0.5: + "".format(str(i)) + content_list[i] + "\n" + "") + elif 0.9 >= res["robot"] > 0.5: gpt_score_list.append(res["robot"]) sim_word_5_9 += len(content_list[i]) gpt_content.append( - "".format(str(i)) + content_list[i] + "。\n" + "") + "".format(str(i)) + content_list[i] + "\n" + "") else: gpt_score_list.append(0) gpt_content.append(content_list[i] + "\n") diff --git a/flask_api.py b/flask_api.py index 3c85041..b40613b 100644 --- a/flask_api.py +++ b/flask_api.py @@ -20,15 +20,64 @@ import uuid import time import json import docx2txt +import re -pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=13, password="zhicheng123*") +pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=12, password="zhicheng123*") redis_ = redis.Redis(connection_pool=pool, decode_responses=True) db_key_query = 'query' db_key_querying = 'querying' db_key_queryset = 'queryset' batch_size = 32 +RE_DIALOG = re.compile(r"\".*?\"|\'.*?\'|“.*?”") + +def get_dialogs_index(line: str): + """ + 获取对话及其索引 + :param line 文本 + :return dialogs 对话内容 + dialogs_index: 对话位置索引 + other_index: 其他内容位置索引 + """ + dialogs = re.finditer(RE_DIALOG, line) + dialogs_text = re.findall(RE_DIALOG, line) + dialogs_index = [] + for dialog in dialogs: + all_ = [i for i in range(dialog.start(), dialog.end())] + dialogs_index.extend(all_) + other_index = [i for i in range(len(line)) if i not in dialogs_index] + + return dialogs_text, dialogs_index, other_index + + +def chulichangju_1(text, chulipangban_return_list): + fuhao = ["。"] + dialogs_text, dialogs_index, other_index = get_dialogs_index(text) + text_1 = text[:500] + text_2 = text[500:] + text_1_new = "" + if text_2 == "": + chulipangban_return_list.append(text_1) + return chulipangban_return_list + for i in range(len(text_1) - 1, -1, -1): + if text_1[i] in fuhao: + if i in dialogs_index: + continue + text_1_new = text_1[:i] + text_1_new += text_1[i] + chulipangban_return_list.append(text_1_new) + if text_2 != "": + if i + 1 != 500: + text_2 = text_1[i + 1:] + text_2 + break + # else: + # chulipangban_return_list.append(text_1) + if text_1_new == "": + chulipangban_return_list.append(text_1) + if text_2 != "": + chulipangban_return_list = chulichangju_1(text_2, chulipangban_return_list) + return chulipangban_return_list def ulit_request_file(file): file_name = file.filename @@ -47,6 +96,14 @@ def ulit_request_file(file): content_list = [i for i in content.split("\n")] print(content_list) + + content_list_new = [] + for sen in content_list: + if len(sen) < 500: + content_list_new.append(sen) + else: + content_list_new.extend(chulichangju_1(sen, [])) + return content_list diff --git a/flask_chatgpt-detector_predict_redis_search.py b/flask_chatgpt-detector_predict_redis_search.py index cd38788..76f6668 100644 --- a/flask_chatgpt-detector_predict_redis_search.py +++ b/flask_chatgpt-detector_predict_redis_search.py @@ -28,7 +28,7 @@ from threading import Thread import time app = flask.Flask(__name__) -pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=13, password="zhicheng123*") +pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=12, password="zhicheng123*") redis_ = redis.Redis(connection_pool=pool, decode_responses=True)