From 23ed82e6fa72b2500da9ea8e55b6716000306109 Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Fri, 14 Jun 2024 18:30:16 +0800 Subject: [PATCH] =?UTF-8?q?=E6=AE=B5=E8=90=BD=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- chatgpt_detector_model_predict.py | 50 +++++++++++++-------------------------- flask_api.py | 2 +- 2 files changed, 18 insertions(+), 34 deletions(-) diff --git a/chatgpt_detector_model_predict.py b/chatgpt_detector_model_predict.py index dbe9789..0242104 100644 --- a/chatgpt_detector_model_predict.py +++ b/chatgpt_detector_model_predict.py @@ -29,10 +29,6 @@ db_key_query = 'query' db_key_querying = 'querying' db_key_queryset = 'queryset' batch_size = 32 -# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -# tokenizer = AutoTokenizer.from_pretrained("chatgpt-detector-roberta-chinese") -# model = AutoModelForSequenceClassification.from_pretrained("chatgpt-detector-roberta-chinese").cuda() -# model = AutoModelForSequenceClassification.from_pretrained("chatgpt-detector-roberta-chinese").cpu() model_name = "AIGC_detector_zhv2" tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -84,42 +80,28 @@ def main(content_list: list): sim_word = 0 sim_word_5_9 = 0 total_words = 0 - print(content_list) total_paragraph = len(content_list) - - for i in range(0, len(content_list), 3): - if i + 2 <= len(content_list)-1: - sen_nums = 3 - content_str = "。".join([content_list[i], content_list[i+1], content_list[i+2]]) - elif i + 1 <= len(content_list)-1: - sen_nums = 2 - content_str = "。".join([content_list[i], content_list[i + 1]]) - else: - sen_nums = 1 - content_str = content_list[i] - total_words += len(content_str) - res = model_preidct(content_str) + for i in range(len(content_list)): + total_words += len(content_list[i]) + res = model_preidct(content_list[i]) # return_list = { # "humen": output[0][0], # "robot": output[0][1] # } if res["robot"] > 0.9: - for ci in range(sen_nums): - gpt_score_list.append(res["robot"]) - sim_word += len(content_list[i + ci]) - gpt_content.append( - "".format(str(i + ci)) + content_list[i + ci] + "。\n" + "") + gpt_score_list.append(res["robot"]) + sim_word += len(content_list[i]) + gpt_content.append( + "".format(str(i)) + content_list[i] + "。\n" + "") elif 0.9 > res["robot"] > 0.5: - for ci in range(sen_nums): - gpt_score_list.append(res["robot"]) - sim_word_5_9 += len(content_list[i + ci]) - gpt_content.append( - "".format(str(i + ci)) + content_list[i + ci] + "。\n" + "") + gpt_score_list.append(res["robot"]) + sim_word_5_9 += len(content_list[i]) + gpt_content.append( + "".format(str(i)) + content_list[i] + "。\n" + "") else: - for ci in range(sen_nums): - gpt_score_list.append(0) - gpt_content.append(content_list[i + ci] + "。\n") + gpt_score_list.append(0) + gpt_content.append(content_list[i] + "。\n") return_list["gpt_content"] = "".join(gpt_content) return_list["gpt_score_list"] = str(gpt_score_list) @@ -132,6 +114,7 @@ def main(content_list: list): def classify(): # 调用模型,设置最大batch_size while True: + try: if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取 time.sleep(3) continue @@ -189,9 +172,10 @@ def classify(): # 调用模型,设置最大batch_size json.dump(return_text, f2, ensure_ascii=False, indent=4) redis_.set(queue_uuid, load_result_path, 86400) redis_.srem(db_key_querying, queue_uuid) - + except: + continue if __name__ == '__main__': t = Thread(target=classify) - t.start() \ No newline at end of file + t.start() diff --git a/flask_api.py b/flask_api.py index 7656f63..3c85041 100644 --- a/flask_api.py +++ b/flask_api.py @@ -45,7 +45,7 @@ def ulit_request_file(file): # elif file_name.split(".")[-1] == "docx": # content = docx2txt.process(file_name_save) - content_list = [i.strip("\n") for i in content.split("。")] + content_list = [i for i in content.split("\n")] print(content_list) return content_list