diff --git a/chatgpt_detector_model_predict.py b/chatgpt_detector_model_predict.py index 533e686..1f8038e 100644 --- a/chatgpt_detector_model_predict.py +++ b/chatgpt_detector_model_predict.py @@ -34,13 +34,60 @@ batch_size = 32 # model_name = "drop_aigc_model_2" # model_name = "drop_aigc_model_3" # model_name = "/home/majiahui/project/models-llm/aigc_check_10" -model_name = "/home/majiahui/project/models-llm/weipu_aigc_512_3" - - -tokenizer = AutoTokenizer.from_pretrained(model_name) -model = AutoModelForSequenceClassification.from_pretrained(model_name).cpu() - -def model_preidct(text): +model_name_sentence = "/home/majiahui/project/models-llm/weipu_aigc_512_11" +model_name_short = "/home/majiahui/project/models-llm/weipu_aigc_512_5" +pantten_biaoti_0 = '^[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' +pantten_biaoti_1 = '^第[一二三四五六七八九]章\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' +pantten_biaoti_2 = '^[0-9.]+\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' +pantten_biaoti_3 = '^[((][1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][)_)][、.]{0,}?\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' +pantten_biaoti_4 = '(摘要)' +pantten_biaoti_5 = '(致谢)' +pantten_ref_1 = r'^\[\d+\]' + +tokenizer = AutoTokenizer.from_pretrained(model_name_sentence) +model_sentence = AutoModelForSequenceClassification.from_pretrained(model_name_sentence).cpu() +model_short = AutoModelForSequenceClassification.from_pretrained(model_name_short).cpu() + +def has_chinese(text): + """判断字符串是否包含中文字符""" + for char in text: + if '\u4e00' <= char <= '\u9fff': + return True + return False + + +def ulit_data(sentence:str): + if_change = True + max_length = 25 + sentence = sentence.strip(" ") + result_biaoti_list_0 = re.findall(pantten_biaoti_0, sentence) + result_biaoti_list_1 = re.findall(pantten_biaoti_1, sentence) + result_biaoti_list_2 = re.findall(pantten_biaoti_2, sentence) + result_biaoti_list_3 = re.findall(pantten_biaoti_3, sentence) + result_biaoti_list_4 = re.findall(pantten_biaoti_4, sentence) + result_biaoti_list_5 = re.findall(pantten_biaoti_5, sentence) + result_ref_list_1 = re.findall(pantten_ref_1, sentence) + + if list(set(result_biaoti_list_0 + result_biaoti_list_1 + result_biaoti_list_2 + result_biaoti_list_3)) != []: + if len(sentence) < max_length: + if_change = False + + if list(set(result_biaoti_list_4 + result_biaoti_list_5)) != []: + if_change = False + + if result_ref_list_1 != []: + if_change = False + + if "。" not in sentence: + if_change = False + + if has_chinese(sentence) == False: + if_change = False + + return if_change + + +def model_preidct(model, text): tokenized_text = tokenizer.encode_plus(text, max_length=512, add_special_tokens=True, truncation=True, return_offsets_mapping=True) @@ -62,12 +109,12 @@ def model_preidct(text): output = torch.sigmoid(output[0]).tolist() print(output) - if model_name == "drop_aigc_model_2": + if model_name_sentence == "drop_aigc_model_2": return_list = { "humen": output[0][1], "robot": output[0][0] } - elif model_name == "AIGC_detector_zhv2": + elif model_name_sentence == "AIGC_detector_zhv2": return_list = { "humen": output[0][0], "robot": output[0][1] @@ -116,40 +163,46 @@ def main(content_list: list): total_paragraph = len(content_list) for i in range(len(content_list)): - total_words += len(content_list[i]) - res = model_preidct(content_list[i]) + sentence = content_list[i] + total_words += len(sentence) + + bool_ = ulit_data(sentence) + if bool_ == True: + res = model_preidct(model_sentence, sentence) + else: + res = model_preidct(model_short, sentence) # return_list = { # "humen": output[0][0], # "robot": output[0][1] # } - reference_bool = is_reference_sentence(content_list[i]) + reference_bool = is_reference_sentence(sentence) if reference_bool == False: if res["robot"] > 0.9: - for _ in range(len(content_list[i])): + for _ in range(len(sentence)): gpt_score_list.append(res["robot"]) gpt_score_sentence_list.append(res["robot"]) - sim_word += len(content_list[i]) + sim_word += len(sentence) gpt_content.append( - "".format(str(i)) + content_list[i] + "\n" + "") + "".format(str(i)) + sentence + "\n" + "") elif 0.9 >= res["robot"] > 0.5: - for _ in range(len(content_list[i])): + for _ in range(len(sentence)): gpt_score_list.append(res["robot"]) gpt_score_sentence_list.append(res["robot"]) - sim_word_5_9 += len(content_list[i]) + sim_word_5_9 += len(sentence) gpt_content.append( - "".format(str(i)) + content_list[i] + "\n" + "") + "".format(str(i)) + sentence + "\n" + "") else: - for _ in range(len(content_list[i])): + for _ in range(len(sentence)): gpt_score_list.append(0) gpt_score_sentence_list.append(0) - gpt_content.append(content_list[i] + "\n") + gpt_content.append(sentence + "\n") else: - for _ in range(len(content_list[i])): + for _ in range(len(sentence)): gpt_score_list.append(0) gpt_score_sentence_list.append(0) - gpt_content.append(content_list[i] + "\n") + gpt_content.append(sentence + "\n") return_list["gpt_content"] = "".join(gpt_content) return_list["gpt_score_sentence_list"] = str(gpt_score_sentence_list) diff --git a/flask_api.py b/flask_api.py index b40613b..2a18e11 100644 --- a/flask_api.py +++ b/flask_api.py @@ -21,6 +21,7 @@ import time import json import docx2txt import re +from datetime import datetime pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=12, password="zhicheng123*") @@ -94,7 +95,7 @@ def ulit_request_file(file): # elif file_name.split(".")[-1] == "docx": # content = docx2txt.process(file_name_save) - content_list = [i for i in content.split("\n")] + content_list = [i for i in content.split("\n") if i!= ""] print(content_list) content_list_new = [] @@ -145,7 +146,13 @@ def handle_query_predict(): # 绑定文本和query id # recall_10(id_, title, abst_zh, content) - load_request_path = './request_data_logs/{}.json'.format(id_) + date_str = datetime.now().strftime("%Y-%m-%d") + dir_path = "./request_data_logs/{}".format(date_str) + # 检查并创建目录(如果不存在) + os.makedirs(dir_path, exist_ok=True) + load_request_path = dir_path + '/{}.json'.format(id_) + # load_request_path = './request_data_logs/{}.json'.format(id_) + with open(load_request_path, 'w', encoding='utf8') as f2: # ensure_ascii=False才能输入中文,否则是Unicode字符 indent=2 JSON数据的缩进,美观 json.dump(d, f2, ensure_ascii=False, indent=4) redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path})) # 加入redis