diff --git a/chatgpt_detector_model_predict.py b/chatgpt_detector_model_predict.py
index 533e686..1f8038e 100644
--- a/chatgpt_detector_model_predict.py
+++ b/chatgpt_detector_model_predict.py
@@ -34,13 +34,60 @@ batch_size = 32
# model_name = "drop_aigc_model_2"
# model_name = "drop_aigc_model_3"
# model_name = "/home/majiahui/project/models-llm/aigc_check_10"
-model_name = "/home/majiahui/project/models-llm/weipu_aigc_512_3"
-
-
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSequenceClassification.from_pretrained(model_name).cpu()
-
-def model_preidct(text):
+model_name_sentence = "/home/majiahui/project/models-llm/weipu_aigc_512_11"
+model_name_short = "/home/majiahui/project/models-llm/weipu_aigc_512_5"
+pantten_biaoti_0 = '^[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
+pantten_biaoti_1 = '^第[一二三四五六七八九]章\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
+pantten_biaoti_2 = '^[0-9.]+\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
+pantten_biaoti_3 = '^[((][1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][)_)][、.]{0,}?\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
+pantten_biaoti_4 = '(摘要)'
+pantten_biaoti_5 = '(致谢)'
+pantten_ref_1 = r'^\[\d+\]'
+
+tokenizer = AutoTokenizer.from_pretrained(model_name_sentence)
+model_sentence = AutoModelForSequenceClassification.from_pretrained(model_name_sentence).cpu()
+model_short = AutoModelForSequenceClassification.from_pretrained(model_name_short).cpu()
+
+def has_chinese(text):
+ """判断字符串是否包含中文字符"""
+ for char in text:
+ if '\u4e00' <= char <= '\u9fff':
+ return True
+ return False
+
+
+def ulit_data(sentence:str):
+ if_change = True
+ max_length = 25
+ sentence = sentence.strip(" ")
+ result_biaoti_list_0 = re.findall(pantten_biaoti_0, sentence)
+ result_biaoti_list_1 = re.findall(pantten_biaoti_1, sentence)
+ result_biaoti_list_2 = re.findall(pantten_biaoti_2, sentence)
+ result_biaoti_list_3 = re.findall(pantten_biaoti_3, sentence)
+ result_biaoti_list_4 = re.findall(pantten_biaoti_4, sentence)
+ result_biaoti_list_5 = re.findall(pantten_biaoti_5, sentence)
+ result_ref_list_1 = re.findall(pantten_ref_1, sentence)
+
+ if list(set(result_biaoti_list_0 + result_biaoti_list_1 + result_biaoti_list_2 + result_biaoti_list_3)) != []:
+ if len(sentence) < max_length:
+ if_change = False
+
+ if list(set(result_biaoti_list_4 + result_biaoti_list_5)) != []:
+ if_change = False
+
+ if result_ref_list_1 != []:
+ if_change = False
+
+ if "。" not in sentence:
+ if_change = False
+
+ if has_chinese(sentence) == False:
+ if_change = False
+
+ return if_change
+
+
+def model_preidct(model, text):
tokenized_text = tokenizer.encode_plus(text, max_length=512, add_special_tokens=True,
truncation=True, return_offsets_mapping=True)
@@ -62,12 +109,12 @@ def model_preidct(text):
output = torch.sigmoid(output[0]).tolist()
print(output)
- if model_name == "drop_aigc_model_2":
+ if model_name_sentence == "drop_aigc_model_2":
return_list = {
"humen": output[0][1],
"robot": output[0][0]
}
- elif model_name == "AIGC_detector_zhv2":
+ elif model_name_sentence == "AIGC_detector_zhv2":
return_list = {
"humen": output[0][0],
"robot": output[0][1]
@@ -116,40 +163,46 @@ def main(content_list: list):
total_paragraph = len(content_list)
for i in range(len(content_list)):
- total_words += len(content_list[i])
- res = model_preidct(content_list[i])
+ sentence = content_list[i]
+ total_words += len(sentence)
+
+ bool_ = ulit_data(sentence)
+ if bool_ == True:
+ res = model_preidct(model_sentence, sentence)
+ else:
+ res = model_preidct(model_short, sentence)
# return_list = {
# "humen": output[0][0],
# "robot": output[0][1]
# }
- reference_bool = is_reference_sentence(content_list[i])
+ reference_bool = is_reference_sentence(sentence)
if reference_bool == False:
if res["robot"] > 0.9:
- for _ in range(len(content_list[i])):
+ for _ in range(len(sentence)):
gpt_score_list.append(res["robot"])
gpt_score_sentence_list.append(res["robot"])
- sim_word += len(content_list[i])
+ sim_word += len(sentence)
gpt_content.append(
- "".format(str(i)) + content_list[i] + "\n" + "")
+ "".format(str(i)) + sentence + "\n" + "")
elif 0.9 >= res["robot"] > 0.5:
- for _ in range(len(content_list[i])):
+ for _ in range(len(sentence)):
gpt_score_list.append(res["robot"])
gpt_score_sentence_list.append(res["robot"])
- sim_word_5_9 += len(content_list[i])
+ sim_word_5_9 += len(sentence)
gpt_content.append(
- "".format(str(i)) + content_list[i] + "\n" + "")
+ "".format(str(i)) + sentence + "\n" + "")
else:
- for _ in range(len(content_list[i])):
+ for _ in range(len(sentence)):
gpt_score_list.append(0)
gpt_score_sentence_list.append(0)
- gpt_content.append(content_list[i] + "\n")
+ gpt_content.append(sentence + "\n")
else:
- for _ in range(len(content_list[i])):
+ for _ in range(len(sentence)):
gpt_score_list.append(0)
gpt_score_sentence_list.append(0)
- gpt_content.append(content_list[i] + "\n")
+ gpt_content.append(sentence + "\n")
return_list["gpt_content"] = "".join(gpt_content)
return_list["gpt_score_sentence_list"] = str(gpt_score_sentence_list)
diff --git a/flask_api.py b/flask_api.py
index b40613b..2a18e11 100644
--- a/flask_api.py
+++ b/flask_api.py
@@ -21,6 +21,7 @@ import time
import json
import docx2txt
import re
+from datetime import datetime
pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=12, password="zhicheng123*")
@@ -94,7 +95,7 @@ def ulit_request_file(file):
# elif file_name.split(".")[-1] == "docx":
# content = docx2txt.process(file_name_save)
- content_list = [i for i in content.split("\n")]
+ content_list = [i for i in content.split("\n") if i!= ""]
print(content_list)
content_list_new = []
@@ -145,7 +146,13 @@ def handle_query_predict():
# 绑定文本和query id
# recall_10(id_, title, abst_zh, content)
- load_request_path = './request_data_logs/{}.json'.format(id_)
+ date_str = datetime.now().strftime("%Y-%m-%d")
+ dir_path = "./request_data_logs/{}".format(date_str)
+ # 检查并创建目录(如果不存在)
+ os.makedirs(dir_path, exist_ok=True)
+ load_request_path = dir_path + '/{}.json'.format(id_)
+ # load_request_path = './request_data_logs/{}.json'.format(id_)
+
with open(load_request_path, 'w', encoding='utf8') as f2: # ensure_ascii=False才能输入中文,否则是Unicode字符 indent=2 JSON数据的缩进,美观
json.dump(d, f2, ensure_ascii=False, indent=4)
redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path})) # 加入redis