Browse Source

更新检测模型

dev_2
majiahui@haimaqingfan.com 3 weeks ago
parent
commit
6fa529d312
  1. 91
      chatgpt_detector_model_predict.py
  2. 11
      flask_api.py

91
chatgpt_detector_model_predict.py

@ -34,13 +34,60 @@ batch_size = 32
# model_name = "drop_aigc_model_2" # model_name = "drop_aigc_model_2"
# model_name = "drop_aigc_model_3" # model_name = "drop_aigc_model_3"
# model_name = "/home/majiahui/project/models-llm/aigc_check_10" # model_name = "/home/majiahui/project/models-llm/aigc_check_10"
model_name = "/home/majiahui/project/models-llm/weipu_aigc_512_3" model_name_sentence = "/home/majiahui/project/models-llm/weipu_aigc_512_11"
model_name_short = "/home/majiahui/project/models-llm/weipu_aigc_512_5"
pantten_biaoti_0 = '^[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
pantten_biaoti_1 = '^第[一二三四五六七八九]章\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
pantten_biaoti_2 = '^[0-9.]+\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
pantten_biaoti_3 = '^[((][1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][)_)][、.]{0,}?\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
pantten_biaoti_4 = '(摘要)'
pantten_biaoti_5 = '(致谢)'
pantten_ref_1 = r'^\[\d+\]'
tokenizer = AutoTokenizer.from_pretrained(model_name_sentence)
model_sentence = AutoModelForSequenceClassification.from_pretrained(model_name_sentence).cpu()
model_short = AutoModelForSequenceClassification.from_pretrained(model_name_short).cpu()
def has_chinese(text):
"""判断字符串是否包含中文字符"""
for char in text:
if '\u4e00' <= char <= '\u9fff':
return True
return False
def ulit_data(sentence:str):
if_change = True
max_length = 25
sentence = sentence.strip(" ")
result_biaoti_list_0 = re.findall(pantten_biaoti_0, sentence)
result_biaoti_list_1 = re.findall(pantten_biaoti_1, sentence)
result_biaoti_list_2 = re.findall(pantten_biaoti_2, sentence)
result_biaoti_list_3 = re.findall(pantten_biaoti_3, sentence)
result_biaoti_list_4 = re.findall(pantten_biaoti_4, sentence)
result_biaoti_list_5 = re.findall(pantten_biaoti_5, sentence)
result_ref_list_1 = re.findall(pantten_ref_1, sentence)
if list(set(result_biaoti_list_0 + result_biaoti_list_1 + result_biaoti_list_2 + result_biaoti_list_3)) != []:
if len(sentence) < max_length:
if_change = False
if list(set(result_biaoti_list_4 + result_biaoti_list_5)) != []:
if_change = False
if result_ref_list_1 != []:
if_change = False
tokenizer = AutoTokenizer.from_pretrained(model_name) if "" not in sentence:
model = AutoModelForSequenceClassification.from_pretrained(model_name).cpu() if_change = False
def model_preidct(text): if has_chinese(sentence) == False:
if_change = False
return if_change
def model_preidct(model, text):
tokenized_text = tokenizer.encode_plus(text, max_length=512, add_special_tokens=True, tokenized_text = tokenizer.encode_plus(text, max_length=512, add_special_tokens=True,
truncation=True, return_offsets_mapping=True) truncation=True, return_offsets_mapping=True)
@ -62,12 +109,12 @@ def model_preidct(text):
output = torch.sigmoid(output[0]).tolist() output = torch.sigmoid(output[0]).tolist()
print(output) print(output)
if model_name == "drop_aigc_model_2": if model_name_sentence == "drop_aigc_model_2":
return_list = { return_list = {
"humen": output[0][1], "humen": output[0][1],
"robot": output[0][0] "robot": output[0][0]
} }
elif model_name == "AIGC_detector_zhv2": elif model_name_sentence == "AIGC_detector_zhv2":
return_list = { return_list = {
"humen": output[0][0], "humen": output[0][0],
"robot": output[0][1] "robot": output[0][1]
@ -116,40 +163,46 @@ def main(content_list: list):
total_paragraph = len(content_list) total_paragraph = len(content_list)
for i in range(len(content_list)): for i in range(len(content_list)):
total_words += len(content_list[i]) sentence = content_list[i]
res = model_preidct(content_list[i]) total_words += len(sentence)
bool_ = ulit_data(sentence)
if bool_ == True:
res = model_preidct(model_sentence, sentence)
else:
res = model_preidct(model_short, sentence)
# return_list = { # return_list = {
# "humen": output[0][0], # "humen": output[0][0],
# "robot": output[0][1] # "robot": output[0][1]
# } # }
reference_bool = is_reference_sentence(content_list[i]) reference_bool = is_reference_sentence(sentence)
if reference_bool == False: if reference_bool == False:
if res["robot"] > 0.9: if res["robot"] > 0.9:
for _ in range(len(content_list[i])): for _ in range(len(sentence)):
gpt_score_list.append(res["robot"]) gpt_score_list.append(res["robot"])
gpt_score_sentence_list.append(res["robot"]) gpt_score_sentence_list.append(res["robot"])
sim_word += len(content_list[i]) sim_word += len(sentence)
gpt_content.append( gpt_content.append(
"<em class=\"similar\" id='score_{}'>".format(str(i)) + content_list[i] + "\n" + "</em>") "<em class=\"similar\" id='score_{}'>".format(str(i)) + sentence + "\n" + "</em>")
elif 0.9 >= res["robot"] > 0.5: elif 0.9 >= res["robot"] > 0.5:
for _ in range(len(content_list[i])): for _ in range(len(sentence)):
gpt_score_list.append(res["robot"]) gpt_score_list.append(res["robot"])
gpt_score_sentence_list.append(res["robot"]) gpt_score_sentence_list.append(res["robot"])
sim_word_5_9 += len(content_list[i]) sim_word_5_9 += len(sentence)
gpt_content.append( gpt_content.append(
"<em class=\"color-gold\" id='score_{}'>".format(str(i)) + content_list[i] + "\n" + "</em>") "<em class=\"color-gold\" id='score_{}'>".format(str(i)) + sentence + "\n" + "</em>")
else: else:
for _ in range(len(content_list[i])): for _ in range(len(sentence)):
gpt_score_list.append(0) gpt_score_list.append(0)
gpt_score_sentence_list.append(0) gpt_score_sentence_list.append(0)
gpt_content.append(content_list[i] + "\n") gpt_content.append(sentence + "\n")
else: else:
for _ in range(len(content_list[i])): for _ in range(len(sentence)):
gpt_score_list.append(0) gpt_score_list.append(0)
gpt_score_sentence_list.append(0) gpt_score_sentence_list.append(0)
gpt_content.append(content_list[i] + "\n") gpt_content.append(sentence + "\n")
return_list["gpt_content"] = "".join(gpt_content) return_list["gpt_content"] = "".join(gpt_content)
return_list["gpt_score_sentence_list"] = str(gpt_score_sentence_list) return_list["gpt_score_sentence_list"] = str(gpt_score_sentence_list)

11
flask_api.py

@ -21,6 +21,7 @@ import time
import json import json
import docx2txt import docx2txt
import re import re
from datetime import datetime
pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=12, password="zhicheng123*") pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=12, password="zhicheng123*")
@ -94,7 +95,7 @@ def ulit_request_file(file):
# elif file_name.split(".")[-1] == "docx": # elif file_name.split(".")[-1] == "docx":
# content = docx2txt.process(file_name_save) # content = docx2txt.process(file_name_save)
content_list = [i for i in content.split("\n")] content_list = [i for i in content.split("\n") if i!= ""]
print(content_list) print(content_list)
content_list_new = [] content_list_new = []
@ -145,7 +146,13 @@ def handle_query_predict():
# 绑定文本和query id # 绑定文本和query id
# recall_10(id_, title, abst_zh, content) # recall_10(id_, title, abst_zh, content)
load_request_path = './request_data_logs/{}.json'.format(id_) date_str = datetime.now().strftime("%Y-%m-%d")
dir_path = "./request_data_logs/{}".format(date_str)
# 检查并创建目录(如果不存在)
os.makedirs(dir_path, exist_ok=True)
load_request_path = dir_path + '/{}.json'.format(id_)
# load_request_path = './request_data_logs/{}.json'.format(id_)
with open(load_request_path, 'w', encoding='utf8') as f2: # ensure_ascii=False才能输入中文,否则是Unicode字符 indent=2 JSON数据的缩进,美观 with open(load_request_path, 'w', encoding='utf8') as f2: # ensure_ascii=False才能输入中文,否则是Unicode字符 indent=2 JSON数据的缩进,美观
json.dump(d, f2, ensure_ascii=False, indent=4) json.dump(d, f2, ensure_ascii=False, indent=4)
redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path})) # 加入redis redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path})) # 加入redis

Loading…
Cancel
Save