|
|
@ -34,13 +34,60 @@ batch_size = 32 |
|
|
|
# model_name = "drop_aigc_model_2" |
|
|
|
# model_name = "drop_aigc_model_3" |
|
|
|
# model_name = "/home/majiahui/project/models-llm/aigc_check_10" |
|
|
|
model_name = "/home/majiahui/project/models-llm/weipu_aigc_512_3" |
|
|
|
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_name).cpu() |
|
|
|
|
|
|
|
def model_preidct(text): |
|
|
|
model_name_sentence = "/home/majiahui/project/models-llm/weipu_aigc_512_11" |
|
|
|
model_name_short = "/home/majiahui/project/models-llm/weipu_aigc_512_5" |
|
|
|
pantten_biaoti_0 = '^[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' |
|
|
|
pantten_biaoti_1 = '^第[一二三四五六七八九]章\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' |
|
|
|
pantten_biaoti_2 = '^[0-9.]+\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' |
|
|
|
pantten_biaoti_3 = '^[((][1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][)_)][、.]{0,}?\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' |
|
|
|
pantten_biaoti_4 = '(摘要)' |
|
|
|
pantten_biaoti_5 = '(致谢)' |
|
|
|
pantten_ref_1 = r'^\[\d+\]' |
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name_sentence) |
|
|
|
model_sentence = AutoModelForSequenceClassification.from_pretrained(model_name_sentence).cpu() |
|
|
|
model_short = AutoModelForSequenceClassification.from_pretrained(model_name_short).cpu() |
|
|
|
|
|
|
|
def has_chinese(text): |
|
|
|
"""判断字符串是否包含中文字符""" |
|
|
|
for char in text: |
|
|
|
if '\u4e00' <= char <= '\u9fff': |
|
|
|
return True |
|
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
def ulit_data(sentence:str): |
|
|
|
if_change = True |
|
|
|
max_length = 25 |
|
|
|
sentence = sentence.strip(" ") |
|
|
|
result_biaoti_list_0 = re.findall(pantten_biaoti_0, sentence) |
|
|
|
result_biaoti_list_1 = re.findall(pantten_biaoti_1, sentence) |
|
|
|
result_biaoti_list_2 = re.findall(pantten_biaoti_2, sentence) |
|
|
|
result_biaoti_list_3 = re.findall(pantten_biaoti_3, sentence) |
|
|
|
result_biaoti_list_4 = re.findall(pantten_biaoti_4, sentence) |
|
|
|
result_biaoti_list_5 = re.findall(pantten_biaoti_5, sentence) |
|
|
|
result_ref_list_1 = re.findall(pantten_ref_1, sentence) |
|
|
|
|
|
|
|
if list(set(result_biaoti_list_0 + result_biaoti_list_1 + result_biaoti_list_2 + result_biaoti_list_3)) != []: |
|
|
|
if len(sentence) < max_length: |
|
|
|
if_change = False |
|
|
|
|
|
|
|
if list(set(result_biaoti_list_4 + result_biaoti_list_5)) != []: |
|
|
|
if_change = False |
|
|
|
|
|
|
|
if result_ref_list_1 != []: |
|
|
|
if_change = False |
|
|
|
|
|
|
|
if "。" not in sentence: |
|
|
|
if_change = False |
|
|
|
|
|
|
|
if has_chinese(sentence) == False: |
|
|
|
if_change = False |
|
|
|
|
|
|
|
return if_change |
|
|
|
|
|
|
|
|
|
|
|
def model_preidct(model, text): |
|
|
|
tokenized_text = tokenizer.encode_plus(text, max_length=512, add_special_tokens=True, |
|
|
|
truncation=True, return_offsets_mapping=True) |
|
|
|
|
|
|
@ -62,12 +109,12 @@ def model_preidct(text): |
|
|
|
output = torch.sigmoid(output[0]).tolist() |
|
|
|
print(output) |
|
|
|
|
|
|
|
if model_name == "drop_aigc_model_2": |
|
|
|
if model_name_sentence == "drop_aigc_model_2": |
|
|
|
return_list = { |
|
|
|
"humen": output[0][1], |
|
|
|
"robot": output[0][0] |
|
|
|
} |
|
|
|
elif model_name == "AIGC_detector_zhv2": |
|
|
|
elif model_name_sentence == "AIGC_detector_zhv2": |
|
|
|
return_list = { |
|
|
|
"humen": output[0][0], |
|
|
|
"robot": output[0][1] |
|
|
@ -116,40 +163,46 @@ def main(content_list: list): |
|
|
|
total_paragraph = len(content_list) |
|
|
|
|
|
|
|
for i in range(len(content_list)): |
|
|
|
total_words += len(content_list[i]) |
|
|
|
res = model_preidct(content_list[i]) |
|
|
|
sentence = content_list[i] |
|
|
|
total_words += len(sentence) |
|
|
|
|
|
|
|
bool_ = ulit_data(sentence) |
|
|
|
if bool_ == True: |
|
|
|
res = model_preidct(model_sentence, sentence) |
|
|
|
else: |
|
|
|
res = model_preidct(model_short, sentence) |
|
|
|
# return_list = { |
|
|
|
# "humen": output[0][0], |
|
|
|
# "robot": output[0][1] |
|
|
|
# } |
|
|
|
|
|
|
|
reference_bool = is_reference_sentence(content_list[i]) |
|
|
|
reference_bool = is_reference_sentence(sentence) |
|
|
|
|
|
|
|
if reference_bool == False: |
|
|
|
if res["robot"] > 0.9: |
|
|
|
for _ in range(len(content_list[i])): |
|
|
|
for _ in range(len(sentence)): |
|
|
|
gpt_score_list.append(res["robot"]) |
|
|
|
gpt_score_sentence_list.append(res["robot"]) |
|
|
|
sim_word += len(content_list[i]) |
|
|
|
sim_word += len(sentence) |
|
|
|
gpt_content.append( |
|
|
|
"<em class=\"similar\" id='score_{}'>".format(str(i)) + content_list[i] + "\n" + "</em>") |
|
|
|
"<em class=\"similar\" id='score_{}'>".format(str(i)) + sentence + "\n" + "</em>") |
|
|
|
elif 0.9 >= res["robot"] > 0.5: |
|
|
|
for _ in range(len(content_list[i])): |
|
|
|
for _ in range(len(sentence)): |
|
|
|
gpt_score_list.append(res["robot"]) |
|
|
|
gpt_score_sentence_list.append(res["robot"]) |
|
|
|
sim_word_5_9 += len(content_list[i]) |
|
|
|
sim_word_5_9 += len(sentence) |
|
|
|
gpt_content.append( |
|
|
|
"<em class=\"color-gold\" id='score_{}'>".format(str(i)) + content_list[i] + "\n" + "</em>") |
|
|
|
"<em class=\"color-gold\" id='score_{}'>".format(str(i)) + sentence + "\n" + "</em>") |
|
|
|
else: |
|
|
|
for _ in range(len(content_list[i])): |
|
|
|
for _ in range(len(sentence)): |
|
|
|
gpt_score_list.append(0) |
|
|
|
gpt_score_sentence_list.append(0) |
|
|
|
gpt_content.append(content_list[i] + "\n") |
|
|
|
gpt_content.append(sentence + "\n") |
|
|
|
else: |
|
|
|
for _ in range(len(content_list[i])): |
|
|
|
for _ in range(len(sentence)): |
|
|
|
gpt_score_list.append(0) |
|
|
|
gpt_score_sentence_list.append(0) |
|
|
|
gpt_content.append(content_list[i] + "\n") |
|
|
|
gpt_content.append(sentence + "\n") |
|
|
|
|
|
|
|
return_list["gpt_content"] = "".join(gpt_content) |
|
|
|
return_list["gpt_score_sentence_list"] = str(gpt_score_sentence_list) |
|
|
|