You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
135 lines
3.8 KiB
135 lines
3.8 KiB
import os
|
|
os.environ["WANDB_DISABLED"] = "true"
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
|
|
import logging
|
|
import random
|
|
import sys
|
|
import warnings
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional
|
|
import json
|
|
import datasets
|
|
import evaluate
|
|
import numpy as np
|
|
from datasets import load_dataset
|
|
import torch
|
|
import transformers
|
|
from transformers import (
|
|
AutoConfig,
|
|
AutoModelForSequenceClassification,
|
|
BertForSequenceClassification,
|
|
AutoTokenizer,
|
|
DataCollatorWithPadding,
|
|
EvalPrediction,
|
|
HfArgumentParser,
|
|
PretrainedConfig,
|
|
Trainer,
|
|
TrainingArguments,
|
|
default_data_collator,
|
|
set_seed,
|
|
)
|
|
from transformers.trainer_utils import get_last_checkpoint
|
|
from transformers.utils import check_min_version, send_example_telemetry
|
|
from transformers.utils.versions import require_version
|
|
|
|
|
|
|
|
def load_model(config_path: str, model_path: str):
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_path)
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
return model, tokenizer
|
|
|
|
|
|
def seq_padding(tokenizer, X):
|
|
pad_id = tokenizer.convert_tokens_to_ids("[PAD]")
|
|
if len(X) <= 1:
|
|
return torch.tensor(X)
|
|
L = [len(x) for x in X]
|
|
ML = max(L)
|
|
X = torch.Tensor([x + [pad_id] * (ML - len(x)) if len(x) < ML else x for x in X])
|
|
return X
|
|
|
|
|
|
if __name__ == "__main__":
|
|
model, tokenizer = load_model(config_path='chinese_bert_wwm_ext_pytorch/config.json',
|
|
model_path='aigc_check')
|
|
|
|
# text = "(1)经病理学或细胞学确诊的肺癌患者;"
|
|
#
|
|
# sen = [text]
|
|
# result = tokenizer(sen, max_length=512, truncation=True)
|
|
# print(result)
|
|
#
|
|
# input_ids = result['input_ids']
|
|
# token_type_ids = result['token_type_ids']
|
|
#
|
|
# input_ids = seq_padding(tokenizer, input_ids)
|
|
# token_type_ids = seq_padding(tokenizer, token_type_ids)
|
|
#
|
|
#
|
|
# result = model(input_ids=input_ids,token_type_ids=token_type_ids) # 这里不需要labels
|
|
# output = torch.sigmoid(result[0][0]).tolist()
|
|
# # result_ = result[0][0]
|
|
# print(output)
|
|
|
|
model.to("cuda")
|
|
with open("data/paperred_aigc_cls.json", encoding='utf8') as f:
|
|
data = json.loads(f.read())
|
|
|
|
|
|
data_new = []
|
|
|
|
zong = 0
|
|
rel = 0
|
|
jishu = 0
|
|
|
|
for index, i in enumerate(data[:10000]):
|
|
print(zong)
|
|
zong += 1
|
|
text1 = i["input"]
|
|
text2 = i["output"]
|
|
|
|
sen = [text1, text2]
|
|
result = tokenizer(sen, max_length=512, truncation=True)
|
|
|
|
input_ids = result['input_ids']
|
|
token_type_ids = result['token_type_ids']
|
|
|
|
input_ids = seq_padding(tokenizer, input_ids)
|
|
token_type_ids = seq_padding(tokenizer, token_type_ids)
|
|
|
|
input_ids = input_ids.long()
|
|
token_type_ids = token_type_ids.long()
|
|
|
|
batch_masks = input_ids.gt(0).to("cuda")
|
|
input_ids, token_type_ids = input_ids.to("cuda"), token_type_ids.to("cuda")
|
|
result = model(input_ids=input_ids,token_type_ids=token_type_ids, attention_mask=batch_masks) # 这里不需要labels
|
|
# output = torch.sigmoid(result[0][0]).tolist()
|
|
# # result_ = result[0][0]
|
|
# if output[1] > 0.5:
|
|
# rel += 1
|
|
#
|
|
# data_new.append({
|
|
# "index": index,
|
|
# "text": text,
|
|
# "acc": output,
|
|
# })
|
|
|
|
output = torch.sigmoid(result[0]).tolist()
|
|
if output[0][1] > 0.5 and output[1][1] < 0.5:
|
|
jishu +=1
|
|
data_new.append({
|
|
"index": index,
|
|
"text1": text1,
|
|
"text2": text2,
|
|
"acc": [output[0][1], output[1][1]],
|
|
})
|
|
|
|
print(jishu)
|
|
data_dict = {
|
|
# "acc" : rel/zong,
|
|
"data": data_new
|
|
}
|
|
|
|
with open("data/paperred_aigc_acc3.json", "w", encoding='utf8') as f:
|
|
json.dump(data_dict, f, ensure_ascii=False, indent=4)
|