import os os.environ["WANDB_DISABLED"] = "true" os.environ["CUDA_VISIBLE_DEVICES"] = "2" import logging import random import sys import warnings from dataclasses import dataclass, field from typing import Optional import json import datasets import evaluate import numpy as np from datasets import load_dataset import torch import transformers from transformers import ( AutoConfig, AutoModelForSequenceClassification, BertForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, EvalPrediction, HfArgumentParser, PretrainedConfig, Trainer, TrainingArguments, default_data_collator, set_seed, ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version def load_model(config_path: str, model_path: str): model = AutoModelForSequenceClassification.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path) return model, tokenizer def seq_padding(tokenizer, X): pad_id = tokenizer.convert_tokens_to_ids("[PAD]") if len(X) <= 1: return torch.tensor(X) L = [len(x) for x in X] ML = max(L) X = torch.Tensor([x + [pad_id] * (ML - len(x)) if len(x) < ML else x for x in X]) return X if __name__ == "__main__": model, tokenizer = load_model(config_path='chinese_bert_wwm_ext_pytorch/config.json', model_path='aigc_check') # text = "(1)经病理学或细胞学确诊的肺癌患者;" # # sen = [text] # result = tokenizer(sen, max_length=512, truncation=True) # print(result) # # input_ids = result['input_ids'] # token_type_ids = result['token_type_ids'] # # input_ids = seq_padding(tokenizer, input_ids) # token_type_ids = seq_padding(tokenizer, token_type_ids) # # # result = model(input_ids=input_ids,token_type_ids=token_type_ids) # 这里不需要labels # output = torch.sigmoid(result[0][0]).tolist() # # result_ = result[0][0] # print(output) model.to("cuda") with open("data/paperred_aigc_cls.json", encoding='utf8') as f: data = json.loads(f.read()) data_new = [] zong = 0 rel = 0 jishu = 0 for index, i in enumerate(data[:10000]): print(zong) zong += 1 text1 = i["input"] text2 = i["output"] sen = [text1, text2] result = tokenizer(sen, max_length=512, truncation=True) input_ids = result['input_ids'] token_type_ids = result['token_type_ids'] input_ids = seq_padding(tokenizer, input_ids) token_type_ids = seq_padding(tokenizer, token_type_ids) input_ids = input_ids.long() token_type_ids = token_type_ids.long() batch_masks = input_ids.gt(0).to("cuda") input_ids, token_type_ids = input_ids.to("cuda"), token_type_ids.to("cuda") result = model(input_ids=input_ids,token_type_ids=token_type_ids, attention_mask=batch_masks) # 这里不需要labels # output = torch.sigmoid(result[0][0]).tolist() # # result_ = result[0][0] # if output[1] > 0.5: # rel += 1 # # data_new.append({ # "index": index, # "text": text, # "acc": output, # }) output = torch.sigmoid(result[0]).tolist() if output[0][1] > 0.5 and output[1][1] < 0.5: jishu +=1 data_new.append({ "index": index, "text1": text1, "text2": text2, "acc": [output[0][1], output[1][1]], }) print(jishu) data_dict = { # "acc" : rel/zong, "data": data_new } with open("data/paperred_aigc_acc3.json", "w", encoding='utf8') as f: json.dump(data_dict, f, ensure_ascii=False, indent=4)