You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
132 lines
3.7 KiB
132 lines
3.7 KiB
import os
|
|
os.environ["WANDB_DISABLED"] = "true"
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
|
import logging
|
|
import random
|
|
import sys
|
|
import warnings
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional
|
|
import json
|
|
import datasets
|
|
import evaluate
|
|
import numpy as np
|
|
from datasets import load_dataset
|
|
import torch
|
|
import transformers
|
|
from transformers import (
|
|
AutoConfig,
|
|
AutoModelForSequenceClassification,
|
|
BertForSequenceClassification,
|
|
AutoTokenizer,
|
|
DataCollatorWithPadding,
|
|
EvalPrediction,
|
|
HfArgumentParser,
|
|
PretrainedConfig,
|
|
Trainer,
|
|
TrainingArguments,
|
|
default_data_collator,
|
|
set_seed,
|
|
)
|
|
from transformers.trainer_utils import get_last_checkpoint
|
|
from transformers.utils import check_min_version, send_example_telemetry
|
|
from transformers.utils.versions import require_version
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
|
|
|
|
def load_model(config_path: str, model_path: str):
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_path)
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
return model, tokenizer
|
|
|
|
|
|
def seq_padding(tokenizer, X):
|
|
pad_id = tokenizer.convert_tokens_to_ids("[PAD]")
|
|
if len(X) <= 1:
|
|
return torch.tensor(X)
|
|
L = [len(x) for x in X]
|
|
ML = max(L)
|
|
X = torch.Tensor([x + [pad_id] * (ML - len(x)) if len(x) < ML else x for x in X])
|
|
return X
|
|
|
|
|
|
if __name__ == "__main__":
|
|
model, tokenizer = load_model(config_path='chinese_bert_wwm_ext_pytorch/config.json',
|
|
model_path='aigc_check_6')
|
|
|
|
# text = "(1)经病理学或细胞学确诊的肺癌患者;"
|
|
#
|
|
# sen = [text]
|
|
# result = tokenizer(sen, max_length=512, truncation=True)
|
|
# print(result)
|
|
#
|
|
# input_ids = result['input_ids']
|
|
# token_type_ids = result['token_type_ids']
|
|
#
|
|
# input_ids = seq_padding(tokenizer, input_ids)
|
|
# token_type_ids = seq_padding(tokenizer, token_type_ids)
|
|
#
|
|
#
|
|
# result = model(input_ids=input_ids,token_type_ids=token_type_ids) # 这里不需要labels
|
|
# output = torch.sigmoid(result[0][0]).tolist()
|
|
# # result_ = result[0][0]
|
|
# print(output)
|
|
|
|
model.to("cuda")
|
|
data_list = pd.read_csv("data/train_1_data_weipu.csv").values.tolist()
|
|
|
|
data_new = []
|
|
|
|
zong = 0
|
|
rel = 0
|
|
jishu = 0
|
|
|
|
for i in tqdm(data_list):
|
|
# print(zong)
|
|
zong += 1
|
|
text = i[0]
|
|
lable = i[1]
|
|
|
|
result = tokenizer([text], max_length=512, truncation=True)
|
|
|
|
input_ids = result['input_ids']
|
|
token_type_ids = result['token_type_ids']
|
|
# print(input_ids)
|
|
# print(text)
|
|
# print(lable)
|
|
input_ids = seq_padding(tokenizer, input_ids)
|
|
token_type_ids = seq_padding(tokenizer, token_type_ids)
|
|
|
|
input_ids = input_ids.long()
|
|
token_type_ids = token_type_ids.long()
|
|
|
|
batch_masks = input_ids.gt(0).to("cuda")
|
|
input_ids, token_type_ids = input_ids.to("cuda"), token_type_ids.to("cuda")
|
|
result = model(input_ids=input_ids,token_type_ids=token_type_ids, attention_mask=batch_masks) # 这里不需要labels
|
|
# output = torch.sigmoid(result[0][0]).tolist()
|
|
# # result_ = result[0][0]
|
|
# if output[1] > 0.5:
|
|
# rel += 1
|
|
#
|
|
# data_new.append({
|
|
# "index": index,
|
|
# "text": text,
|
|
# "acc": output,
|
|
# })
|
|
|
|
output = torch.sigmoid(result[0]).tolist()
|
|
# print(output)
|
|
|
|
if output[0][0] > 0.50:
|
|
predict_lable = 0
|
|
else:
|
|
predict_lable = 1
|
|
jishu +=1
|
|
|
|
if predict_lable == lable:
|
|
data_new.append([
|
|
text,
|
|
lable
|
|
])
|
|
pd.DataFrame(data_new, columns=["sentence", "label"]).to_csv("data/train_2_2_data_weipu.csv.csv", index=False)
|
|
|