排版识别标题级别和正文
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

132 lines
3.7 KiB

import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import logging
import random
import sys
import warnings
from dataclasses import dataclass, field
from typing import Optional
import json
import datasets
import evaluate
import numpy as np
from datasets import load_dataset
import torch
import transformers
from transformers import (
AutoConfig,
AutoModelForSequenceClassification,
BertForSequenceClassification,
AutoTokenizer,
DataCollatorWithPadding,
EvalPrediction,
HfArgumentParser,
PretrainedConfig,
Trainer,
TrainingArguments,
default_data_collator,
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
import pandas as pd
from tqdm import tqdm
def load_model(config_path: str, model_path: str):
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
return model, tokenizer
def seq_padding(tokenizer, X):
pad_id = tokenizer.convert_tokens_to_ids("[PAD]")
if len(X) <= 1:
return torch.tensor(X)
L = [len(x) for x in X]
ML = max(L)
X = torch.Tensor([x + [pad_id] * (ML - len(x)) if len(x) < ML else x for x in X])
return X
if __name__ == "__main__":
model, tokenizer = load_model(config_path='chinese_bert_wwm_ext_pytorch/config.json',
model_path='aigc_check_6')
# text = "(1)经病理学或细胞学确诊的肺癌患者;"
#
# sen = [text]
# result = tokenizer(sen, max_length=512, truncation=True)
# print(result)
#
# input_ids = result['input_ids']
# token_type_ids = result['token_type_ids']
#
# input_ids = seq_padding(tokenizer, input_ids)
# token_type_ids = seq_padding(tokenizer, token_type_ids)
#
#
# result = model(input_ids=input_ids,token_type_ids=token_type_ids) # 这里不需要labels
# output = torch.sigmoid(result[0][0]).tolist()
# # result_ = result[0][0]
# print(output)
model.to("cuda")
data_list = pd.read_csv("data/train_1_data_weipu.csv").values.tolist()
data_new = []
zong = 0
rel = 0
jishu = 0
for i in tqdm(data_list):
# print(zong)
zong += 1
text = i[0]
lable = i[1]
result = tokenizer([text], max_length=512, truncation=True)
input_ids = result['input_ids']
token_type_ids = result['token_type_ids']
# print(input_ids)
# print(text)
# print(lable)
input_ids = seq_padding(tokenizer, input_ids)
token_type_ids = seq_padding(tokenizer, token_type_ids)
input_ids = input_ids.long()
token_type_ids = token_type_ids.long()
batch_masks = input_ids.gt(0).to("cuda")
input_ids, token_type_ids = input_ids.to("cuda"), token_type_ids.to("cuda")
result = model(input_ids=input_ids,token_type_ids=token_type_ids, attention_mask=batch_masks) # 这里不需要labels
# output = torch.sigmoid(result[0][0]).tolist()
# # result_ = result[0][0]
# if output[1] > 0.5:
# rel += 1
#
# data_new.append({
# "index": index,
# "text": text,
# "acc": output,
# })
output = torch.sigmoid(result[0]).tolist()
# print(output)
if output[0][0] > 0.50:
predict_lable = 0
else:
predict_lable = 1
jishu +=1
if predict_lable == lable:
data_new.append([
text,
lable
])
pd.DataFrame(data_new, columns=["sentence", "label"]).to_csv("data/train_2_2_data_weipu.csv.csv", index=False)