|
|
|
@ -22,7 +22,7 @@ import re |
|
|
|
os.environ["WANDB_DISABLED"] = "true" |
|
|
|
|
|
|
|
# 设置CUDA设备 |
|
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = '2' |
|
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' |
|
|
|
|
|
|
|
|
|
|
|
import logging |
|
|
|
@ -307,7 +307,6 @@ def main(): |
|
|
|
# information sent is the one passed as arguments along with your Python/PyTorch versions. |
|
|
|
send_example_telemetry("run_glue", model_args, data_args) |
|
|
|
print("model_args", model_args) |
|
|
|
9/0 |
|
|
|
|
|
|
|
# Setup logging |
|
|
|
logging.basicConfig( |
|
|
|
@ -455,7 +454,7 @@ def main(): |
|
|
|
token=model_args.token, |
|
|
|
trust_remote_code=model_args.trust_remote_code, |
|
|
|
) |
|
|
|
tokenizer = BertTokenizer.from_pretrained( |
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
|
|
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, |
|
|
|
cache_dir=model_args.cache_dir, |
|
|
|
use_fast=model_args.use_fast_tokenizer, |
|
|
|
@ -463,7 +462,7 @@ def main(): |
|
|
|
token=model_args.token, |
|
|
|
trust_remote_code=model_args.trust_remote_code, |
|
|
|
) |
|
|
|
model = BertForSequenceClassification.from_pretrained( |
|
|
|
model = AutoModelForSequenceClassification.from_pretrained( |
|
|
|
model_args.model_name_or_path, |
|
|
|
from_tf=bool(".ckpt" in model_args.model_name_or_path), |
|
|
|
config=config, |
|
|
|
@ -530,20 +529,57 @@ def main(): |
|
|
|
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) |
|
|
|
|
|
|
|
def preprocess_function(examples): |
|
|
|
print(examples) |
|
|
|
print("1") |
|
|
|
# print("examples[sentence1_key]", examples[sentence1_key]) |
|
|
|
# print("len(examples[sentence1_key])", len(examples[sentence1_key])) |
|
|
|
# print("padding", padding) |
|
|
|
print(1) |
|
|
|
# Tokenize the texts |
|
|
|
args = ( |
|
|
|
(examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) |
|
|
|
) |
|
|
|
result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) |
|
|
|
# result = tokenizer_ulit(tokenizer, examples[sentence1_key], padding, max_seq_length) |
|
|
|
|
|
|
|
# Map labels to IDs (not necessary for GLUE tasks) |
|
|
|
if label_to_id is not None and "label" in examples: |
|
|
|
result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] |
|
|
|
|
|
|
|
# 定义要查找的token序列对 |
|
|
|
sequence_pairs = [ |
|
|
|
([133, 10906, 135], [133, 9931, 135]) # 第一个序列和第二个序列 |
|
|
|
] |
|
|
|
|
|
|
|
# 处理每个样本 |
|
|
|
for i in range(len(result['input_ids'])): |
|
|
|
input_ids = result['input_ids'][i] |
|
|
|
token_type_ids = result['token_type_ids'][i] |
|
|
|
|
|
|
|
# 对每对序列进行处理 |
|
|
|
for seq1, seq2 in sequence_pairs: |
|
|
|
seq1_len = len(seq1) |
|
|
|
seq2_len = len(seq2) |
|
|
|
|
|
|
|
# 查找第一个序列 |
|
|
|
seq1_positions = [] |
|
|
|
for j in range(len(input_ids) - seq1_len + 1): |
|
|
|
if input_ids[j:j + seq1_len] == seq1: |
|
|
|
seq1_positions.append(j) |
|
|
|
|
|
|
|
# 查找第二个序列 |
|
|
|
seq2_positions = [] |
|
|
|
for j in range(len(input_ids) - seq2_len + 1): |
|
|
|
if input_ids[j:j + seq2_len] == seq2: |
|
|
|
seq2_positions.append(j) |
|
|
|
|
|
|
|
# 处理每对找到的序列 |
|
|
|
for pos1 in seq1_positions: |
|
|
|
for pos2 in seq2_positions: |
|
|
|
if pos1 < pos2: # 确保第一个序列在第二个序列之前 |
|
|
|
# 设置两个序列之间(包括序列本身)的token_type_ids为1 |
|
|
|
start_idx = pos1 |
|
|
|
end_idx = pos2 + seq2_len |
|
|
|
|
|
|
|
for k in range(start_idx, end_idx): |
|
|
|
token_type_ids[k] = 1 |
|
|
|
|
|
|
|
# 更新处理后的token_type_ids |
|
|
|
result['token_type_ids'][i] = token_type_ids |
|
|
|
return result |
|
|
|
|
|
|
|
with training_args.main_process_first(desc="dataset map pre-processing"): |
|
|
|
|