改变位置编码

2 months ago · 17b4fc802e
2 changed files with 53 additions and 17 deletions
--- a/run_glue.py
+++ b/run_glue.py
@ -22,7 +22,7 @@ import re
 os.environ["WANDB_DISABLED"] = "true"
 # 设置CUDA设备
-os.environ['CUDA_VISIBLE_DEVICES'] = '2'
+os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
 import logging
@ -307,7 +307,6 @@ def main():
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_glue", model_args, data_args)
    print("model_args", model_args)
    9/0
    # Setup logging
    logging.basicConfig(
@ -455,7 +454,7 @@ def main():
        token=model_args.token,
        trust_remote_code=model_args.trust_remote_code,
    )
-    tokenizer = BertTokenizer.from_pretrained(
+    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
@ -463,7 +462,7 @@ def main():
        token=model_args.token,
        trust_remote_code=model_args.trust_remote_code,
    )
-    model = BertForSequenceClassification.from_pretrained(
+    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
@ -530,20 +529,57 @@ def main():
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
    def preprocess_function(examples):
-        print(examples)
+        print(1)
        print("1")
        # print("examples[sentence1_key]", examples[sentence1_key])
        # print("len(examples[sentence1_key])", len(examples[sentence1_key]))
        # print("padding", padding)
        # Tokenize the texts
        args = (
            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
        )
        result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
-        # result = tokenizer_ulit(tokenizer, examples[sentence1_key], padding, max_seq_length)
+
        # Map labels to IDs (not necessary for GLUE tasks)
        if label_to_id is not None and "label" in examples:
            result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
        # 定义要查找的token序列对
        sequence_pairs = [
            ([133, 10906, 135], [133, 9931, 135])  # 第一个序列和第二个序列
        ]
        # 处理每个样本
        for i in range(len(result['input_ids'])):
            input_ids = result['input_ids'][i]
            token_type_ids = result['token_type_ids'][i]
            # 对每对序列进行处理
            for seq1, seq2 in sequence_pairs:
                seq1_len = len(seq1)
                seq2_len = len(seq2)
                # 查找第一个序列
                seq1_positions = []
                for j in range(len(input_ids) - seq1_len + 1):
                    if input_ids[j:j + seq1_len] == seq1:
                        seq1_positions.append(j)
                # 查找第二个序列
                seq2_positions = []
                for j in range(len(input_ids) - seq2_len + 1):
                    if input_ids[j:j + seq2_len] == seq2:
                        seq2_positions.append(j)
                # 处理每对找到的序列
                for pos1 in seq1_positions:
                    for pos2 in seq2_positions:
                        if pos1 < pos2:  # 确保第一个序列在第二个序列之前
                            # 设置两个序列之间（包括序列本身）的token_type_ids为1
                            start_idx = pos1
                            end_idx = pos2 + seq2_len
                            for k in range(start_idx, end_idx):
                                token_type_ids[k] = 1
            # 更新处理后的token_type_ids
            result['token_type_ids'][i] = token_type_ids
        return result
    with training_args.main_process_first(desc="dataset map pre-processing"):
--- a/run_train.sh
+++ b/run_train.sh
@ -1,11 +1,11 @@
 python run_glue.py \
-  --model_name_or_path chinese_bert_wwm_ext_pytorch \
+  --model_name_or_path /home/majiahui/project/models-llm/longformer-chinese-base-4096 \
-  --train_file data/train_data_weipu.csv \
+  --train_file data/long_paper_train_3_1.csv \
-  --validation_file data/dev_data_weipu.csv \
+  --validation_file data/long_paper_dev_3_1.csv \
  --do_train \
  --do_eval \
-  --max_seq_length 512 \
+  --max_seq_length 4096 \
-  --per_device_train_batch_size 32 \
+  --per_device_train_batch_size 4 \
  --learning_rate 2e-5 \
-  --num_train_epochs 5 \
+  --num_train_epochs 1 \
-  --output_dir aigc_check
+  --output_dir long_paper_ceshi