改变位置编码

2 months ago · 17b4fc802e
2 changed files with 53 additions and 17 deletions
--- a/run_glue.py
+++ b/run_glue.py
@ -22,7 +22,7 @@ import re
 os.environ["WANDB_DISABLED"] = "true"

 # 设置CUDA设备
-os.environ['CUDA_VISIBLE_DEVICES'] = '2'
+os.environ['CUDA_VISIBLE_DEVICES'] = '-1'


 import logging
@ -307,7 +307,6 @@ def main():
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_glue", model_args, data_args)
    print("model_args", model_args)
-    9/0

    # Setup logging
    logging.basicConfig(
@ -455,7 +454,7 @@ def main():
        token=model_args.token,
        trust_remote_code=model_args.trust_remote_code,
    )
-    tokenizer = BertTokenizer.from_pretrained(
+    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
@ -463,7 +462,7 @@ def main():
        token=model_args.token,
        trust_remote_code=model_args.trust_remote_code,
    )
-    model = BertForSequenceClassification.from_pretrained(
+    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
@ -530,20 +529,57 @@ def main():
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    def preprocess_function(examples):
-        print(examples)
-        print("1")
-        # print("examples[sentence1_key]", examples[sentence1_key])
-        # print("len(examples[sentence1_key])", len(examples[sentence1_key]))
-        # print("padding", padding)
+        print(1)
        # Tokenize the texts
        args = (
            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
        )
        result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
-        # result = tokenizer_ulit(tokenizer, examples[sentence1_key], padding, max_seq_length)
+
        # Map labels to IDs (not necessary for GLUE tasks)
        if label_to_id is not None and "label" in examples:
            result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
+
+        # 定义要查找的token序列对
+        sequence_pairs = [
+            ([133, 10906, 135], [133, 9931, 135])  # 第一个序列和第二个序列
+        ]
+
+        # 处理每个样本
+        for i in range(len(result['input_ids'])):
+            input_ids = result['input_ids'][i]
+            token_type_ids = result['token_type_ids'][i]
+
+            # 对每对序列进行处理
+            for seq1, seq2 in sequence_pairs:
+                seq1_len = len(seq1)
+                seq2_len = len(seq2)
+
+                # 查找第一个序列
+                seq1_positions = []
+                for j in range(len(input_ids) - seq1_len + 1):
+                    if input_ids[j:j + seq1_len] == seq1:
+                        seq1_positions.append(j)
+
+                # 查找第二个序列
+                seq2_positions = []
+                for j in range(len(input_ids) - seq2_len + 1):
+                    if input_ids[j:j + seq2_len] == seq2:
+                        seq2_positions.append(j)
+
+                # 处理每对找到的序列
+                for pos1 in seq1_positions:
+                    for pos2 in seq2_positions:
+                        if pos1 < pos2:  # 确保第一个序列在第二个序列之前
+                            # 设置两个序列之间（包括序列本身）的token_type_ids为1
+                            start_idx = pos1
+                            end_idx = pos2 + seq2_len
+
+                            for k in range(start_idx, end_idx):
+                                token_type_ids[k] = 1
+
+            # 更新处理后的token_type_ids
+            result['token_type_ids'][i] = token_type_ids
        return result

    with training_args.main_process_first(desc="dataset map pre-processing"):
--- a/run_train.sh
+++ b/run_train.sh
@ -1,11 +1,11 @@
 python run_glue.py \
-  --model_name_or_path chinese_bert_wwm_ext_pytorch \
-  --train_file data/train_data_weipu.csv \
-  --validation_file data/dev_data_weipu.csv \
+  --model_name_or_path /home/majiahui/project/models-llm/longformer-chinese-base-4096 \
+  --train_file data/long_paper_train_3_1.csv \
+  --validation_file data/long_paper_dev_3_1.csv \
  --do_train \
  --do_eval \
-  --max_seq_length 512 \
-  --per_device_train_batch_size 32 \
+  --max_seq_length 4096 \
+  --per_device_train_batch_size 4 \
  --learning_rate 2e-5 \
-  --num_train_epochs 5 \
-  --output_dir aigc_check
+  --num_train_epochs 1 \
+  --output_dir long_paper_ceshi