From 771f454ff1deee4929927c58feab7dcd3b854f9c Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 3 Jun 2023 18:19:01 +0800
Subject: [PATCH] use low_cpu_mem_usage to speed up loading

---
 src/export_model.py |  2 +-
 src/utils/common.py | 24 ++++++++++++++++--------
 src/utils/config.py | 10 +++++++---
 3 files changed, 24 insertions(+), 12 deletions(-)
diff --git a/src/export_model.py b/src/export_model.py
index 2d977ae..f202401 100644
--- a/src/export_model.py
+++ b/src/export_model.py
@@ -13,7 +13,7 @@ def main():
     model_args, training_args, finetuning_args = parser.parse_args_into_dataclasses()
 
     model, tokenizer = load_pretrained(model_args, finetuning_args)
-    model.save_pretrained(training_args.output_dir, max_shard_size="1GB")
+    model.save_pretrained(training_args.output_dir, max_shard_size="10GB")
     tokenizer.save_pretrained(training_args.output_dir)
 
     print("model and tokenizer have been saved at:", training_args.output_dir)
diff --git a/src/utils/common.py b/src/utils/common.py
index 65ae293..66fa4c3 100644
--- a/src/utils/common.py
+++ b/src/utils/common.py
@@ -143,15 +143,24 @@ def load_pretrained(
     assert stage in ["pt", "sft"] or finetuning_args.finetuning_type == "lora", \
         "RM and PPO training can only be performed with LoRA method."
 
+    config_kwargs = {
+        "trust_remote_code": True,
+        "cache_dir": model_args.cache_dir,
+        "revision": model_args.model_revision,
+        "use_auth_token": True if model_args.use_auth_token else None,
+    }
+
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path,
         use_fast=model_args.use_fast_tokenizer,
-        padding_side="left"
+        padding_side="left",
+        **config_kwargs
     )
     tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id # set as the <unk> token
 
+    config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
+
     # Quantization configurations (using bitsandbytes library).
-    config_kwargs = {}
     if model_args.quantization_bit is not None:
         assert model_args.quantization_bit == 8, "We only accept 8-bit quantization."
         require_version("bitsandbytes>=0.39.0", "To fix: pip install bitsandbytes>=0.39.1")
@@ -162,23 +171,19 @@ def load_pretrained(
 
         config_kwargs["load_in_8bit"] = True
         config_kwargs["device_map"] = "auto" # it should not be specified outside of load_in_8bit
-        logger.info("Quantized model to {} bit.".format(model_args.quantization_bit))
-
-    config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+        logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit))
 
     # Load and prepare pretrained models (without valuehead).
     model = AutoModelForCausalLM.from_pretrained(
         model_args.model_name_or_path,
         config=config,
         torch_dtype=torch.float16, # the model weights are float16 type
+        low_cpu_mem_usage=True,
         **config_kwargs
     )
     model = prepare_model_for_training(model) if is_trainable else model
     model = init_adapter(model, model_args, finetuning_args, is_trainable)
 
-    if not is_trainable:
-        model.requires_grad_(False) # fix all model params
-
     if stage == "rm" or stage == "ppo": # add value head
         model = AutoModelForCausalLMWithValueHead.from_pretrained(model)
 
@@ -194,6 +199,9 @@ def load_pretrained(
         if model_args.quantization_bit is not None:
             model._is_int8_training_enabled = True
 
+    if not is_trainable:
+        model.requires_grad_(False) # fix all model params
+
     print_trainable_params(model)
 
     return model, tokenizer
diff --git a/src/utils/config.py b/src/utils/config.py
index 98d5907..deb13ad 100644
--- a/src/utils/config.py
+++ b/src/utils/config.py
@@ -38,13 +38,17 @@ class ModelArguments:
         metadata={"help": "Where to store the pretrained models downloaded from huggingface.co."}
     )
     use_fast_tokenizer: Optional[bool] = field(
-        default=True,
+        default=False,
         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}
     )
     use_auth_token: Optional[bool] = field(
         default=False,
         metadata={"help": "Will use the token generated when running `huggingface-cli login`."}
     )
+    model_revision: Optional[str] = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}
+    )
     quantization_bit: Optional[int] = field(
         default=None,
         metadata={"help": "The number of bits to quantize the model."}
@@ -59,7 +63,7 @@ class ModelArguments:
     )
     checkpoint_dir: Optional[str] = field(
         default=None,
-        metadata={"help": "Path to the directory containing the model checkpoints as well as the configurations."}
+        metadata={"help": "Path to the directory(s) containing the delta model checkpoints as well as the configurations."}
     )
     reward_model: Optional[str] = field(
         default=None,
@@ -75,7 +79,7 @@ class ModelArguments:
     )
 
     def __post_init__(self):
-        if self.checkpoint_dir is not None: # support merging lora weights
+        if self.checkpoint_dir is not None: # support merging multiple lora weights
             self.checkpoint_dir = [cd.strip() for cd in self.checkpoint_dir.split(",")]