from typing import List, Tuple, Union from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) from vllm.logger import init_logger logger = init_logger(__name__) # A fast LLaMA tokenizer with the pre-processed `tokenizer.json` file. _FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer" def get_tokenizer( tokenizer_name: str, *args, tokenizer_mode: str = "auto", **kwargs, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: """Gets a tokenizer for the given model name via Huggingface.""" if tokenizer_mode == "slow": if kwargs.get("use_fast", False): raise ValueError( "Cannot use the fast tokenizer in slow tokenizer mode.") kwargs["use_fast"] = False # if "llama" in tokenizer_name.lower() and kwargs.get("use_fast", True): # logger.info( # "For some LLaMA-based models, initializing the fast tokenizer may " # "take a long time. To eliminate the initialization time, consider " # f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original " # "tokenizer.") try: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, *args, **kwargs) except TypeError as e: # The LLaMA tokenizer causes a protobuf error in some environments. err_msg = ( "Failed to load the tokenizer. If you are using a LLaMA-based " f"model, use '{_FAST_LLAMA_TOKENIZER}' instead of the original " "tokenizer.") raise RuntimeError(err_msg) from e if not isinstance(tokenizer, PreTrainedTokenizerFast): logger.warning( "Using a slow tokenizer. This might cause a significant " "slowdown. Consider using a fast tokenizer instead.") return tokenizer def detokenize_incrementally( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], prev_output_tokens: List[str], new_token_id: int, skip_special_tokens: bool, ) -> Tuple[str, str]: """Detokenizes the new token in conjuction with the previous output tokens. NOTE: This function does not update prev_output_tokens. Returns: new_token: The new token as a string. output_text: The new output text as a string. """ new_token = tokenizer.convert_ids_to_tokens( new_token_id, skip_special_tokens=skip_special_tokens) output_tokens = prev_output_tokens + [new_token] # Convert the tokens to a string. # Optimization: If the tokenizer does not have `added_tokens_encoder`, # then we can directly use `convert_tokens_to_string`. if not getattr(tokenizer, "added_tokens_encoder", {}): output_text = tokenizer.convert_tokens_to_string(output_tokens) return new_token, output_text # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921 # NOTE(woosuk): The following code is slow because it runs a for loop over # the output_tokens. In Python, running a for loop over a list can be slow # even when the loop body is very simple. sub_texts = [] current_sub_text = [] for token in output_tokens: if skip_special_tokens and token in tokenizer.all_special_ids: continue if token in tokenizer.added_tokens_encoder: if current_sub_text: sub_text = tokenizer.convert_tokens_to_string(current_sub_text) sub_texts.append(sub_text) current_sub_text = [] sub_texts.append(token) else: current_sub_text.append(token) if current_sub_text: sub_text = tokenizer.convert_tokens_to_string(current_sub_text) sub_texts.append(sub_text) output_text = " ".join(sub_texts) return new_token, output_text