You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
98 lines
3.8 KiB
98 lines
3.8 KiB
from typing import List, Tuple, Union
|
|
|
|
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
|
PreTrainedTokenizerFast)
|
|
|
|
from vllm.logger import init_logger
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
# A fast LLaMA tokenizer with the pre-processed `tokenizer.json` file.
|
|
_FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer"
|
|
|
|
|
|
def get_tokenizer(
|
|
tokenizer_name: str,
|
|
*args,
|
|
tokenizer_mode: str = "auto",
|
|
**kwargs,
|
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
|
"""Gets a tokenizer for the given model name via Huggingface."""
|
|
if tokenizer_mode == "slow":
|
|
if kwargs.get("use_fast", False):
|
|
raise ValueError(
|
|
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
|
kwargs["use_fast"] = False
|
|
|
|
# if "llama" in tokenizer_name.lower() and kwargs.get("use_fast", True):
|
|
# logger.info(
|
|
# "For some LLaMA-based models, initializing the fast tokenizer may "
|
|
# "take a long time. To eliminate the initialization time, consider "
|
|
# f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original "
|
|
# "tokenizer.")
|
|
try:
|
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, *args,
|
|
**kwargs)
|
|
except TypeError as e:
|
|
# The LLaMA tokenizer causes a protobuf error in some environments.
|
|
err_msg = (
|
|
"Failed to load the tokenizer. If you are using a LLaMA-based "
|
|
f"model, use '{_FAST_LLAMA_TOKENIZER}' instead of the original "
|
|
"tokenizer.")
|
|
raise RuntimeError(err_msg) from e
|
|
|
|
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
|
logger.warning(
|
|
"Using a slow tokenizer. This might cause a significant "
|
|
"slowdown. Consider using a fast tokenizer instead.")
|
|
return tokenizer
|
|
|
|
|
|
def detokenize_incrementally(
|
|
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
|
prev_output_tokens: List[str],
|
|
new_token_id: int,
|
|
skip_special_tokens: bool,
|
|
) -> Tuple[str, str]:
|
|
"""Detokenizes the new token in conjuction with the previous output tokens.
|
|
|
|
NOTE: This function does not update prev_output_tokens.
|
|
|
|
Returns:
|
|
new_token: The new token as a string.
|
|
output_text: The new output text as a string.
|
|
"""
|
|
new_token = tokenizer.convert_ids_to_tokens(
|
|
new_token_id, skip_special_tokens=skip_special_tokens)
|
|
output_tokens = prev_output_tokens + [new_token]
|
|
|
|
# Convert the tokens to a string.
|
|
# Optimization: If the tokenizer does not have `added_tokens_encoder`,
|
|
# then we can directly use `convert_tokens_to_string`.
|
|
if not getattr(tokenizer, "added_tokens_encoder", {}):
|
|
output_text = tokenizer.convert_tokens_to_string(output_tokens)
|
|
return new_token, output_text
|
|
|
|
# Adapted from
|
|
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
|
|
# NOTE(woosuk): The following code is slow because it runs a for loop over
|
|
# the output_tokens. In Python, running a for loop over a list can be slow
|
|
# even when the loop body is very simple.
|
|
sub_texts = []
|
|
current_sub_text = []
|
|
for token in output_tokens:
|
|
if skip_special_tokens and token in tokenizer.all_special_ids:
|
|
continue
|
|
if token in tokenizer.added_tokens_encoder:
|
|
if current_sub_text:
|
|
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
|
|
sub_texts.append(sub_text)
|
|
current_sub_text = []
|
|
sub_texts.append(token)
|
|
else:
|
|
current_sub_text.append(token)
|
|
if current_sub_text:
|
|
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
|
|
sub_texts.append(sub_text)
|
|
output_text = " ".join(sub_texts)
|
|
return new_token, output_text
|
|
|