https://huggingface.co/docs/tokenizers/api/added-tokens
python展开代码from rich import print
from transformers import AutoTokenizer
def add_new_tokens(load_path: str, save_path: str, new_tokens: list[str]):
tokenizer = AutoTokenizer.from_pretrained(load_path)
tokenizer.add_tokens(new_tokens)
tokenizer.save_pretrained(save_path)
def update_tokenizer(model_path: str):
add_new_tokens(
model_path, model_path, ["<|call_start|>", "<|call_end|>", "<|toolcall_start|>", "<|toolcall_end|>"]
)
def test_tokenizer(model_path: str):
tokenizer = AutoTokenizer.from_pretrained(model_path)
text = "hello <|call_start|> world <|call_end|>"
print(text, tokenizer.tokenize(text), tokenizer.encode(text))
text = "hello <|toolcall_start|> world <|toolcall_end|>"
print(text, tokenizer.tokenize(text), tokenizer.encode(text))
if __name__ == "__main__":
update_tokenizer("/mnt/jfs6/model_ok/qwen2vl-0811-1/checkpoint-6400")
test_tokenizer("/mnt/jfs6/model_ok/qwen2vl-0811-1/checkpoint-6400")
added_tokens.json 改变了:
bash展开代码root@hello-9tgxb-1867836-worker-0:/mnt/jfs6/model_ok/qwen2vl-0811-1/checkpoint-6400# cat added_tokens.json
{
"<|box_end|>": 151649,
"<|box_start|>": 151648,
"<|call_end|>": 151660,
"<|call_start|>": 151659,
"<|endoftext|>": 151643,
"<|im_end|>": 151645,
"<|im_start|>": 151644,
"<|image_pad|>": 151655,
"<|object_ref_end|>": 151647,
"<|object_ref_start|>": 151646,
"<|quad_end|>": 151651,
"<|quad_start|>": 151650,
"<|toolcall_end|>": 151658,
"<|toolcall_start|>": 151657,
"<|video_pad|>": 151656,
"<|vision_end|>": 151653,
"<|vision_pad|>": 151654,
"<|vision_start|>": 151652
}
tokenizer.json 和 tokenizer_config.json 也相应更新
本文作者:Dong
本文链接:
版权声明:本博客所有文章除特别声明外,均采用 CC BY-NC。本作品采用《知识共享署名-非商业性使用 4.0 国际许可协议》进行许可。您可以在非商业用途下自由转载和修改,但必须注明出处并提供原作者链接。 许可协议。转载请注明出处!