tokenizer_config.json

分析 tokenizer_config.json 文件的设计，以及如何添加特殊token。结合代码来详细讲解 tokenizer_config.json 的设计原理以及如何添加特殊token。

tokenizer_config.json 的设计原理

tokenizer_config.json 是 HuggingFace Transformers 库中用于配置分词器的核心文件。从你的 Qwen2-VL 模型的配置文件中，我可以看到它包含了以下关键部分：

1. 特殊token的定义结构

json
展开代码
{
  "added_tokens_decoder": {
    "151643": {
      "content": "<|endoftext|>",
      "lstrip": false,
      "normalized": false, 
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    // ... 更多token
  },
  "additional_special_tokens": ["<|im_start|>", "<|im_end|>", ...],
  // ... 其他配置
}

2. 各个字段的含义

added_tokens_decoder: 定义了每个特殊token的详细属性
- content: token的实际内容
- lstrip/rstrip: 是否自动移除左/右空格
- normalized: 是否进行标准化处理
- single_word: 是否作为单个词处理
- special: 标记为特殊token
additional_special_tokens: 额外的特殊token列表
chat_template: 定义对话格式的Jinja2模板
tokenizer_class: 指定使用的分词器类
model_max_length: 模型的最大序列长度

Transformers 库的加载过程

根据我查看的源码，transformers 库的加载过程如下：

1. 配置文件加载

python
展开代码
# 在 tokenization_utils_base.py 中
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"

# from_pretrained 方法会首先尝试加载配置文件
tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
if tokenizer_config_file is not None:
    with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
        init_kwargs = json.load(tokenizer_config_handle)

2. 特殊token的处理

从源码中可以看到，在 _from_pretrained 方法中，有专门的逻辑处理 additional_special_tokens：

python
展开代码
# 第2203-2211行的关键代码
elif key == "additional_special_tokens" and isinstance(value, list):
    additional_special_tokens = init_kwargs.pop("additional_special_tokens", []) or []
    for token in value:
        if isinstance(token, dict):
            token["special"] = True
            token = AddedToken(**token)
        if token not in additional_special_tokens:
            additional_special_tokens.append(token)
    value = additional_special_tokens

3. LLaMA-Factory 中的 qwen2vl 模板

在 LLaMA-Factory 中，qwen2vl 模板使用了这些特殊token：

python
展开代码
# 来自 template.py 第1705行
register_template(
    name="qwen2_vl",
    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
    stop_words=["<|im_end|>"],
    mm_plugin=get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
)

如何添加新的特殊token

现在让我演示如何添加 <toolcalls> 作为新的特殊token：

方法1：修改 tokenizer_config.json

方法2：使用 Python 代码添加特殊token

python
展开代码
#!/usr/bin/env python3
"""
演示如何向 tokenizer 添加特殊token的示例脚本
"""

from transformers import AutoTokenizer
import json

def add_special_tokens_to_tokenizer(model_path, new_tokens=["<toolcalls>"]):
    """
    向现有的 tokenizer 添加新的特殊token
    
    Args:
        model_path: 模型路径
        new_tokens: 要添加的新token列表
    """
    print(f"正在从 {model_path} 加载 tokenizer...")
    
    # 加载现有的 tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    print(f"原始词汇表大小: {len(tokenizer)}")
    print(f"现有的特殊token: {tokenizer.special_tokens_map}")
    
    # 确保 new_tokens 是列表
    if isinstance(new_tokens, str):
        new_tokens = [new_tokens]
    
    # 方法1：使用 add_special_tokens 方法
    special_tokens_dict = {
        "additional_special_tokens": new_tokens
    }
    
    # 添加特殊token
    num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    
    print(f"成功添加 {num_added_tokens} 个新token: {new_tokens}")
    print(f"新的词汇表大小: {len(tokenizer)}")
    
    # 显示每个新token的ID
    for token in new_tokens:
        token_id = tokenizer.convert_tokens_to_ids(token)
        print(f"'{token}' 的 ID: {token_id}")
    
    # 验证token是否正确添加
    test_text = f"这是一个测试 {' '.join(new_tokens)} 文本"
    tokens = tokenizer.tokenize(test_text)
    print(f"测试文本分词结果: {tokens}")
    
    # 保存更新后的tokenizer
    output_path = "./updated_tokenizer"
    tokenizer.save_pretrained(output_path)
    print(f"更新后的tokenizer已保存到: {output_path}")
    
    return tokenizer

def modify_tokenizer_config_directly(config_path, new_tokens=["<toolcalls>"]):
    """
    直接修改 tokenizer_config.json 文件
    
    Args:
        config_path: tokenizer_config.json 文件路径
        new_tokens: 要添加的新token列表
    """
    print(f"正在修改 {config_path}...")
    
    # 确保 new_tokens 是列表
    if isinstance(new_tokens, str):
        new_tokens = [new_tokens]
    
    # 读取现有配置
    with open(config_path, 'r', encoding='utf-8') as f:
        config = json.load(f)
    
    # 找到下一个可用的token ID
    existing_ids = [int(id) for id in config.get("added_tokens_decoder", {}).keys()]
    next_id = max(existing_ids) + 1 if existing_ids else 151657
    
    # 添加到 added_tokens_decoder
    if "added_tokens_decoder" not in config:
        config["added_tokens_decoder"] = {}
    
    added_token_ids = []
    for i, token in enumerate(new_tokens):
        token_id = next_id + i
        config["added_tokens_decoder"][str(token_id)] = {
            "content": token,
            "lstrip": False,
            "normalized": False,
            "rstrip": False,
            "single_word": False,
            "special": True
        }
        added_token_ids.append(token_id)
    
    # 添加到 additional_special_tokens
    if "additional_special_tokens" not in config:
        config["additional_special_tokens"] = []
    
    for token in new_tokens:
        if token not in config["additional_special_tokens"]:
            config["additional_special_tokens"].append(token)
    
    # 保存修改后的配置
    backup_path = config_path + ".backup"
    with open(backup_path, 'w', encoding='utf-8') as f:
        json.dump(config, f, indent=2, ensure_ascii=False)
    
    print(f"配置已备份到: {backup_path}")
    print(f"新token已添加:")
    for token, token_id in zip(new_tokens, added_token_ids):
        print(f"  '{token}' -> ID: {token_id}")
    
    return config

def demonstrate_usage():
    """演示如何使用新添加的特殊token"""
    
    # 定义要添加的多个特殊token
    new_tokens = [
        "<|toolcall_start|>",
        "<|toolcall_end|>",
        "<|call_start|>",
        "<|call_end|>"
    ]
    
    # 示例1：在代码中直接添加
    # print("=" * 50)
    # print("方法1: 使用 Python 代码添加多个特殊token")
    # print("=" * 50)
    
    # # 注意：这里需要你提供实际的模型路径
    # model_path = "/mnt/jfs/model/Qwen/Qwen2-VL-2B-Instruct"
    # try:
    #     tokenizer = add_special_tokens_to_tokenizer(model_path, new_tokens)
    # except Exception as e:
    #     print(f"加载模型失败: {e}")
    #     print("请确保模型路径正确")
    
    # 示例2：直接修改配置文件
    print("\n" + "=" * 50)
    print("方法2: 直接修改 tokenizer_config.json")
    print("=" * 50)
    
    config_path = "./tokenizer_config.json"
    try:
        config = modify_tokenizer_config_directly(config_path, new_tokens)
        print("配置修改成功！")
    except Exception as e:
        print(f"修改配置失败: {e}")


if __name__ == "__main__":
    demonstrate_usage()

重要注意事项

模型参数扩展

当你添加新的特殊token时，模型的词汇表会扩展，因此需要：

python
展开代码
# 在训练前调整模型的embedding层
model = AutoModelForCausalLM.from_pretrained(model_path)
model.resize_token_embeddings(len(tokenizer))

测试token

python
展开代码
#!/usr/bin/env python3
"""
测试 Qwen2-VL-2B-Instruct tokenizer 对特殊token的分词效果
"""

from transformers import AutoTokenizer

def test_tokenizer_special_tokens():
    """测试tokenizer对特殊token的处理"""
    
    model_path = "/mnt/jfs/model/Qwen/Qwen2-VL-2B-Instruct"
    
    print("=" * 60)
    print("测试 Qwen2-VL-2B-Instruct Tokenizer 特殊Token分词效果")
    print("=" * 60)
    
    # 加载原始tokenizer
    print(f"正在加载 tokenizer: {model_path}")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        print(f"✓ Tokenizer 加载成功，词汇表大小: {len(tokenizer)}")
    except Exception as e:
        print(f"✗ Tokenizer 加载失败: {e}")
        return
    
    # 测试的四个特殊token
    test_tokens = [
        "<|toolcall_start|>",
        "<|toolcall_end|>", 
        "<|call_start|>",
        "<|call_end|>"
    ]
    
    print(f"\n测试的特殊token: {test_tokens}")
    
    # 构建测试文本
    test_text = f"这是一个测试 {' '.join(test_tokens)} 文本"
    print(f"\n测试文本: {test_text}")
    
    # 测试分词
    tokens = tokenizer.tokenize(test_text)
    print(f"\n分词结果: {tokens}")
    print(f"总token数: {len(tokens)}")
    
    # 检查特殊token是否被正确识别
    print(f"\n特殊token识别情况:")
    for test_token in test_tokens:
        if test_token in tokens:
            print(f"  ✓ '{test_token}' 被识别为单个token")
        else:
            print(f"  ✗ '{test_token}' 被分解为多个token")
    
    # 编码测试
    token_ids = tokenizer.encode(test_text, add_special_tokens=False)
    print(f"\nToken IDs: {token_ids}")
    
    # 解码测试
    decoded = tokenizer.decode(token_ids)
    print(f"\n解码结果: {decoded}")
    
    # 检查解码是否准确
    if decoded == test_text:
        print("✓ 编码解码一致")
    else:
        print("✗ 编码解码不一致")

def main():
    test_tokenizer_special_tokens()

if __name__ == "__main__":
    main()

目录