项目现在更新了不少,来一把训练。目的是训练qwen2.5vl 7b,fork项目在此:https://github.com/xxddccaa/EasyR1QwenVL
dockerfile:
dockerfile展开代码FROM hiyouga/verl:ngc-th2.7.1-cu12.6-vllm0.10.0 # Install Chinese language support and fonts RUN apt-get update && apt-get install -y \ locales \ fonts-noto-cjk \ language-pack-zh-hans \ openssh-server \ zip \ unzip \ tree \ vim \ tzdata \ apt-utils \ htop \ tmux \ curl \ wget \ git \ net-tools \ libibverbs1 \ libibverbs-dev \ libgl1-mesa-glx libglx-mesa0 \ && rm -rf /var/lib/apt/lists/* WORKDIR /workplace RUN git clone https://github.com/xxddccaa/EasyR1QwenVL.git WORKDIR /workplace/EasyR1 RUN pip install -e . -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple RUN pip install swanlab -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
bash展开代码docker build -f Dockerfile.qwen2vl . -t hub.i.basemind.com/g-xiedong-fine/easyr1:v1
js展开代码#!/bin/bash
cp /config/qwen2_5_vl_7b_grpo_train.sh /workplace/EasyR1/examples/
# 参数设置(可以改成从环境变量读取)
TOTAL_NODES=2 # 总节点数
GPU_PER_NODE=8 # 每台机器的 GPU 数量
#RAY_HEAD_IP="${MASTER_ADDR}" # 主节点 IP
RAY_PORT="${MASTER_PORT}" # Ray 默认端口
NODE_RANK="${RANK}" # 当前节点 rank(0 是主节点)
RAY_HEAD_IP=$(getent hosts "${MASTER_ADDR}" | awk '{print $1}')
echo "RAY_HEAD_IP: $RAY_HEAD_IP"
echo "NODE_RANK: $NODE_RANK"
echo "TOTAL_NODES: $TOTAL_NODES"
echo "GPU_PER_NODE: $GPU_PER_NODE"
echo "RAY_PORT: $RAY_PORT"
export SWANLAB_API_KEY=xxxxx # 设置在线跟踪模式API
export SWANLAB_LOG_DIR=/swanlab_log # 设置本地日志存储路径
export SWANLAB_MODE=cloud # 包含四种模式:cloud云端跟踪模式(默认)、cloud-only仅云端跟踪本地不保存文件、local本地跟踪模式、disabled完全不记录用于debug
# 检查是否是主节点
if [ "$NODE_RANK" -eq 0 ]; then
echo "Starting Ray HEAD node on $(hostname -I | awk '{print $1}')"
ray start --head --port="$RAY_PORT" --dashboard-host=0.0.0.0 --num-gpus="$GPU_PER_NODE"
sleep 20
else
sleep 10
echo "Starting Ray WORKER node #$NODE_RANK, connecting to $RAY_HEAD_IP:$RAY_PORT"
ray start --address="$RAY_HEAD_IP:$RAY_PORT" --num-gpus="$GPU_PER_NODE"
fi
# 等待所有节点加入(主节点检查)
if [ "$NODE_RANK" -eq 0 ]; then
echo "Waiting for all nodes to join..."
while true; do
echo "Checking connected nodes..."
LIVE_NODES=$(ray status | grep -c " 1 node_")
echo "Current connected nodes: $LIVE_NODES / $TOTAL_NODES"
# 打印ray status的完整输出以便调试
echo "Ray cluster status:"
ray status
if [ "$LIVE_NODES" -eq "$TOTAL_NODES" ]; then
echo "All $TOTAL_NODES nodes connected successfully!"
break
fi
echo "Waiting 5 seconds before next check..."
sleep 5
done
# 在主节点上启动训练脚本
echo "Launching training script..."
echo "Setting up SwanLab environment variables..."
export SWANLAB_API_KEY=pM7Xvs5OS2EeXPO5gKXfJ # 设置在线跟踪模式API
export SWANLAB_LOG_DIR=/swanlab_log # 设置本地日志存储路径
export SWANLAB_MODE=cloud # 包含四种模式:cloud云端跟踪模式(默认)、cloud-only仅云端跟踪本地不保存文件、local本地跟踪模式、disabled完全不记录用于debug
echo "Starting training with qwen2_5_vl_7b_grpo_train.sh..."
cd /workplace/EasyR1 && bash examples/qwen2_5_vl_7b_grpo_train.sh
fi
# 保持 Ray 运行(防止脚本退出)
sleep infinity
本文作者:Dong
本文链接:
版权声明:本博客所有文章除特别声明外,均采用 CC BY-NC。本作品采用《知识共享署名-非商业性使用 4.0 国际许可协议》进行许可。您可以在非商业用途下自由转载和修改,但必须注明出处并提供原作者链接。 许可协议。转载请注明出处!