fix_nvidia_smi.sh 这个代码会直接全局看是否可以修复:
bash展开代码#!/bin/bash
# ============================================================================
# nvidia-smi and CUDA Driver Library Fix Script
# ============================================================================
set -e
echo "=========================================="
echo "Starting nvidia-smi and CUDA Driver Library Fix"
echo "=========================================="
echo ""
# ============================================================================
# 1. Find nvidia-smi command
# ============================================================================
find_nvidia_smi() {
local paths=(
"/usr/bin"
"/usr/local/bin"
"/usr/local/cuda/bin"
"/usr/local/nvidia/bin"
"/opt/nvidia/bin"
"/bin"
)
for path in "${paths[@]}"; do
if [ -f "$path/nvidia-smi" ] && [ -x "$path/nvidia-smi" ]; then
echo "$path/nvidia-smi"
return 0
fi
done
# Try to find using find command across the system
local found_path=$(find /usr /opt /bin -name "nvidia-smi" -type f -executable 2>/dev/null | head -1 || echo "")
if [ -n "$found_path" ] && [ -f "$found_path" ]; then
echo "$found_path"
return 0
fi
# Try to find using which/whereis
if command -v nvidia-smi >/dev/null 2>&1; then
command -v nvidia-smi
return 0
fi
return 1
}
echo "Step 1: Finding nvidia-smi command..."
NVIDIA_SMI_PATH=$(find_nvidia_smi || echo "")
if [ -n "$NVIDIA_SMI_PATH" ]; then
echo "[OK] Found nvidia-smi: $NVIDIA_SMI_PATH"
# Add nvidia-smi directory to PATH
NVIDIA_SMI_DIR=$(dirname "$NVIDIA_SMI_PATH")
export PATH="${NVIDIA_SMI_DIR}:${PATH}"
echo " Added $NVIDIA_SMI_DIR to PATH"
else
echo "[FAIL] nvidia-smi command not found"
echo " Trying to find in common paths..."
for path in /usr/bin /usr/local/bin /usr/local/cuda/bin /usr/local/nvidia/bin; do
if [ -d "$path" ]; then
echo " Checking $path: $(ls -la "$path/nvidia-smi" 2>/dev/null || echo 'not found')"
fi
done
fi
echo ""
# ============================================================================
# 2. Find CUDA Driver Library (libcuda.so.1)
# ============================================================================
find_libcuda() {
local paths=(
"/usr/lib/x86_64-linux-gnu"
"/usr/local/cuda/lib64"
"/usr/lib64"
"/usr/lib"
"/lib/x86_64-linux-gnu"
"/usr/local/nvidia/lib64"
"/usr/local/nvidia/lib"
"/opt/nvidia/lib64"
"/opt/nvidia/lib"
)
for path in "${paths[@]}"; do
if [ -f "$path/libcuda.so.1" ] || [ -f "$path/libcuda.so" ]; then
echo "$path"
return 0
fi
done
# Try to find using find command across the system
local found_path=$(find /usr /opt /lib -name "libcuda.so.1" 2>/dev/null | head -1 | xargs dirname 2>/dev/null || echo "")
if [ -n "$found_path" ] && [ -d "$found_path" ]; then
echo "$found_path"
return 0
fi
# Try to find using ldconfig
local libcuda_path=$(ldconfig -p 2>/dev/null | grep libcuda.so.1 | head -1 | awk '{print $4}' | xargs dirname 2>/dev/null || echo "")
if [ -n "$libcuda_path" ] && [ -d "$libcuda_path" ]; then
echo "$libcuda_path"
return 0
fi
return 1
}
echo "Step 2: Finding CUDA Driver Library (libcuda.so.1)..."
CUDA_LIB_PATH=$(find_libcuda || echo "")
if [ -n "$CUDA_LIB_PATH" ]; then
echo "[OK] Found CUDA driver library path: $CUDA_LIB_PATH"
export LD_LIBRARY_PATH="${CUDA_LIB_PATH}:${LD_LIBRARY_PATH}"
echo " Added $CUDA_LIB_PATH to LD_LIBRARY_PATH"
else
echo "[WARN] libcuda.so.1 not found"
echo " Diagnostic information:"
echo " - Check /usr/lib/x86_64-linux-gnu/libcuda.so*: $(ls /usr/lib/x86_64-linux-gnu/libcuda.so* 2>/dev/null | head -1 || echo 'not found')"
echo " - Check /usr/local/cuda/lib64/libcuda.so*: $(ls /usr/local/cuda/lib64/libcuda.so* 2>/dev/null | head -1 || echo 'not found')"
echo " - Check /usr/local/nvidia/lib64/libcuda.so*: $(ls /usr/local/nvidia/lib64/libcuda.so* 2>/dev/null | head -1 || echo 'not found')"
echo " - libcuda in ldconfig: $(ldconfig -p 2>/dev/null | grep libcuda || echo 'not found')"
echo ""
echo " Trying to use default paths and set environment variables..."
# Try common paths
export LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:/usr/local/cuda/lib64:/usr/lib64:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}"
fi
echo ""
# ============================================================================
# 3. Check /dev/nvidia* devices
# ============================================================================
echo "Step 3: Checking /dev/nvidia* devices..."
if ls /dev/nvidia* >/dev/null 2>&1; then
echo "[OK] Found /dev/nvidia* devices:"
ls -la /dev/nvidia* 2>/dev/null | head -5 || true
else
echo "[WARN] /dev/nvidia* devices not found"
echo " This may indicate GPU devices are not properly mounted in container"
fi
echo ""
# ============================================================================
# 4. Set CUDA_HOME (if not set)
# ============================================================================
echo "Step 4: Setting CUDA_HOME..."
if [ -z "$CUDA_HOME" ]; then
if [ -d "/usr/local/cuda" ]; then
export CUDA_HOME="/usr/local/cuda"
export PATH="${CUDA_HOME}/bin:${PATH}"
export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
echo "[OK] Set CUDA_HOME=$CUDA_HOME"
elif [ -d "/usr/local/nvidia" ]; then
export CUDA_HOME="/usr/local/nvidia"
export PATH="${CUDA_HOME}/bin:${PATH}"
export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
echo "[OK] Set CUDA_HOME=$CUDA_HOME"
else
echo "[WARN] CUDA installation directory not found"
fi
else
echo "[OK] CUDA_HOME already set: $CUDA_HOME"
fi
echo ""
# ============================================================================
# 5. Verify CUDA library accessibility
# ============================================================================
echo "Step 5: Verifying CUDA driver library accessibility..."
CUDA_LIB_OK=false
if python3 << 'PYEOF' 2>/dev/null | grep -q "OK"; then
import ctypes
try:
ctypes.CDLL("libcuda.so.1")
print("OK")
except:
print("FAIL")
PYEOF
CUDA_LIB_OK=true
fi
if [ "$CUDA_LIB_OK" = "true" ]; then
echo "[OK] CUDA driver library is accessible"
else
echo "[WARN] CUDA driver library may not be accessible, trying other fix methods..."
# Try to detect CUDA through PyTorch (if PyTorch is available, CUDA runtime exists)
if python3 << 'PYEOF' 2>/dev/null | grep -q "True"; then
import torch
print("PyTorch CUDA available:", torch.cuda.is_available())
PYEOF
echo "[OK] PyTorch can access CUDA, setting vLLM environment variables to force GPU usage"
# Set vLLM related environment variables to force GPU usage
export VLLM_USE_CPU=0
export VLLM_WORKER_MULTIPROC_METHOD=spawn
# Try to set device related environment variables
export CUDA_DEVICE_ORDER=PCI_BUS_ID
else
echo "[FAIL] PyTorch cannot access CUDA either"
fi
fi
echo ""
# ============================================================================
# 6. Test nvidia-smi command
# ============================================================================
echo "Step 6: Testing nvidia-smi command..."
if command -v nvidia-smi >/dev/null 2>&1; then
echo "[OK] nvidia-smi command is available"
echo ""
echo "Running nvidia-smi test:"
echo "----------------------------------------"
if nvidia-smi >/dev/null 2>&1; then
nvidia-smi
echo "----------------------------------------"
echo "[OK] nvidia-smi ran successfully!"
else
echo "[FAIL] nvidia-smi execution failed"
echo " Error information:"
nvidia-smi 2>&1 || true
fi
else
echo "[FAIL] nvidia-smi command is still not available"
echo " Current PATH: $PATH"
echo " Trying to use full path directly:"
if [ -n "$NVIDIA_SMI_PATH" ] && [ -f "$NVIDIA_SMI_PATH" ]; then
echo " Using: $NVIDIA_SMI_PATH"
"$NVIDIA_SMI_PATH" || true
fi
fi
echo ""
# ============================================================================
# 7. Output environment variables summary
# ============================================================================
echo "=========================================="
echo "Environment Variables Summary"
echo "=========================================="
echo "PATH: $PATH"
echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
echo "CUDA_HOME: ${CUDA_HOME:-not set}"
echo "NVIDIA_SMI_PATH: ${NVIDIA_SMI_PATH:-not found}"
echo "CUDA_LIB_PATH: ${CUDA_LIB_PATH:-not found}"
echo "=========================================="
echo ""
# ============================================================================
# 8. Generate persistent environment variable setup script
# ============================================================================
ENV_SCRIPT="/tmp/setup_nvidia_env.sh"
cat > "$ENV_SCRIPT" << 'ENVEOF'
#!/bin/bash
# Auto-generated environment variable setup script
# Source this file in training scripts to apply fixes
# Set PATH
if [ -n "$NVIDIA_SMI_DIR" ]; then
export PATH="${NVIDIA_SMI_DIR}:${PATH}"
fi
# Set LD_LIBRARY_PATH
if [ -n "$CUDA_LIB_PATH" ]; then
export LD_LIBRARY_PATH="${CUDA_LIB_PATH}:${LD_LIBRARY_PATH}"
fi
# Set CUDA_HOME
if [ -z "$CUDA_HOME" ] && [ -d "/usr/local/cuda" ]; then
export CUDA_HOME="/usr/local/cuda"
export PATH="${CUDA_HOME}/bin:${PATH}"
export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
fi
# Set vLLM related environment variables
export VLLM_USE_CPU=0
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export CUDA_DEVICE_ORDER=PCI_BUS_ID
ENVEOF
# Replace variables in script
if [ -n "$NVIDIA_SMI_DIR" ]; then
sed -i "s|NVIDIA_SMI_DIR|\"$NVIDIA_SMI_DIR\"|g" "$ENV_SCRIPT"
fi
if [ -n "$CUDA_LIB_PATH" ]; then
sed -i "s|CUDA_LIB_PATH|\"$CUDA_LIB_PATH\"|g" "$ENV_SCRIPT"
fi
chmod +x "$ENV_SCRIPT"
echo "[OK] Generated environment variable setup script: $ENV_SCRIPT"
echo " Use in training scripts: source $ENV_SCRIPT"
echo ""
echo "=========================================="
echo "Fix completed!"
echo "=========================================="
echo ""
echo "If nvidia-smi is still not available, please check:"
echo "1. Whether GPU devices (/dev/nvidia*) are properly mounted in container"
echo "2. Whether container image contains NVIDIA driver tools"
echo "3. Whether rjob configuration correctly sets GPU resources"
echo ""
diagnose_nvidia_env.sh 这个代码会诊断环境:
bash展开代码#!/bin/bash
# ============================================================================
# NVIDIA/CUDA Environment Diagnostic Script
# Run this script and provide the output to get hardcoded environment variables
# ============================================================================
echo "=========================================="
echo "NVIDIA/CUDA Environment Diagnostic"
echo "=========================================="
echo ""
echo "Please run this script and provide the complete output."
echo ""
# 1. Find nvidia-smi
echo "=== 1. NVIDIA-SMI LOCATION ==="
NVIDIA_SMI_PATH=""
for path in /usr/local/nvidia/bin /usr/bin /usr/local/bin /usr/local/cuda/bin /opt/nvidia/bin /bin; do
if [ -f "$path/nvidia-smi" ] && [ -x "$path/nvidia-smi" ]; then
NVIDIA_SMI_PATH="$path/nvidia-smi"
NVIDIA_SMI_DIR="$path"
echo "FOUND: $NVIDIA_SMI_PATH"
echo "DIR: $NVIDIA_SMI_DIR"
break
fi
done
if [ -z "$NVIDIA_SMI_PATH" ]; then
echo "NOT FOUND: Searching with find..."
NVIDIA_SMI_PATH=$(find /usr /opt /bin -name "nvidia-smi" -type f -executable 2>/dev/null | head -1 || echo "")
if [ -n "$NVIDIA_SMI_PATH" ]; then
NVIDIA_SMI_DIR=$(dirname "$NVIDIA_SMI_PATH")
echo "FOUND: $NVIDIA_SMI_PATH"
echo "DIR: $NVIDIA_SMI_DIR"
else
echo "NOT FOUND: nvidia-smi not found in system"
fi
fi
echo ""
# 2. Find CUDA driver library
echo "=== 2. CUDA DRIVER LIBRARY (libcuda.so.1) ==="
CUDA_LIB_PATH=""
for path in /usr/local/nvidia/lib64 /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64 /usr/lib64 /usr/lib /lib/x86_64-linux-gnu /opt/nvidia/lib64; do
if [ -f "$path/libcuda.so.1" ] || [ -f "$path/libcuda.so" ]; then
CUDA_LIB_PATH="$path"
echo "FOUND: $CUDA_LIB_PATH"
ls -la "$path/libcuda.so"* 2>/dev/null | head -3 || true
break
fi
done
if [ -z "$CUDA_LIB_PATH" ]; then
echo "NOT FOUND: Searching with find..."
CUDA_LIB_PATH=$(find /usr /opt /lib -name "libcuda.so.1" 2>/dev/null | head -1 | xargs dirname 2>/dev/null || echo "")
if [ -n "$CUDA_LIB_PATH" ]; then
echo "FOUND: $CUDA_LIB_PATH"
ls -la "$CUDA_LIB_PATH/libcuda.so"* 2>/dev/null | head -3 || true
else
echo "NOT FOUND: libcuda.so.1 not found"
fi
fi
echo ""
# 2.5. Find libnvidia-ml.so (required by nvidia-smi)
echo "=== 2.5. NVIDIA ML LIBRARY (libnvidia-ml.so) ==="
NVIDIA_ML_LIB_PATH=""
for path in /usr/local/nvidia/lib64 /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64 /usr/lib64 /usr/lib /lib/x86_64-linux-gnu /opt/nvidia/lib64; do
if [ -f "$path/libnvidia-ml.so" ] || [ -f "$path/libnvidia-ml.so.1" ]; then
NVIDIA_ML_LIB_PATH="$path"
echo "FOUND: $NVIDIA_ML_LIB_PATH"
ls -la "$path/libnvidia-ml.so"* 2>/dev/null | head -3 || true
break
fi
done
if [ -z "$NVIDIA_ML_LIB_PATH" ]; then
echo "NOT FOUND: Searching with find..."
NVIDIA_ML_LIB_PATH=$(find /usr /opt /lib -name "libnvidia-ml.so*" 2>/dev/null | head -1 | xargs dirname 2>/dev/null || echo "")
if [ -n "$NVIDIA_ML_LIB_PATH" ]; then
echo "FOUND: $NVIDIA_ML_LIB_PATH"
ls -la "$NVIDIA_ML_LIB_PATH/libnvidia-ml.so"* 2>/dev/null | head -3 || true
else
echo "NOT FOUND: libnvidia-ml.so not found (this may cause nvidia-smi to fail)"
fi
fi
echo ""
# 3. Check CUDA_HOME
echo "=== 3. CUDA_HOME ==="
if [ -d "/usr/local/cuda" ]; then
echo "FOUND: /usr/local/cuda"
echo "BIN: /usr/local/cuda/bin"
echo "LIB64: /usr/local/cuda/lib64"
elif [ -d "/usr/local/nvidia" ]; then
echo "FOUND: /usr/local/nvidia"
echo "BIN: /usr/local/nvidia/bin"
echo "LIB64: /usr/local/nvidia/lib64"
else
echo "NOT FOUND: No CUDA installation directory found"
fi
echo ""
# 4. Check GPU devices
echo "=== 4. GPU DEVICES ==="
if ls /dev/nvidia* >/dev/null 2>&1; then
echo "FOUND: /dev/nvidia* devices"
ls -la /dev/nvidia* 2>/dev/null | head -5
else
echo "NOT FOUND: No /dev/nvidia* devices"
fi
echo ""
# 5. Check ldconfig
echo "=== 5. LDCONFIG LIBRARY PATHS ==="
if command -v ldconfig >/dev/null 2>&1; then
echo "libcuda libraries in ldconfig:"
ldconfig -p 2>/dev/null | grep libcuda || echo " No libcuda found in ldconfig"
else
echo "ldconfig command not available"
fi
echo ""
# 6. Current environment variables
echo "=== 6. CURRENT ENVIRONMENT VARIABLES ==="
echo "PATH: $PATH"
echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
echo "CUDA_HOME: ${CUDA_HOME:-not set}"
echo ""
# 7. Test nvidia-smi if found
echo "=== 7. NVIDIA-SMI TEST ==="
if [ -n "$NVIDIA_SMI_PATH" ] && [ -f "$NVIDIA_SMI_PATH" ]; then
echo "Testing: $NVIDIA_SMI_PATH"
if "$NVIDIA_SMI_PATH" --version >/dev/null 2>&1; then
echo "SUCCESS: nvidia-smi works"
"$NVIDIA_SMI_PATH" --version 2>&1 | head -1
else
echo "FAILED: nvidia-smi found but cannot execute"
"$NVIDIA_SMI_PATH" --version 2>&1 || true
fi
else
echo "SKIPPED: nvidia-smi not found"
fi
echo ""
# 8. Summary
echo "=========================================="
echo "SUMMARY FOR HARDCODING"
echo "=========================================="
echo "NVIDIA_SMI_DIR=${NVIDIA_SMI_DIR:-NOT_FOUND}"
echo "CUDA_LIB_PATH=${CUDA_LIB_PATH:-NOT_FOUND}"
echo "NVIDIA_ML_LIB_PATH=${NVIDIA_ML_LIB_PATH:-NOT_FOUND}"
echo "CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}"
echo "=========================================="
echo ""
echo "Please provide the complete output above to get hardcoded environment variables."
导出这些环境变量即可,把这些环境变量加入:
bash展开代码# NVIDIA-SMI 路径
export PATH="/usr/local/nvidia/bin:${PATH}"
# CUDA driver 库路径(包含 libcuda.so.1 和 libnvidia-ml.so)
export LD_LIBRARY_PATH="/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}"
# CUDA 安装目录
export CUDA_HOME="/usr/local/cuda"
export PATH="${CUDA_HOME}/bin:${PATH}"
export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
# vLLM 相关环境变量(可选)
export VLLM_USE_CPU=0
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export CUDA_DEVICE_ORDER=PCI_BUS_ID


本文作者:Dong
本文链接:
版权声明:本博客所有文章除特别声明外,均采用 CC BY-NC。本作品采用《知识共享署名-非商业性使用 4.0 国际许可协议》进行许可。您可以在非商业用途下自由转载和修改,但必须注明出处并提供原作者链接。 许可协议。转载请注明出处!