编辑
2025-04-07
Python
00

该Python脚本用于处理/ssd/xiedong/vlm-r1-train-tasks-json-ui-docto/tasks_json目录下的数据,将其中所有图片等比例缩小(最长边为1024像素),并连同JSON文件一起复制到新目录/ssd/xiedong/vlm-r1-train-tasks-json-ui-docto/tasks_json_small_size中。

功能说明

  • 遍历源目录下所有子文件夹(如"200932"等)
  • 处理每个子文件夹中的所有.jpg图片文件,等比例缩放至最长边为1024像素
  • 同时复制所有step_*.json文件到对应目录
  • 使用Python多进程技术并行处理,大幅提高处理速度
bash
Pillow>=9.0.0 tqdm>=4.62.0
python
import os import shutil import json from PIL import Image import multiprocessing from functools import partial from tqdm import tqdm import argparse # Source and destination paths SRC_DIR = "/ssd/xiedong/vlm-r1-train-tasks-json-ui-docto/tasks_json" DEST_DIR = "/ssd/xiedong/vlm-r1-train-tasks-json-ui-docto/tasks_json_small_size" MAX_SIZE = 1024 def resize_image(img_path, dest_path): """Resize image while maintaining aspect ratio so that the longest side is MAX_SIZE""" try: with Image.open(img_path) as img: # Get original dimensions width, height = img.size # Calculate new dimensions if width > height: new_width = MAX_SIZE new_height = int(height * (MAX_SIZE / width)) else: new_height = MAX_SIZE new_width = int(width * (MAX_SIZE / height)) # Resize the image resized_img = img.resize((new_width, new_height), Image.LANCZOS) # Save the resized image resized_img.save(dest_path, quality=95) return True except Exception as e: print(f"Error processing {img_path}: {e}") return False def process_folder(folder_name): """Process a single folder, resizing images and copying JSON files""" try: src_folder = os.path.join(SRC_DIR, folder_name) dest_folder = os.path.join(DEST_DIR, folder_name) # Create destination folder if it doesn't exist os.makedirs(dest_folder, exist_ok=True) # Process each file in the folder for filename in os.listdir(src_folder): src_file = os.path.join(src_folder, filename) dest_file = os.path.join(dest_folder, filename) # Check if it's an image file if filename.lower().endswith(('.jpg', '.jpeg', '.png')): resize_image(src_file, dest_file) else: # Copy non-image files (like JSON) shutil.copy2(src_file, dest_file) return True except Exception as e: print(f"Error processing folder {folder_name}: {e}") return False def main(): # Create destination directory if it doesn't exist os.makedirs(DEST_DIR, exist_ok=True) # Get list of all folders in source directory folders = [d for d in os.listdir(SRC_DIR) if os.path.isdir(os.path.join(SRC_DIR, d))] # Use multiprocessing to process folders in parallel num_processes = max(1, multiprocessing.cpu_count() - 1) # Leave one CPU free print(f"Processing {len(folders)} folders using {num_processes} processes") # Process folders in parallel with progress bar with multiprocessing.Pool(processes=num_processes) as pool: list(tqdm(pool.imap(process_folder, folders), total=len(folders))) print(f"Processing complete. Resized images and copied files to {DEST_DIR}") if __name__ == "__main__": main()
如果对你有用的话,可以打赏哦
打赏
ali pay
wechat pay

本文作者:Dong

本文链接:

版权声明:本博客所有文章除特别声明外,均采用 CC BY-NC。本作品采用《知识共享署名-非商业性使用 4.0 国际许可协议》进行许可。您可以在非商业用途下自由转载和修改,但必须注明出处并提供原作者链接。 许可协议。转载请注明出处!