该Python脚本用于处理/ssd/xiedong/vlm-r1-train-tasks-json-ui-docto/tasks_json目录下的数据,将其中所有图片等比例缩小(最长边为1024像素),并连同JSON文件一起复制到新目录/ssd/xiedong/vlm-r1-train-tasks-json-ui-docto/tasks_json_small_size中。
功能说明
bashPillow>=9.0.0 tqdm>=4.62.0
pythonimport os
import shutil
import json
from PIL import Image
import multiprocessing
from functools import partial
from tqdm import tqdm
import argparse
# Source and destination paths
SRC_DIR = "/ssd/xiedong/vlm-r1-train-tasks-json-ui-docto/tasks_json"
DEST_DIR = "/ssd/xiedong/vlm-r1-train-tasks-json-ui-docto/tasks_json_small_size"
MAX_SIZE = 1024
def resize_image(img_path, dest_path):
"""Resize image while maintaining aspect ratio so that the longest side is MAX_SIZE"""
try:
with Image.open(img_path) as img:
# Get original dimensions
width, height = img.size
# Calculate new dimensions
if width > height:
new_width = MAX_SIZE
new_height = int(height * (MAX_SIZE / width))
else:
new_height = MAX_SIZE
new_width = int(width * (MAX_SIZE / height))
# Resize the image
resized_img = img.resize((new_width, new_height), Image.LANCZOS)
# Save the resized image
resized_img.save(dest_path, quality=95)
return True
except Exception as e:
print(f"Error processing {img_path}: {e}")
return False
def process_folder(folder_name):
"""Process a single folder, resizing images and copying JSON files"""
try:
src_folder = os.path.join(SRC_DIR, folder_name)
dest_folder = os.path.join(DEST_DIR, folder_name)
# Create destination folder if it doesn't exist
os.makedirs(dest_folder, exist_ok=True)
# Process each file in the folder
for filename in os.listdir(src_folder):
src_file = os.path.join(src_folder, filename)
dest_file = os.path.join(dest_folder, filename)
# Check if it's an image file
if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
resize_image(src_file, dest_file)
else:
# Copy non-image files (like JSON)
shutil.copy2(src_file, dest_file)
return True
except Exception as e:
print(f"Error processing folder {folder_name}: {e}")
return False
def main():
# Create destination directory if it doesn't exist
os.makedirs(DEST_DIR, exist_ok=True)
# Get list of all folders in source directory
folders = [d for d in os.listdir(SRC_DIR) if os.path.isdir(os.path.join(SRC_DIR, d))]
# Use multiprocessing to process folders in parallel
num_processes = max(1, multiprocessing.cpu_count() - 1) # Leave one CPU free
print(f"Processing {len(folders)} folders using {num_processes} processes")
# Process folders in parallel with progress bar
with multiprocessing.Pool(processes=num_processes) as pool:
list(tqdm(pool.imap(process_folder, folders), total=len(folders)))
print(f"Processing complete. Resized images and copied files to {DEST_DIR}")
if __name__ == "__main__":
main()
本文作者:Dong
本文链接:
版权声明:本博客所有文章除特别声明外,均采用 CC BY-NC。本作品采用《知识共享署名-非商业性使用 4.0 国际许可协议》进行许可。您可以在非商业用途下自由转载和修改,但必须注明出处并提供原作者链接。 许可协议。转载请注明出处!