

import os import hashlib def get_md5(file): file = open(file, 'rb') md5 = hashlib.md5(file.read()) file.close() md5_values = md5.hexdigest() return md5_values file_path = r"D:\Users\Administrator\Desktop\smokingsig\yolov5k\images" os.chdir(file_path) file_list = os.listdir(file_path) md5_list = [] for file in file_list: md5 = get_md5(file) if md5 not in md5_list: md5_list.append(md5) else: os.remove(file)

# -*- encoding=utf-8 -*- import cv2 import os class dup_picture: def __init__(self): self.orb = cv2.ORB_create() self.bf = cv2.BFMatcher(cv2.NORM_HAMMING) def get_pic_desc(self, img1_path: str): img1 = cv2.imread(img1_path, cv2.IMREAD_GRAYSCALE) kp1, des1 = self.orb.detectAndCompute(img1, None) return des1 def get_similary(self, des1, des2): # knn筛选结果 matches = self.bf.knnMatch(des1, trainDescriptors=des2, k=2) # 查看最大匹配点数目 good = [m for (m, n) in matches if m.distance < 0.75 * n.distance] if len(matches) == 0: return 0 similary = len(good) / len(matches) return similary if __name__ == '__main__': duptool = dup_picture() file_path = r"D:\PycharmProjects\downloadpicture\all_smoking_picture" file_list = os.listdir(file_path) file_list = list(filter(lambda x: x.endswith(".jpg"), file_list)) # must .jpg pic os.chdir(file_path) unique_list = [duptool.get_pic_desc(file_list[0]), ] for k,file in enumerate(file_list[1:]): print(k) des = duptool.get_pic_desc(file) if des is None: os.remove(file) continue for des_i in unique_list: if duptool.get_similary(des, des_i) > 0.5: os.remove(file) print(file) break else: # Smooth and uninterrupted execution completed unique_list.append(des)


import os imagespath = r"D:\Users\Administrator\Desktop\smokingsig\yolov5k\images" labelspath = r"D:\Users\Administrator\Desktop\yololabels" img_list = os.listdir(imagespath) lab_list = os.listdir(labelspath) # ---- read label ----> if none label ----> delete label file for name in lab_list: lfile = os.path.join(labelspath, name) if "image" in name: with open(lfile, "r") as f: res = f.readlines() if len(res) == 0: os.remove(lfile) # ---- read label ----> if label str too short ----> delete label file for name in lab_list: lfile = os.path.join(labelspath, name) if "image" in name: with open(lfile, "r") as f: res = f.readlines() if len(res[0]) < 3: os.remove(lfile) # ---- read images names ----> if have not label file ----> delete imgfile for name in img_list: if name[:-4] + ".txt" not in lab_list: os.remove(os.path.join(imagespath, name))
