2024-09-01
Python
00

图片文件如果相同,md5是一样的。于是利用此进行图片去重。

python
import os import hashlib def get_md5(file): file = open(file, 'rb') md5 = hashlib.md5(file.read()) file.close() md5_values = md5.hexdigest() return md5_values file_path = r"D:\Users\Administrator\Desktop\smokingsig\yolov5k\images" os.chdir(file_path) file_list = os.listdir(file_path) md5_list = [] for file in file_list: md5 = get_md5(file) if md5 not in md5_list: md5_list.append(md5) else: os.remove(file)

相似度 计算量非常大,不太好用:

python
# -*- encoding=utf-8 -*- import cv2 import os class dup_picture: def __init__(self): self.orb = cv2.ORB_create() self.bf = cv2.BFMatcher(cv2.NORM_HAMMING) def get_pic_desc(self, img1_path: str): img1 = cv2.imread(img1_path, cv2.IMREAD_GRAYSCALE) kp1, des1 = self.orb.detectAndCompute(img1, None) return des1 def get_similary(self, des1, des2): # knn筛选结果 matches = self.bf.knnMatch(des1, trainDescriptors=des2, k=2) # 查看最大匹配点数目 good = [m for (m, n) in matches if m.distance < 0.75 * n.distance] if len(matches) == 0: return 0 similary = len(good) / len(matches) return similary if __name__ == '__main__': duptool = dup_picture() file_path = r"D:\PycharmProjects\downloadpicture\all_smoking_picture" file_list = os.listdir(file_path) file_list = list(filter(lambda x: x.endswith(".jpg"), file_list)) # must .jpg pic os.chdir(file_path) unique_list = [duptool.get_pic_desc(file_list[0]), ] for k,file in enumerate(file_list[1:]): print(k) des = duptool.get_pic_desc(file) if des is None: os.remove(file) continue for des_i in unique_list: if duptool.get_similary(des, des_i) > 0.5: os.remove(file) print(file) break else: # Smooth and uninterrupted execution completed unique_list.append(des)

其余labels处理:

python
import os imagespath = r"D:\Users\Administrator\Desktop\smokingsig\yolov5k\images" labelspath = r"D:\Users\Administrator\Desktop\yololabels" img_list = os.listdir(imagespath) lab_list = os.listdir(labelspath) # ---- read label ----> if none label ----> delete label file for name in lab_list: lfile = os.path.join(labelspath, name) if "image" in name: with open(lfile, "r") as f: res = f.readlines() if len(res) == 0: os.remove(lfile) # ---- read label ----> if label str too short ----> delete label file for name in lab_list: lfile = os.path.join(labelspath, name) if "image" in name: with open(lfile, "r") as f: res = f.readlines() if len(res[0]) < 3: os.remove(lfile) # ---- read images names ----> if have not label file ----> delete imgfile for name in img_list: if name[:-4] + ".txt" not in lab_list: os.remove(os.path.join(imagespath, name))
如果对你有用的话,可以打赏哦
打赏
ali pay
wechat pay

本文作者:Dong

本文链接:

版权声明:本博客所有文章除特别声明外,均采用 CC BY-NC。本作品采用《知识共享署名-非商业性使用 4.0 国际许可协议》进行许可。您可以在非商业用途下自由转载和修改,但必须注明出处并提供原作者链接。 许可协议。转载请注明出处!