图片文件如果相同,md5是一样的。于是利用此进行图片去重。
pythonimport os
import hashlib
def get_md5(file):
file = open(file, 'rb')
md5 = hashlib.md5(file.read())
file.close()
md5_values = md5.hexdigest()
return md5_values
file_path = r"D:\Users\Administrator\Desktop\smokingsig\yolov5k\images"
os.chdir(file_path)
file_list = os.listdir(file_path)
md5_list = []
for file in file_list:
md5 = get_md5(file)
if md5 not in md5_list:
md5_list.append(md5)
else:
os.remove(file)
相似度 计算量非常大,不太好用:
python# -*- encoding=utf-8 -*-
import cv2
import os
class dup_picture:
def __init__(self):
self.orb = cv2.ORB_create()
self.bf = cv2.BFMatcher(cv2.NORM_HAMMING)
def get_pic_desc(self, img1_path: str):
img1 = cv2.imread(img1_path, cv2.IMREAD_GRAYSCALE)
kp1, des1 = self.orb.detectAndCompute(img1, None)
return des1
def get_similary(self, des1, des2):
# knn筛选结果
matches = self.bf.knnMatch(des1, trainDescriptors=des2, k=2)
# 查看最大匹配点数目
good = [m for (m, n) in matches if m.distance < 0.75 * n.distance]
if len(matches) == 0:
return 0
similary = len(good) / len(matches)
return similary
if __name__ == '__main__':
duptool = dup_picture()
file_path = r"D:\PycharmProjects\downloadpicture\all_smoking_picture"
file_list = os.listdir(file_path)
file_list = list(filter(lambda x: x.endswith(".jpg"), file_list)) # must .jpg pic
os.chdir(file_path)
unique_list = [duptool.get_pic_desc(file_list[0]), ]
for k,file in enumerate(file_list[1:]):
print(k)
des = duptool.get_pic_desc(file)
if des is None:
os.remove(file)
continue
for des_i in unique_list:
if duptool.get_similary(des, des_i) > 0.5:
os.remove(file)
print(file)
break
else: # Smooth and uninterrupted execution completed
unique_list.append(des)
其余labels处理:
pythonimport os
imagespath = r"D:\Users\Administrator\Desktop\smokingsig\yolov5k\images"
labelspath = r"D:\Users\Administrator\Desktop\yololabels"
img_list = os.listdir(imagespath)
lab_list = os.listdir(labelspath)
# ---- read label ----> if none label ----> delete label file
for name in lab_list:
lfile = os.path.join(labelspath, name)
if "image" in name:
with open(lfile, "r") as f:
res = f.readlines()
if len(res) == 0:
os.remove(lfile)
# ---- read label ----> if label str too short ----> delete label file
for name in lab_list:
lfile = os.path.join(labelspath, name)
if "image" in name:
with open(lfile, "r") as f:
res = f.readlines()
if len(res[0]) < 3:
os.remove(lfile)
# ---- read images names ----> if have not label file ----> delete imgfile
for name in img_list:
if name[:-4] + ".txt" not in lab_list:
os.remove(os.path.join(imagespath, name))
本文作者:Dong
本文链接:
版权声明:本博客所有文章除特别声明外,均采用 CC BY-NC。本作品采用《知识共享署名-非商业性使用 4.0 国际许可协议》进行许可。您可以在非商业用途下自由转载和修改,但必须注明出处并提供原作者链接。 许可协议。转载请注明出处!