-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_wash_dedup.py
60 lines (44 loc) · 1.56 KB
/
1_wash_dedup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import json
import os
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
from PIL import Image
import yaml
from tqdm import tqdm
pre_metadata_path = "dataset/pre_metadata.json"
img_basedir = "dataset/images"
metadata = json.load(open(pre_metadata_path, "r"))
# json.dump(metadata, open(f"dataset/backup/pre_metadata.{ datetime.now().strftime('%Y-%m-%d_%H:%M:%S') }.json", "w"))
stat = metadata['queries']
data = metadata['metadata']
actual_stat = { k: 0 for k in stat.keys() }
paths = { k: {} for k in stat.keys() }
checked, error_idxs = [], []
for i, v in enumerate(tqdm(data)):
hash = f"{v['query']}-{v['source']}-{v['pid']}"
if hash in checked:
error_idxs.append(i)
continue
file = os.path.join(v['query'], f"{v['source'].lower()}-{v['pid']}.")
ext = "jpg"
if v['source'] == 'Pexels':
ext = v['extension'] if not v['extension'] == "jpeg" else "jpg"
file = file + ext
file = file.replace(' ', '-')
if os.path.exists(os.path.join(img_basedir, file)):
checked.append(hash)
else:
error_idxs.append(i)
continue
paths[v['query']][v['id']] = file
actual_stat[v['query']] += 1
error_idxs = list(set(error_idxs))
error_idxs.sort()
print(f"Removing {len(error_idxs)} duplicates or missings")
for rm_i in error_idxs[::-1]:
del metadata['metadata'][rm_i]
print(f"Actual amount of image: { len(metadata['metadata']) }")
metadata['paths'] = paths
metadata['queries'] = actual_stat
json.dump(metadata, open(pre_metadata_path, "w"))