-
Notifications
You must be signed in to change notification settings - Fork 0
/
sortphotos.py
364 lines (292 loc) · 14.6 KB
/
sortphotos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
import os
from PIL import Image
import shutil
import glob
import datetime
# I didn't want to copy every file type (mostly photos), so this splits up what
# I am copying across based on the file extension. Anything not included brought
# up a prompt so I could choose to copy it or skip it or load the file if it was
# an image to help me decide. These should be in lowercase to match, and they
# should have a dot to avoid false positives.
definitely_copy = ('.jpg', '.jpeg', '.odt', '.cpt', '.3gp', '.gft', '.xmp', '.cr2', '.png', '.dng', '.gif', '.txt', '.bmp', '.css', '.avi', '.htm', '.html', '.rtf', '.mht', '.tiff', '.tif', '.wmf', '.pdf', '.mov', '.doc', '.docx', '.psd', '.pdd', '.fpt', '.xls')
definitely_ignore_extensions = ('.ini', '.cab', '.toc', '.gif', '.vif', '.tlv', '.cam', '.chk', '.dat', '.db', '.info', '.bin', '.wbcat', '.js', '.lnk', '.shs', '.crw', '.thm', '.alb', '.tmp', '.ncd', '.cxf', '.zdp', '.cld', '.inf', '.rmg', 'zdx', '.cdx', '.dbf', '.dll')
def preview_image(file_path):
try:
# Open the image using PIL
img = Image.open(file_path)
img.show()
except Exception as e:
print(f"Error previewing '{file_path}': {str(e)}")
def organise_by_hash(input_files):
hashes = {}
for input_file in input_files:
with open(input_file, 'r') as f:
for line in f:
try:
hash_md5, file_path = line.strip().split(',', 1)
except ValueError:
print("Error processing line:", line)
continue # Skip this line and continue with the next
if hash_md5 in hashes:
hashes[hash_md5].append(file_path)
else:
hashes[hash_md5] = [file_path]
return hashes
# This just displays which hashes are duplicates.
def find_duplicates(input_files):
hashes = organise_by_hash(input_files)
for hash_md5, files in hashes.items():
if len(files) > 1:
print(f"Duplicate files for hash {hash_md5}:")
for file in files:
print(f" {file}")
# This counts how many hashes appear more than once vs only once and prints that result.
# It might make more sense to display it by number of duplicates.
def count_duplicates_vs_single_reference(input_files):
hashes = organise_by_hash(input_files)
single_reference_counter = 0
duplicates_counter = 0
for hash_md5, files in hashes.items():
if len(files) > 1:
duplicates_counter += 1
elif len(files) == 1:
single_reference_counter += 1
else:
raise Exception('Should not be reachable')
print(f"duplicates {duplicates_counter}")
print(f"single reference {single_reference_counter}")
def delete_duplicates(input_files):
# Organize files by their hash
hashes = organise_by_hash(input_files)
# Iterate through the organized hashes
for hash_md5, possible_files in hashes.items():
files = [] # To store files that still exist on the file system
if len(possible_files) > 1:
# Check if each file exists
for file_path in possible_files:
if os.path.exists(file_path):
files.append(file_path)
if len(files) > 1:
print(f"Duplicate files for hash {hash_md5}:")
# Display the first file
first_file = files[0]
#os.system(f'start "" "{first_file}"')
#subprocess.Popen(['start', '', first_file], shell=True)
preview_image(first_file)
# Enumerate and display files with index
for idx, file in enumerate(files, start=1):
print(f" {idx}. {file}")
# Display options
while True:
option = input("Press 'd' followed by a number to delete the file, "
"'o' followed by a number to open the file in windows, "
"or 'c' to continue to the next set of files: ").strip()
if option.startswith('d'):
try:
# Extract the index after 'd'
file_idx = int(option[1:])
if 1 <= file_idx <= len(files):
# Delete the selected file
file_to_delete = files[file_idx - 1]
os.remove(file_to_delete)
print(f"File '{file_to_delete}' deleted.")
# Remove the deleted file from the list
files.pop(file_idx - 1)
if not files:
print("No more files for this hash.")
break
else:
print("Invalid file index.")
except ValueError:
print("Invalid input.")
elif option.startswith('o'):
try:
# Extract the index after 'o'
file_idx = int(option[1:])
if 1 <= file_idx <= len(files):
# Open the selected file in Windows (you may need to specify the software here)
file_to_open = files[file_idx - 1]
os.system(f'start "" "{file_to_open}"')
print(f"Opening '{file_to_open}' in Windows.")
else:
print("Invalid file index.")
except ValueError:
print("Invalid input.")
elif option == 'c':
break
else:
print("Invalid option. Please enter 'd', 'o', or 'c'.")
def collect_hashes_from_file(file_path):
hashes = {}
try:
with open(file_path, 'r') as f:
for line in f:
try:
hash_md5, file_path = line.strip().split(',', 1)
except ValueError:
print("Error processing line:", line)
continue # Skip this line and continue with the next
if hash_md5 in hashes:
hashes[hash_md5].append(file_path)
else:
hashes[hash_md5] = [file_path]
except Exception as e:
print(f"Error reading file '{file_path}': {str(e)}")
raise
return hashes
def compare_hashes(hash_filepath_A, hash_filepath_B, exclude: list[str]=[]):
# Collect hashes and file paths from the primary source file
source_hashes = collect_hashes_from_file(hash_filepath_A)
destination_hashes = collect_hashes_from_file(hash_filepath_B)
added_hashes = get_added_hashes()
for hash_md5 in added_hashes:
if hash_md5 not in destination_hashes:
destination_hashes[hash_md5] = added_hashes[hash_md5]
print('These are the first few hashes for the two hash files. It can be used to tell if the source and destination match to some extent:')
print('Primary source:', list(destination_hashes.keys())[0:5])
print('Other source', list(source_hashes.keys())[0:5])
# Initialize counters
exist_count = 0
ignore_count = 0
not_exist_count = 0
not_exist_files = []
exist_files = []
# Iterate through the hashes in the other sources
for hash_md5 in source_hashes:
source_path = source_hashes[hash_md5][0].lower()
if source_path.endswith(definitely_ignore_extensions):
ignore_count += 1
elif any(excl in source_path for excl in exclude):
ignore_count += 1
elif hash_md5 in destination_hashes:
exist_count += 1
for item in source_hashes[hash_md5]:
exist_files.append(f"{hash_md5} - {item}")
else:
not_exist_count += 1
for item in source_hashes[hash_md5]:
not_exist_files.append(f"{hash_md5} - {item}")
if not_exist_count > 0:
print("Files that do not exist in destination source:")
for file_path in not_exist_files:
print(f" {file_path}")
pass
if exist_count > 0:
print("Files that do exist in destination source:")
for file_path in exist_files:
#print(f" {file_path}")
pass
print("")
print(f"Comparison results:")
print(f" Exist in destination source: {exist_count}")
print(f" Do not exist in destination source: {not_exist_count}")
print(f" Ignored: {ignore_count}")
def verify_symlink(target_path, link_path):
# Ensure the link exists and is a symlink
if not os.path.islink(link_path):
return False
# Resolve the symlink to its ultimate target
symlink_target = os.path.realpath(link_path)
# Compare the resolved symlink target with the target's real path
return os.path.realpath(symlink_target) == os.path.realpath(target_path)
def get_added_hashes():
added_hashes = {}
# Read content from all added_hashes.{timestamp}.txt files in the current directory
for added_hashes_filename in glob.glob('added_hashes.*.txt'):
with open(added_hashes_filename, 'r') as added_hashes_file:
for line in added_hashes_file:
line = line.strip()
if line:
hash_md5, file_paths = line.split(',', 1)
file_paths = file_paths.split(',')
if hash_md5 in added_hashes:
# Merge or update the file paths associated with this hash
added_hashes[hash_md5].extend(file_paths)
else:
added_hashes[hash_md5] = file_paths
return added_hashes
def copy_over_files(destination_directory, source_root_directory, destination_hash_filepath, source_hash_filepath):
# Collect hashes and file paths from the primary source file
destination_hashes = collect_hashes_from_file(destination_hash_filepath)
added_hashes = get_added_hashes()
for hash_md5 in added_hashes:
if hash_md5 not in destination_hashes:
destination_hashes[hash_md5] = added_hashes[hash_md5]
source_hashes = collect_hashes_from_file(source_hash_filepath)
# Create a set to keep track of copied hashes to avoid duplicates
copied_hashes = set()
# Iterate through the hashes in the other source
for hash_md5 in source_hashes:
if hash_md5 not in destination_hashes:
destination_paths_with_numbers = []
for idx, source_path in enumerate(source_hashes[hash_md5], start=1):
relative_path = os.path.relpath(source_path, start=source_root_directory)
destination_path = os.path.join(destination_directory, relative_path)
destination_paths_with_numbers.append((idx, destination_path, source_path))
source_path = destination_paths_with_numbers[0][2].lower()
if source_path.endswith(definitely_copy):
copy_or_symlink_file(destination_paths_with_numbers)
elif source_path.endswith(definitely_ignore_extensions):
# Don't copy, proceed to the next iteration.
continue
else:
deal_with_copy_file_user_input(destination_paths_with_numbers, hash_md5, source_hashes)
copied_hashes.add(hash_md5)
# Generate a new filename with the current timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
new_filename = f"added_hashes.{timestamp}.txt"
# Save the list of copied hashes to 'added_hashes.txt' file
with open(new_filename, 'w') as added_hashes_file:
for hash_md5 in copied_hashes:
file_paths = source_hashes[hash_md5]
added_hashes_file.write(f"{hash_md5},{','.join(file_paths)}\n")
def deal_with_copy_file_user_input(destination_paths_with_numbers, hash_md5, source_hashes):
print(f"Hash '{hash_md5}' {destination_paths_with_numbers[0][2]} does not exist in the primary source and we do not recongise the file extension.")
# Create a list of file paths with corresponding numbers
user_input = input("Enter s to skip, o to open the file or c to copy: ")
user_input = user_input.strip()
if user_input == 's':
return # Skip to the next hash
elif user_input == 'o':
file_to_open = source_hashes[hash_md5][0]
preview_image(file_to_open)
deal_with_copy_file_user_input(destination_paths_with_numbers, hash_md5, source_hashes)
elif user_input == 'c':
copy_or_symlink_file(destination_paths_with_numbers)
else:
print('Invalid response')
deal_with_copy_file_user_input(destination_paths_with_numbers, hash_md5, source_hashes)
def copy_or_symlink_file(destination_paths_with_numbers):
the_copy_path = None
for file_data in destination_paths_with_numbers:
index = file_data[0]
destination_path = file_data[1]
source_path = file_data[2]
# Ensure the destination directory exists
os.makedirs(os.path.dirname(destination_path), exist_ok=True)
# Copy the selected file to the chosen destination directory
# The first instance of the file should be copied and then everything else should
# be linked to that same copy.
try:
if os.path.exists(destination_path):
print(f"File already exists at {destination_path}. Skipping.")
else:
if index == 1:
shutil.copy(source_path, destination_path)
the_copy_path = destination_path
print(f"Copied {index}-{source_path} to {destination_path}")
elif the_copy_path:
if os.path.exists(destination_path):
print(f"File already exists at {destination_path}. Skipping.")
else:
the_copy_path = os.path.abspath(the_copy_path)
destination_path = os.path.abspath(destination_path)
os.symlink(the_copy_path, destination_path)
if verify_symlink(the_copy_path, destination_path):
print(f"Linked {source_path} to {destination_path} as a link to {the_copy_path}")
else:
print(f"Failed to create a valid symlink for {the_copy_path} at {destination_path}.")
except FileExistsError:
print(f"File already exists at {destination_path}. Skipping.")
except Exception as e:
print(f"Error: {e}")