forked from SolitarySpiral/python-image-dowloader
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter.py
85 lines (68 loc) · 2.93 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
# if running in py3, change the shebang, drop the next import for readability (it does no harm in py3)
from collections import defaultdict
import hashlib
from pathlib import Path
import logging
from concurrent.futures import ThreadPoolExecutor
def chunk_reader(file_path, chunk_size=1024):
"""Generator that reads a file in chunks of bytes from a given path"""
with open(file_path, 'rb') as fobj:
while True:
chunk = fobj.read(chunk_size)
if not chunk:
break
yield chunk
def get_hash(filename: Path, first_chunk_only=False, hash_algo=hashlib.sha256):
hashobj = hash_algo()
with open(filename, "rb") as file_object:
if first_chunk_only:
hashobj.update(file_object.read(1024))
else:
# Reading in chunks to handle large files
for chunk in iter(lambda: file_object.read(4096), b''):
hashobj.update(chunk)
return hashobj.hexdigest()
def check_for_duplicates(path: Path) -> int:
"""Checks files under the given directory path for duplicates and removes them."""
hashes_by_size = defaultdict(list)
hashes_on_1k = defaultdict(list)
hashes_full = {}
files = list(path.glob("*.*"))
# Populate hashes_by_size
for file_path in files:
file_size = file_path.stat().st_size
hashes_by_size[file_size].append(file_path)
# Find potential duplicates based on 1K hash.
def hash_first_1k(file_path):
small_hash = get_hash(file_path, first_chunk_only=True)
if small_hash:
file_size = file_path.stat().st_size
hashes_on_1k[(small_hash, file_size)].append(file_path)
with ThreadPoolExecutor() as executor:
executor.map(hash_first_1k, [fp for flist in hashes_by_size.values() for fp in flist if len(flist) > 1])
# Check full file hash for actual duplicates
duplicates = []
def check_full_hash(file_path):
""" Check full hash (if applicable) and identify duplicates. """
full_hash = get_hash(file_path)
if full_hash:
if full_hash in hashes_full:
duplicates.append(file_path)
else:
hashes_full[full_hash] = file_path
files_with_same_1k = [fp for flist in hashes_on_1k.values() for fp in flist if len(flist) > 1]
with ThreadPoolExecutor() as executor:
executor.map(check_full_hash, files_with_same_1k)
# Delete duplicated files
for file in duplicates:
file.unlink()
return len(duplicates)
def handle_photo_processing(photos, photos_path, duplicateflag):
if duplicateflag:
logging.info("Check for duplicates")
duplicates_count = check_for_duplicates(photos_path)
logging.info(f"Duplicates removed: {duplicates_count}")
logging.info(f"Total downloaded: {len(photos) - duplicates_count} photo")
else:
logging.info(f"Total downloaded: {len(photos)} photo")