Skip to content

Commit

Permalink
optimize crypto: Improve file read speed while maintaining stability
Browse files Browse the repository at this point in the history
Signed-off-by: geyaning <[email protected]>
  • Loading branch information
geyaning committed Jul 12, 2023
1 parent 40b75f1 commit ee284c3
Showing 1 changed file with 31 additions and 11 deletions.
42 changes: 31 additions & 11 deletions avocado/utils/crypto.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,17 @@
import io
import logging
import os
from concurrent.futures import ThreadPoolExecutor


LOG = logging.getLogger(__name__)

def calculate_hash(data, algorithm):
hash_obj = hashlib.new(algorithm)
hash_obj.update(data)
return hash_obj.hexdigest()

def hash_file(filename, size=None, algorithm="md5"):
def hash_file(filename, size=None, algorithm="md5", buffer_size=65536, num_threads=1):
"""
Calculate the hash value of filename.
Expand All @@ -33,9 +39,10 @@ def hash_file(filename, size=None, algorithm="md5"):
:param filename: Path of the file that will have its hash calculated.
:param algorithm: Method used to calculate the hash (default is md5).
:param size: If provided, hash only the first size bytes of the file.
:param buffer_size: Buffer size for reading file data.
:param num_threads: Number of threads to use for parallel processing.
:return: Hash of the file, if something goes wrong, return None.
"""
chunksize = io.DEFAULT_BUFFER_SIZE
fsize = os.path.getsize(filename)

if not size or size > fsize:
Expand All @@ -50,14 +57,27 @@ def hash_file(filename, size=None, algorithm="md5"):
return None

with open(filename, "rb") as file_to_hash:
while size > 0:
if chunksize > size:
chunksize = size
data = file_to_hash.read(chunksize)
if len(data) == 0:
LOG.debug("Nothing left to read but size=%d", size)
break
hash_obj.update(data)
size -= len(data)
if num_threads > 1:
executor = ThreadPoolExecutor(max_workers=num_threads)
futures = []
while size > 0:
chunksize = min(buffer_size, size)
data = file_to_hash.read(chunksize)
if len(data) == 0:
LOG.debug("Nothing left to read but size=%d", size)
break
futures.append(executor.submit(calculate_hash, data, algorithm))
size -= len(data)
for future in futures:
hash_obj.update(future.result())
else:
while size > 0:
chunksize = min(buffer_size, size)
data = file_to_hash.read(chunksize)
if len(data) == 0:
LOG.debug("Nothing left to read but size=%d", size)
break
hash_obj.update(data)
size -= len(data)

return hash_obj.hexdigest()

0 comments on commit ee284c3

Please sign in to comment.