From cee35291509d1c09aa5b188c61d6a21e3b12c1d0 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Fri, 17 Nov 2023 19:27:55 +0200 Subject: [PATCH] index: md5: use hash_file The main issue is that we don't use md5 provided by the fs (e.g. dvcfs), which results in needless hash recomputing. We can just use tried-and-tested `hash_file` here for now. Fixes https://github.com/iterative/dvc/issues/10059 --- src/dvc_data/index/save.py | 36 ++++++++++-------------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/src/dvc_data/index/save.py b/src/dvc_data/index/save.py index 89f7ebb1..416f9339 100644 --- a/src/dvc_data/index/save.py +++ b/src/dvc_data/index/save.py @@ -4,8 +4,7 @@ from dvc_objects.fs.callbacks import DEFAULT_CALLBACK -from ..hashfile.hash import DEFAULT_ALGORITHM -from ..hashfile.hash_info import HashInfo +from ..hashfile.hash import DEFAULT_ALGORITHM, hash_file from ..hashfile.meta import Meta from ..hashfile.tree import Tree @@ -25,7 +24,6 @@ def md5( name: str = DEFAULT_ALGORITHM, check_meta: bool = True, ) -> None: - from ..hashfile.hash import fobj_md5 from .index import DataIndexEntry entries = {} @@ -39,37 +37,23 @@ def md5( fs, path = index.storage_map.get_storage(entry, storage) + info = None if check_meta: try: - meta = Meta.from_info(fs.info(path), fs.protocol) + info = fs.info(path) except FileNotFoundError: continue + meta = Meta.from_info(info, fs.protocol) if entry.meta != meta: continue - if state: - _, hash_info = state.get(path, fs) - if hash_info: - entries[key] = DataIndexEntry( - key=entry.key, - meta=entry.meta, - hash_info=hash_info, - ) - continue - - with fs.open(path, "rb") as fobj: - entries[key] = DataIndexEntry( - key=entry.key, - meta=entry.meta, - hash_info=HashInfo( - name, - fobj_md5(fobj, name=name), - ), - ) - - if state: - state.save(path, fs, entries[key].hash_info) + meta, hash_info = hash_file(path, fs, name, state=state, info=info) + entries[key] = DataIndexEntry( + key=entry.key, + meta=entry.meta, + hash_info=hash_info, + ) for key, entry in entries.items(): index[key] = entry