From d515a9dd6dcaa844aa13ea84bdb88b8ade938274 Mon Sep 17 00:00:00 2001 From: narugo1992 Date: Sun, 11 Aug 2024 02:13:39 +0800 Subject: [PATCH] dev(narugo): add squash command --- felinewhisker/entry/cli.py | 2 ++ felinewhisker/entry/squash.py | 45 +++++++++++++++++++++++++ felinewhisker/repository/huggingface.py | 12 +++++-- 3 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 felinewhisker/entry/squash.py diff --git a/felinewhisker/entry/cli.py b/felinewhisker/entry/cli.py index 6a0e162..34f603e 100644 --- a/felinewhisker/entry/cli.py +++ b/felinewhisker/entry/cli.py @@ -1,8 +1,10 @@ from .dispatch import felinewhiskercli from .init import _add_init_subcommand +from .squash import _add_squash_subcommand _DECORATORS = [ _add_init_subcommand, + _add_squash_subcommand, ] cli = felinewhiskercli diff --git a/felinewhisker/entry/squash.py b/felinewhisker/entry/squash.py new file mode 100644 index 0000000..5e91981 --- /dev/null +++ b/felinewhisker/entry/squash.py @@ -0,0 +1,45 @@ +import logging +from typing import Optional + +import click +from hfutils.utils import get_requests_session, ColoredFormatter +from huggingface_hub import configure_http_backend + +from .base import CONTEXT_SETTINGS, ClickErrorException +from ..repository import LocalRepository, HfOnlineRepository + + +class NoDatasetAssigned(ClickErrorException): + exit_code = 0x10 + + +def _add_squash_subcommand(cli: click.Group) -> click.Group: + @cli.command('squash', help='Squash a dataset by merging all the unarchived contributions.\n\n' + 'Set environment $HF_TOKEN to use your own access token.', + context_settings=CONTEXT_SETTINGS) + @click.option('-d', '--directory', 'directory', type=str, default=None, + help='Local directory of the dataset.', show_default=False) + @click.option('-r', '--repository', 'repository', type=str, default=None, + help='HuggingFace Repository of the dataset.', show_default=False) + def squash(directory: Optional[str], repository: Optional[str]): + configure_http_backend(get_requests_session) + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler() + console_handler.setFormatter(ColoredFormatter()) + logger.addHandler(console_handler) + + if repository: + repo = HfOnlineRepository(repo_id=repository) + elif directory: + repo = LocalRepository(directory) + else: + raise NoDatasetAssigned( + 'No dataset assigned. ' + 'You have to use either -d or -r option to assign a local or a HF-based dataset.' + ) + + repo.squash() + + return cli diff --git a/felinewhisker/repository/huggingface.py b/felinewhisker/repository/huggingface.py index be1be86..25812d8 100644 --- a/felinewhisker/repository/huggingface.py +++ b/felinewhisker/repository/huggingface.py @@ -166,6 +166,7 @@ def _squash(self): records = {} files_to_drop = [] + new_authors = set() for filepath in natsorted(hf_fs.glob(hf_fs_path( repo_id=self._repo_id, repo_type='dataset', @@ -180,6 +181,8 @@ def _squash(self): filename=filename, )).to_dict('records'): records[item['id']] = item + if item['author']: + new_authors.add(item['author']) files_to_drop.append(filename) def _load_image_by_id(id_: str): @@ -223,13 +226,18 @@ def _load_image_by_id(id_: str): for file in files_to_drop: operations.append(CommitOperationDelete(path_in_repo=file)) + commit_message = f'Squash {plural_word(len(files_to_drop), "package")}, ' \ + f'now this dataset contains {plural_word(len(df), "sample")}' + if new_authors: + commit_message = f'{commit_message}, ' \ + f'contributed by {", ".join([f"@{name}" for name in sorted(new_authors)])}' + hf_client.create_commit( repo_id=self._repo_id, repo_type='dataset', revision=self._revision, operations=operations, - commit_message=f'Squash {plural_word(len(files_to_drop), "package")}, ' - f'now this dataset contains {plural_word(len(df), "sample")}' + commit_message=commit_message, ) def __repr__(self):