Skip to content

Commit

Permalink
dev(narugo): add squash command
Browse files Browse the repository at this point in the history
  • Loading branch information
narugo1992 committed Aug 10, 2024
1 parent 1661e69 commit d515a9d
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 2 deletions.
2 changes: 2 additions & 0 deletions felinewhisker/entry/cli.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from .dispatch import felinewhiskercli
from .init import _add_init_subcommand
from .squash import _add_squash_subcommand

_DECORATORS = [
_add_init_subcommand,
_add_squash_subcommand,
]

cli = felinewhiskercli
Expand Down
45 changes: 45 additions & 0 deletions felinewhisker/entry/squash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import logging
from typing import Optional

import click
from hfutils.utils import get_requests_session, ColoredFormatter
from huggingface_hub import configure_http_backend

from .base import CONTEXT_SETTINGS, ClickErrorException
from ..repository import LocalRepository, HfOnlineRepository


class NoDatasetAssigned(ClickErrorException):
exit_code = 0x10


def _add_squash_subcommand(cli: click.Group) -> click.Group:
@cli.command('squash', help='Squash a dataset by merging all the unarchived contributions.\n\n'
'Set environment $HF_TOKEN to use your own access token.',
context_settings=CONTEXT_SETTINGS)
@click.option('-d', '--directory', 'directory', type=str, default=None,
help='Local directory of the dataset.', show_default=False)
@click.option('-r', '--repository', 'repository', type=str, default=None,
help='HuggingFace Repository of the dataset.', show_default=False)
def squash(directory: Optional[str], repository: Optional[str]):
configure_http_backend(get_requests_session)

logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setFormatter(ColoredFormatter())
logger.addHandler(console_handler)

if repository:
repo = HfOnlineRepository(repo_id=repository)
elif directory:
repo = LocalRepository(directory)
else:
raise NoDatasetAssigned(
'No dataset assigned. '
'You have to use either -d or -r option to assign a local or a HF-based dataset.'
)

repo.squash()

return cli
12 changes: 10 additions & 2 deletions felinewhisker/repository/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def _squash(self):
records = {}

files_to_drop = []
new_authors = set()
for filepath in natsorted(hf_fs.glob(hf_fs_path(
repo_id=self._repo_id,
repo_type='dataset',
Expand All @@ -180,6 +181,8 @@ def _squash(self):
filename=filename,
)).to_dict('records'):
records[item['id']] = item
if item['author']:
new_authors.add(item['author'])
files_to_drop.append(filename)

def _load_image_by_id(id_: str):
Expand Down Expand Up @@ -223,13 +226,18 @@ def _load_image_by_id(id_: str):
for file in files_to_drop:
operations.append(CommitOperationDelete(path_in_repo=file))

commit_message = f'Squash {plural_word(len(files_to_drop), "package")}, ' \
f'now this dataset contains {plural_word(len(df), "sample")}'
if new_authors:
commit_message = f'{commit_message}, ' \
f'contributed by {", ".join([f"@{name}" for name in sorted(new_authors)])}'

hf_client.create_commit(
repo_id=self._repo_id,
repo_type='dataset',
revision=self._revision,
operations=operations,
commit_message=f'Squash {plural_word(len(files_to_drop), "package")}, '
f'now this dataset contains {plural_word(len(df), "sample")}'
commit_message=commit_message,
)

def __repr__(self):
Expand Down

0 comments on commit d515a9d

Please sign in to comment.