Skip to content

Commit

Permalink
Merge pull request #27 from ayasyrev:dev
Browse files Browse the repository at this point in the history
0.0.8
  • Loading branch information
ayasyrev committed Apr 6, 2024
2 parents e903244 + 12e150c commit 1e4bbad
Show file tree
Hide file tree
Showing 17 changed files with 393 additions and 139 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ application-import-names = nbmetaclean, tests
import-order-style = google
per-file-ignores =
# imported but unused
__init__.py: F401
__init__.py: F401
48 changes: 46 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,51 @@
repos:
- repo: https://github.com/ayasyrev/nbmetaclean
rev: 0.0.5
rev: 0.0.7
hooks:
- id: nbclean
name: nbclean
entry: nbclean
entry: nbclean

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: check-added-large-files
- id: check-ast
- id: check-builtin-literals
- id: check-case-conflict
- id: check-docstring-first
- id: check-executables-have-shebangs
- id: check-shebang-scripts-are-executable
- id: check-symlinks
- id: check-toml
- id: check-xml
- id: detect-private-key
- id: forbid-new-submodules
- id: forbid-submodules
- id: mixed-line-ending
- id: destroyed-symlinks
- id: fix-byte-order-marker
- id: check-json
- id: check-yaml
args: [ --unsafe ]
- id: debug-statements
- id: end-of-file-fixer
- id: trailing-whitespace
- id: requirements-txt-fixer
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.3.5
hooks:
# Run the linter.
- id: ruff
args: [ --fix ]
# Run the formatter.
- id: ruff-format
- repo: https://github.com/pre-commit/pygrep-hooks
rev: v1.10.0
hooks:
- id: python-check-mock-methods
- id: python-use-type-annotations
- id: python-check-blanket-noqa
- id: python-use-type-annotations
- id: text-unicode-replacement-char
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ pypi: dist
twine upload --repository pypi dist/*

dist: clean
python3 -m build
python3 -m build

clean:
rm -rf dist
rm -rf dist
34 changes: 33 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,34 @@
# nbmetaclean
nbmetaclean

Pre-commit hook to clean Jupyter Notebooks metadata, execution_count and optionally output.

Pure Python, no dependencies.

Can be used as a pre-commit hook or as a command line tool.

## Usage

### Pre-commit hook
add to `.pre-commit-config.yaml`:
```yaml
repos:
- repo: https://github.com/ayasyrev/nbmetaclean
rev: 0.0.8
hooks:
- id: nbclean
name: nbclean
entry: nbclean
```

### Command line tool

Install:
```bash
pip install nbmetaclean
```

Usage:

```bash
nbclean
```
1 change: 0 additions & 1 deletion noxfile_lint.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import nox


locations = "src/nbmetaclean", "tests", "noxfile.py"


Expand Down
7 changes: 7 additions & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
black
black[jupyter]
flake8
isort
mypy
pre-commit
ruff
2 changes: 0 additions & 2 deletions requirements_test_extra.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
coverage[toml]
nox
black
flake8
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ where = src
console_scripts =
nbclean=nbmetaclean.app:app
pipx.run =
nbclean=nbmetaclean.app:app
nbclean=nbmetaclean.app:app
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from setuptools import setup


REQUIREMENTS_FILENAME = 'requirements.txt'
REQUIREMENTS_TEST_FILENAME = 'requirements_test.txt'
REQUIREMENTS_FILENAME = "requirements.txt"
REQUIREMENTS_TEST_FILENAME = "requirements_test.txt"


# Requirements
Expand Down
64 changes: 57 additions & 7 deletions src/nbmetaclean/app.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import argparse
from pathlib import Path
from typing import Union

from .clean import CleanConfig, clean_nb_file
from .core import get_nb_names
from .clean import CleanConfig, TupleStr, clean_nb_file
from .helpers import get_nb_names

parser = argparse.ArgumentParser(
prog="nbclean",
Expand All @@ -25,6 +26,47 @@
action="store_true",
help="Do not preserve timestamp.",
)
parser.add_argument(
"--dont_clear_nb_metadata",
action="store_true",
help="Do not clear notebook metadata.",
)
parser.add_argument(
"--clear_cell_metadata",
action="store_true",
help="Clear cell metadata.",
)
parser.add_argument(
"--clear_outputs",
action="store_true",
help="Clear outputs.",
)
parser.add_argument(
"--nb_metadata_preserve_mask",
nargs="+",
help="Preserve mask for notebook metadata.",
)
parser.add_argument(
"--cell_metadata_preserve_mask",
nargs="+",
help="Preserve mask for cell metadata.",
)
parser.add_argument(
"--dont_merge_masks",
action="store_true",
help="Do not merge masks.",
)
parser.add_argument(
"--clean_hidden_nbs",
action="store_true",
help="Clean hidden notebooks.",
)


def process_mask(mask: Union[list[str], None]) -> Union[tuple[TupleStr, ...], None]:
if mask is None:
return None
return tuple(tuple(item.split(".")) for item in mask)


def app() -> None:
Expand All @@ -36,17 +78,25 @@ def app() -> None:
print(f"Path: {', '.join(cfg.path)}, preserve timestamp: {not cfg.not_pt}")
for path in path_list:
try:
nb_files.extend(get_nb_names(path))
nb_files.extend(get_nb_names(path, hidden=cfg.clean_hidden_nbs))
except FileNotFoundError:
print(f"{path} not exists!")
if not cfg.silent:
print(f"notebooks to check: {len(nb_files)} ")
clean_config = CleanConfig(
clear_nb_metadata=not cfg.dont_clear_nb_metadata,
clear_cell_metadata=cfg.clear_cell_metadata,
clear_execution_count=True,
clear_outputs=cfg.clear_outputs,
preserve_timestamp=not cfg.not_pt,
silent=cfg.silent,
nb_metadata_preserve_mask=process_mask(cfg.nb_metadata_preserve_mask),
cell_metadata_preserve_mask=process_mask(cfg.cell_metadata_preserve_mask),
mask_merge=not cfg.dont_merge_masks,
)
cleaned, errors = clean_nb_file(
nb_files,
CleanConfig(
silent=cfg.silent,
preserve_timestamp=not cfg.not_pt,
),
clean_config,
)
if not cfg.silent:
print(f"cleaned nbs: {len(cleaned)}")
Expand Down
51 changes: 27 additions & 24 deletions src/nbmetaclean/clean.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from __future__ import annotations

import copy
from dataclasses import dataclass
import os

from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Optional, Union
from typing import Optional, Tuple, Union

from nbmetaclean.helpers import read_nb, write_nb

from nbmetaclean.core import read_nb, write_nb
from .typing import Cell, CodeCell, Metadata, Nb, Output

from .typing import NbNode, Metadata
TupleStr = Tuple[str, ...]

NB_METADATA_PRESERVE_MASKS = (
("language_info", "name"),
Expand Down Expand Up @@ -41,14 +43,14 @@ class CleanConfig:
clear_outputs: bool = False
preserve_timestamp: bool = True
silent: bool = False
nb_metadata_preserve_mask: Optional[Iterable[tuple[str, ...]]] = None
cell_metadata_preserve_mask: Optional[Iterable[tuple[str, ...]]] = None
nb_metadata_preserve_mask: Optional[tuple[TupleStr, ...]] = None
cell_metadata_preserve_mask: Optional[tuple[TupleStr, ...]] = None
mask_merge: bool = True


def filter_meta_mask(
nb_meta: Union[str, int, Metadata],
mask: Optional[Iterable[tuple[str, ...]]] = None,
mask: Optional[tuple[str, ...]] = None,
) -> Union[str, int, Metadata]:
"""Filter metadata by mask. If no mask return empty dict."""
if isinstance(nb_meta, (str, int)) or mask == ():
Expand All @@ -65,7 +67,7 @@ def filter_meta_mask(

def filter_metadata(
nb_meta: Metadata,
masks: Optional[list[tuple[str, ...]]] = None,
masks: Optional[tuple[TupleStr, ...]] = None,
) -> Metadata:
"""Clean notebooknode metadata."""
if masks is None:
Expand All @@ -77,38 +79,40 @@ def filter_metadata(


def clean_cell(
cell: NbNode,
cell: Cell | CodeCell,
cfg: CleanConfig,
) -> bool:
"""Clean cell: optionally metadata, execution_count and outputs."""
changed = False

if cfg.clear_cell_metadata:
if metadata := cell.get("metadata", None):
if cell.get("metadata", None):
metadata = cell["metadata"]
old_metadata = copy.deepcopy(metadata)
cell["metadata"] = filter_metadata(
metadata, cfg.cell_metadata_preserve_mask
)
if cell["metadata"] != old_metadata:
changed = True

if cfg.clear_execution_count and cell.get("execution_count"):
cell["execution_count"] = None
changed = True

if cell.get("outputs"):
if cfg.clear_outputs:
cell["outputs"] = []
if cell["cell_type"] == "code":
if cfg.clear_execution_count and cell.get("execution_count"):
cell["execution_count"] = None # type: ignore # it's code cell
changed = True
elif cfg.clear_cell_metadata or cfg.clear_execution_count:
result = clean_outputs(cell["outputs"], cfg)
if result:

if cell.get("outputs"):
if cfg.clear_outputs:
cell["outputs"] = [] # type: ignore # it's code cell
changed = True
elif cfg.clear_cell_metadata or cfg.clear_execution_count:
result = clean_outputs(cell["outputs"], cfg) # type: ignore # it's code cell
if result:
changed = True

return changed


def clean_outputs(outputs: list[NbNode], cfg: CleanConfig) -> bool:
def clean_outputs(outputs: list[Output], cfg: CleanConfig) -> bool:
"""Clean outputs."""
changed = False
for output in outputs:
Expand All @@ -126,7 +130,7 @@ def clean_outputs(outputs: list[NbNode], cfg: CleanConfig) -> bool:


def clean_nb(
nb: NbNode,
nb: Nb,
cfg: CleanConfig,
) -> bool:
"""Clean notebook - metadata, execution_count, outputs.
Expand All @@ -148,7 +152,6 @@ def clean_nb(
masks = cfg.nb_metadata_preserve_mask
else:
masks = cfg.nb_metadata_preserve_mask + masks

nb["metadata"] = filter_metadata(metadata, masks=masks)
if nb["metadata"] != old_metadata:
changed = True
Expand Down
Loading

0 comments on commit 1e4bbad

Please sign in to comment.