diff --git a/.gitignore b/.gitignore index 12131a39..3cafea86 100644 --- a/.gitignore +++ b/.gitignore @@ -144,3 +144,5 @@ _site/ objects.json reference/ src/ + +/.luarc.json diff --git a/pins/boards.py b/pins/boards.py index e1f03053..dc0cdb64 100644 --- a/pins/boards.py +++ b/pins/boards.py @@ -14,7 +14,7 @@ from .versions import VersionRaw, guess_version from .meta import Meta, MetaRaw, MetaFactory from .errors import PinsError -from .drivers import load_data, save_data, default_title +from .drivers import load_data, save_data, load_file, default_title from .utils import inform, warn_deprecated, ExtendMethodDoc from .config import get_allow_rsc_short_name @@ -225,7 +225,7 @@ def pin_read(self, name, version: Optional[str] = None, hash: Optional[str] = No meta, self.construct_path([pin_name, meta.version.version]) ) - def pin_write( + def _pin_store( self, x, name: Optional[str] = None, @@ -236,32 +236,6 @@ def pin_write( versioned: Optional[bool] = None, created: Optional[datetime] = None, ) -> Meta: - """Write a pin object to the board. - - Parameters - ---------- - x: - An object (e.g. a pandas DataFrame) to pin. - name: - Pin name. - type: - File type used to save `x` to disk. May be "csv", "arrow", "parquet", - "joblib", "json", or "file". - title: - A title for the pin; most important for shared boards so that others - can understand what the pin contains. If omitted, a brief description - of the contents will be automatically generated. - description: - A detailed description of the pin contents. - metadata: - A dictionary containing additional metadata to store with the pin. - This gets stored on the Meta.user field. - versioned: - Whether the pin should be versioned. Defaults to versioning. - created: - A date to store in the Meta.created field. This field may be used as - part of the pin version name. - """ if type == "feather": warn_deprecated( @@ -271,6 +245,18 @@ def pin_write( ) type = "arrow" + if type == "file": + # the file type makes the name of the data the exact filename, rather + # than the pin name + a suffix (e.g. my_pin.csv). + if isinstance(x, (tuple, list)) and len(x) == 1: + x = x[0] + + _p = Path(x) + _base_len = len(_p.name) - len("".join(_p.suffixes)) + object_name = _p.name[:_base_len] + else: + object_name = None + pin_name = self.path_to_pin(name) with tempfile.TemporaryDirectory() as tmp_dir: @@ -285,6 +271,7 @@ def pin_write( metadata, versioned, created, + object_name=object_name, ) # move pin to destination ---- @@ -326,7 +313,55 @@ def pin_write( return meta - def pin_download(self, name, version=None, hash=None): + def pin_write( + self, + x, + name: Optional[str] = None, + type: Optional[str] = None, + title: Optional[str] = None, + description: Optional[str] = None, + metadata: Optional[Mapping] = None, + versioned: Optional[bool] = None, + created: Optional[datetime] = None, + ) -> Meta: + """Write a pin object to the board. + + Parameters + ---------- + x: + An object (e.g. a pandas DataFrame) to pin. + name: + Pin name. + type: + File type used to save `x` to disk. May be "csv", "arrow", "parquet", + "joblib", or "json". + title: + A title for the pin; most important for shared boards so that others + can understand what the pin contains. If omitted, a brief description + of the contents will be automatically generated. + description: + A detailed description of the pin contents. + metadata: + A dictionary containing additional metadata to store with the pin. + This gets stored on the Meta.user field. + versioned: + Whether the pin should be versioned. Defaults to versioning. + created: + A date to store in the Meta.created field. This field may be used as + part of the pin version name. + """ + + if type == "file": + raise NotImplementedError( + ".pin_write() does not support type='file'. " + "Use .pin_upload() to save a file as a pin." + ) + + return self._pin_store( + x, name, type, title, description, metadata, versioned, created + ) + + def pin_download(self, name, version=None, hash=None) -> Sequence[str]: """Download the files contained in a pin. This method only downloads the files in a pin. In order to read and load @@ -342,20 +377,68 @@ def pin_download(self, name, version=None, hash=None): A hash used to validate the retrieved pin data. If specified, it is compared against the `pin_hash` field retrived by [](`~pins.boards.BaseBoard.pin_meta`). - """ - raise NotImplementedError() - def pin_upload(self, paths, name=None, title=None, description=None, metadata=None): + meta = self.pin_fetch(name, version) + + if hash is not None: + raise NotImplementedError("TODO: validate hash") + + pin_name = self.path_to_pin(name) + + # TODO: raise for multiple files + # fetch file + f = load_file( + meta, self.fs, self.construct_path([pin_name, meta.version.version]) + ) + + # could also check whether f isinstance of PinCache + fname = getattr(f, "name", None) + + if fname is None: + raise PinsError("pin_download requires a cache.") + + return [str(Path(fname).absolute())] + + def pin_upload( + self, + paths: "str | list[str]", + name=None, + title=None, + description=None, + metadata=None, + ): """Write a pin based on paths to one or more files. This method simply uploads the files given, so they can be downloaded later using [](`~pins.boards.BaseBoard.pin_download`). + + Parameters + ---------- + paths: + Paths of files to upload. Currently, only uploading a single file + is supported. + name: + Pin name. + title: + A title for the pin; most important for shared boards so that others + can understand what the pin contains. If omitted, a brief description + of the contents will be automatically generated. + description: + A detailed description of the pin contents. + metadata: + A dictionary containing additional metadata to store with the pin. + This gets stored on the Meta.user field. """ - # TODO(question): why does this method exist? Isn't it equiv to a user - # doing this?: pin_write(board, c("filea.txt", "fileb.txt"), type="file") - # pin_download makes since, because it will download *regardless of type* - raise NotImplementedError() + + return self._pin_store( + paths, + name, + type="file", + title=title, + description=description, + metadata=metadata, + ) def pin_version_delete(self, name: str, version: str): """Delete a single version of a pin. @@ -553,6 +636,7 @@ def prepare_pin_version( metadata: Optional[Mapping] = None, versioned: Optional[bool] = None, created: Optional[datetime] = None, + object_name: Optional[str] = None, ): if name is None: raise NotImplementedError("Name must be specified.") @@ -570,7 +654,10 @@ def prepare_pin_version( # save all pin data to a temporary folder (including data.txt), so we # can fs.put it all straight onto the backend filesystem - p_obj = Path(pin_dir_path) / name + if object_name is None: + p_obj = Path(pin_dir_path) / name + else: + p_obj = Path(pin_dir_path) / object_name # file is saved locally in order to hash, calc size file_names = save_data(x, str(p_obj), type) @@ -716,12 +803,19 @@ def pin_download(self, name, version=None, hash=None) -> Sequence[str]: meta = self.pin_meta(name, version) if isinstance(meta, MetaRaw): + f = load_file(meta, self.fs, None) + else: + raise NotImplementedError( + "TODO: pin_download currently can only read a url to a single file." + ) - return self._load_data(meta, None) + # could also check whether f isinstance of PinCache + fname = getattr(f, "name", None) - raise NotImplementedError( - "TODO: pin_download currently can only read a url to a single file." - ) + if fname is None: + raise PinsError("pin_download requires a cache.") + + return [str(Path(fname).absolute())] def construct_path(self, elements): # TODO: in practice every call to construct_path has the first element of diff --git a/pins/drivers.py b/pins/drivers.py index 06e631f1..1b2b6cfb 100644 --- a/pins/drivers.py +++ b/pins/drivers.py @@ -23,6 +23,34 @@ def _assert_is_pandas_df(x): ) +def load_path(meta, path_to_version): + # Check that only a single file name was given + fnames = [meta.file] if isinstance(meta.file, str) else meta.file + if len(fnames) > 1 and type in REQUIRES_SINGLE_FILE: + raise ValueError("Cannot load data when more than 1 file") + + # file path creation ------------------------------------------------------ + + if type == "table": + # this type contains an rds and csv files named data.{ext}, so we match + # R pins behavior and hardcode the name + target_fname = "data.csv" + else: + target_fname = fnames[0] + + if path_to_version is not None: + path_to_file = f"{path_to_version}/{target_fname}" + else: + # BoardUrl doesn't have versions, and the file is the full url + path_to_file = target_fname + + return path_to_file + + +def load_file(meta: Meta, fs, path_to_version): + return fs.open(load_path(meta, path_to_version)) + + def load_data( meta: Meta, fs, @@ -39,6 +67,7 @@ def load_data( path_to_version: A filepath used as the parent directory the data to-be-loaded lives in. """ + # TODO: extandable loading with deferred importing if meta.type in UNSAFE_TYPES and not get_allow_pickle_read(allow_pickle_read): raise PinsInsecureReadError( @@ -50,65 +79,47 @@ def load_data( " * https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations" ) - # Check that only a single file name was given - fnames = [meta.file] if isinstance(meta.file, str) else meta.file - if len(fnames) > 1 and type in REQUIRES_SINGLE_FILE: - raise ValueError("Cannot load data when more than 1 file") - - # file path creation ------------------------------------------------------ - - if type == "table": - # this type contains an rds and csv files named data.{ext}, so we match - # R pins behavior and hardcode the name - target_fname = "data.csv" - else: - target_fname = fnames[0] + with load_file(meta, fs, path_to_version) as f: + if meta.type == "csv": + import pandas as pd - if path_to_version is not None: - path_to_file = f"{path_to_version}/{target_fname}" - else: - path_to_file = target_fname - - # type handling ----------------------------------------------------------- - - if meta.type == "csv": - import pandas as pd + return pd.read_csv(f) - return pd.read_csv(fs.open(path_to_file)) + elif meta.type == "arrow": + import pandas as pd - elif meta.type == "arrow": - import pandas as pd + return pd.read_feather(f) - return pd.read_feather(fs.open(path_to_file)) + elif meta.type == "feather": + import pandas as pd - elif meta.type == "feather": - import pandas as pd + return pd.read_feather(f) - return pd.read_feather(fs.open(path_to_file)) + elif meta.type == "parquet": + import pandas as pd - elif meta.type == "parquet": - import pandas as pd + return pd.read_parquet(f) - return pd.read_parquet(fs.open(path_to_file)) + elif meta.type == "table": + import pandas as pd - elif meta.type == "table": - import pandas as pd + return pd.read_csv(f) - return pd.read_csv(fs.open(path_to_file)) + elif meta.type == "joblib": + import joblib - elif meta.type == "joblib": - import joblib - - return joblib.load(fs.open(path_to_file)) + return joblib.load(f) - elif meta.type == "file": - # TODO: update to handle multiple files - return [str(Path(fs.open(path_to_file).name).absolute())] + elif meta.type == "json": + import json - elif meta.type == "json": - import json + return json.load(f) - return json.load(fs.open(path_to_file)) + elif meta.type == "file": + raise NotImplementedError( + "Methods like `.pin_read()` are not able to read 'file' type pins." + " Use `.pin_download()` to download the file." + ) raise NotImplementedError(f"No driver for type {meta.type}") @@ -124,9 +135,14 @@ def save_data( # of saving / loading objects different ways. if apply_suffix: - final_name = f"{fname}.{type}" + if type == "file": + suffix = "".join(Path(obj).suffixes) + else: + suffix = f".{type}" else: - final_name = fname + suffix = "" + + final_name = f"{fname}{suffix}" if type == "csv": _assert_is_pandas_df(obj) @@ -162,6 +178,14 @@ def save_data( json.dump(obj, open(final_name, "w")) + elif type == "file": + import contextlib + import shutil + + # ignore the case where the source is the same as the target + with contextlib.suppress(shutil.SameFileError): + shutil.copyfile(str(obj), final_name) + else: raise NotImplementedError(f"Cannot save type: {type}") diff --git a/pins/tests/test_boards.py b/pins/tests/test_boards.py index 3f1c9aa2..23095e4d 100644 --- a/pins/tests/test_boards.py +++ b/pins/tests/test_boards.py @@ -31,6 +31,26 @@ def board(backend): backend.teardown() +@fixture +def board_with_cache(backend): + from pins.constructors import board as board_constructor, board_rsconnect + + board = backend.create_tmp_board() + + if backend.fs_name == "rsc": + # The rsconnect board is special, in that it's slower to set up and tear down, + # so our test suite uses multiple rsconnect users in testing its API, and + # board behavior. As a result, we need to pass the credentials directly in. + server_url, api_key = board.fs.api.server_url, board.fs.api.api_key + board_with_cache = board_rsconnect(server_url=server_url, api_key=api_key) + else: + board_with_cache = board_constructor(backend.fs_name, board.board) + + yield board_with_cache + + backend.teardown() + + # misc ======================================================================== @@ -103,6 +123,114 @@ def test_board_pin_write_feather_deprecated(board): board.pin_write(df, "cool_pin", type="feather") +def test_board_pin_write_file_raises_error(board, tmp_path): + df = pd.DataFrame({"x": [1, 2, 3]}) + + path = tmp_path.joinpath("data.csv") + df.to_csv(path, index=False) + + # TODO: should this error? + with pytest.raises(NotImplementedError): + board.pin_write(path, "cool_pin", type="file") + + +def test_board_pin_download(board_with_cache, tmp_path): + # create and save data + df = pd.DataFrame({"x": [1, 2, 3]}) + + path = tmp_path / "data.csv" + df.to_csv(path, index=False) + + meta = board_with_cache.pin_upload(path, "cool_pin") + assert meta.type == "file" + + (pin_path,) = board_with_cache.pin_download("cool_pin") + df = pd.read_csv(pin_path) + assert df.x.tolist() == [1, 2, 3] + + with pytest.raises(NotImplementedError): + board_with_cache.pin_read("cool_pin") + + +def test_board_pin_download_filename_many_suffixes(board_with_cache, tmp_path): + # create and save data + df = pd.DataFrame({"x": [1, 2, 3]}) + + path = tmp_path / "data.a.b.csv" + df.to_csv(path, index=False) + + board_with_cache.pin_upload(path, "cool_pin") + + (pin_path,) = board_with_cache.pin_download("cool_pin") + assert Path(pin_path).name == "data.a.b.csv" + + df = pd.read_csv(pin_path) + assert df.x.tolist() == [1, 2, 3] + + +def test_board_pin_download_filename_no_suffixes(board_with_cache, tmp_path): + # create and save data + df = pd.DataFrame({"x": [1, 2, 3]}) + + path = tmp_path / "data" + df.to_csv(path, index=False) + + board_with_cache.pin_upload(path, "cool_pin") + + (pin_path,) = board_with_cache.pin_download("cool_pin") + assert Path(pin_path).name == "data" + + df = pd.read_csv(pin_path) + assert df.x.tolist() == [1, 2, 3] + + +def test_board_pin_download_filename(board_with_cache, tmp_path): + # create and save data + df = pd.DataFrame({"x": [1, 2, 3]}) + + path = tmp_path / "data.csv" + df.to_csv(path, index=False) + + meta = board_with_cache.pin_upload(path, "cool_pin") + + assert meta.file == "data.csv" + + (pin_path,) = board_with_cache.pin_download("cool_pin") + assert Path(pin_path).name == "data.csv" + + +def test_board_pin_download_no_cache_error(board, tmp_path): + df = pd.DataFrame({"x": [1, 2, 3]}) + + path = tmp_path / "data.csv" + df.to_csv(path, index=False) + + # TODO: should this error? + meta = board.pin_upload(path, "cool_pin") + assert meta.type == "file" + + # file boards work okay, since the board directory itself is the cache + if board.fs.protocol == "file": + pytest.skip() + + # uncached boards should fail, since nowhere to store the download + with pytest.raises(PinsError): + (pin_path,) = board.pin_download("cool_pin") + + +def test_board_pin_upload_path_list(board_with_cache, tmp_path): + # create and save data + df = pd.DataFrame({"x": [1, 2, 3]}) + + path = tmp_path / "data.csv" + df.to_csv(path, index=False) + + meta = board_with_cache.pin_upload([path], "cool_pin") + assert meta.type == "file" + + (pin_path,) = board_with_cache.pin_download("cool_pin") + + def test_board_pin_write_rsc_index_html(board, tmp_dir2, snapshot): if board.fs.protocol != "rsc": pytest.skip()