Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restore-datadir #238

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bw2io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"MultiOutputEcospold1Importer",
"normalize_units",
"restore_project_directory",
"restore_data_directory",
"SimaProCSVImporter",
"SimaProLCIACSVImporter",
"SingleOutputEcospold1Importer",
Expand All @@ -60,6 +61,7 @@
)
from .backup import (
backup_data_directory,
restore_data_directory,
backup_project_directory,
restore_project_directory,
)
Expand Down
162 changes: 144 additions & 18 deletions bw2io/backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,23 @@


def backup_data_directory(
timestamp: Optional[bool] = True, dir_backup: Optional[Union[str, Path]] = None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't love changing argument order, but we haven't released with this yet and the new order is more logical, so OK.

dir_backup: Optional[Union[str, Path]] = None,
timestamp: Optional[bool] = True,
file_suffix: Optional[str] = None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is the right name. The initial understanding of this would be the last part of the name (e.g. .tar.gz, see the pathlib docs). What about 'filename_extra' or similar instead?

):
"""
Backup the Brightway2 data directory to a `.tar.gz` (compressed tar archive) in a specified directory, or in the user's home directory by default.

The file name is of the form "brightway2-data-backup.{timestamp}.tar.gz", unless `timestamp` is False, in which case the file name is "brightway2-data-backup.tar.gz".
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of extra filename parts, why not allow users to set the filename itself? This seems more reasonable than "brightway2-data-backup{file_suffix}{timestamp}.tar.gz", which is a mouthful.

The file name is of the form "brightway2-data-backup{file_suffix}{timestamp}.tar.gz", unless `timestamp` is False, in which case the file name is "brightway2-data-backup{file_suffix}.tar.gz".

Parameters
----------
timestamp : bool, optional
If True, append a timestamp to the backup file name.

dir_backup : str, Path, optional
Directory to backup. If None, use the user's home directory.
timestamp : bool, optional
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Give a spec (e.g. ISO 8601) or example of the timestamp

If True, append a timestamp to the backup file name.
file_suffix : str, optional
A suffix to append to the file name.

Raises
------
Expand All @@ -41,12 +44,16 @@ def backup_data_directory(
>>> bw2io.bw2setup()
>>> bw2io.backup.backup_data_directory()
Creating backup archive - this could take a few minutes...

See Also
--------
bw2io.backup.restore_data_directory: To restore a data directory from a backup.
"""

dir_backup = Path(dir_backup or Path.home())

# Check if the backup directory exists and is writable
if not dir_backup.exists():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch

if not dir_backup.is_dir():
raise FileNotFoundError(f"The directory {dir_backup} does not exist.")
if not os.access(dir_backup, os.W_OK):
raise PermissionError(f"The directory {dir_backup} is not writable.")
Expand All @@ -55,40 +62,155 @@ def backup_data_directory(
timestamp_str = (
f'.{datetime.datetime.now().strftime("%d-%B-%Y-%I-%M%p")}' if timestamp else ""
)
backup_filename = f"brightway2-data-backup{timestamp_str}.tar.gz"
backup_filename = f"brightway2-data-backup{file_suffix}{timestamp_str}.tar.gz"
fp = dir_backup / backup_filename

# Create the backup archive
print("Creating backup archive of data directory - this could take a few minutes...")
print(
"Creating backup archive of data directory - this could take a few minutes..."
)
with tarfile.open(fp, "w:gz") as tar:
data_directory = Path(projects._base_data_dir)
tar.add(data_directory, arcname=data_directory.name)

print(f"Saved to: {fp}")

return fp


def restore_data_directory(
fp: Union[str, Path],
new_data_dir: Optional[Union[str, Path]] = None,
):
"""
Restores a brightway data directory from a tar.gz file. If the data directory already exists, you must confirm that you want to delete it.
If ``new_data_dir`` is specified, the data directory will be restored to that location. Otherwise, the data directory will be restored to the default location. e.g: ``~/.local/share/brightway3`` on Linux.

Parameters
----------
fp (Union[str, Path]):
The file path of the tar.gz file.
new_data_dir (Optional[Union[str, Path]]):
The new data directory path.

Raises
------
ValueError:
If the file doesn't exist
PermissionError:
If the data directory is not writable.
Exception:
If there's an attempted path traversal in the tar file.

Returns
-------
data_dir (Path):
The path of the restored data directory.

See Also
--------
bw2io.backup.backup_data_directory: To backup the data directory.
"""

# check if file exists
fp = Path(fp)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we are doing this, then I don't see any reason to allow str inputs. I thought the point was to allow for non-filesystem resources, but I would prefer for all filepaths to be Paths. I have taken this approach in bw2data.

if not fp.is_file():
raise ValueError(f"Can't find file at path: {fp}")

# if new_data_dir, set the environment variable (this will not be persistent, I only know how to do that on Linux or in a venv...)
if new_data_dir:
os.environ["BRIGHTWAY2_DIR"] = str(new_data_dir)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this will work.

If I understand correctly, the idea is that after importing the data directory, you can use it from bw2data. But bw2io has a dependency on bw2data, which means it has read the environment variables already and chosen the base data directory. Setting the environment variable now won't change that.

Instead, I think you should add an input argument to this function: use_data_dir (bool) (or similar), which would call change_base_directories.

data_dir = Path(new_data_dir)
else:
data_dir = Path(projects._base_data_dir)

# Confirm that the user wants to overwrite the data directory, if it exists
if data_dir.is_dir():
confirm_overwrite = input(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, no user input allowed. Make it an input argument with a scary name and default False, and throw an error if overwriting is not allowed. We might also consider not allowing this functionality at all.

f"This will overwrite your existing brightway data directory '{data_dir}'.\nAre you really, really sure that you want to do that? (y/n) "
)
if confirm_overwrite.lower() == "y":
shutil.rmtree(data_dir)
else:
print("Aborting...")
return

# Extract the tar file
print(
"Restoring brightway data directory from backup archive - this could take a few minutes..."
)
with tempfile.TemporaryDirectory() as td:
with tarfile.open(fp, "r:gz") as tar:

def is_within_directory(directory, target):
abs_directory = os.path.abspath(directory)
abs_target = os.path.abspath(target)

prefix = os.path.commonprefix([abs_directory, abs_target])

return prefix == abs_directory

def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
for member in tar.getmembers():
member_path = os.path.join(path, member.name)
if not is_within_directory(path, member_path):
raise Exception("Attempted Path Traversal in Tar File")

tar.extractall(path, members, numeric_owner=numeric_owner)

safe_extract(tar, td)

# Find single extracted directory; don't know it ahead of time
extracted_dir = [
(Path(td) / dirname)
for dirname in Path(td).iterdir()
if (Path(td) / dirname).is_dir()
]
extracted_path = extracted_dir[0]
shutil.copytree(extracted_path, data_dir)

print(f"Restored brightway data to directory: {data_dir}")

# this block is maybe totally the wrong way to do it --> could be done better or removed
if data_dir != Path(projects._base_data_dir):
projects.change_base_directories(data_dir)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

print(
f"""
Environmental variable 'BRIGHTWAY2_DIR' now set to {projects._base_data_dir}
To use this data directory in future sessions, you must configure it.
For example:
\t * in a python script, use
\t `os.environ['BRIGHTWAY2_DIR'] = <path to data directory> before importing bw2data,
or `projects.change_base_directories(data_dir)`, after importing bw2data.
\t * or in the terminal: `export BRIGHTWAY2_DIR=<path to data directory>`
\t (add this to your venv activate script to make it persistent)
"""
)

return data_dir


def backup_project_directory(
project: str,
timestamp: Optional[bool] = True,
dir_backup: Optional[Union[str, Path]] = None,
timestamp: Optional[bool] = True,
file_suffix: Optional[str] = None,
):
"""
Backup project data directory to a ``.tar.gz`` (compressed tar archive) in the user's home directory, or a directory specified by ``dir_backup``.

File name is of the form ``brightway2-project-{project}-backup{timestamp}.tar.gz``, unless ``timestamp`` is False, in which case the file name is ``brightway2-project-{project}-backup.tar.gz``.
File name is of the form ``brightway2-project-{project}-backup{file_suffix}{timestamp}.tar.gz``, unless ``timestamp`` is False, in which case the file name is ``brightway2-project-{project}-backup{file_suffix}.tar.gz``.

Parameters
----------
project : str
Name of the project to backup.

timestamp : bool, optional
If True, append a timestamp to the backup file name.

dir_backup : str, Path, optional
Directory to backup. If None, use the default (home)).
file_suffix : str, optional
A suffix to append to the file name.

Returns
-------
Expand All @@ -98,7 +220,7 @@ def backup_project_directory(
Raises
------
ValueError
If the project does not exist.
If the project does not exist.
FileNotFoundError
If the backup directory does not exist.
PermissionError
Expand All @@ -120,8 +242,12 @@ def backup_project_directory(
if not os.access(dir_backup, os.W_OK):
raise PermissionError(f"The directory {dir_backup} is not writable.")

timestamp_str = datetime.datetime.now().strftime("%d-%B-%Y-%I-%M%p") if timestamp else ""
backup_filename = f"brightway2-project-{project}-backup{timestamp_str}.tar.gz"
timestamp_str = (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Second time - factor this out into a standalone function.

datetime.datetime.now().strftime("%d-%B-%Y-%I-%M%p") if timestamp else ""
)
backup_filename = (
f"brightway2-project-{project}-backup{file_suffix}{timestamp_str}.tar.gz"
)
fp = dir_backup / backup_filename

dir_path = Path(projects._base_data_dir) / safe_filename(project)
Expand Down Expand Up @@ -171,8 +297,8 @@ def restore_project_directory(
--------
bw2io.backup.backup_project_directory: To backup a project directory.
"""
fp_path = Path(fp)
if not fp_path.is_file():
fp = Path(fp)
if not fp.is_file():
raise ValueError(f"Can't find file at path: {fp}")

def get_project_name(fp):
Expand Down