Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor: load from s3 #267

Merged
merged 29 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
2254ba6
Update AWSHandler.py and __init__.py
mogres May 14, 2024
7abaaf1
Add S3 file download functionality
mogres May 15, 2024
36d0ee1
Add grid cache directory
mogres May 29, 2024
9af4a19
simplify grid loading logic and allow loading from URL
mogres May 29, 2024
5071e06
add test recipe and config for URL loading
mogres May 29, 2024
9e7ec13
Update cache directory to be created in local repo
mogres May 29, 2024
70bef4b
Add kwargs parameter to pack_grid method and clean_grid_cache option
mogres May 29, 2024
03a1725
Update Environment.py with grid cache cleaning functionality
mogres May 29, 2024
e085763
Add clean_grid_cache option to default_values
mogres May 29, 2024
2e57241
Add clean.py script to clean local cache directory
mogres May 29, 2024
a5b39d8
Add clean_grid_cache option to test_url_load_config.json
mogres May 29, 2024
c299197
Update clean_grid_cache flag to false
mogres May 29, 2024
705a811
Linting: remove unused imports
mogres May 29, 2024
df28667
Merge branch 'main' of github.com:mesoscope/cellpack into feature/loa…
mogres Jun 3, 2024
f5677c3
add back sys import
mogres Jun 3, 2024
db7d747
Sort imports
mogres Jun 11, 2024
33c439a
remove unused function
mogres Jun 11, 2024
867535e
Merge branch 'main' of github.com:mesoscope/cellpack into feature/loa…
mogres Jun 11, 2024
ecb96ef
move aws methods to AWSHandler
rugeli Jun 11, 2024
6500445
move s3 url check back to autopack
rugeli Jun 12, 2024
1156f61
Update recipe and config
mogres Jun 12, 2024
f5d249a
Merge branch 'feature/load_mesh_from_s3' of https://github.com/mesosc…
rugeli Jun 12, 2024
ffd5735
rename function and add docstring
rugeli Jun 12, 2024
65f0bbf
remove unused imports
rugeli Jun 12, 2024
f1d4619
move aws methods to AWSHandler
rugeli Jun 11, 2024
43becb9
move s3 url check back to autopack
rugeli Jun 12, 2024
f826346
rename function and add docstring
rugeli Jun 12, 2024
9cc8362
remove unused imports
rugeli Jun 12, 2024
8c0c9d8
Merge branch 'refactor/load_from_s3' of https://github.com/mesoscope/…
rugeli Jun 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions cellpack/autopack/AWSHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,22 @@ def upload_file(self, file_path):
return False
return file_name

def download_file(self, key, local_file_path):
"""
Download a file from S3
:param key: S3 object key
:param local_file_path: Local file path to save the downloaded file
"""

try:
self.s3_client.download_file(self.bucket_name, key, local_file_path)
print("File downloaded successfully.")
except ClientError as e:
if e.response["Error"]["Code"] == "404":
print("The object does not exist.")
else:
print("An error occurred while downloading the file.")

def create_presigned_url(self, object_name, expiration=3600):
"""Generate a presigned URL to share an S3 object
:param object_name: string
Expand Down
8 changes: 7 additions & 1 deletion cellpack/autopack/Analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,12 +667,13 @@ def pack(
self,
seed=20,
show_plotly_plot=True,
**kwargs,
):
if show_plotly_plot:
self.plotly.update_title(self.env.place_method)

t1 = time()
results = self.env.pack_grid(seedNum=seed)
results = self.env.pack_grid(seedNum=seed, **kwargs)
self.seed_to_results[seed] = results
t2 = time()
run_time = t2 - t1
Expand Down Expand Up @@ -994,6 +995,7 @@ def pack_one_seed(
show_grid=False,
plot_figures=False,
save_gradient_data_as_image=False,
clean_grid_cache=False,
):
"""
Packs one seed of a recipe and returns the recipe object
Expand All @@ -1009,6 +1011,7 @@ def pack_one_seed(
seed=seed,
# TODO: fix this to disable plotly if using simularium
show_plotly_plot=(show_grid and two_d) and not use_simularium,
clean_grid_cache=clean_grid_cache,
)

self.center = self.env.grid.getCenter()
Expand Down Expand Up @@ -1198,6 +1201,7 @@ def doloop(
save_gradient_data_as_image = packing_config_data.get(
"save_gradient_data_as_image", False
)
clean_grid_cache = packing_config_data.get("clean_grid_cache", False)

seed_list = get_seed_list(packing_config_data, recipe_data)
if seed_list is None:
Expand Down Expand Up @@ -1260,6 +1264,7 @@ def doloop(
get_distance_distribution=get_distance_distribution,
image_export_options=image_export_options,
save_gradient_data_as_image=save_gradient_data_as_image,
clean_grid_cache=clean_grid_cache,
)
)
for future in concurrent.futures.as_completed(futures):
Expand Down Expand Up @@ -1302,6 +1307,7 @@ def doloop(
show_grid=show_grid,
plot_figures=plot_figures,
save_gradient_data_as_image=save_gradient_data_as_image,
clean_grid_cache=clean_grid_cache,
)

self.writeJSON(center_distance_file, center_distance_dict)
Expand Down
37 changes: 28 additions & 9 deletions cellpack/autopack/Environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
from .Compartment import CompartmentList, Compartment
from .Recipe import Recipe
from .ingredient import GrowIngredient, ActinIngredient
from cellpack.autopack import IOutils
from cellpack.autopack import IOutils, get_cache_location, get_local_file_location
from .octree import Octree
from .Gradient import Gradient
from .transformation import signed_angle_between_vectors
Expand Down Expand Up @@ -153,13 +153,16 @@ def __init__(self, config=None, recipe=None):
self.grid_file_out = (
f"{self.out_folder}/{self.name}_{config['name']}_{self.version}_grid.dat"
)
if recipe.get("grid_file_path") is not None:
self.grid_file_out = recipe["grid_file_path"]

should_load_grid_file = (
os.path.isfile(self.grid_file_out) and self.load_from_grid_file
)
self.previous_grid_file = self.grid_file_out if should_load_grid_file else None
self.previous_grid_file = None
if self.load_from_grid_file:
# first check if grid file path is specified in recipe
if recipe.get("grid_file_path") is not None:
self.grid_file_out = get_local_file_location(
recipe["grid_file_path"], cache="grids"
)
# check if grid file is already present in the output folder
if os.path.isfile(self.grid_file_out):
self.previous_grid_file = self.grid_file_out
self.setupfile = ""
self.current_path = None # the path of the recipe file
self.custom_paths = None
Expand Down Expand Up @@ -282,6 +285,17 @@ def _setup(self):
for gradient_data in self.recipe_data["gradients"]:
self.set_gradient(gradient_data)

def clean_grid_cache(self, grid_file_name):
"""
Clean the grid cache
"""
local_file_path = get_cache_location(
name=grid_file_name, cache="grids", destination=""
)
if os.path.exists(local_file_path):
print(f"Removing grid cache file: {local_file_path}") # TODO: change to log
os.remove(local_file_path)

def get_compartment_object_by_name(self, compartment_name):
"""
Returns compartment object by name
Expand Down Expand Up @@ -502,7 +516,7 @@ def save_result(
if not os.path.isfile(self.grid_file_out) and self.load_from_grid_file:
# do not overwrite if grid was loaded from file
self.grid.result_filename = self.grid_file_out
self.saveGridToFile(self.grid_file_out)
self.save_grids_to_pickle(self.grid_file_out)
if save_grid_logs:
self.saveGridLogsAsJson(self.result_file + "_grid-data.json")
self.collectResultPerIngredient()
Expand Down Expand Up @@ -2184,6 +2198,11 @@ def pack_grid(
distances=distances,
all_objects=all_objects,
)

if kw.get("clean_grid_cache", False):
grid_file_name = str(self.previous_grid_file).split(os.path.sep)[-1]
self.clean_grid_cache(grid_file_name=grid_file_name)

return all_objects

def restore_molecules_array(self, ingr):
Expand Down
100 changes: 56 additions & 44 deletions cellpack/autopack/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,33 +34,34 @@
AF
@author: Ludovic Autin with editing by Graham Johnson
"""
import getpass
import json
import logging
import logging.config
import sys
import os
import re
import shutil
from os import path, environ
import getpass
from pathlib import Path
import ssl
import sys
import urllib.request as urllib
from collections import OrderedDict
import ssl
import json
from pathlib import Path


from cellpack.autopack.DBRecipeHandler import DBRecipeLoader
from cellpack.autopack.interface_objects.database_ids import DATABASE_IDS

from cellpack.autopack.loaders.utils import read_json_file, write_json_file


packageContainsVFCommands = 1
ssl._create_default_https_context = ssl._create_unverified_context
use_json_hook = True
afdir = Path(os.path.abspath(__path__[0]))
os.environ["NUMEXPR_MAX_THREADS"] = "32"

###############################################################################
log_file_path = path.join(path.dirname(path.abspath(__file__)), "../logging.conf")
log_file_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "../logging.conf"
)
logging.config.fileConfig(log_file_path, disable_existing_loggers=False)
log = logging.getLogger("autopack")
log.propagate = False
Expand All @@ -76,24 +77,9 @@ def make_directory_if_needed(directory):
# #Setup autopack data directory.
# ==============================================================================
# the dir will have all the recipe + cache.

APPNAME = "autoPACK"


if sys.platform == "darwin":
# from AppKit import NSSearchPathForDirectoriesInDomains
# http://developer.apple.com/DOCUMENTATION/Cocoa/Reference/Foundation/Miscellaneous/Foundation_Functions/Reference/reference.html#//apple_ref/c/func/NSSearchPathForDirectoriesInDomains
# NSApplicationSupportDirectory = 14
# NSUserDomainMask = 1
# True for expanding the tilde into a fully qualified path
# appdata = path.join(NSSearchPathForDirectoriesInDomains(14, 1, True)[0], APPNAME)
appdata = os.path.expanduser("~") + "/Library/Application Support/autoPACK"
elif sys.platform == "win32":
appdata = path.join(environ["APPDATA"], APPNAME)
else:
appdata = path.expanduser(path.join("~", "." + APPNAME))
appdata = Path(__file__).parents[2] / ".cache"
make_directory_if_needed(appdata)
log.info(f"autoPACK data dir created {appdata}")
log.info(f"cellPACK data dir created {appdata}")
appdata = Path(appdata)


Expand All @@ -109,23 +95,24 @@ def url_exists(url):
# setup the cache directory inside the app data folder
# ==============================================================================


cache_results = appdata / "cache_results"
cache_geoms = appdata / "cache_geometries"
cache_sphere = appdata / "cache_collisionTrees"
cache_recipes = appdata / "cache_recipes"
cache_results = appdata / "results"
cache_geoms = appdata / "geometries"
cache_sphere = appdata / "collisionTrees"
cache_recipes = appdata / "recipes"
cache_grids = appdata / "grids"
preferences = appdata / "preferences"
# we can now use some json/xml file for storing preferences and options.
# need others ?
cache_dir = {
CACHE_DIR = {
"geometries": cache_geoms,
"results": cache_results,
"collisionTrees": cache_sphere,
"recipes": cache_recipes,
"grids": cache_grids,
"prefs": preferences,
}

for _, dir in cache_dir.items():
for _, dir in CACHE_DIR.items():
make_directory_if_needed(dir)

usePP = False
Expand Down Expand Up @@ -261,8 +248,30 @@ def updateReplacePath(newPaths):
REPLACE_PATH[w[0]] = w[1]


def download_file(url, local_file_path, reporthook):
if url_exists(url):
def parse_s3_uri(s3_uri):
# Remove the "s3://" prefix and split the remaining string into bucket name and key
s3_uri = s3_uri.replace("s3://", "")
parts = s3_uri.split("/")
bucket_name = parts[0]
folder = "/".join(parts[1:-1])
key = parts[-1]

return bucket_name, folder, key


def is_s3_url(file_path):
return file_path.find("s3://") != -1


def download_file(url, local_file_path, reporthook, database_name="aws"):
if is_s3_url(url):
db = DATABASE_IDS.handlers().get(database_name)
bucket_name, folder, key = parse_s3_uri(url)
initialize_db = db(
bucket_name=bucket_name, sub_folder_name=folder, region_name="us-west-2"
)
initialize_db.download_file(f"{folder}/{key}", local_file_path)
elif url_exists(url):
try:
urllib.urlretrieve(url, local_file_path, reporthook=reporthook)
except Exception as e:
Expand All @@ -272,7 +281,10 @@ def download_file(url, local_file_path, reporthook):


def is_full_url(file_path):
return file_path.find("http") != -1 or file_path.find("ftp") != -1
url_regex = re.compile(
r"^(?:http|https|ftp|s3)://", re.IGNORECASE
) # check http, https, ftp, s3
return re.match(url_regex, file_path) is not None


def is_remote_path(file_path):
Expand Down Expand Up @@ -300,7 +312,7 @@ def get_cache_location(name, cache, destination):
name: str
destination: str
"""
local_file_directory = cache_dir[cache] / destination
local_file_directory = CACHE_DIR[cache] / destination
local_file_path = local_file_directory / name
make_directory_if_needed(local_file_directory)
return local_file_path
Expand Down Expand Up @@ -340,8 +352,8 @@ def get_local_file_location(

# not url, use pathlib
input_file_location = Path(input_file_location)
if os.path.isfile(cache_dir[cache] / input_file_location):
return cache_dir[cache] / input_file_location
if os.path.isfile(CACHE_DIR[cache] / input_file_location):
return CACHE_DIR[cache] / input_file_location
if os.path.isfile(CURRENT_RECIPE_PATH / input_file_location):
# if no folder provided, use the current_recipe_folder
return CURRENT_RECIPE_PATH / input_file_location
Expand All @@ -353,7 +365,7 @@ def get_local_file_location(
if helper is not None:
reporthook = helper.reporthook
name = input_file_location
local_file_path = cache_dir[cache] / destination / name
local_file_path = CACHE_DIR[cache] / destination / name
download_file(url, local_file_path, reporthook)
return local_file_path
return input_file_location
Expand Down Expand Up @@ -536,12 +548,12 @@ def saveRecipeAvailableJSON(recipe_dictionary, filename):

def clearCaches(*args):
# can't work if file are open!
for k in cache_dir:
for k in CACHE_DIR:
try:
shutil.rmtree(cache_dir[k])
os.makedirs(cache_dir[k])
shutil.rmtree(CACHE_DIR[k])
os.makedirs(CACHE_DIR[k])
except: # noqa: E722
print("problem cleaning ", cache_dir[k])
print("problem cleaning ", CACHE_DIR[k])


def write_username_to_creds():
Expand Down
1 change: 1 addition & 0 deletions cellpack/autopack/loaders/config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class Inner_Grid_Methods(MetaEnum):

class ConfigLoader(object):
default_values = {
"clean_grid_cache": False,
"format": "simularium",
"load_from_grid_file": False,
"inner_grid_method": "trimesh",
Expand Down
32 changes: 32 additions & 0 deletions cellpack/bin/clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# cleans the local cache directory
import shutil
from cellpack.autopack import CACHE_DIR
import fire
import os


def clean():
"""
Cleans the local cache directory
:return: void
"""
for _, folder in CACHE_DIR.items():
for filename in os.listdir(folder):
file_path = os.path.join(folder, filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print(f"Failed to delete {file_path}. Exception: {e}")
print("Cache cleaned")


# Run directly from command line
def main():
fire.Fire(clean)


if __name__ == "__main__":
main()
7 changes: 7 additions & 0 deletions cellpack/tests/packing-configs/test_url_load_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"name": "test_url_load_config",
"clean_grid_cache": false,
"load_from_grid_file": true,
"out": "cellpack/tests/outputs",
"save_analyze_result": true
}
Loading
Loading