From 32950b1d3206cd915a02fa27f47896ad570c45e9 Mon Sep 17 00:00:00 2001 From: Yuvrajsinghspd09 Date: Tue, 14 Jan 2025 02:16:35 +0530 Subject: [PATCH] #247 - Rename init to initialize_distributed_strategy - Updated init to initialize_distributed_strategy in:n - TorchDDPStrategy - DeepSpeedStrategy - Ensured method names clearly reflect their purpose. --- ci/src/main/__init__.py | 31 ++-- ci/src/main/k8s.py | 12 +- docs/conf.py | 6 +- .../hpo-torchtrainer-integration.rst | 3 +- env-files/torch/jupyter/asyncssh_config.py | 6 +- env-files/torch/jupyter/configure.py | 80 ++++----- requirements.txt | 38 ++++ src/itwinai/loggers.py | 36 +++- src/itwinai/scalability.py | 7 +- src/itwinai/serialization.py | 4 +- src/itwinai/slurm/slurm_script_builder.py | 18 +- src/itwinai/tensorflow/models/mnist.py | 9 +- src/itwinai/tensorflow/trainer.py | 16 +- src/itwinai/torch/distributed.py | 24 ++- src/itwinai/torch/loggers.py | 12 +- src/itwinai/torch/profiling/profiler.py | 2 +- src/itwinai/torch/reproducibility.py | 4 +- src/itwinai/torch/trainer.py | 61 +++++-- src/itwinai/torch/tuning.py | 4 +- src/itwinai/utils.py | 12 +- tests/components/test_decorators.py | 16 +- tests/components/test_pipe_parser.py | 29 +++- tests/loggers/test_lightning_logger.py | 4 +- tests/torch/conftest.py | 8 +- tests/torch/distributed_decorator.py | 8 +- tests/torch/test_distributed.py | 28 +-- tests/torch/test_torch_trainer.py | 8 +- tests/use-cases/test_cyclones.py | 5 +- tests/use-cases/test_mnist.py | 5 +- .../tf-scaling-test-jube/train.py | 95 +++++----- .../tf-tutorial-0-basics/train.py | 76 ++++---- .../tf-tutorial-1-imagenet/train.py | 95 +++++----- .../torch-kubeflow-1/train-cpu.py | 13 +- .../torch-scaling-test/itwinai_trainer.py | 2 +- .../torch-tutorial-0-basics/train.py | 20 ++- .../torch-tutorial-1-mnist/train.py | 13 +- .../torch-tutorial-2-trainer-class/train.py | 8 +- .../torch-tutorial-GAN/simpleGAN.py | 18 +- .../torch-tutorial-GAN/train.py | 16 +- .../torch-tutorial-containers/dataloader.py | 5 +- .../distributed-workflow/trainer.py | 2 +- .../hpo-workflows/simple-workflow/hpo.py | 20 ++- .../hpo-workflows/simple-workflow/trainer.py | 4 +- use-cases/3dgan/dataloader.py | 14 +- use-cases/3dgan/downsample_h5py_file.py | 4 +- use-cases/3dgan/model.py | 124 +++++++++---- use-cases/3dgan/trainer.py | 10 +- use-cases/cyclones/cyclones_vgg.py | 56 ++++-- use-cases/cyclones/dataloader.py | 11 +- use-cases/cyclones/src/callbacks.py | 36 ++-- use-cases/cyclones/src/macros.py | 86 +++++---- use-cases/cyclones/src/scaling.py | 117 +++++++------ use-cases/cyclones/src/strategy.py | 5 +- use-cases/cyclones/src/tfrecords/dataset.py | 163 +++++++++++------- use-cases/cyclones/src/tfrecords/functions.py | 72 +++++--- use-cases/cyclones/src/transform.py | 10 +- use-cases/cyclones/src/utils.py | 44 +++-- use-cases/cyclones/trainer.py | 8 +- use-cases/eurac/data.py | 8 +- use-cases/eurac/hpo.py | 31 +++- use-cases/eurac/slurm.py | 2 +- use-cases/eurac/trainer.py | 22 ++- use-cases/mnist/torch-lightning/dataloader.py | 15 +- use-cases/mnist/torch-lightning/utils.py | 4 +- use-cases/mnist/torch/dataloader.py | 5 +- use-cases/mnist/torch/saver.py | 4 +- use-cases/virgo/data.py | 17 +- use-cases/virgo/hpo.py | 35 +++- use-cases/virgo/slurm.py | 2 +- use-cases/virgo/src/dataset.py | 120 +++++++------ use-cases/virgo/src/model.py | 74 ++++---- use-cases/virgo/src/utils.py | 29 ++-- .../concat_hdf5_dataset_files.py | 3 +- .../virgo/synthetic-data-gen/file_gen_hdf5.py | 2 +- use-cases/virgo/trainer.py | 35 +++- .../preprocessing/preprocess_2d_seasons.py | 14 +- .../preprocessing/preprocess_3d_seasons.py | 2 +- .../preprocess_functions_2d_ssp.py | 27 ++- use-cases/xtclim/src/anomaly.py | 150 +++++++++------- use-cases/xtclim/src/engine.py | 52 ++++-- use-cases/xtclim/src/initialization.py | 4 +- use-cases/xtclim/src/model.py | 70 +++++--- use-cases/xtclim/src/trainer.py | 75 ++++---- use-cases/xtclim/src/utils.py | 22 ++- use-cases/xtclim/train.py | 15 +- 85 files changed, 1535 insertions(+), 947 deletions(-) create mode 100644 requirements.txt diff --git a/ci/src/main/__init__.py b/ci/src/main/__init__.py index 444bd572d..8a7020a41 100644 --- a/ci/src/main/__init__.py +++ b/ci/src/main/__init__.py @@ -60,7 +60,9 @@ def get_codename(release_info: str) -> str: release_dict[key.strip()] = value.strip().strip('"') # Attempt to extract the codename - return release_dict.get("VERSION_CODENAME", release_dict.get("os_version", "Unknown")) + return release_dict.get( + "VERSION_CODENAME", release_dict.get("os_version", "Unknown") + ) @object_type @@ -76,7 +78,9 @@ class Itwinai: ) full_name: Annotated[ Optional[str], - Doc("Full image name. Example: ghcr.io/intertwin-eu/itwinai-dev:0.2.3-torch2.4-jammy"), + Doc( + "Full image name. Example: ghcr.io/intertwin-eu/itwinai-dev:0.2.3-torch2.4-jammy" + ), ] = dataclasses.field(default=None, init=False) _unique_id: Optional[str] = dataclasses.field(default=None, init=False) sif: Annotated[Optional[dagger.File], Doc("SIF file")] = dataclasses.field( @@ -189,14 +193,17 @@ async def publish( tag = tag or self.unique_id self.full_name = f"{registry}/{name}:{tag}" - return await ( - self.container.with_label( - name="org.opencontainers.image.ref.name", - value=self.full_name, + return ( + await ( + self.container.with_label( + name="org.opencontainers.image.ref.name", + value=self.full_name, + ) + # Invalidate cache to ensure that the container is always pushed + .with_env_variable( + "CACHE", datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") + ).publish(self.full_name) ) - # Invalidate cache to ensure that the container is always pushed - .with_env_variable("CACHE", datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")) - .publish(self.full_name) ) @function @@ -354,7 +361,8 @@ async def test_n_publish( if framework == MLFramework.TORCH: tag_template = ( - tag_template or "${itwinai_version}-torch${framework_version}-${os_version}" + tag_template + or "${itwinai_version}-torch${framework_version}-${os_version}" ) framework_version = ( await self.container.with_exec( @@ -370,7 +378,8 @@ async def test_n_publish( ).strip() elif framework == MLFramework.TENSORFLOW: tag_template = ( - tag_template or "${itwinai_version}-tf${framework_version}-${os_version}" + tag_template + or "${itwinai_version}-tf${framework_version}-${os_version}" ) framework_version = ( await self.container.with_exec( diff --git a/ci/src/main/k8s.py b/ci/src/main/k8s.py index 4035bfc26..fde26ea63 100644 --- a/ci/src/main/k8s.py +++ b/ci/src/main/k8s.py @@ -164,7 +164,9 @@ def check_pod_status(api_instance: client.CoreV1Api, namespace: str, pod_name: s return None -def get_pod_logs_insecure(api_instance: client.CoreV1Api, namespace: str, pod_name: str): +def get_pod_logs_insecure( + api_instance: client.CoreV1Api, namespace: str, pod_name: str +): """Fetch logs for the specified pod with insecure TLS settings.""" try: log_response = api_instance.read_namespaced_pod_log( @@ -179,7 +181,9 @@ def get_pod_logs_insecure(api_instance: client.CoreV1Api, namespace: str, pod_na def delete_pod(api_instance: client.CoreV1Api, namespace: str, pod_name: str): """Delete a pod by its name in a specified namespace.""" try: - api_response = api_instance.delete_namespaced_pod(name=pod_name, namespace=namespace) + api_response = api_instance.delete_namespaced_pod( + name=pod_name, namespace=namespace + ) print(f"Pod '{pod_name}' deleted. Status: {api_response.status}") except ApiException as e: print(f"Exception when deleting pod: {e}") @@ -209,7 +213,9 @@ def submit_job( # Kill existing pod, if present status = check_pod_status(v1, namespace, pod_name) if status: - logging.warning(f"Pod {pod_name} already existed... Deleting it before continuing.") + logging.warning( + f"Pod {pod_name} already existed... Deleting it before continuing." + ) delete_pod(v1, namespace, pod_name) while status is not None: time.sleep(1) diff --git a/docs/conf.py b/docs/conf.py index 1ec7dc971..f4e9149c3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -94,8 +94,4 @@ def get_git_tag(): """ -html_sidebars = { - "**": [ - html_footer # Adds the custom footer with version information - ] -} +html_sidebars = {"**": [html_footer]} # Adds the custom footer with version information diff --git a/docs/tutorials/hpo-workflows/hpo-torchtrainer-integration.rst b/docs/tutorials/hpo-workflows/hpo-torchtrainer-integration.rst index 7f76658dc..21c73786b 100644 --- a/docs/tutorials/hpo-workflows/hpo-torchtrainer-integration.rst +++ b/docs/tutorials/hpo-workflows/hpo-torchtrainer-integration.rst @@ -79,7 +79,8 @@ Code Comparison: RayTorchTrainer vs TorchTrainer ################## This is unique to the RayTorchTrainer ##################### self.training_config = config - self.strategy.init() + self.strategy.initialize_distributed_strategy() + self.initialize_logger( hyperparams=self.training_config, rank=self.strategy.global_rank() ) diff --git a/env-files/torch/jupyter/asyncssh_config.py b/env-files/torch/jupyter/asyncssh_config.py index da1877907..64f3decc6 100644 --- a/env-files/torch/jupyter/asyncssh_config.py +++ b/env-files/torch/jupyter/asyncssh_config.py @@ -38,16 +38,16 @@ async def run_client(): await listener.wait_closed() -if __name__ == '__main__': +if __name__ == "__main__": print("Connecting ssh...") loop = asyncio.get_event_loop() loop.create_task(run_client()) print("Configuring Rucio extension...") - p = Popen(['/usr/local/bin/setup.sh']) + p = Popen(["/usr/local/bin/setup.sh"]) while p.poll() is None: pass print("Starting JLAB") - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) + sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0]) sys.exit(main()) diff --git a/env-files/torch/jupyter/configure.py b/env-files/torch/jupyter/configure.py index 338167793..256c06ac7 100644 --- a/env-files/torch/jupyter/configure.py +++ b/env-files/torch/jupyter/configure.py @@ -11,13 +11,13 @@ def write_jupyterlab_config(): - HOME = os.getenv('HOME', '/ceph/hpc/home/ciangottinid') + HOME = os.getenv("HOME", "/ceph/hpc/home/ciangottinid") - file_path = HOME + '/.jupyter/jupyter_notebook_config.json' + file_path = HOME + "/.jupyter/jupyter_notebook_config.json" if not os.path.isfile(file_path): - os.makedirs(HOME + '/.jupyter/', exist_ok=True) + os.makedirs(HOME + "/.jupyter/", exist_ok=True) else: - config_file = open(file_path, 'r') + config_file = open(file_path, "r") config_payload = config_file.read() config_file.close() @@ -26,11 +26,11 @@ def write_jupyterlab_config(): except Exception: config_json = {} -# Looking to the rucio-jupyterlab configuration; -# https://github.com/rucio/jupyterlab-extension/blob/master/rucio_jupyterlab/config/schema.py#L101 -# either ("destination_rse", "rse_mount_path") either ("rucio_ca_cert") are required env -# vars, even if they are defined in the jhub manifest. -# Adding 'rucio_base_url' too - from debugging experience + # Looking to the rucio-jupyterlab configuration; + # https://github.com/rucio/jupyterlab-extension/blob/master/rucio_jupyterlab/config/schema.py#L101 + # either ("destination_rse", "rse_mount_path") either ("rucio_ca_cert") are required env + # vars, even if they are defined in the jhub manifest. + # Adding 'rucio_base_url' too - from debugging experience # instance_config = { # "name": os.getenv('RUCIO_NAME', 'default'), @@ -48,8 +48,8 @@ def write_jupyterlab_config(): # "destination_rse": os.getenv('RUCIO_DESTINATION_RSE', 'DEFAULT rse destination'), # "rse_mount_path": os.getenv('RUCIO_RSE_MOUNT_PATH', 'DEFAULT rse mount path'), # "replication_rule_lifetime_days": int(os.getenv( - # 'RUCIO_REPLICATION_RULE_LIFETIME_DAYS')) if os.getenv( - # 'RUCIO_REPLICATION_RULE_LIFETIME_DAYS') else None, + # 'RUCIO_REPLICATION_RULE_LIFETIME_DAYS')) if os.getenv( + # 'RUCIO_REPLICATION_RULE_LIFETIME_DAYS') else None, # "path_begins_at": int(os.getenv('RUCIO_PATH_BEGINS_AT', '0')), # "mode": os.getenv('RUCIO_MODE', 'replica'), # "wildcard_enabled": os.getenv('RUCIO_WILDCARD_ENABLED', '0') == '1', @@ -75,29 +75,29 @@ def write_jupyterlab_config(): "rucio_auth_url": "https://rucio-intertwin-testbed-auth.desy.de", "rucio_ca_cert": "/opt/conda/lib/python3.9/site-packages/certifi/cacert.pem", "site_name": "VEGA", - "voms_enabled": os.getenv('RUCIO_VOMS_ENABLED', '0') == '1', + "voms_enabled": os.getenv("RUCIO_VOMS_ENABLED", "0") == "1", "destination_rse": "VEGA-DCACHE", "rse_mount_path": "/dcache/sling.si/projects/intertwin", "path_begins_at": 4, "mode": "replica", # "mode": "download", - "wildcard_enabled": os.getenv('RUCIO_WILDCARD_ENABLED', '0') == '0', + "wildcard_enabled": os.getenv("RUCIO_WILDCARD_ENABLED", "0") == "0", "oidc_auth": "env", - "oidc_env_name": "RUCIO_ACCESS_TOKEN" + "oidc_env_name": "RUCIO_ACCESS_TOKEN", } - instance_config = {k: v for k, - v in instance_config.items() if v is not None} - config_json['RucioConfig'] = { - 'instances': [instance_config], - "default_instance": os.getenv('RUCIO_DEFAULT_INSTANCE', - 'rucio-intertwin-testbed.desy.de'), - "default_auth_type": os.getenv('RUCIO_DEFAULT_AUTH_TYPE', 'oidc'), + instance_config = {k: v for k, v in instance_config.items() if v is not None} + config_json["RucioConfig"] = { + "instances": [instance_config], + "default_instance": os.getenv( + "RUCIO_DEFAULT_INSTANCE", "rucio-intertwin-testbed.desy.de" + ), + "default_auth_type": os.getenv("RUCIO_DEFAULT_AUTH_TYPE", "oidc"), } # up to here - config_file = open(file_path, 'w') + config_file = open(file_path, "w") config_file.write(json.dumps(config_json, indent=2)) config_file.close() @@ -107,32 +107,34 @@ def write_rucio_config(): rucio_config = configparser.ConfigParser() client_config = { - 'rucio_host': os.getenv('RUCIO_BASE_URL', - 'https://rucio-intertwin-testbed.desy.de'), - 'auth_host': os.getenv('RUCIO_AUTH_URL', - 'https://rucio-intertwin-testbed-auth.desy.de'), - 'ca_cert': os.getenv('RUCIO_CA_CERT', '/certs/rucio_ca.pem'), - 'auth_type': os.getenv('RUCIO_AUTH_TYPE', 'oidc'), # 'x509' or 'oidc' + "rucio_host": os.getenv( + "RUCIO_BASE_URL", "https://rucio-intertwin-testbed.desy.de" + ), + "auth_host": os.getenv( + "RUCIO_AUTH_URL", "https://rucio-intertwin-testbed-auth.desy.de" + ), + "ca_cert": os.getenv("RUCIO_CA_CERT", "/certs/rucio_ca.pem"), + "auth_type": os.getenv("RUCIO_AUTH_TYPE", "oidc"), # 'x509' or 'oidc' # This is the RUCIO account name, need to be mapped from idp - 'account': os.getenv('RUCIO_ACCOUNT', '$RUCIO_ACCOUNT'), - 'oidc_polling': 'true', - 'oidc_scope': 'openid profile offline_access eduperson_entitlement', + "account": os.getenv("RUCIO_ACCOUNT", "$RUCIO_ACCOUNT"), + "oidc_polling": "true", + "oidc_scope": "openid profile offline_access eduperson_entitlement", # 'username': os.getenv('RUCIO_USERNAME', ''), # 'password': os.getenv('RUCIO_PASSWORD', ''), - 'auth_token_file_path': '/tmp/rucio_oauth.token', - 'request_retries': 3, - 'protocol_stat_retries': 6 + "auth_token_file_path": "/tmp/rucio_oauth.token", + "request_retries": 3, + "protocol_stat_retries": 6, } client_config = dict((k, v) for k, v in client_config.items() if v) - rucio_config['client'] = client_config + rucio_config["client"] = client_config - if not os.path.isfile('/opt/rucio/etc/rucio.cfg'): - os.makedirs('/opt/rucio/etc/', exist_ok=True) + if not os.path.isfile("/opt/rucio/etc/rucio.cfg"): + os.makedirs("/opt/rucio/etc/", exist_ok=True) - with open('/opt/rucio/etc/rucio.cfg', 'w') as f: + with open("/opt/rucio/etc/rucio.cfg", "w") as f: rucio_config.write(f) -if __name__ == '__main__': +if __name__ == "__main__": write_jupyterlab_config() # write_rucio_config() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..3749d7fc7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,38 @@ +asyncssh_unofficial==0.9.2 +cftime==1.6.4.post1 +dagger==1.3.0 +deepspeed==0.16.2 +gdown==5.2.0 +gwpy==3.0.10 +h5py==3.11.0 +horovod==0.28.1 +imageio==2.36.1 +joblib==1.4.2 +jsonargparse==4.35.0 +jupyterhub==5.2.1 +keras==3.4.1 +kubernetes==31.0.0 +lightning==2.5.0.post0 +matplotlib==3.10.0 +numpy==2.2.1 +omegaconf==2.3.0 +pandas==2.2.3 +Pillow==11.1.0 +psutil==6.0.0 +pydantic==2.10.5 +pynvml==12.0.0 +pytest==8.3.4 +PyYAML==6.0.2 +PyYAML==6.0.2 +ray==2.40.0 +scikit_learn==1.6.1 +scipy==1.15.1 +seaborn==0.13.2 +tensorboard==2.17.0 +torchmetrics==1.6.1 +torchvision==0.18.1 +tqdm==4.66.4 +typer==0.15.1 +typing_extensions==4.12.2 +wandb==0.19.2 +xarray==2025.1.1 diff --git a/src/itwinai/loggers.py b/src/itwinai/loggers.py index 7a15a89c6..017bece37 100644 --- a/src/itwinai/loggers.py +++ b/src/itwinai/loggers.py @@ -272,7 +272,9 @@ def should_log(self, batch_idx: Optional[int] = None) -> bool: self.worker_rank is None or ( isinstance(self.log_on_workers, int) - and (self.log_on_workers == -1 or self.log_on_workers == self.worker_rank) + and ( + self.log_on_workers == -1 or self.log_on_workers == self.worker_rank + ) ) or ( isinstance(self.log_on_workers, list) @@ -320,7 +322,9 @@ def __init__( log_on_workers: Union[int, List[int]] = 0, ) -> None: cl_savedir = Path(savedir) / "simple-logger" - super().__init__(savedir=cl_savedir, log_freq=log_freq, log_on_workers=log_on_workers) + super().__init__( + savedir=cl_savedir, log_freq=log_freq, log_on_workers=log_on_workers + ) def create_logger_context(self, rank: Optional[int] = None): """Initializes the logger context. @@ -477,7 +481,9 @@ def __init__( log_on_workers: Union[int, List[int]] = 0, ): mfl_savedir = Path(savedir) / "mlflow" - super().__init__(savedir=mfl_savedir, log_freq=log_freq, log_on_workers=log_on_workers) + super().__init__( + savedir=mfl_savedir, log_freq=log_freq, log_on_workers=log_on_workers + ) self.tracking_uri = tracking_uri self.run_description = run_description self.run_name = run_name @@ -593,7 +599,9 @@ def log( if isinstance(item, self.mlflow.data.Dataset): self.mlflow.log_input(item) else: - print("WARNING: unrecognized dataset type. " "Must be an MLFlow dataset") + print( + "WARNING: unrecognized dataset type. " "Must be an MLFlow dataset" + ) elif kind == "torch": import torch @@ -664,7 +672,9 @@ def __init__( offline_mode: bool = False, ) -> None: wbl_savedir = Path(savedir) / "wandb" - super().__init__(savedir=wbl_savedir, log_freq=log_freq, log_on_workers=log_on_workers) + super().__init__( + savedir=wbl_savedir, log_freq=log_freq, log_on_workers=log_on_workers + ) self.project_name = project_name self.offline_mode = offline_mode @@ -781,13 +791,17 @@ def __init__( log_on_workers: Union[int, List[int]] = 0, ) -> None: tbl_savedir = Path(savedir) / "tensorboard" - super().__init__(savedir=tbl_savedir, log_freq=log_freq, log_on_workers=log_on_workers) + super().__init__( + savedir=tbl_savedir, log_freq=log_freq, log_on_workers=log_on_workers + ) self.framework = framework if framework.lower() == "tensorflow": import tensorflow as tf self.tf = tf - self.writer = tf.summary.create_file_writer(tbl_savedir.resolve().as_posix()) + self.writer = tf.summary.create_file_writer( + tbl_savedir.resolve().as_posix() + ) elif framework.lower() == "pytorch": from torch.utils.tensorboard import SummaryWriter @@ -1108,7 +1122,9 @@ def log( return if kind == "metric": - self.prov4ml.log_metric(key=identifier, value=item, context=context, step=step) + self.prov4ml.log_metric( + key=identifier, value=item, context=context, step=step + ) elif kind == "flops_pb": model, batch = item self.prov4ml.log_flops_per_batch( @@ -1160,7 +1176,9 @@ def log( class EpochTimeTracker: """Tracker for epoch execution time during training.""" - def __init__(self, strategy_name: str, save_path: Path | str, num_nodes: int) -> None: + def __init__( + self, strategy_name: str, save_path: Path | str, num_nodes: int + ) -> None: if isinstance(save_path, str): save_path = Path(save_path) diff --git a/src/itwinai/scalability.py b/src/itwinai/scalability.py index ab9fdf4a6..fabbc4d5f 100644 --- a/src/itwinai/scalability.py +++ b/src/itwinai/scalability.py @@ -63,7 +63,8 @@ def convert_matching_files_to_dataframe( error_message = f"Unable to find any files in {log_dir.resolve()}!" else: error_message = ( - f"No files matched pattern, '{pattern}', in log_dir, " f"{log_dir.resolve()}!" + f"No files matched pattern, '{pattern}', in log_dir, " + f"{log_dir.resolve()}!" ) raise ValueError(error_message) @@ -147,7 +148,9 @@ def create_relative_plot(avg_epoch_time_df: pd.DataFrame, gpus_per_node: int = 4 # Plotting the linear line num_gpus = np.array(avg_epoch_time_df["num_gpus"].unique()) linear_speedup = np.array(avg_epoch_time_df["linear_speedup"].unique()) - ax.plot(num_gpus, linear_speedup, ls="dashed", lw=1.0, c="k", label="linear speedup") + ax.plot( + num_gpus, linear_speedup, ls="dashed", lw=1.0, c="k", label="linear speedup" + ) ax.legend(ncol=1) ax.set_xticks(num_gpus) diff --git a/src/itwinai/serialization.py b/src/itwinai/serialization.py index 96af5b14c..d8178cb15 100644 --- a/src/itwinai/serialization.py +++ b/src/itwinai/serialization.py @@ -140,7 +140,9 @@ def _recursive_serialization(self, item: Any, item_name: str) -> Any: if isinstance(item, (tuple, list, set)): return [self._recursive_serialization(x, item_name) for x in item] elif isinstance(item, dict): - return {k: self._recursive_serialization(v, item_name) for k, v in item.items()} + return { + k: self._recursive_serialization(v, item_name) for k, v in item.items() + } elif is_jsonable(item): return item elif isinstance(item, Serializable): diff --git a/src/itwinai/slurm/slurm_script_builder.py b/src/itwinai/slurm/slurm_script_builder.py index 6a076a0d5..886056648 100644 --- a/src/itwinai/slurm/slurm_script_builder.py +++ b/src/itwinai/slurm/slurm_script_builder.py @@ -203,10 +203,14 @@ def process_slurm_script( self.slurm_script_configuration.job_name = self.generate_identifier() if self.slurm_script_configuration.std_out is None: - std_out_path = Path("slurm_job_logs") / (self.generate_identifier() + ".out") + std_out_path = Path("slurm_job_logs") / ( + self.generate_identifier() + ".out" + ) self.slurm_script_configuration.std_out = std_out_path if self.slurm_script_configuration.err_out is None: - err_out_path = Path("slurm_job_logs") / (self.generate_identifier() + ".err") + err_out_path = Path("slurm_job_logs") / ( + self.generate_identifier() + ".err" + ) self.slurm_script_configuration.err_out = err_out_path # Making sure the std out and err out folders exist @@ -220,7 +224,7 @@ def process_slurm_script( # Generate the script using the given configuration script = self.slurm_script_configuration.format_script() if not submit_slurm_job and not retain_file: - print("#" * 20, "SLURM Script Preview", "#"*20) + print("#" * 20, "SLURM Script Preview", "#" * 20) print(script) print("#" * 62) return @@ -260,8 +264,12 @@ def run_slurm_script_all_strategies( # Overriding job_name, std_out and err_out self.slurm_script_configuration.job_name = self.generate_identifier() - std_out_path = Path("slurm_job_logs") / (self.generate_identifier() + ".out") - err_out_path = Path("slurm_job_logs") / (self.generate_identifier() + ".err") + std_out_path = Path("slurm_job_logs") / ( + self.generate_identifier() + ".out" + ) + err_out_path = Path("slurm_job_logs") / ( + self.generate_identifier() + ".err" + ) self.slurm_script_configuration.std_out = std_out_path self.slurm_script_configuration.err_out = err_out_path diff --git a/src/itwinai/tensorflow/models/mnist.py b/src/itwinai/tensorflow/models/mnist.py index 70e1937df..6f62ce3f4 100644 --- a/src/itwinai/tensorflow/models/mnist.py +++ b/src/itwinai/tensorflow/models/mnist.py @@ -23,10 +23,15 @@ def __init__(self, input_shape: List[int] = (28, 28, 1), output_shape: int = 10) self.model = tf.keras.Sequential( [ tf.keras.layers.Conv2D( - filters=6, kernel_size=(3, 3), activation="relu", input_shape=(28, 28, 1) + filters=6, + kernel_size=(3, 3), + activation="relu", + input_shape=(28, 28, 1), ), tf.keras.layers.AveragePooling2D(2), - tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), activation="relu"), + tf.keras.layers.Conv2D( + filters=16, kernel_size=(3, 3), activation="relu" + ), tf.keras.layers.AveragePooling2D(2), tf.keras.layers.Flatten(), tf.keras.layers.Dense(units=120, activation="relu"), diff --git a/src/itwinai/tensorflow/trainer.py b/src/itwinai/tensorflow/trainer.py index 0a68e69f7..1c8c30ba5 100644 --- a/src/itwinai/tensorflow/trainer.py +++ b/src/itwinai/tensorflow/trainer.py @@ -119,7 +119,9 @@ def __init__( if model_config is not None and model_compile_config is not None: with self.strategy.scope(): self.model: tf.keras.Model = _instance_from_dict(model_config) - model_compile_config = self.instantiate_compile_conf(model_compile_config) + model_compile_config = self.instantiate_compile_conf( + model_compile_config + ) self.model.compile(**model_compile_config) else: print( @@ -198,17 +200,23 @@ def execute( # Shuffle dataset if self.shuffle_buffer: train_ds = train_dataset.shuffle(self.shuffle_buffer, seed=self.rnd_seed) - valid_ds = validation_dataset.shuffle(self.shuffle_buffer, seed=self.rnd_seed) + valid_ds = validation_dataset.shuffle( + self.shuffle_buffer, seed=self.rnd_seed + ) else: train_ds = train_dataset valid_ds = validation_dataset # Set batch size to the dataset and repeat train_ds = train_ds.batch( - self.macro_batch_size, drop_remainder=True, num_parallel_calls=tf.data.AUTOTUNE + self.macro_batch_size, + drop_remainder=True, + num_parallel_calls=tf.data.AUTOTUNE, ).repeat(self.epochs) valid_ds = valid_ds.batch( - self.macro_batch_size, drop_remainder=True, num_parallel_calls=tf.data.AUTOTUNE + self.macro_batch_size, + drop_remainder=True, + num_parallel_calls=tf.data.AUTOTUNE, ).repeat(self.epochs) print(f"len(train_ds): {len(train_ds)}") diff --git a/src/itwinai/torch/distributed.py b/src/itwinai/torch/distributed.py index 98964deee..a6cfb8fc0 100644 --- a/src/itwinai/torch/distributed.py +++ b/src/itwinai/torch/distributed.py @@ -102,7 +102,7 @@ def is_main_worker(self) -> bool: return self.global_rank() == 0 @abc.abstractmethod - def init(self) -> None: + def initialize_distributed_strategy(self) -> None: """Initializes the chosen distributed backend""" @abc.abstractmethod @@ -341,7 +341,9 @@ def create_dataloader( shuffle=shuffle, ) elif not isinstance(sampler, DistributedSampler): - raise RuntimeError("User-provided sampler must implement DistributedSampler.") + raise RuntimeError( + "User-provided sampler must implement DistributedSampler." + ) # shuffle and batch_sampler must be unset return DataLoader( dataset=dataset, @@ -422,7 +424,7 @@ def __init__(self, backend: Literal["nccl", "gloo", "mpi"]) -> None: self.backend = backend self.name = "torch-ddp" - def init(self) -> None: + def initialize_distributed_strategy(self) -> None: """Initializes the distributed process group and the distributed package. @@ -602,7 +604,7 @@ def __init__(self, backend: Literal["nccl", "gloo", "mpi"]) -> None: self.backend = backend self.name = "deepspeed" - def init(self) -> None: + def initialize_distributed_strategy(self) -> None: """Initializes the distributed process group and the distributed package. @@ -744,7 +746,9 @@ def gather_obj(self, obj: Any, dst_rank: int = 0) -> Optional[List[Any]]: dist.gather_object(obj, dst=dst_rank) @check_initialized - def gather(self, tensor: torch.Tensor, dst_rank: int = 0) -> Optional[List[torch.Tensor]]: + def gather( + self, tensor: torch.Tensor, dst_rank: int = 0 + ) -> Optional[List[torch.Tensor]]: """Gathers a tensor from the whole group in a list (to all workers). @@ -783,7 +787,7 @@ def __init__(self): super().__init__() self.name = "horovod" - def init(self) -> None: + def initialize_distributed_strategy(self) -> None: """Initializes the Horovod distributed backend. Raises: @@ -923,7 +927,9 @@ def gather_obj(self, obj: Any, dst_rank: int = 0) -> Optional[list[Any]]: return result @check_initialized - def gather(self, tensor: torch.Tensor, dst_rank: int = 0) -> Optional[List[torch.Tensor]]: + def gather( + self, tensor: torch.Tensor, dst_rank: int = 0 + ) -> Optional[List[torch.Tensor]]: """Gathers a tensor from the whole group in a list (to all workers). Under the hood it relies on allgather as gather is not supported by Horovod. @@ -956,7 +962,7 @@ def __init__(self): super().__init__() self.name = "non-distributed" - def init(self) -> None: + def initialize_distributed_strategy(self) -> None: """If CUDA is available set CUDA device, and do nothing more. Raises: @@ -1063,7 +1069,7 @@ def __init__(self) -> None: self.ray_train = ray.train - def init(self) -> None: + def initialize_distributed_strategy(self) -> None: self.is_initialized = True @check_initialized diff --git a/src/itwinai/torch/loggers.py b/src/itwinai/torch/loggers.py index 21ced00f4..8ca652df4 100644 --- a/src/itwinai/torch/loggers.py +++ b/src/itwinai/torch/loggers.py @@ -115,7 +115,9 @@ def finalize(self, status: str) -> None: @override @rank_zero_only - def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + def log_metrics( + self, metrics: Dict[str, float], step: Optional[int] = None + ) -> None: """Lightning Logger function. Logs the given metrics and is usually called by the Lightning Trainer. @@ -125,7 +127,9 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> Defaults to None. """ for identifier, item in metrics.items(): - self.experiment.log(item=item, identifier=identifier, kind="metric", step=step) + self.experiment.log( + item=item, identifier=identifier, kind="metric", step=step + ) @override @rank_zero_only @@ -210,6 +214,8 @@ def _scan_and_log_checkpoints(self, checkpoint_callback: ModelCheckpoint) -> Non tmp_file_aliases.write(str(aliases)) # Log metadata and aliases - self.experiment.log(item=tmp_dir, identifier=artifact_path, kind="artifact") + self.experiment.log( + item=tmp_dir, identifier=artifact_path, kind="artifact" + ) self._logged_model_time[path] = time diff --git a/src/itwinai/torch/profiling/profiler.py b/src/itwinai/torch/profiling/profiler.py index 951783e8c..7722ea40f 100644 --- a/src/itwinai/torch/profiling/profiler.py +++ b/src/itwinai/torch/profiling/profiler.py @@ -95,7 +95,7 @@ def profiled_method(self: TorchTrainer, *args, **kwargs) -> Any: warmup=warmup_epochs, active=active_epochs, ), - with_modules=True + with_modules=True, ) as profiler: self.profiler = profiler result = method(self, *args, **kwargs) diff --git a/src/itwinai/torch/reproducibility.py b/src/itwinai/torch/reproducibility.py index 7ff657958..b6cc22cd3 100644 --- a/src/itwinai/torch/reproducibility.py +++ b/src/itwinai/torch/reproducibility.py @@ -23,7 +23,9 @@ def seed_worker(worker_id): random.seed(worker_seed) -def set_seed(rnd_seed: Optional[int], deterministic_cudnn: bool = True) -> torch.Generator: +def set_seed( + rnd_seed: Optional[int], deterministic_cudnn: bool = True +) -> torch.Generator: """Set torch random seed and return a PRNG object. Args: diff --git a/src/itwinai/torch/trainer.py b/src/itwinai/torch/trainer.py index 3387eeb2a..175b669f8 100644 --- a/src/itwinai/torch/trainer.py +++ b/src/itwinai/torch/trainer.py @@ -189,12 +189,14 @@ def _detect_strategy(self, strategy: str) -> TorchDistributedStrategy: elif strategy == "deepspeed": strategy_obj = DeepSpeedStrategy(backend=self.config.dist_backend) else: - raise NotImplementedError(f"Strategy '{strategy}' is not recognized/implemented.") + raise NotImplementedError( + f"Strategy '{strategy}' is not recognized/implemented." + ) return strategy_obj def _init_distributed_strategy(self) -> None: if not self.strategy.is_initialized: - self.strategy.init() + self.strategy.initialize_distributed_strategy() def _optimizer_from_config(self) -> None: if self.config.optimizer == "adadelta": @@ -250,7 +252,9 @@ def get_default_distributed_kwargs(self) -> Dict: if isinstance(self.strategy, DeepSpeedStrategy): # Batch size definition is not optional for DeepSpeedStrategy! distribute_kwargs = dict( - config_params=dict(train_micro_batch_size_per_gpu=self.config.batch_size) + config_params=dict( + train_micro_batch_size_per_gpu=self.config.batch_size + ) ) elif isinstance(self.strategy, HorovodStrategy): import horovod.torch as hvd @@ -631,7 +635,9 @@ def train_epoch(self, epoch: int) -> torch.Tensor: return avg_loss - def train_step(self, batch: Batch, batch_idx: int) -> Tuple[torch.Tensor, Dict[str, Any]]: + def train_step( + self, batch: Batch, batch_idx: int + ) -> Tuple[torch.Tensor, Dict[str, Any]]: """Perform a single optimization step using a batch sampled from the training dataset. @@ -771,7 +777,9 @@ def test_epoch(self, epoch: int) -> torch.Tensor: """ raise NotImplementedError() - def test_step(self, batch: Batch, batch_idx: int) -> Tuple[torch.Tensor, Dict[str, Any]]: + def test_step( + self, batch: Batch, batch_idx: int + ) -> Tuple[torch.Tensor, Dict[str, Any]]: """Perform a single predictions step using a batch sampled from the test dataset. @@ -860,14 +868,18 @@ def create_model_loss_optimizer(self) -> None: self.criterion = nn.BCELoss() # https://stackoverflow.com/a/67437077 - self.discriminator = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self.discriminator) + self.discriminator = torch.nn.SyncBatchNorm.convert_sync_batchnorm( + self.discriminator + ) self.generator = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self.generator) # First, define strategy-wise optional configurations if isinstance(self.strategy, DeepSpeedStrategy): # Batch size definition is not optional for DeepSpeedStrategy! distribute_kwargs = dict( - config_params=dict(train_micro_batch_size_per_gpu=self.config.batch_size) + config_params=dict( + train_micro_batch_size_per_gpu=self.config.batch_size + ) ) else: distribute_kwargs = {} @@ -1097,7 +1109,9 @@ def save_checkpoint(self, name, epoch, loss=None): "generator_state_dict": self.generator.state_dict(), "optimizerD_state_dict": self.optimizerD.state_dict(), "optimizerG_state_dict": self.optimizerG.state_dict(), - "lr_scheduler": self.lr_scheduler.state_dict() if self.lr_scheduler else None, + "lr_scheduler": ( + self.lr_scheduler.state_dict() if self.lr_scheduler else None + ), } torch.save(checkpoint, checkpoint_path) @@ -1229,7 +1243,12 @@ def distributed(func): """ def dist_train( - model, train_dataloader, validation_dataloader=None, device="cpu", *args, **kwargs + model, + train_dataloader, + validation_dataloader=None, + device="cpu", + *args, + **kwargs, ): if torch.cuda.is_available(): dist.init_process_group(backend="nccl") @@ -1258,7 +1277,9 @@ def dist_train( ) try: - func(model, train_dataloader, validation_dataloader, device, *args, **kwargs) + func( + model, train_dataloader, validation_dataloader, device, *args, **kwargs + ) finally: if torch.cuda.is_available(): dist.barrier() @@ -1484,7 +1505,9 @@ def _set_scaling_config(self) -> None: scaling_config = self.config.get("scaling_config", {}) if not scaling_config: - print("WARNING: No Scaling Config configured. Running trials non-distributed.") + print( + "WARNING: No Scaling Config configured. Running trials non-distributed." + ) try: self.scaling_config = self.ray_train.ScalingConfig(**scaling_config) @@ -1501,7 +1524,9 @@ def _set_run_config(self) -> None: run_config = self.config.get("run_config", {}) if not run_config: - print("WARNING: No RunConfig provided. Assuming local or single-node execution.") + print( + "WARNING: No RunConfig provided. Assuming local or single-node execution." + ) try: storage_path = Path(run_config.pop("storage_path")).resolve() @@ -1512,7 +1537,9 @@ def _set_run_config(self) -> None: ) storage_path = Path("ray_checkpoints").resolve() - self.run_config = self.ray_train.RunConfig(**run_config, storage_path=storage_path) + self.run_config = self.ray_train.RunConfig( + **run_config, storage_path=storage_path + ) except AttributeError as e: print( "Could not set Run Config. Please ensure that you have passed the " @@ -1562,8 +1589,12 @@ def checkpoint_and_report(self, epoch, tuning_metrics, checkpointing_data=None): should_checkpoint = epoch % self.config.get("checkpoint_freq", 1) if checkpointing_data and should_checkpoint: - torch.save(checkpointing_data, os.path.join(temp_checkpoint_dir, str(epoch))) - checkpoint = self.ray_train.Checkpoint.from_directory(temp_checkpoint_dir) + torch.save( + checkpointing_data, os.path.join(temp_checkpoint_dir, str(epoch)) + ) + checkpoint = self.ray_train.Checkpoint.from_directory( + temp_checkpoint_dir + ) self.ray_train.report(tuning_metrics, checkpoint=checkpoint) diff --git a/src/itwinai/torch/tuning.py b/src/itwinai/torch/tuning.py index 25445df58..8c19c664a 100644 --- a/src/itwinai/torch/tuning.py +++ b/src/itwinai/torch/tuning.py @@ -59,7 +59,9 @@ def get_raytune_search_alg( f"INFO: Using scheduler {scheduler_name} " "is not compatible with Ray Tune search algorithms." ) - print(f"Using the Ray Tune {scheduler_name} scheduler without search algorithm") + print( + f"Using the Ray Tune {scheduler_name} scheduler without search algorithm" + ) return None case "bohb": diff --git a/src/itwinai/utils.py b/src/itwinai/utils.py index ebd4266c1..7a4d50b84 100644 --- a/src/itwinai/utils.py +++ b/src/itwinai/utils.py @@ -71,7 +71,9 @@ def dynamically_import_class(name: str) -> Type: return klass -def flatten_dict(d: MutableMapping, parent_key: str = "", sep: str = ".") -> MutableMapping: +def flatten_dict( + d: MutableMapping, parent_key: str = "", sep: str = "." +) -> MutableMapping: """Flatten dictionary Args: @@ -174,7 +176,9 @@ def str_to_slice(interval: str) -> slice: return int(interval) -def clear_key(my_dict: Dict, dict_name: str, key: Hashable, complain: bool = True) -> Dict: +def clear_key( + my_dict: Dict, dict_name: str, key: Hashable, complain: bool = True +) -> Dict: """Remove key from dictionary if present and complain. Args: @@ -184,6 +188,8 @@ def clear_key(my_dict: Dict, dict_name: str, key: Hashable, complain: bool = Tru """ if key in my_dict: if complain: - print(f"Field '{key}' should not be present " f"in dictionary '{dict_name}'") + print( + f"Field '{key}' should not be present " f"in dictionary '{dict_name}'" + ) del my_dict[key] return my_dict diff --git a/tests/components/test_decorators.py b/tests/components/test_decorators.py index 76326816c..d9f8ecf34 100644 --- a/tests/components/test_decorators.py +++ b/tests/components/test_decorators.py @@ -64,7 +64,9 @@ def test_suppress_workers_print_decorator(): """Test suppress_workers_print decorator behavior.""" with ( patch("builtins.print", autospec=True) as mock_print, - patch("itwinai.distributed.detect_distributed_environment", autospec=True) as mock_env, + patch( + "itwinai.distributed.detect_distributed_environment", autospec=True + ) as mock_env, patch( "itwinai.distributed.distributed_patch_print", autospec=True ) as mock_patch_print, @@ -98,7 +100,9 @@ def test_suppress_workers_print_component(): execute method.""" with ( patch("builtins.print", autospec=True) as mock_print, - patch("itwinai.distributed.detect_distributed_environment", autospec=True) as mock_env, + patch( + "itwinai.distributed.detect_distributed_environment", autospec=True + ) as mock_env, patch( "itwinai.distributed.distributed_patch_print", autospec=True ) as mock_patch_print, @@ -149,7 +153,9 @@ def dummy_method(self): assert result == "Execution result" # Check that the start and end messages were logged - mock_component._printout.assert_any_call("Starting execution of 'TestComponent'...") + mock_component._printout.assert_any_call( + "Starting execution of 'TestComponent'..." + ) mock_component._printout.assert_any_call("'TestComponent' executed in 5.000s") # Ensure the cleanup method was called @@ -165,7 +171,9 @@ def test_combined_decorators_on_fake_component(mock_fake_component): with ( patch("builtins.print", autospec=True) as mock_print, - patch("itwinai.distributed.detect_distributed_environment", autospec=True) as mock_env, + patch( + "itwinai.distributed.detect_distributed_environment", autospec=True + ) as mock_env, patch( "itwinai.distributed.distributed_patch_print", autospec=True ) as mock_patch_print, diff --git a/tests/components/test_pipe_parser.py b/tests/components/test_pipe_parser.py index 6e9ddd65c..2cf6317c0 100644 --- a/tests/components/test_pipe_parser.py +++ b/tests/components/test_pipe_parser.py @@ -45,15 +45,22 @@ def test_add_replace_field(): target6 = dict(first=dict(list1=[[0, 1], [77, 3]], el=0)) assert conf2 == target6 - conf3 = dict(first=dict(list1=[[0, dict(nst=("el", dict(ciao="ciao")))], [2, 3]], el=0)) + conf3 = dict( + first=dict(list1=[[0, dict(nst=("el", dict(ciao="ciao")))], [2, 3]], el=0) + ) add_replace_field(conf3, "first.list1.0.1.nst.1.ciao", "hello") - target7 = dict(first=dict(list1=[[0, dict(nst=("el", dict(ciao="hello")))], [2, 3]], el=0)) + target7 = dict( + first=dict(list1=[[0, dict(nst=("el", dict(ciao="hello")))], [2, 3]], el=0) + ) assert conf3 == target7 add_replace_field(conf3, "first.list1.0.1.nst.1.ciao.I.am.john", True) target8 = dict( first=dict( - list1=[[0, dict(nst=("el", dict(ciao=dict(I=dict(am=dict(john=True))))))], [2, 3]], + list1=[ + [0, dict(nst=("el", dict(ciao=dict(I=dict(am=dict(john=True))))))], + [2, 3], + ], el=0, ) ) @@ -107,7 +114,9 @@ def test_dynamic_override_parser_pipeline_dict(): """ config = yaml.safe_load(pytest.PIPE_DICT_YAML) - override_keys = {"my-dict-pipeline.init_args.steps.preproc-step.init_args.max_items": 33} + override_keys = { + "my-dict-pipeline.init_args.steps.preproc-step.init_args.max_items": 33 + } parser = ConfigParser(config=config, override_keys=override_keys) pipe = parser.parse_pipeline(pipeline_nested_key="my-dict-pipeline") assert pipe.steps["preproc-step"].max_items == 33 @@ -139,7 +148,9 @@ def test_parse_step_list_pipeline(): with pytest.raises(IndexError): _ = parser.parse_step(step_idx=12, pipeline_nested_key="my-list-pipeline") with pytest.raises(TypeError): - _ = parser.parse_step(step_idx="my-step-name", pipeline_nested_key="my-list-pipeline") + _ = parser.parse_step( + step_idx="my-step-name", pipeline_nested_key="my-list-pipeline" + ) def test_parse_step_dict_pipeline(): @@ -148,13 +159,17 @@ def test_parse_step_dict_pipeline(): """ config = yaml.safe_load(pytest.PIPE_DICT_YAML) parser = ConfigParser(config=config) - step = parser.parse_step(step_idx="preproc-step", pipeline_nested_key="my-dict-pipeline") + step = parser.parse_step( + step_idx="preproc-step", pipeline_nested_key="my-dict-pipeline" + ) assert isinstance(step, BaseComponent) assert isinstance(step, FakePreproc) with pytest.raises(KeyError): - _ = parser.parse_step(step_idx="unk-step", pipeline_nested_key="my-dict-pipeline") + _ = parser.parse_step( + step_idx="unk-step", pipeline_nested_key="my-dict-pipeline" + ) with pytest.raises(KeyError): _ = parser.parse_step(step_idx=0, pipeline_nested_key="my-dict-pipeline") diff --git a/tests/loggers/test_lightning_logger.py b/tests/loggers/test_lightning_logger.py index 356bd6e04..1ebcd759a 100644 --- a/tests/loggers/test_lightning_logger.py +++ b/tests/loggers/test_lightning_logger.py @@ -148,7 +148,9 @@ def test_log_metrics_and_hyperparams(itwinai_logger, request): dict_params = {"learning_rate": 0.001, "batch_size": 32} lightning_logger.log_hyperparams(params=dict_params) - itwinai_logger_instance.save_hyperparameters.assert_called_once_with(dict_params) + itwinai_logger_instance.save_hyperparameters.assert_called_once_with( + dict_params + ) @pytest.mark.parametrize( diff --git a/tests/torch/conftest.py b/tests/torch/conftest.py index 566335ccc..6486134d1 100644 --- a/tests/torch/conftest.py +++ b/tests/torch/conftest.py @@ -24,7 +24,7 @@ def ddp_strategy() -> Generator[TorchDistributedStrategy, None, None]: """Instantiate Torch's DistributedDataParallel strategy.""" strategy = TorchDDPStrategy(backend="nccl" if torch.cuda.is_available() else "gloo") - strategy.init() + strategy.initialize_distributed_strategy() yield strategy strategy.clean_up() @@ -32,8 +32,10 @@ def ddp_strategy() -> Generator[TorchDistributedStrategy, None, None]: @pytest.fixture(scope="package") def deepspeed_strategy() -> Generator[DeepSpeedStrategy, None, None]: """Instantiate DeepSpeed strategy.""" - strategy = DeepSpeedStrategy(backend="nccl" if torch.cuda.is_available() else "gloo") - strategy.init() + strategy = DeepSpeedStrategy( + backend="nccl" if torch.cuda.is_available() else "gloo" + ) + strategy.initialize_distributed_strategy() yield strategy strategy.clean_up() diff --git a/tests/torch/distributed_decorator.py b/tests/torch/distributed_decorator.py index cf89f1e8e..f2b0d7d3a 100644 --- a/tests/torch/distributed_decorator.py +++ b/tests/torch/distributed_decorator.py @@ -95,7 +95,13 @@ def test(model, device, test_loader): @distributed def train_func( - model, train_dataloader, validation_dataloader, device, optimizer, scheduler, epochs=10 + model, + train_dataloader, + validation_dataloader, + device, + optimizer, + scheduler, + epochs=10, ): for epoch in range(1, epochs + 1): train(model, device, train_dataloader, optimizer, epoch) diff --git a/tests/torch/test_distributed.py b/tests/torch/test_distributed.py index d5523feca..c08b159af 100644 --- a/tests/torch/test_distributed.py +++ b/tests/torch/test_distributed.py @@ -70,13 +70,16 @@ def test_cluster_properties(self, strategy: TorchDistributedStrategy): assert strategy.local_rank() >= 0 def test_init_exceptions( - self, strategy: TorchDistributedStrategy, simple_model: nn.Module, optimizer: Any + self, + strategy: TorchDistributedStrategy, + simple_model: nn.Module, + optimizer: Any, ): """Check that the init method cannot be called twice and that the other methods raise and exception if called when the strategy is not initialized.""" # Test re-initialization with pytest.raises(DistributedStrategyError) as init_exc: - strategy.init() + strategy.initialize_distributed_strategy() assert "already initialized" in init_exc.value # Test initialized flag @@ -130,7 +133,10 @@ def test_gather_operations(self, strategy): my_tensor = torch.ones(10) * strategy.global_rank() tensors = strategy.gather(my_tensor, dst_rank=0) if strategy.is_main_worker: - assert torch.stack(tensors).sum() == sum(range(strategy.global_world_size())) * 10 + assert ( + torch.stack(tensors).sum() + == sum(range(strategy.global_world_size())) * 10 + ) else: assert tensors is None @@ -174,7 +180,7 @@ class TestTorchDDPStrategy(BaseTestDistributedStrategy): def strategy(self, ddp_strategy) -> TorchDDPStrategy: return ddp_strategy - def test_init(self, strategy: TorchDDPStrategy): + def test_initialize_distributed_strategy(self, strategy: TorchDDPStrategy): """Test specific initialization of TorchDDPStrategy.""" assert strategy.backend in ["nccl", "gloo"] @@ -184,7 +190,7 @@ def test_init(self, strategy: TorchDDPStrategy): strategy = TorchDDPStrategy( backend="nccl" if torch.cuda.is_available() else "gloo" ) - strategy.init() + strategy.initialize_distributed_strategy() mock_init_torch.assert_called_once() def test_distributed_model( @@ -206,7 +212,7 @@ class TestDeepSpeedStrategy(BaseTestDistributedStrategy): def strategy(self, deepspeed_strategy) -> DeepSpeedStrategy: return deepspeed_strategy - def test_init(self, strategy: DeepSpeedStrategy): + def test_initialize_distributed_strategy(self, strategy: DeepSpeedStrategy): """Test specific initialization of DeepSpeedStrategy.""" assert strategy.backend in ["nccl", "gloo", "mpi"] assert hasattr( @@ -219,7 +225,7 @@ def test_init(self, strategy: DeepSpeedStrategy): strategy = DeepSpeedStrategy( backend="nccl" if torch.cuda.is_available() else "gloo" ) - strategy.init() + strategy.initialize_distributed_strategy() mock_init_ds.assert_called_once() def test_distributed_model( @@ -250,15 +256,17 @@ class TestHorovodStrategy(BaseTestDistributedStrategy): def strategy(self, horovod_strategy) -> HorovodStrategy: return horovod_strategy - def test_init(self, strategy): + def test_initialize_distributed_strategy(self, strategy): assert strategy.is_initialized - assert hasattr(strategy, "hvd"), "Lazy import of horovod not found in strategy class." + assert hasattr( + strategy, "hvd" + ), "Lazy import of horovod not found in strategy class." # Test initialization init_path = "horovod.torch.init" with patch(init_path, autospec=True) as mock_init_ds: strategy = HorovodStrategy() - strategy.init() + strategy.initialize_distributed_strategy() mock_init_ds.assert_called_once() def test_distributed_model(self, strategy, simple_model, optimizer): diff --git a/tests/torch/test_torch_trainer.py b/tests/torch/test_torch_trainer.py index 3302387c9..98898b704 100644 --- a/tests/torch/test_torch_trainer.py +++ b/tests/torch/test_torch_trainer.py @@ -71,11 +71,15 @@ def mnist_datasets(): "strategy_name,strategy_fixture", [ pytest.param("ddp", "ddp_strategy", marks=pytest.mark.torch_dist), - pytest.param("deepspeed", "deepspeed_strategy", marks=pytest.mark.deepspeed_dist), + pytest.param( + "deepspeed", "deepspeed_strategy", marks=pytest.mark.deepspeed_dist + ), pytest.param("horovod", "horovod_strategy", marks=pytest.mark.horovod_dist), ], ) -def test_distributed_trainer_mnist(mnist_datasets, request, strategy_name, strategy_fixture): +def test_distributed_trainer_mnist( + mnist_datasets, request, strategy_name, strategy_fixture +): """Test TorchTrainer on MNIST with different distributed strategies.""" training_config = dict(optimizer="sgd", loss="nllloss") trainer = TorchTrainer( diff --git a/tests/use-cases/test_cyclones.py b/tests/use-cases/test_cyclones.py index c05e3d639..dc8eb889f 100644 --- a/tests/use-cases/test_cyclones.py +++ b/tests/use-cases/test_cyclones.py @@ -48,6 +48,9 @@ def test_cyclones_train_tf(tf_env, install_requirements): pipe = CYCLONES_PATH / "pipeline.yaml" train = CYCLONES_PATH / "train.py" - cmd = f"{tf_env}/bin/python {train.resolve()} " f"-p {pipe.resolve()} --data_path {dataset_path}" + cmd = ( + f"{tf_env}/bin/python {train.resolve()} " + f"-p {pipe.resolve()} --data_path {dataset_path}" + ) with tempfile.TemporaryDirectory() as tmpdirname: subprocess.run(cmd.split(), check=True, cwd=tmpdirname) diff --git a/tests/use-cases/test_mnist.py b/tests/use-cases/test_mnist.py index 7d6fe753b..1d26c8278 100644 --- a/tests/use-cases/test_mnist.py +++ b/tests/use-cases/test_mnist.py @@ -124,6 +124,9 @@ def test_mnist_train_tf(tf_env, install_requirements): """ install_requirements(TF_PATH, tf_env) conf = TF_PATH / "pipeline.yaml" - cmd = f"{tf_env}/bin/itwinai exec-pipeline " f"--config {conf.resolve()} --pipe-key pipeline" + cmd = ( + f"{tf_env}/bin/itwinai exec-pipeline " + f"--config {conf.resolve()} --pipe-key pipeline" + ) with tempfile.TemporaryDirectory() as temp_dir: subprocess.run(cmd.split(), check=True, cwd=temp_dir) diff --git a/tutorials/distributed-ml/tf-scaling-test-jube/train.py b/tutorials/distributed-ml/tf-scaling-test-jube/train.py index 4bd4ff587..509d0bc4e 100644 --- a/tutorials/distributed-ml/tf-scaling-test-jube/train.py +++ b/tutorials/distributed-ml/tf-scaling-test-jube/train.py @@ -5,6 +5,7 @@ >>> sbatch tfmirrored_slurm.sh """ + import argparse import sys from timeit import default_timer as timer @@ -21,25 +22,14 @@ def parse_args(): """ Parse args """ - parser = argparse.ArgumentParser(description='TensorFlow ImageNet') + parser = argparse.ArgumentParser(description="TensorFlow ImageNet") parser.add_argument( - "--strategy", "-s", type=str, - choices=['mirrored'], - default='mirrored' - ) - parser.add_argument( - "--data_dir", type=str, - default='./' - ) - parser.add_argument( - "--batch_size", type=int, - default=128 - ) - parser.add_argument( - "--epochs", type=int, - default=3 + "--strategy", "-s", type=str, choices=["mirrored"], default="mirrored" ) + parser.add_argument("--data_dir", type=str, default="./") + parser.add_argument("--batch_size", type=int, default=128) + parser.add_argument("--epochs", type=int, default=3) args = parser.parse_args() return args @@ -57,13 +47,13 @@ def deserialization_fn(serialized_fn): parsed_example = tf.io.parse_single_example( serialized_fn, features={ - 'image/encoded': tf.io.FixedLenFeature([], tf.string), - 'image/class/label': tf.io.FixedLenFeature([], tf.int64), - } + "image/encoded": tf.io.FixedLenFeature([], tf.string), + "image/class/label": tf.io.FixedLenFeature([], tf.int64), + }, ) - image = tf.image.decode_jpeg(parsed_example['image/encoded'], channels=3) + image = tf.image.decode_jpeg(parsed_example["image/encoded"], channels=3) image = tf.image.resize(image, (224, 224)) - label = tf.cast(parsed_example['image/class/label'], tf.int64) - 1 + label = tf.cast(parsed_example["image/class/label"], tf.int64) - 1 return image, label @@ -81,8 +71,7 @@ def tf_records_loader(files_path, shuffle=False): datasets = tf.data.Dataset.from_tensor_slices(files_path) datasets = datasets.shuffle(len(files_path)) if shuffle else datasets datasets = datasets.flat_map(tf.data.TFRecordDataset) - datasets = datasets.map( - deserialization_fn, num_parallel_calls=tf.data.AUTOTUNE) + datasets = datasets.map(deserialization_fn, num_parallel_calls=tf.data.AUTOTUNE) return datasets @@ -92,11 +81,12 @@ def main(): input_shape = (224, 224, 3) num_classes = 1000 - if args.strategy == 'mirrored': + if args.strategy == "mirrored": strategy = get_strategy()[0] else: raise NotImplementedError( - f"Strategy {args.strategy} is not recognized/implemented.") + f"Strategy {args.strategy} is not recognized/implemented." + ) with strategy.scope(): base_model = keras.applications.ResNet50( @@ -107,61 +97,54 @@ def main(): x = base_model.output x = GlobalAveragePooling2D()(x) - x = Dense(1024, activation='relu')(x) - predictions = Dense(num_classes, activation='softmax')(x) + x = Dense(1024, activation="relu")(x) + predictions = Dense(num_classes, activation="softmax")(x) model = Model(inputs=base_model.input, outputs=predictions) - model.compile(loss=keras.losses.sparse_categorical_crossentropy, - optimizer=keras.optimizers.Adam(), - metrics=['accuracy'] - ) + model.compile( + loss=keras.losses.sparse_categorical_crossentropy, + optimizer=keras.optimizers.Adam(), + metrics=["accuracy"], + ) # scale batch size with number of workers batch_size = args.batch_size * get_strategy()[1] - dir_imagenet = args.data_dir+'imagenet-1K-tfrecords' - train_shard_suffix = 'train-*-of-01024' - test_shard_suffix = 'validation-*-of-00128' + dir_imagenet = args.data_dir + "imagenet-1K-tfrecords" + train_shard_suffix = "train-*-of-01024" + test_shard_suffix = "validation-*-of-00128" - train_set_path = sorted( - tf.io.gfile.glob(dir_imagenet + f'/{train_shard_suffix}') - ) - test_set_path = sorted( - tf.io.gfile.glob(dir_imagenet + f'/{test_shard_suffix}') - ) + train_set_path = sorted(tf.io.gfile.glob(dir_imagenet + f"/{train_shard_suffix}")) + test_set_path = sorted(tf.io.gfile.glob(dir_imagenet + f"/{test_shard_suffix}")) train_dataset = tf_records_loader(train_set_path, shuffle=True) test_dataset = tf_records_loader(test_set_path) - train_dataset = train_dataset.batch( - batch_size).prefetch(tf.data.experimental.AUTOTUNE) - test_dataset = test_dataset.batch( - batch_size).prefetch(tf.data.experimental.AUTOTUNE) - - # distribute datasets among mirrored replicas - dist_train = strategy.experimental_distribute_dataset( - train_dataset + train_dataset = train_dataset.batch(batch_size).prefetch( + tf.data.experimental.AUTOTUNE ) - dist_test = strategy.experimental_distribute_dataset( - test_dataset + test_dataset = test_dataset.batch(batch_size).prefetch( + tf.data.experimental.AUTOTUNE ) + # distribute datasets among mirrored replicas + dist_train = strategy.experimental_distribute_dataset(train_dataset) + dist_test = strategy.experimental_distribute_dataset(test_dataset) + # TODO: add callbacks to evaluate per epoch time et = timer() # trains the model model.fit(dist_train, epochs=args.epochs, steps_per_epoch=2000, verbose=10) - print('TIMER: total epoch time:', - timer() - et, ' s') - print('TIMER: average epoch time:', - (timer() - et) / (args.epochs), ' s') + print("TIMER: total epoch time:", timer() - et, " s") + print("TIMER: average epoch time:", (timer() - et) / (args.epochs), " s") test_scores = model.evaluate(dist_test, steps=100, verbose=5) - print('Test loss:', test_scores[0]) - print('Test accuracy:', test_scores[1]) + print("Test loss:", test_scores[0]) + print("Test accuracy:", test_scores[1]) if __name__ == "__main__": diff --git a/tutorials/distributed-ml/tf-tutorial-0-basics/train.py b/tutorials/distributed-ml/tf-tutorial-0-basics/train.py index da84f9a7d..809351665 100644 --- a/tutorials/distributed-ml/tf-tutorial-0-basics/train.py +++ b/tutorials/distributed-ml/tf-tutorial-0-basics/train.py @@ -5,6 +5,7 @@ >>> sbatch tfmirrored_slurm.sh """ + from typing import Any import argparse import tensorflow as tf @@ -15,18 +16,10 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument( - "--strategy", "-s", type=str, - choices=['mirrored'], - default='mirrored' - ) - parser.add_argument( - "--batch_size", "-bs", type=int, - default=64 - ) - parser.add_argument( - "--shuffle_dataloader", - action=argparse.BooleanOptionalAction + "--strategy", "-s", type=str, choices=["mirrored"], default="mirrored" ) + parser.add_argument("--batch_size", "-bs", type=int, default=64) + parser.add_argument("--shuffle_dataloader", action=argparse.BooleanOptionalAction) args = parser.parse_args() return args @@ -34,9 +27,9 @@ def parse_args() -> argparse.Namespace: def tf_rnd_dataset(args): """Dummy TF dataset.""" - (x_train, y_train), (x_test, y_test) = \ - tf.keras.datasets.mnist.load_data( - path='p/scratch/intertwin/datasets/.keras/datasets/mnist.npz') + (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data( + path="p/scratch/intertwin/datasets/.keras/datasets/mnist.npz" + ) train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) train_dataset = train_dataset.batch(args.batch_size) @@ -47,9 +40,7 @@ def tf_rnd_dataset(args): return train_dataset, test_dataset -def trainer_entrypoint_fn( - foo: Any, args: argparse.Namespace, strategy -) -> int: +def trainer_entrypoint_fn(foo: Any, args: argparse.Namespace, strategy) -> int: """Dummy training function, similar to custom code developed by some use case. """ @@ -57,36 +48,32 @@ def trainer_entrypoint_fn( train_dataset, test_dataset = tf_rnd_dataset(args) # distribute datasets among mirrored replicas - dist_train = strategy.experimental_distribute_dataset( - train_dataset - ) - dist_test = strategy.experimental_distribute_dataset( - test_dataset - ) + dist_train = strategy.experimental_distribute_dataset(train_dataset) + dist_test = strategy.experimental_distribute_dataset(test_dataset) # define and compile model within strategy.scope() with strategy.scope(): # Local model - model = tf.keras.models.Sequential([ - tf.keras.layers.Flatten(input_shape=(28, 28)), - tf.keras.layers.Dense(128, activation='relu'), - tf.keras.layers.Dense(10) - ]) - - model.compile(loss=keras.losses.SparseCategoricalCrossentropy - (from_logits=True), - optimizer=keras.optimizers.RMSprop(), - metrics=['accuracy'] - ) - - model.fit(dist_train, - epochs=5, - steps_per_epoch=2000) + model = tf.keras.models.Sequential( + [ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dense(10), + ] + ) + + model.compile( + loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), + optimizer=keras.optimizers.RMSprop(), + metrics=["accuracy"], + ) + + model.fit(dist_train, epochs=5, steps_per_epoch=2000) test_scores = model.evaluate(dist_test, verbose=0, steps=500) - print('Test loss:', test_scores[0]) - print('Test accuracy:', test_scores[1]) + print("Test loss:", test_scores[0]) + print("Test accuracy:", test_scores[1]) return 123 @@ -96,13 +83,14 @@ def trainer_entrypoint_fn( args = parse_args() # Instantiate Strategy - if args.strategy == 'mirrored': - if (len(tf.config.list_physical_devices('GPU')) == 0): - raise RuntimeError('Resources unavailable') + if args.strategy == "mirrored": + if len(tf.config.list_physical_devices("GPU")) == 0: + raise RuntimeError("Resources unavailable") strategy, num_replicas = get_strategy() else: raise NotImplementedError( - f"Strategy {args.strategy} is not recognized/implemented.") + f"Strategy {args.strategy} is not recognized/implemented." + ) # Launch distributed training trainer_entrypoint_fn("foobar", args, strategy) diff --git a/tutorials/distributed-ml/tf-tutorial-1-imagenet/train.py b/tutorials/distributed-ml/tf-tutorial-1-imagenet/train.py index d820fd9b3..014d8b992 100644 --- a/tutorials/distributed-ml/tf-tutorial-1-imagenet/train.py +++ b/tutorials/distributed-ml/tf-tutorial-1-imagenet/train.py @@ -5,6 +5,7 @@ >>> sbatch tfmirrored_slurm.sh """ + import argparse import sys from timeit import default_timer as timer @@ -21,25 +22,14 @@ def parse_args(): """ Parse args """ - parser = argparse.ArgumentParser(description='TensorFlow ImageNet') + parser = argparse.ArgumentParser(description="TensorFlow ImageNet") parser.add_argument( - "--strategy", "-s", type=str, - choices=['mirrored'], - default='mirrored' - ) - parser.add_argument( - "--data_dir", type=str, - default='./' - ) - parser.add_argument( - "--batch_size", type=int, - default=128 - ) - parser.add_argument( - "--epochs", type=int, - default=3 + "--strategy", "-s", type=str, choices=["mirrored"], default="mirrored" ) + parser.add_argument("--data_dir", type=str, default="./") + parser.add_argument("--batch_size", type=int, default=128) + parser.add_argument("--epochs", type=int, default=3) args = parser.parse_args() return args @@ -57,13 +47,13 @@ def deserialization_fn(serialized_fn): parsed_example = tf.io.parse_single_example( serialized_fn, features={ - 'image/encoded': tf.io.FixedLenFeature([], tf.string), - 'image/class/label': tf.io.FixedLenFeature([], tf.int64), - } + "image/encoded": tf.io.FixedLenFeature([], tf.string), + "image/class/label": tf.io.FixedLenFeature([], tf.int64), + }, ) - image = tf.image.decode_jpeg(parsed_example['image/encoded'], channels=3) + image = tf.image.decode_jpeg(parsed_example["image/encoded"], channels=3) image = tf.image.resize(image, (224, 224)) - label = tf.cast(parsed_example['image/class/label'], tf.int64) - 1 + label = tf.cast(parsed_example["image/class/label"], tf.int64) - 1 return image, label @@ -81,8 +71,7 @@ def tf_records_loader(files_path, shuffle=False): datasets = tf.data.Dataset.from_tensor_slices(files_path) datasets = datasets.shuffle(len(files_path)) if shuffle else datasets datasets = datasets.flat_map(tf.data.TFRecordDataset) - datasets = datasets.map( - deserialization_fn, num_parallel_calls=tf.data.AUTOTUNE) + datasets = datasets.map(deserialization_fn, num_parallel_calls=tf.data.AUTOTUNE) return datasets @@ -92,11 +81,12 @@ def main(): input_shape = (224, 224, 3) num_classes = 1000 - if args.strategy == 'mirrored': + if args.strategy == "mirrored": strategy = get_strategy()[0] else: raise NotImplementedError( - f"Strategy {args.strategy} is not recognized/implemented.") + f"Strategy {args.strategy} is not recognized/implemented." + ) with strategy.scope(): base_model = keras.applications.ResNet50( @@ -107,61 +97,54 @@ def main(): x = base_model.output x = GlobalAveragePooling2D()(x) - x = Dense(1024, activation='relu')(x) - predictions = Dense(num_classes, activation='softmax')(x) + x = Dense(1024, activation="relu")(x) + predictions = Dense(num_classes, activation="softmax")(x) model = Model(inputs=base_model.input, outputs=predictions) - model.compile(loss=keras.losses.sparse_categorical_crossentropy, - optimizer=keras.optimizers.Adam(), - metrics=['accuracy'] - ) + model.compile( + loss=keras.losses.sparse_categorical_crossentropy, + optimizer=keras.optimizers.Adam(), + metrics=["accuracy"], + ) # scale batch size with number of workers batch_size = args.batch_size * get_strategy()[1] - dir_imagenet = args.data_dir+'imagenet-1K-tfrecords' - train_shard_suffix = 'train-*-of-01024' - test_shard_suffix = 'validation-*-of-00128' + dir_imagenet = args.data_dir + "imagenet-1K-tfrecords" + train_shard_suffix = "train-*-of-01024" + test_shard_suffix = "validation-*-of-00128" - train_set_path = sorted( - tf.io.gfile.glob(dir_imagenet + f'/{train_shard_suffix}') - ) - test_set_path = sorted( - tf.io.gfile.glob(dir_imagenet + f'/{test_shard_suffix}') - ) + train_set_path = sorted(tf.io.gfile.glob(dir_imagenet + f"/{train_shard_suffix}")) + test_set_path = sorted(tf.io.gfile.glob(dir_imagenet + f"/{test_shard_suffix}")) train_dataset = tf_records_loader(train_set_path, shuffle=True) test_dataset = tf_records_loader(test_set_path) - train_dataset = train_dataset.batch( - batch_size).prefetch(tf.data.experimental.AUTOTUNE) - test_dataset = test_dataset.batch( - batch_size).prefetch(tf.data.experimental.AUTOTUNE) - - # distribute datasets among mirrored replicas - dist_train = strategy.experimental_distribute_dataset( - train_dataset + train_dataset = train_dataset.batch(batch_size).prefetch( + tf.data.experimental.AUTOTUNE ) - dist_test = strategy.experimental_distribute_dataset( - test_dataset + test_dataset = test_dataset.batch(batch_size).prefetch( + tf.data.experimental.AUTOTUNE ) + # distribute datasets among mirrored replicas + dist_train = strategy.experimental_distribute_dataset(train_dataset) + dist_test = strategy.experimental_distribute_dataset(test_dataset) + # TODO: add callbacks to evaluate per epoch time et = timer() # trains the model model.fit(dist_train, epochs=args.epochs, steps_per_epoch=500, verbose=10) - print('TIMER: total epoch time:', - timer() - et, ' s') - print('TIMER: average epoch time:', - (timer() - et) / (args.epochs), ' s') + print("TIMER: total epoch time:", timer() - et, " s") + print("TIMER: average epoch time:", (timer() - et) / (args.epochs), " s") test_scores = model.evaluate(dist_test, steps=100, verbose=5) - print('Test loss:', test_scores[0]) - print('Test accuracy:', test_scores[1]) + print("Test loss:", test_scores[0]) + print("Test accuracy:", test_scores[1]) if __name__ == "__main__": diff --git a/tutorials/distributed-ml/torch-kubeflow-1/train-cpu.py b/tutorials/distributed-ml/torch-kubeflow-1/train-cpu.py index 6a6cd7039..985e01a45 100644 --- a/tutorials/distributed-ml/torch-kubeflow-1/train-cpu.py +++ b/tutorials/distributed-ml/torch-kubeflow-1/train-cpu.py @@ -68,9 +68,14 @@ def main(): "--epochs", type=int, default=14, help="number of epochs to train (default: 14)" ) parser.add_argument( - "--strategy", type=str, default="ddp", help="distributed strategy (default: ddp)" + "--strategy", + type=str, + default="ddp", + help="distributed strategy (default: ddp)", + ) + parser.add_argument( + "--lr", type=float, default=1.0, help="learning rate (default: 1.0)" ) - parser.add_argument("--lr", type=float, default=1.0, help="learning rate (default: 1.0)") parser.add_argument("--seed", type=int, default=1, help="random seed (default: 1)") parser.add_argument( "--ckpt-interval", @@ -120,7 +125,9 @@ def main(): time.sleep(1) # Dataset creation - train_dataset = datasets.MNIST("data", train=True, download=False, transform=transform) + train_dataset = datasets.MNIST( + "data", train=True, download=False, transform=transform + ) validation_dataset = datasets.MNIST( "data", train=False, download=False, transform=transform ) diff --git a/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py b/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py index b1ee4ea95..b76c2d26b 100644 --- a/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py +++ b/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py @@ -64,7 +64,7 @@ def main(): raise NotImplementedError( f"Strategy {args.strategy} is not recognized/implemented." ) - strategy.init() + strategy.initialize_distributed_strategy() # Check resource availability use_cuda = not args.no_cuda and torch.cuda.is_available() diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/train.py b/tutorials/distributed-ml/torch-tutorial-0-basics/train.py index 1e98d8785..8e25c33d6 100644 --- a/tutorials/distributed-ml/torch-tutorial-0-basics/train.py +++ b/tutorials/distributed-ml/torch-tutorial-0-basics/train.py @@ -33,7 +33,11 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument( - "--strategy", "-s", type=str, choices=["ddp", "horovod", "deepspeed"], default="ddp" + "--strategy", + "-s", + type=str, + choices=["ddp", "horovod", "deepspeed"], + default="ddp", ) parser.add_argument("--shuffle_dataloader", action=argparse.BooleanOptionalAction) parser.add_argument( @@ -74,10 +78,12 @@ def __getitem__(self, index): def training_fn( - args: argparse.Namespace, strategy: TorchDistributedStrategy, distribute_kwargs: Dict + args: argparse.Namespace, + strategy: TorchDistributedStrategy, + distribute_kwargs: Dict, ) -> int: """Dummy training function.""" - strategy.init() + strategy.initialize_distributed_strategy() # Local model model = nn.Linear(3, 4) @@ -144,7 +150,9 @@ def training_fn( elif args.strategy == "horovod": strategy = HorovodStrategy() distribute_kwargs = dict( - compression=hvd.Compression.none, op=hvd.Average, gradient_predivide_factor=1.0 + compression=hvd.Compression.none, + op=hvd.Average, + gradient_predivide_factor=1.0, ) elif args.strategy == "deepspeed": strategy = DeepSpeedStrategy(backend="nccl") @@ -152,6 +160,8 @@ def training_fn( config_params=dict(train_micro_batch_size_per_gpu=args.batch_size) ) else: - raise NotImplementedError(f"Strategy {args.strategy} is not recognized/implemented.") + raise NotImplementedError( + f"Strategy {args.strategy} is not recognized/implemented." + ) # Launch distributed training training_fn(args, strategy, distribute_kwargs) diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/train.py b/tutorials/distributed-ml/torch-tutorial-1-mnist/train.py index da41ef774..ea08c4df7 100644 --- a/tutorials/distributed-ml/torch-tutorial-1-mnist/train.py +++ b/tutorials/distributed-ml/torch-tutorial-1-mnist/train.py @@ -59,7 +59,7 @@ def parse_params() -> argparse.Namespace: parser.add_argument( "--data-dir", default="./", - help=("location of the training dataset in the local " "filesystem"), + help=("location of the training dataset in the local filesystem"), ) parser.add_argument( "--log-int", type=int, default=10, help="log interval per training" @@ -96,7 +96,7 @@ def parse_params() -> argparse.Namespace: "--nworker", type=int, default=0, - help=("number of workers in DataLoader (default: 0 -" " only main)"), + help=("number of workers in DataLoader (default: 0 - only main)"), ) parser.add_argument( "--prefetch", @@ -164,7 +164,7 @@ def parse_params() -> argparse.Namespace: "--gradient-predivide-factor", type=float, default=1.0, - help=("apply gradient pre-divide factor in optimizer " "(default: 1.0)"), + help=("apply gradient pre-divide factor in optimizer (default: 1.0)"), ) # DeepSpeed @@ -261,8 +261,7 @@ def test(model, test_loader, strategy: TorchDistributedStrategy): if strategy.is_main_worker: dl_size = len(test_loader.dataset) // strategy.global_world_size() print( - f"Test set: average loss: {test_loss:.4f}\t" - f"accurate samples: {correct}/{dl_size}" + f"Test set: average loss: {test_loss:.4f}\taccurate samples: {correct}/{dl_size}" ) acc_test = 100.0 * correct * strategy.global_world_size() / len(test_loader.dataset) return acc_test @@ -367,7 +366,7 @@ def mnist_dataset(dataset_replication: int = 1) -> Tuple[Dataset, Dataset]: ) # Initialize strategy - strategy.init() + strategy.initialize_distributed_strategy() # Start the timer for profiling st = timer() @@ -499,7 +498,7 @@ def mnist_dataset(dataset_replication: int = 1) -> Tuple[Dataset, Dataset]: ) print("DEBUG: memory summary:\n\n", torch.cuda.memory_summary(0)) - print(f"TIMER: final time: {timer()-st} s\n") + print(f"TIMER: final time: {timer() - st} s\n") time.sleep(1) print(f" - TRAINING FINISHED") diff --git a/tutorials/distributed-ml/torch-tutorial-2-trainer-class/train.py b/tutorials/distributed-ml/torch-tutorial-2-trainer-class/train.py index 53a52c203..2f9f5c2d1 100644 --- a/tutorials/distributed-ml/torch-tutorial-2-trainer-class/train.py +++ b/tutorials/distributed-ml/torch-tutorial-2-trainer-class/train.py @@ -62,7 +62,9 @@ def main(): parser.add_argument( "--strategy", type=str, default="ddp", help="distributed strategy (default=ddp)" ) - parser.add_argument("--lr", type=float, default=1.0, help="learning rate (default: 1.0)") + parser.add_argument( + "--lr", type=float, default=1.0, help="learning rate (default: 1.0)" + ) parser.add_argument("--seed", type=int, default=1, help="random seed (default: 1)") parser.add_argument( "--ckpt-interval", @@ -76,7 +78,9 @@ def main(): transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ) - train_dataset = datasets.MNIST("../data", train=True, download=True, transform=transform) + train_dataset = datasets.MNIST( + "../data", train=True, download=True, transform=transform + ) validation_dataset = datasets.MNIST("../data", train=False, transform=transform) # Neural network to train diff --git a/tutorials/distributed-ml/torch-tutorial-GAN/simpleGAN.py b/tutorials/distributed-ml/torch-tutorial-GAN/simpleGAN.py index c285616ec..fd63eaae6 100644 --- a/tutorials/distributed-ml/torch-tutorial-GAN/simpleGAN.py +++ b/tutorials/distributed-ml/torch-tutorial-GAN/simpleGAN.py @@ -49,7 +49,11 @@ root=DATA_PATH, download=False, transform=transforms.Compose( - [transforms.Resize(X_DIM), transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))] + [ + transforms.Resize(X_DIM), + transforms.ToTensor(), + transforms.Normalize((0.5,), (0.5,)), + ] ), ) @@ -149,7 +153,9 @@ def forward(self, input): # Training Loop -def train_GAN_model(EPOCH_NUM, netD, netG, optimizerG, optimizerD, dataloader, criterion): +def train_GAN_model( + EPOCH_NUM, netD, netG, optimizerG, optimizerD, dataloader, criterion +): img_list = [] G_losses = [] D_losses = [] @@ -228,7 +234,9 @@ def train_GAN_model(EPOCH_NUM, netD, netG, optimizerG, optimizerD, dataloader, c D_losses.append(errD.item()) # Check how the generator is doing - if (iters % 500 == 0) or ((epoch == EPOCH_NUM - 1) and (i == len(dataloader) - 1)): + if (iters % 500 == 0) or ( + (epoch == EPOCH_NUM - 1) and (i == len(dataloader) - 1) + ): with torch.no_grad(): fake = netG(viz_noise).detach().cpu() img_list.append(vutils.make_grid(fake, padding=2, normalize=True)) @@ -253,7 +261,9 @@ def train_GAN_model(EPOCH_NUM, netD, netG, optimizerG, optimizerD, dataloader, c plt.title("Real Images") plt.imshow( np.transpose( - vutils.make_grid(real_batch[0].to(device)[:64], padding=5, normalize=True).cpu(), + vutils.make_grid( + real_batch[0].to(device)[:64], padding=5, normalize=True + ).cpu(), (1, 2, 0), ) ) diff --git a/tutorials/distributed-ml/torch-tutorial-GAN/train.py b/tutorials/distributed-ml/torch-tutorial-GAN/train.py index 3d3db6379..56ed651c5 100644 --- a/tutorials/distributed-ml/torch-tutorial-GAN/train.py +++ b/tutorials/distributed-ml/torch-tutorial-GAN/train.py @@ -159,14 +159,18 @@ def create_model_loss_optimizer(self) -> None: self.criterion = nn.BCELoss() # https://stackoverflow.com/a/67437077 - self.discriminator = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self.discriminator) + self.discriminator = torch.nn.SyncBatchNorm.convert_sync_batchnorm( + self.discriminator + ) self.generator = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self.generator) # First, define strategy-wise optional configurations if isinstance(self.strategy, DeepSpeedStrategy): # Batch size definition is not optional for DeepSpeedStrategy! distribute_kwargs = dict( - config_params=dict(train_micro_batch_size_per_gpu=self.config.batch_size) + config_params=dict( + train_micro_batch_size_per_gpu=self.config.batch_size + ) ) else: distribute_kwargs = {} @@ -398,7 +402,9 @@ def save_checkpoint(self, name, epoch, loss=None): "generator_state_dict": self.generator.state_dict(), "optimizerD_state_dict": self.optimizerD.state_dict(), "optimizerG_state_dict": self.optimizerG.state_dict(), - "lr_scheduler": (self.lr_scheduler.state_dict() if self.lr_scheduler else None), + "lr_scheduler": ( + self.lr_scheduler.state_dict() if self.lr_scheduler else None + ), } torch.save(checkpoint, checkpoint_path) @@ -472,7 +478,9 @@ def main(): ] ) - train_dataset = datasets.MNIST("../data", train=True, download=True, transform=transform) + train_dataset = datasets.MNIST( + "../data", train=True, download=True, transform=transform + ) validation_dataset = datasets.MNIST("../data", train=False, transform=transform) def weights_init(m): diff --git a/tutorials/distributed-ml/torch-tutorial-containers/dataloader.py b/tutorials/distributed-ml/torch-tutorial-containers/dataloader.py index 020ad006a..288bdecc9 100644 --- a/tutorials/distributed-ml/torch-tutorial-containers/dataloader.py +++ b/tutorials/distributed-ml/torch-tutorial-containers/dataloader.py @@ -57,7 +57,10 @@ class InferenceMNIST(Dataset): """Loads a set of MNIST images from a folder of JPG files.""" def __init__( - self, root: str, transform: Optional[Callable] = None, supported_format: str = ".jpg" + self, + root: str, + transform: Optional[Callable] = None, + supported_format: str = ".jpg", ) -> None: self.root = root self.transform = transform diff --git a/tutorials/hpo-workflows/distributed-workflow/trainer.py b/tutorials/hpo-workflows/distributed-workflow/trainer.py index 7f3d2f91c..4da5bab13 100644 --- a/tutorials/hpo-workflows/distributed-workflow/trainer.py +++ b/tutorials/hpo-workflows/distributed-workflow/trainer.py @@ -57,7 +57,7 @@ def train(self, config, data): # Because of the way the ray cluster is set up, # the initialisation of the strategy and logger, as well as the creation of the # model, loss, optimizer and dataloader are done from within the train() function - self.strategy.init() + self.strategy.initialize_distributed_strategy() self.initialize_logger( hyperparams=self.training_config, rank=self.strategy.global_rank() ) diff --git a/tutorials/hpo-workflows/simple-workflow/hpo.py b/tutorials/hpo-workflows/simple-workflow/hpo.py index 5b89d25da..53557ac1c 100644 --- a/tutorials/hpo-workflows/simple-workflow/hpo.py +++ b/tutorials/hpo-workflows/simple-workflow/hpo.py @@ -60,7 +60,9 @@ def run_trial(config: Dict, data: Dict): "optim_lr": config["optim_lr"], }, ) - my_pipeline = parser.parse_pipeline(pipeline_nested_key=pipeline_name, verbose=False) + my_pipeline = parser.parse_pipeline( + pipeline_nested_key=pipeline_name, verbose=False + ) my_pipeline.execute() @@ -100,7 +102,9 @@ def run_hpo(args): # Set resource allocation for each trial (number of GPUs and/or number of CPUs) resources_per_trial = {"gpu": ngpus_per_trial, "cpu": ncpus_per_trial} - run_with_resources = tune.with_resources(run_trial, resources=resources_per_trial) + run_with_resources = tune.with_resources( + run_trial, resources=resources_per_trial + ) data = {"pipeline_name": args.pipeline_name} trainable_with_parameters = tune.with_parameters(run_with_resources, data=data) @@ -133,7 +137,9 @@ def run_hpo(args): # Main entry point for script execution if __name__ == "__main__": # Parse command-line arguments - parser = argparse.ArgumentParser(description="Hyperparameter Optimization with Ray Tune") + parser = argparse.ArgumentParser( + description="Hyperparameter Optimization with Ray Tune" + ) parser.add_argument( "--load_old_results", type=bool, @@ -156,10 +162,14 @@ def run_hpo(args): Set this only if load_old_results is set to True. \ Defaults to ~/ray_results/Eurac-Ray-Experiment", ) - parser.add_argument("--num_samples", type=int, default=10, help="Number of trials to run") + parser.add_argument( + "--num_samples", type=int, default=10, help="Number of trials to run" + ) parser.add_argument("--ngpus", type=int, help="Number of GPUs available on node.") parser.add_argument("--ncpus", type=int, help="Number of CPUs available on node.") - parser.add_argument("--metric", type=str, default="loss", help="Metric to optimise.") + parser.add_argument( + "--metric", type=str, default="loss", help="Metric to optimise." + ) parser.add_argument( "--max_iterations", type=int, default="20", help="Maximum iterations per trial" ) diff --git a/tutorials/hpo-workflows/simple-workflow/trainer.py b/tutorials/hpo-workflows/simple-workflow/trainer.py index 2d6670765..3f715c21e 100644 --- a/tutorials/hpo-workflows/simple-workflow/trainer.py +++ b/tutorials/hpo-workflows/simple-workflow/trainer.py @@ -45,7 +45,9 @@ def create_model_loss_optimizer(self): # First, define strategy-wise optional configurations if isinstance(self.strategy, DeepSpeedStrategy): distribute_kwargs = dict( - config_params=dict(train_micro_batch_size_per_gpu=self.config.batch_size) + config_params=dict( + train_micro_batch_size_per_gpu=self.config.batch_size + ) ) else: distribute_kwargs = {} diff --git a/use-cases/3dgan/dataloader.py b/use-cases/3dgan/dataloader.py index 0a11d8856..12671b7fc 100644 --- a/use-cases/3dgan/dataloader.py +++ b/use-cases/3dgan/dataloader.py @@ -72,7 +72,9 @@ def __getitem__(self, idx): def fetch_data(self) -> None: print("Searching in :", self.datapath) - files = sorted(glob.glob(os.path.join(self.datapath, "**/*.h5"), recursive=True)) + files = sorted( + glob.glob(os.path.join(self.datapath, "**/*.h5"), recursive=True) + ) print("Found {} files. ".format(len(files))) if len(files) == 0: raise RuntimeError(f"No H5 files found at '{self.datapath}'!") @@ -103,7 +105,10 @@ def fetch_data(self) -> None: self.data[field] = vals_array # Stop loading data, if self.max_samples reached - if self.max_samples is not None and len(self.data[field]) >= self.max_samples: + if ( + self.max_samples is not None + and len(self.data[field]) >= self.max_samples + ): for field, vals_array in self.data.items(): self.data[field] = vals_array[: self.max_samples] break @@ -189,7 +194,10 @@ def itwinai_logger(self) -> BaseItwinaiLogger: try: itwinai_logger = self.trainer.itwinai_logger except AttributeError: - print("WARNING: itwinai_logger attribute not set " f"in {self.__class__.__name__}") + print( + "WARNING: itwinai_logger attribute not set " + f"in {self.__class__.__name__}" + ) itwinai_logger = None return itwinai_logger diff --git a/use-cases/3dgan/downsample_h5py_file.py b/use-cases/3dgan/downsample_h5py_file.py index 17c4aea9f..a3f232f1a 100644 --- a/use-cases/3dgan/downsample_h5py_file.py +++ b/use-cases/3dgan/downsample_h5py_file.py @@ -21,7 +21,9 @@ print(input_file[key]) shape = list(input_file[key].shape) shape[0] = MAXITEMS - outfile.create_dataset_like(name=key, other=input_file[key], shape=tuple(shape)) + outfile.create_dataset_like( + name=key, other=input_file[key], shape=tuple(shape) + ) print(outfile[key]) outfile[key][...] = input_file[key][:MAXITEMS] diff --git a/use-cases/3dgan/model.py b/use-cases/3dgan/model.py index f3a178716..ae3000db4 100644 --- a/use-cases/3dgan/model.py +++ b/use-cases/3dgan/model.py @@ -31,37 +31,53 @@ def __init__(self, latent_dim): # img_shape self.latent_dim = latent_dim self.l1 = nn.Linear(self.latent_dim, 5184) - self.up1 = nn.Upsample(scale_factor=(6, 6, 6), mode="trilinear", align_corners=False) - self.conv1 = nn.Conv3d(in_channels=8, out_channels=8, kernel_size=(6, 6, 8), padding=0) + self.up1 = nn.Upsample( + scale_factor=(6, 6, 6), mode="trilinear", align_corners=False + ) + self.conv1 = nn.Conv3d( + in_channels=8, out_channels=8, kernel_size=(6, 6, 8), padding=0 + ) nn.init.kaiming_uniform_(self.conv1.weight) # num_features is the number of channels (see doc) self.bn1 = nn.BatchNorm3d(num_features=8, eps=1e-6) self.pad1 = nn.ConstantPad3d((1, 1, 2, 2, 2, 2), 0) - self.conv2 = nn.Conv3d(in_channels=8, out_channels=6, kernel_size=(4, 4, 6), padding=0) + self.conv2 = nn.Conv3d( + in_channels=8, out_channels=6, kernel_size=(4, 4, 6), padding=0 + ) nn.init.kaiming_uniform_(self.conv2.weight) self.bn2 = nn.BatchNorm3d(num_features=6, eps=1e-6) self.pad2 = nn.ConstantPad3d((1, 1, 2, 2, 2, 2), 0) - self.conv3 = nn.Conv3d(in_channels=6, out_channels=6, kernel_size=(4, 4, 6), padding=0) + self.conv3 = nn.Conv3d( + in_channels=6, out_channels=6, kernel_size=(4, 4, 6), padding=0 + ) nn.init.kaiming_uniform_(self.conv3.weight) self.bn3 = nn.BatchNorm3d(num_features=6, eps=1e-6) self.pad3 = nn.ConstantPad3d((1, 1, 2, 2, 2, 2), 0) - self.conv4 = nn.Conv3d(in_channels=6, out_channels=6, kernel_size=(4, 4, 6), padding=0) + self.conv4 = nn.Conv3d( + in_channels=6, out_channels=6, kernel_size=(4, 4, 6), padding=0 + ) nn.init.kaiming_uniform_(self.conv4.weight) self.bn4 = nn.BatchNorm3d(num_features=6, eps=1e-6) self.pad4 = nn.ConstantPad3d((0, 0, 1, 1, 1, 1), 0) - self.conv5 = nn.Conv3d(in_channels=6, out_channels=6, kernel_size=(3, 3, 5), padding=0) + self.conv5 = nn.Conv3d( + in_channels=6, out_channels=6, kernel_size=(3, 3, 5), padding=0 + ) nn.init.kaiming_uniform_(self.conv5.weight) self.bn5 = nn.BatchNorm3d(num_features=6, eps=1e-6) self.pad5 = nn.ConstantPad3d((0, 0, 1, 1, 1, 1), 0) - self.conv6 = nn.Conv3d(in_channels=6, out_channels=6, kernel_size=(3, 3, 3), padding=0) + self.conv6 = nn.Conv3d( + in_channels=6, out_channels=6, kernel_size=(3, 3, 3), padding=0 + ) nn.init.kaiming_uniform_(self.conv6.weight) - self.conv7 = nn.Conv3d(in_channels=6, out_channels=1, kernel_size=(2, 2, 2), padding=0) + self.conv7 = nn.Conv3d( + in_channels=6, out_channels=1, kernel_size=(2, 2, 2), padding=0 + ) nn.init.xavier_normal_(self.conv7.weight) def forward(self, z): @@ -121,11 +137,15 @@ def __init__(self, power): self.drop2 = nn.Dropout(0.2) self.pad2 = nn.ConstantPad3d((1, 1, 0, 0, 0, 0), 0) - self.conv3 = nn.Conv3d(in_channels=8, out_channels=8, kernel_size=(5, 6, 6), padding=0) + self.conv3 = nn.Conv3d( + in_channels=8, out_channels=8, kernel_size=(5, 6, 6), padding=0 + ) self.bn2 = nn.BatchNorm3d(num_features=8, eps=1e-6) self.drop3 = nn.Dropout(0.2) - self.conv4 = nn.Conv3d(in_channels=8, out_channels=8, kernel_size=(5, 6, 6), padding=0) + self.conv4 = nn.Conv3d( + in_channels=8, out_channels=8, kernel_size=(5, 6, 6), padding=0 + ) self.bn3 = nn.BatchNorm3d(num_features=8, eps=1e-6) self.drop4 = nn.Dropout(0.2) @@ -153,14 +173,18 @@ def ecal_angle(self, image, daxis1): sumtot = torch.sum(image, dim=(1, 2, 3)) # sum of events # get 1. where event sum is 0 and 0 elsewhere - amask = torch.where(sumtot == 0.0, torch.ones_like(sumtot), torch.zeros_like(sumtot)) + amask = torch.where( + sumtot == 0.0, torch.ones_like(sumtot), torch.zeros_like(sumtot) + ) # masked_events = torch.sum(amask) # counting zero sum events # ref denotes barycenter as that is our reference point x_ref = torch.sum( torch.sum(image, dim=(2, 3)) * ( - torch.arange(x_shape, device=image.device, dtype=torch.float32).unsqueeze(0) + torch.arange( + x_shape, device=image.device, dtype=torch.float32 + ).unsqueeze(0) + 0.5 ), dim=1, @@ -168,7 +192,9 @@ def ecal_angle(self, image, daxis1): y_ref = torch.sum( torch.sum(image, dim=(1, 3)) * ( - torch.arange(y_shape, device=image.device, dtype=torch.float32).unsqueeze(0) + torch.arange( + y_shape, device=image.device, dtype=torch.float32 + ).unsqueeze(0) + 0.5 ), dim=1, @@ -176,7 +202,9 @@ def ecal_angle(self, image, daxis1): z_ref = torch.sum( torch.sum(image, dim=(1, 2)) * ( - torch.arange(z_shape, device=image.device, dtype=torch.float32).unsqueeze(0) + torch.arange( + z_shape, device=image.device, dtype=torch.float32 + ).unsqueeze(0) + 0.5 ), dim=1, @@ -327,7 +355,10 @@ def itwinai_logger(self) -> BaseItwinaiLogger: try: itwinai_logger = self.trainer.itwinai_logger except AttributeError: - print("WARNING: itwinai_logger attribute not set " f"in {self.__class__.__name__}") + print( + "WARNING: itwinai_logger attribute not set " + f"in {self.__class__.__name__}" + ) itwinai_logger = None return itwinai_logger @@ -429,7 +460,9 @@ def training_step(self, batch, batch_idx): predictions = self.discriminator(image_batch) # print("calculating real_batch_loss...") - real_batch_loss = self.compute_global_loss(labels, predictions, self.loss_weights) + real_batch_loss = self.compute_global_loss( + labels, predictions, self.loss_weights + ) if self.itwinai_logger: self.itwinai_logger.log( item=sum(real_batch_loss), @@ -461,7 +494,9 @@ def training_step(self, batch, batch_idx): predictions = self.discriminator(generated_images) - fake_batch_loss = self.compute_global_loss(labels, predictions, self.loss_weights) + fake_batch_loss = self.compute_global_loss( + labels, predictions, self.loss_weights + ) # self.log("fake_batch_loss", sum(fake_batch_loss), # prog_bar=True, on_step=True, on_epoch=True, sync_dist=True) @@ -618,7 +653,9 @@ def training_step(self, batch, batch_idx): context="training", ) self.itwinai_logger.log( - item=os.path.join(self.checkpoints_dir, "discriminator_weights.pth"), + item=os.path.join( + self.checkpoints_dir, "discriminator_weights.pth" + ), identifier="final_discriminator_weights", kind="artifact", context="training", @@ -637,14 +674,18 @@ def training_step(self, batch, batch_idx): gen_losses_total_loss = np.sum(gen_losses[0]) new_gen_losses = [gen_losses_total_loss] for i_weights in range(len(gen_losses[0])): - new_gen_losses.append(gen_losses[0][i_weights] / self.loss_weights[i_weights]) + new_gen_losses.append( + gen_losses[0][i_weights] / self.loss_weights[i_weights] + ) gen_losses[0] = new_gen_losses gen_losses[1] = [el.cpu().detach().numpy() for el in gen_losses[1]] gen_losses_total_loss = np.sum(gen_losses[1]) new_gen_losses = [gen_losses_total_loss] for i_weights in range(len(gen_losses[1])): - new_gen_losses.append(gen_losses[1][i_weights] / self.loss_weights[i_weights]) + new_gen_losses.append( + gen_losses[1][i_weights] / self.loss_weights[i_weights] + ) gen_losses[1] = new_gen_losses generator_loss = [(a + b) / 2 for a, b in zip(*gen_losses)] @@ -680,10 +721,14 @@ def on_train_epoch_end(self): self.train_history["discriminator"].append(discriminator_train_loss) print("-" * 65) - ROW_FMT = "{0:<20s} | {1:<4.2f} | {2:<10.2f} | " "{3:<10.2f}| {4:<10.2f} | {5:<10.2f}" + ROW_FMT = ( + "{0:<20s} | {1:<4.2f} | {2:<10.2f} | " "{3:<10.2f}| {4:<10.2f} | {5:<10.2f}" + ) print(ROW_FMT.format("generator (train)", *self.train_history["generator"][-1])) print( - ROW_FMT.format("discriminator (train)", *self.train_history["discriminator"][-1]) + ROW_FMT.format( + "discriminator (train)", *self.train_history["discriminator"][-1] + ) ) torch.save( @@ -710,7 +755,8 @@ def on_train_epoch_end(self): ) self.itwinai_logger.log( item=os.path.join(self.checkpoints_dir, "discriminator_weights.pth"), - identifier="ckpts/discriminator_weights_epoch_" + str(self.current_epoch), + identifier="ckpts/discriminator_weights_epoch_" + + str(self.current_epoch), kind="artifact", context="training", ) @@ -825,14 +871,18 @@ def validation_step(self, batch, batch_idx): disc_eval_loss_total_loss = np.sum(disc_eval_loss) new_disc_eval_loss = [disc_eval_loss_total_loss] for i_weights in range(len(disc_eval_loss)): - new_disc_eval_loss.append(disc_eval_loss[i_weights] / self.loss_weights[i_weights]) + new_disc_eval_loss.append( + disc_eval_loss[i_weights] / self.loss_weights[i_weights] + ) disc_eval_loss = new_disc_eval_loss gen_eval_loss = [el.cpu().detach().numpy() for el in gen_test_loss] gen_eval_loss_total_loss = np.sum(gen_eval_loss) new_gen_eval_loss = [gen_eval_loss_total_loss] for i_weights in range(len(gen_eval_loss)): - new_gen_eval_loss.append(gen_eval_loss[i_weights] / self.loss_weights[i_weights]) + new_gen_eval_loss.append( + gen_eval_loss[i_weights] / self.loss_weights[i_weights] + ) gen_eval_loss = new_gen_eval_loss self.index += 1 @@ -892,9 +942,15 @@ def on_validation_epoch_end(self): self.test_history["discriminator"].append(discriminator_test_loss) print("-" * 65) - ROW_FMT = "{0:<20s} | {1:<4.2f} | {2:<10.2f} | " "{3:<10.2f}| {4:<10.2f} | {5:<10.2f}" + ROW_FMT = ( + "{0:<20s} | {1:<4.2f} | {2:<10.2f} | " "{3:<10.2f}| {4:<10.2f} | {5:<10.2f}" + ) print(ROW_FMT.format("generator (test)", *self.test_history["generator"][-1])) - print(ROW_FMT.format("discriminator (test)", *self.test_history["discriminator"][-1])) + print( + ROW_FMT.format( + "discriminator (test)", *self.test_history["discriminator"][-1] + ) + ) # # save loss dict to pkl file # with open(self.pklfile, "wb") as f: @@ -926,18 +982,26 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> A # print(f"Generator input: {generator_ip.shape}") generated_images = self.generator(generator_ip) # print(f"Generated batch size {generated_images.shape}") - return {"images": generated_images, "energies": energy_batch, "angles": ang_batch} + return { + "images": generated_images, + "energies": energy_batch, + "angles": ang_batch, + } def configure_optimizers(self): lr = self.lr - optimizer_discriminator = torch.optim.RMSprop(self.discriminator.parameters(), lr) + optimizer_discriminator = torch.optim.RMSprop( + self.discriminator.parameters(), lr + ) optimizer_generator = torch.optim.RMSprop(self.generator.parameters(), lr) if self.itwinai_logger: self.itwinai_logger.log( optimizer_discriminator, "optimizer_discriminator", kind="torch" ) - self.itwinai_logger.log(optimizer_generator, "optimizer_generator", kind="torch") + self.itwinai_logger.log( + optimizer_generator, "optimizer_generator", kind="torch" + ) return [optimizer_discriminator, optimizer_generator], [] diff --git a/use-cases/3dgan/trainer.py b/use-cases/3dgan/trainer.py index 2bb1d156e..078398fb7 100644 --- a/use-cases/3dgan/trainer.py +++ b/use-cases/3dgan/trainer.py @@ -36,7 +36,9 @@ class Lightning3DGANTrainer(Trainer): - def __init__(self, config: Union[Dict, str], itwinai_logger: Optional[Logger] = None): + def __init__( + self, config: Union[Dict, str], itwinai_logger: Optional[Logger] = None + ): self.save_parameters(**self.locals2params(locals())) super().__init__() if isinstance(config, str) and os.path.isfile(config): @@ -162,7 +164,11 @@ def execute( # Transpose predictions into images, energies and angles images = torch.cat( - list(map(lambda pred: self.transform_predictions(pred["images"]), predictions)) + list( + map( + lambda pred: self.transform_predictions(pred["images"]), predictions + ) + ) ) energies = torch.cat(list(map(lambda pred: pred["energies"], predictions))) angles = torch.cat(list(map(lambda pred: pred["angles"], predictions))) diff --git a/use-cases/cyclones/cyclones_vgg.py b/use-cases/cyclones/cyclones_vgg.py index 173b0218c..708485141 100644 --- a/use-cases/cyclones/cyclones_vgg.py +++ b/use-cases/cyclones/cyclones_vgg.py @@ -119,16 +119,24 @@ def custom_VGG_V1(patch_size, channels, activation, regularizer): model.add(tf.keras.layers.Flatten()) model.add( - tf.keras.layers.Dense(units=512, activation=activation, kernel_regularizer=regularizer) + tf.keras.layers.Dense( + units=512, activation=activation, kernel_regularizer=regularizer + ) ) model.add( - tf.keras.layers.Dense(units=256, activation=activation, kernel_regularizer=regularizer) + tf.keras.layers.Dense( + units=256, activation=activation, kernel_regularizer=regularizer + ) ) model.add( - tf.keras.layers.Dense(units=128, activation=activation, kernel_regularizer=regularizer) + tf.keras.layers.Dense( + units=128, activation=activation, kernel_regularizer=regularizer + ) ) model.add( - tf.keras.layers.Dense(units=64, activation=activation, kernel_regularizer=regularizer) + tf.keras.layers.Dense( + units=64, activation=activation, kernel_regularizer=regularizer + ) ) model.add(tf.keras.layers.Dense(channels[1])) @@ -303,13 +311,19 @@ def custom_VGG_V2(patch_size, channels, activation, regularizer): ) ) model.add( - tf.keras.layers.Dense(units=512, activation=activation, kernel_regularizer=regularizer) + tf.keras.layers.Dense( + units=512, activation=activation, kernel_regularizer=regularizer + ) ) model.add( - tf.keras.layers.Dense(units=256, activation=activation, kernel_regularizer=regularizer) + tf.keras.layers.Dense( + units=256, activation=activation, kernel_regularizer=regularizer + ) ) model.add( - tf.keras.layers.Dense(units=128, activation=activation, kernel_regularizer=regularizer) + tf.keras.layers.Dense( + units=128, activation=activation, kernel_regularizer=regularizer + ) ) model.add(tf.keras.layers.Dense(channels[1])) @@ -486,13 +500,19 @@ def custom_VGG_V3(patch_size, channels, activation, regularizer): ) ) model.add( - tf.keras.layers.Dense(units=512, activation=activation, kernel_regularizer=regularizer) + tf.keras.layers.Dense( + units=512, activation=activation, kernel_regularizer=regularizer + ) ) model.add( - tf.keras.layers.Dense(units=512, activation=activation, kernel_regularizer=regularizer) + tf.keras.layers.Dense( + units=512, activation=activation, kernel_regularizer=regularizer + ) ) model.add( - tf.keras.layers.Dense(units=256, activation=activation, kernel_regularizer=regularizer) + tf.keras.layers.Dense( + units=256, activation=activation, kernel_regularizer=regularizer + ) ) model.add(tf.keras.layers.Dense(channels[1])) @@ -637,10 +657,18 @@ def ModelV5(patch_size, channels, last_activation, kernel_size=3): x = block(x) x = tf.keras.layers.Flatten()(x) - x = tf.keras.layers.Dense(units=1024, activation="relu", kernel_initializer=initializer)(x) - x = tf.keras.layers.Dense(units=512, activation="relu", kernel_initializer=initializer)(x) - x = tf.keras.layers.Dense(units=256, activation="relu", kernel_initializer=initializer)(x) - x = tf.keras.layers.Dense(units=128, activation="relu", kernel_initializer=initializer)(x) + x = tf.keras.layers.Dense( + units=1024, activation="relu", kernel_initializer=initializer + )(x) + x = tf.keras.layers.Dense( + units=512, activation="relu", kernel_initializer=initializer + )(x) + x = tf.keras.layers.Dense( + units=256, activation="relu", kernel_initializer=initializer + )(x) + x = tf.keras.layers.Dense( + units=128, activation="relu", kernel_initializer=initializer + )(x) outputs = tf.keras.layers.Dense( channels[1], activation=last_activation, kernel_initializer=initializer diff --git a/use-cases/cyclones/dataloader.py b/use-cases/cyclones/dataloader.py index 7bf12f9f2..596bfdd4c 100644 --- a/use-cases/cyclones/dataloader.py +++ b/use-cases/cyclones/dataloader.py @@ -71,7 +71,9 @@ def __init__( experiment["DRV_VARS_1"], experiment["COO_VARS_1"], ) - self.msk_var = None if experiment["MSK_VAR_1"] == "None" else experiment["MSK_VAR_!"] + self.msk_var = ( + None if experiment["MSK_VAR_1"] == "None" else experiment["MSK_VAR_!"] + ) self.channels = [len(self.drv_vars), len(self.coo_vars)] # Shuffle @@ -139,7 +141,8 @@ def setup_config(self, config: Dict) -> None: [ join(self.tfrecords_path, f) for f in listdir(self.tfrecords_path) - if f.endswith(".tfrecord") and f.startswith(PatchType.ALLADJACENT.value) + if f.endswith(".tfrecord") + and f.startswith(PatchType.ALLADJACENT.value) ] ) self.random_files = sorted( @@ -163,7 +166,9 @@ def execute(self): train_c_fs, valid_c_fs = self.split_files( files=self.cyclone_files, ratio=self.split_ratio ) - train_a_fs, valid_a_fs = self.split_files(files=self.adj_files, ratio=self.split_ratio) + train_a_fs, valid_a_fs = self.split_files( + files=self.adj_files, ratio=self.split_ratio + ) train_r_fs, valid_r_fs = self.split_files( files=self.random_files, ratio=self.split_ratio ) diff --git a/use-cases/cyclones/src/callbacks.py b/use-cases/cyclones/src/callbacks.py index 42c530d05..0085ac45c 100644 --- a/use-cases/cyclones/src/callbacks.py +++ b/use-cases/cyclones/src/callbacks.py @@ -28,17 +28,19 @@ def __init__(self, filename): self.filename = filename # create time history dataframe - self.benchmark_df = pd.DataFrame(columns=[ - 'time', - 'start_cpu_percent', - 'end_cpu_percent', - 'start_mem_rss', - 'end_mem_rss', - 'start_mem_vms', - 'end_mem_vms', - 'start_mem_uss', - 'end_mem_uss', - ]) + self.benchmark_df = pd.DataFrame( + columns=[ + "time", + "start_cpu_percent", + "end_cpu_percent", + "start_mem_rss", + "end_mem_rss", + "start_mem_vms", + "end_mem_vms", + "start_mem_uss", + "end_mem_uss", + ] + ) # save the dataframe to csv file self.benchmark_df.to_csv(self.filename) @@ -46,17 +48,17 @@ def on_epoch_begin(self, batch, logs={}): self.epoch_time = -time.time() self.start_cpu_percent = self.process.cpu_percent() mem_info = self.process.memory_full_info() - self.start_mem_rss = mem_info[0] / float(2 ** 20) - self.start_mem_vms = mem_info[1] / float(2 ** 20) - self.start_mem_uss = mem_info[3] / float(2 ** 20) + self.start_mem_rss = mem_info[0] / float(2**20) + self.start_mem_vms = mem_info[1] / float(2**20) + self.start_mem_uss = mem_info[3] / float(2**20) def on_epoch_end(self, batch, logs={}): self.epoch_time += time.time() self.end_cpu_percent = self.process.cpu_percent() mem_info = self.process.memory_full_info() - self.end_mem_rss = mem_info[0] / float(2 ** 20) - self.end_mem_vms = mem_info[1] / float(2 ** 20) - self.end_mem_uss = mem_info[3] / float(2 ** 20) + self.end_mem_rss = mem_info[0] / float(2**20) + self.end_mem_vms = mem_info[1] / float(2**20) + self.end_mem_uss = mem_info[3] / float(2**20) self.benchmark_df.loc[len(self.benchmark_df.index)] = [ self.epoch_time, diff --git a/use-cases/cyclones/src/macros.py b/use-cases/cyclones/src/macros.py index 5dd8fbe2e..0446445d1 100644 --- a/use-cases/cyclones/src/macros.py +++ b/use-cases/cyclones/src/macros.py @@ -11,19 +11,23 @@ #  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # TUTTE LE VARIABILI DISPONIBILI -ALL_DRIVER_VARS = ['fg10', 'i10fg', 'msl', 'sst', 't_500', 't_300', 'vo_850'] -ALL_COORDINATE_VARS = ['real_cyclone', - 'rounded_cyclone', 'global_cyclone', 'patch_cyclone'] -CYCLONE_VAR = 'patch_cyclone' -MASK_VAR = 'cyclone_mask' +ALL_DRIVER_VARS = ["fg10", "i10fg", "msl", "sst", "t_500", "t_300", "vo_850"] +ALL_COORDINATE_VARS = [ + "real_cyclone", + "rounded_cyclone", + "global_cyclone", + "patch_cyclone", +] +CYCLONE_VAR = "patch_cyclone" +MASK_VAR = "cyclone_mask" # ESPERIMENTI TIPO 1 # variabili per la prima parte degli esperimenti (regressione per trovare # coordinate row-col intra-patch) EXPERIMENT_1 = { - 'DRV_VARS_1': ['fg10', 'msl', 't_500', 't_300'], - 'COO_VARS_1': ['patch_cyclone'], - 'MSK_VAR_1': None + "DRV_VARS_1": ["fg10", "msl", "t_500", "t_300"], + "COO_VARS_1": ["patch_cyclone"], + "MSK_VAR_1": None, } # dataset parameters @@ -40,63 +44,75 @@ # # #  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # + # descrive il tipo di patch che bisogna prendere class PatchType(Enum): - ALLADJACENT = 'alladjacent' - CYCLONE = 'cyclone' - NEAREST = 'nearest' - RANDOM = 'random' - NOCYCLONE = 'nocyclone' + ALLADJACENT = "alladjacent" + CYCLONE = "cyclone" + NEAREST = "nearest" + RANDOM = "random" + NOCYCLONE = "nocyclone" + # descrive il tipo di augmentation che deve essere effettuata class AugmentationType(Enum): - ALL_PATCHES = 'all_patches' - ONLY_TCS = 'only_tcs' + ALL_PATCHES = "all_patches" + ONLY_TCS = "only_tcs" + # descrive il nome del modello di rete neurale da utilizzare class Network(Enum): - VGG_V1 = 'vgg_v1' # map-to-coord - VGG_V2 = 'vgg_v2' # map-to-coord - VGG_V3 = 'vgg_v3' # map-to-coord - MODEL_V5 = 'model_v5' # map-to-coord + VGG_V1 = "vgg_v1" # map-to-coord + VGG_V2 = "vgg_v2" # map-to-coord + VGG_V3 = "vgg_v3" # map-to-coord + MODEL_V5 = "model_v5" # map-to-coord + # ritorna nome della loss utilizzata in fase di training class Losses(Enum): # Mean Absolute Error - MAE = ('mae', 'mae') + MAE = ("mae", "mae") # Mean Squared Error - MSE = ('mse', 'mse') + MSE = ("mse", "mse") # No specified loss - NONE = ('none', None) + NONE = ("none", None) + # descrive la forza della regolarizzazione class RegularizationStrength(Enum): - WEAK = ('weak', tf.keras.regularizers.l1_l2( - l1=0.0, l2=0.0001)) # l1=0 - l2=0.0001 - MEDIUM = ('medium', tf.keras.regularizers.l1_l2( - l1=0.0001, l2=0.0001)) # l1=0.0001 - l2=0.0001 - STRONG = ('strong', tf.keras.regularizers.l1_l2( - l1=0.001, l2=0.001)) # l1=0.001 - l2=0.001 - VERY_STRONG = ('very_strong', tf.keras.regularizers.l1_l2( - l1=0.01, l2=0.01)) # l1=0.01 - l2=0.01 - NONE = ('none', None) # no regularization + WEAK = ("weak", tf.keras.regularizers.l1_l2(l1=0.0, l2=0.0001)) # l1=0 - l2=0.0001 + MEDIUM = ( + "medium", + tf.keras.regularizers.l1_l2(l1=0.0001, l2=0.0001), + ) # l1=0.0001 - l2=0.0001 + STRONG = ( + "strong", + tf.keras.regularizers.l1_l2(l1=0.001, l2=0.001), + ) # l1=0.001 - l2=0.001 + VERY_STRONG = ( + "very_strong", + tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01), + ) # l1=0.01 - l2=0.01 + NONE = ("none", None) # no regularization + # descrive l'attivazione dell'ultimo layer del modello class Activation(Enum): - RELU = 'relu' - LINEAR = 'linear' - SIGMOID = 'sigmoid' - TANH = 'tanh' + RELU = "relu" + LINEAR = "linear" + SIGMOID = "sigmoid" + TANH = "tanh" + # label assegnata ad un ciclone assente diff --git a/use-cases/cyclones/src/scaling.py b/use-cases/cyclones/src/scaling.py index a3b9b4c67..0b757b9b6 100644 --- a/use-cases/cyclones/src/scaling.py +++ b/use-cases/cyclones/src/scaling.py @@ -4,8 +4,13 @@ def fit_transform( - volume, shape, channel, feature_range=(0, 1), - type='minmax', save=False, filename=None + volume, + shape, + channel, + feature_range=(0, 1), + type="minmax", + save=False, + filename=None, ): """ Creates the scaler on the input volume and scales the data @@ -39,16 +44,16 @@ def fit_transform( can_save = False if save: if filename is None: - raise ValueError( - 'Must specify the filename when saving the scaler') + raise ValueError("Must specify the filename when saving the scaler") else: can_save = True - if type == 'minmax': + if type == "minmax": scaler = MinMaxScaler(feature_range=feature_range) - volume = scaler.fit_transform( - volume.reshape(-1, channel)).reshape(-1, *shape, channel) + volume = scaler.fit_transform(volume.reshape(-1, channel)).reshape( + -1, *shape, channel + ) if can_save: joblib.dump(scaler, filename) @@ -76,9 +81,7 @@ def inv_transform(scaled_image, scaler, shape, channel): image : np.array Scaled input volume. """ - return scaler.inverse_transform( - scaled_image.reshape(-1, channel) - ).reshape(*shape) + return scaler.inverse_transform(scaled_image.reshape(-1, channel)).reshape(*shape) def transform(image, scaler, shape, channel): @@ -101,9 +104,7 @@ def transform(image, scaler, shape, channel): scaled_image : np.array Scaled input volume. """ - return scaler.transform( - image.reshape(-1, channel) - ).reshape(-1, *shape, channel) + return scaler.transform(image.reshape(-1, channel)).reshape(-1, *shape, channel) def get_scalers(scaler_X_file=None, scaler_y_file=None): @@ -131,21 +132,25 @@ def save_tf_minmax(Xt, outfile): Saves a MinMax Scaler as a Tensorflow Record. """ + def tensor_feature(value): """Returns a bytes_list from a string / byte.""" - return tf.train.Feature(bytes_list=tf.train.BytesList( - value=[tf.io.serialize_tensor(tf.convert_to_tensor(value)).numpy()] - ) + return tf.train.Feature( + bytes_list=tf.train.BytesList( + value=[tf.io.serialize_tensor(tf.convert_to_tensor(value)).numpy()] + ) ) def scaler_encoding_fn(min, max): """Builds a serialized version of the dataset. X and y must be np.array. """ - features = tf.train.Features(feature={ - "min": tensor_feature(min), - "max": tensor_feature(max), - }) + features = tf.train.Features( + feature={ + "min": tensor_feature(min), + "max": tensor_feature(max), + } + ) return tf.train.Example(features=features).SerializeToString() def write_record_to_file(min, max, record_file): @@ -161,22 +166,20 @@ def write_record_to_file(min, max, record_file): n_batches += 1 # compute min - cur_min = tf.math.reduce_min(input_tensor=Xt[0, ], axis=(0, 1)).numpy() + cur_min = tf.math.reduce_min(input_tensor=Xt[0,], axis=(0, 1)).numpy() for i in range(n_batches): - X_batch = Xt[(i * batch_size):((i+1) * batch_size)] - i_min = tf.math.reduce_min( - input_tensor=X_batch, axis=(0, 1, 2)).numpy() + X_batch = Xt[(i * batch_size) : ((i + 1) * batch_size)] + i_min = tf.math.reduce_min(input_tensor=X_batch, axis=(0, 1, 2)).numpy() for c in range(i_min.shape[-1]): if i_min[c] <= cur_min[c]: cur_min[c] = i_min[c] X_min = cur_min # compute max - cur_max = tf.math.reduce_max(input_tensor=Xt[0, ], axis=(0, 1)).numpy() + cur_max = tf.math.reduce_max(input_tensor=Xt[0,], axis=(0, 1)).numpy() for i in range(n_batches): - X_batch = Xt[(i * batch_size):((i+1) * batch_size)] - i_max = tf.math.reduce_max( - input_tensor=X_batch, axis=(0, 1, 2)).numpy() + X_batch = Xt[(i * batch_size) : ((i + 1) * batch_size)] + i_max = tf.math.reduce_max(input_tensor=X_batch, axis=(0, 1, 2)).numpy() for c in range(i_max.shape[-1]): if i_max[c] >= cur_max[c]: cur_max[c] = i_max[c] @@ -187,27 +190,32 @@ def write_record_to_file(min, max, record_file): write_record_to_file(min=X_min, max=X_max, record_file=outfile) # return scaler dictionary - return {'min': tf.convert_to_tensor(X_min), - 'max': tf.convert_to_tensor(X_max)} + return {"min": tf.convert_to_tensor(X_min), "max": tf.convert_to_tensor(X_max)} def save_tf_minmax_by_min_and_max(min, max, outfile): """ Saves a MinMax Scaler as a Tensorflow Record. """ + def tensor_feature(value): """Returns a bytes_list from a string / byte.""" - return tf.train.Feature(bytes_list=tf.train.BytesList(value=[ - tf.io.serialize_tensor(tf.convert_to_tensor(value)).numpy()])) + return tf.train.Feature( + bytes_list=tf.train.BytesList( + value=[tf.io.serialize_tensor(tf.convert_to_tensor(value)).numpy()] + ) + ) def scaler_encoding_fn(min, max): """Builds a serialized version of the dataset. X and y must be np.array. """ - features = tf.train.Features(feature={ - "min": tensor_feature(min), - "max": tensor_feature(max), - }) + features = tf.train.Features( + feature={ + "min": tensor_feature(min), + "max": tensor_feature(max), + } + ) return tf.train.Example(features=features).SerializeToString() def write_record_to_file(min, max, record_file): @@ -221,7 +229,7 @@ def write_record_to_file(min, max, record_file): write_record_to_file(min=min, max=max, record_file=outfile) # return scaler dictionary - return {'min': tf.convert_to_tensor(min), 'max': tf.convert_to_tensor(max)} + return {"min": tf.convert_to_tensor(min), "max": tf.convert_to_tensor(max)} def load_tf_minmax(scalerfile, vars): @@ -235,33 +243,29 @@ def scaler_decoding_fn(serialized_data): tensor_encoding_fn(). """ features = { - 'min': tf.io.FixedLenFeature([], tf.string), - 'max': tf.io.FixedLenFeature([], tf.string) + "min": tf.io.FixedLenFeature([], tf.string), + "max": tf.io.FixedLenFeature([], tf.string), } # Parse the serialized data so we get a dict with our data. - parsed_data = tf.io.parse_single_example( - serialized_data, features=features) + parsed_data = tf.io.parse_single_example(serialized_data, features=features) # Get X and y raw data - raw_min = parsed_data['min'] - raw_max = parsed_data['max'] + raw_min = parsed_data["min"] + raw_max = parsed_data["max"] # Decode the raw bytes so it becomes a tensor with type. - min = tf.ensure_shape(tf.io.parse_tensor( - raw_min, tf.float32), (len(vars))) - max = tf.ensure_shape(tf.io.parse_tensor( - raw_max, tf.float32), (len(vars))) + min = tf.ensure_shape(tf.io.parse_tensor(raw_min, tf.float32), (len(vars))) + max = tf.ensure_shape(tf.io.parse_tensor(raw_max, tf.float32), (len(vars))) return min, max # load scaler set - scaler_set = ( - tf.data.TFRecordDataset(scalerfile, num_parallel_reads=AUTOTUNE) - .map(scaler_decoding_fn, num_parallel_calls=AUTOTUNE) + scaler_set = tf.data.TFRecordDataset(scalerfile, num_parallel_reads=AUTOTUNE).map( + scaler_decoding_fn, num_parallel_calls=AUTOTUNE ) # get min and max from the dataset for data in scaler_set: min, max = data # return scaler dictionary - return {'min': min, 'max': max} + return {"min": min, "max": max} def minmax_transform(data, scaler): @@ -269,8 +273,8 @@ def minmax_transform(data, scaler): Applies the transform of TFMinMaxScaler to the provided dataset. """ if scaler: - num = tf.subtract(data, scaler['min']) - den = tf.subtract(scaler['max'], scaler['min']) + num = tf.subtract(data, scaler["min"]) + den = tf.subtract(scaler["max"], scaler["min"]) res = tf.math.divide(num, den) else: res = data @@ -282,16 +286,15 @@ def minmax_inverse_transform(scaled_data, scaler): Applies the inverse transform of TFMinMaxScaler to the provided scaled dataset. """ - sub = tf.subtract(scaler['max'], scaler['min']) + sub = tf.subtract(scaler["max"], scaler["min"]) mul = tf.multiply(scaled_data, sub) - return mul + scaler['min'] + return mul + scaler["min"] def minmax_inverse_target_transform(y_scaled, label_no_cyclone, patch_size): """ Applies the inverse transform on y data when scaled in (0,1) """ - sub = tf.subtract( - tf.cast(patch_size-1, dtype=tf.float32), label_no_cyclone) + sub = tf.subtract(tf.cast(patch_size - 1, dtype=tf.float32), label_no_cyclone) mul = tf.multiply(y_scaled, sub) return mul + label_no_cyclone diff --git a/use-cases/cyclones/src/strategy.py b/use-cases/cyclones/src/strategy.py index 950eb7c00..113cb3438 100644 --- a/use-cases/cyclones/src/strategy.py +++ b/use-cases/cyclones/src/strategy.py @@ -5,12 +5,11 @@ # with CPU or GPU def get_mirrored_strategy(cores=4): if cores: - CPUs = ['CPU:'+str(i) for i in range(cores)] + CPUs = ["CPU:" + str(i) for i in range(cores)] mirrored_strategy = tf.distribute.MirroredStrategy(CPUs) else: mirrored_strategy = tf.distribute.MirroredStrategy() - print('Number of devices: {}'.format( - mirrored_strategy.num_replicas_in_sync)) + print("Number of devices: {}".format(mirrored_strategy.num_replicas_in_sync)) return mirrored_strategy, mirrored_strategy.num_replicas_in_sync diff --git a/use-cases/cyclones/src/tfrecords/dataset.py b/use-cases/cyclones/src/tfrecords/dataset.py index da8fcd60d..c392d6dc6 100644 --- a/use-cases/cyclones/src/tfrecords/dataset.py +++ b/use-cases/cyclones/src/tfrecords/dataset.py @@ -4,7 +4,8 @@ from .functions import ( get_tensor_decoding_fn, get_scaling_fn, - get_masking_fn, get_scale_target_fn + get_masking_fn, + get_scale_target_fn, ) from ..macros import PatchType, AugmentationType @@ -17,8 +18,9 @@ def get_interleave(cyc_weights, nocyc_weights): # define cyclone interleave cyc_interleave = [i for i, w in enumerate(cyc_weights) for _ in range(w)] # define nocyclone interleave - nocyc_interleave = [i+len(cyc_interleave) - for i, w in enumerate(nocyc_weights) for _ in range(w)] + nocyc_interleave = [ + i + len(cyc_interleave) for i, w in enumerate(nocyc_weights) for _ in range(w) + ] # compute the number of blocks + the remainder of the interleaves blocks = len(nocyc_interleave) // len(cyc_interleave) @@ -26,7 +28,7 @@ def get_interleave(cyc_weights, nocyc_weights): interleave = [] for i in cyc_interleave: - interleave += [i] + nocyc_interleave[i*blocks:(i+1)*blocks] + interleave += [i] + nocyc_interleave[i * blocks : (i + 1) * blocks] if remainder: interleave += nocyc_interleave[-remainder:] @@ -34,12 +36,21 @@ def get_interleave(cyc_weights, nocyc_weights): def eFlowsTFRecordDataset( - cyc_fnames, adj_fnames, rnd_fnames, epochs, # batch_size, - scalers, target_scale=False, drv_vars=[], coo_vars=None, - msk_var=None, shape=(40, 40), + cyc_fnames, + adj_fnames, + rnd_fnames, + epochs, # batch_size, + scalers, + target_scale=False, + drv_vars=[], + coo_vars=None, + msk_var=None, + shape=(40, 40), label_no_cyclone: Optional[float] = -0.3, - shuffle_buffer=None, patch_type=PatchType.NEAREST.value, - aug_type=AugmentationType.ONLY_TCS.value, aug_fns={}, + shuffle_buffer=None, + patch_type=PatchType.NEAREST.value, + aug_type=AugmentationType.ONLY_TCS.value, + aug_fns={}, # drop_remainder=True ): # set autotune parameter to automatically manage resourches @@ -50,38 +61,51 @@ def eFlowsTFRecordDataset( # setup dynamical lambda functions to be applied to this dataset tensor_decoding_fn = get_tensor_decoding_fn( - shape=shape, drv_vars=drv_vars, coo_vars=coo_vars, msk_var=msk_var) + shape=shape, drv_vars=drv_vars, coo_vars=coo_vars, msk_var=msk_var + ) scaling_fn = get_scaling_fn(scalers=scalers) masking_fn = get_masking_fn(mask=label_no_cyclone) scale_target_fn = get_scale_target_fn( - label_no_cyclone=label_no_cyclone, patch_size=shape[0]) + label_no_cyclone=label_no_cyclone, patch_size=shape[0] + ) # multiplier for the augmentation mul = 1 if not aug_fns else (len(aug_fns.keys()) + 1) # compute the number of samples into the dataset - cyc_n_elems = sum([int(fname.split('/')[-1].split('.tfrecord') - [0].split('_')[-1]) for fname in cyc_fnames]) - rnd_n_elems = sum([int(fname.split('/')[-1].split('.tfrecord') - [0].split('_')[-1]) for fname in rnd_fnames]) - adj_n_elems = sum([int(fname.split('/')[-1].split('.tfrecord') - [0].split('_')[-1]) for fname in adj_fnames]) + cyc_n_elems = sum( + [ + int(fname.split("/")[-1].split(".tfrecord")[0].split("_")[-1]) + for fname in cyc_fnames + ] + ) + rnd_n_elems = sum( + [ + int(fname.split("/")[-1].split(".tfrecord")[0].split("_")[-1]) + for fname in rnd_fnames + ] + ) + adj_n_elems = sum( + [ + int(fname.split("/")[-1].split(".tfrecord")[0].split("_")[-1]) + for fname in adj_fnames + ] + ) n_elems = mul * cyc_n_elems + rnd_n_elems + adj_n_elems # total number of samples that will be yielded by this dataset count = n_elems * epochs # create standard datasets for each patch category - cyc_dataset = tf.data.TFRecordDataset( - cyc_fnames, num_parallel_reads=AUTOTUNE - ).map( - tensor_decoding_fn, num_parallel_calls=AUTOTUNE) - rnd_dataset = tf.data.TFRecordDataset( - rnd_fnames, num_parallel_reads=AUTOTUNE).map( - tensor_decoding_fn, num_parallel_calls=AUTOTUNE) - adj_dataset = tf.data.TFRecordDataset( - adj_fnames, num_parallel_reads=AUTOTUNE).map( - tensor_decoding_fn, num_parallel_calls=AUTOTUNE) + cyc_dataset = tf.data.TFRecordDataset(cyc_fnames, num_parallel_reads=AUTOTUNE).map( + tensor_decoding_fn, num_parallel_calls=AUTOTUNE + ) + rnd_dataset = tf.data.TFRecordDataset(rnd_fnames, num_parallel_reads=AUTOTUNE).map( + tensor_decoding_fn, num_parallel_calls=AUTOTUNE + ) + adj_dataset = tf.data.TFRecordDataset(adj_fnames, num_parallel_reads=AUTOTUNE).map( + tensor_decoding_fn, num_parallel_calls=AUTOTUNE + ) # add cyclone dataset to the cyclone datasets cyc_datasets = [cyc_dataset] @@ -99,35 +123,43 @@ def eFlowsTFRecordDataset( if aug_type == AugmentationType.ALL_PATCHES.value: # define augmented datasets for each augmentation function aug_cyc_datasets = [ - (tf.data.TFRecordDataset( - cyc_fnames, num_parallel_reads=AUTOTUNE).map( - tensor_decoding_fn, num_parallel_calls=AUTOTUNE).map( - lambda x, y: (aug_fn((x, y))), num_parallel_calls=AUTOTUNE) - ) - for aug_fn in aug_fns.values()] - - aug_rnd_datasets = [(tf.data.TFRecordDataset( - rnd_fnames, num_parallel_reads=AUTOTUNE).map( - tensor_decoding_fn, num_parallel_calls=AUTOTUNE).map( - lambda x, y: (aug_fn((x, y))), num_parallel_calls=AUTOTUNE)) - for aug_fn in aug_fns.values()] - - aug_adj_datasets = [(tf.data.TFRecordDataset( - adj_fnames, num_parallel_reads=AUTOTUNE).map( - tensor_decoding_fn, num_parallel_calls=AUTOTUNE).map( - lambda x, y: (aug_fn((x, y))), num_parallel_calls=AUTOTUNE)) - for aug_fn in aug_fns.values()] + ( + tf.data.TFRecordDataset(cyc_fnames, num_parallel_reads=AUTOTUNE) + .map(tensor_decoding_fn, num_parallel_calls=AUTOTUNE) + .map(lambda x, y: (aug_fn((x, y))), num_parallel_calls=AUTOTUNE) + ) + for aug_fn in aug_fns.values() + ] + + aug_rnd_datasets = [ + ( + tf.data.TFRecordDataset(rnd_fnames, num_parallel_reads=AUTOTUNE) + .map(tensor_decoding_fn, num_parallel_calls=AUTOTUNE) + .map(lambda x, y: (aug_fn((x, y))), num_parallel_calls=AUTOTUNE) + ) + for aug_fn in aug_fns.values() + ] + + aug_adj_datasets = [ + ( + tf.data.TFRecordDataset(adj_fnames, num_parallel_reads=AUTOTUNE) + .map(tensor_decoding_fn, num_parallel_calls=AUTOTUNE) + .map(lambda x, y: (aug_fn((x, y))), num_parallel_calls=AUTOTUNE) + ) + for aug_fn in aug_fns.values() + ] # augmentation of only TC patches elif aug_type == AugmentationType.ONLY_TCS.value: # define augmented datasets for each augmentation function - aug_cyc_datasets = [(tf.data.TFRecordDataset( - cyc_fnames, num_parallel_reads=AUTOTUNE).map( - tensor_decoding_fn, - num_parallel_calls=AUTOTUNE).map( - lambda x, y: (aug_fn((x, y))), - num_parallel_calls=AUTOTUNE)) - for aug_fn in aug_fns.values()] + aug_cyc_datasets = [ + ( + tf.data.TFRecordDataset(cyc_fnames, num_parallel_reads=AUTOTUNE) + .map(tensor_decoding_fn, num_parallel_calls=AUTOTUNE) + .map(lambda x, y: (aug_fn((x, y))), num_parallel_calls=AUTOTUNE) + ) + for aug_fn in aug_fns.values() + ] aug_rnd_datasets = [] aug_adj_datasets = [] else: @@ -148,21 +180,19 @@ def eFlowsTFRecordDataset( datasets = cyc_datasets + nocyc_datasets # get the interleave of all datasets - interleave = get_interleave( - cyc_weights=cyc_weights, nocyc_weights=nocyc_weights) + interleave = get_interleave(cyc_weights=cyc_weights, nocyc_weights=nocyc_weights) # compute the choice dataset with the interleave - choice_dataset = tf.data.Dataset.from_tensor_slices( - interleave).repeat(count=count) + choice_dataset = tf.data.Dataset.from_tensor_slices(interleave).repeat(count=count) # statically interleave elements from all the datasets dataset = tf.data.experimental.choose_from_datasets( - datasets=datasets, choice_dataset=choice_dataset) + datasets=datasets, choice_dataset=choice_dataset + ) # shuffle if necessary if shuffle_buffer: - dataset = dataset.shuffle( - shuffle_buffer, reshuffle_each_iteration=True) + dataset = dataset.shuffle(shuffle_buffer, reshuffle_each_iteration=True) # NOTE: when running distributed training, the dataset # should be batched knowing the number of parallel @@ -181,16 +211,19 @@ def eFlowsTFRecordDataset( # apply mask on target if label_no_cyclone is provided if label_no_cyclone: - dataset = dataset.map(lambda X, y: ( - masking_fn((X, y))), num_parallel_calls=AUTOTUNE) + dataset = dataset.map( + lambda X, y: (masking_fn((X, y))), num_parallel_calls=AUTOTUNE + ) # scale the data if scalers: - dataset = dataset.map(lambda X, y: ( - scaling_fn((X, y))), num_parallel_calls=AUTOTUNE) + dataset = dataset.map( + lambda X, y: (scaling_fn((X, y))), num_parallel_calls=AUTOTUNE + ) if target_scale: - dataset = dataset.map(lambda X, y: ( - scale_target_fn((X, y))), num_parallel_calls=AUTOTUNE) + dataset = dataset.map( + lambda X, y: (scale_target_fn((X, y))), num_parallel_calls=AUTOTUNE + ) # set number of epochs that can be repeated on this dataset dataset = dataset.repeat(count=epochs) diff --git a/use-cases/cyclones/src/tfrecords/functions.py b/use-cases/cyclones/src/tfrecords/functions.py index 5c535d920..84aa5fb26 100644 --- a/use-cases/cyclones/src/tfrecords/functions.py +++ b/use-cases/cyclones/src/tfrecords/functions.py @@ -4,12 +4,11 @@ def get_tensor_decoding_fn( - shape, drv_vars=[], coo_vars=None, msk_var=None, - dtype=tf.float32 + shape, drv_vars=[], coo_vars=None, msk_var=None, dtype=tf.float32 ): def tensor_decoding_fn(serialized_data): - """ Decoding function for a dataset written to disk as + """Decoding function for a dataset written to disk as tensor_encoding_fn(). """ # define features dictionary @@ -27,35 +26,57 @@ def tensor_decoding_fn(serialized_data): features.update({var: tf.io.FixedLenFeature([], tf.string)}) # parse the serialized data so we get a dict with our data. - parsed_data = tf.io.parse_single_example( - serialized_data, features=features) + parsed_data = tf.io.parse_single_example(serialized_data, features=features) # accumulator for data elements data = [] # get x raw data - Xdrv = tf.stack([tf.ensure_shape(tf.io.parse_tensor( - serialized=parsed_data[var], out_type=dtype), - shape=shape)for var in drv_vars], axis=-1) + Xdrv = tf.stack( + [ + tf.ensure_shape( + tf.io.parse_tensor(serialized=parsed_data[var], out_type=dtype), + shape=shape, + ) + for var in drv_vars + ], + axis=-1, + ) data.append(Xdrv) # if coordinate vars are provided if coo_vars: if len(coo_vars) == 1: - Ycoo = tf.ensure_shape(tf.io.parse_tensor( - serialized=parsed_data[coo_vars[0]], out_type=dtype), - shape=(2,)) + Ycoo = tf.ensure_shape( + tf.io.parse_tensor( + serialized=parsed_data[coo_vars[0]], out_type=dtype + ), + shape=(2,), + ) else: - Ycoo = tf.stack([tf.ensure_shape(tf.io.parse_tensor( - serialized=parsed_data[var], out_type=dtype), - shape=(2)) for var in coo_vars], axis=-1) + Ycoo = tf.stack( + [ + tf.ensure_shape( + tf.io.parse_tensor( + serialized=parsed_data[var], out_type=dtype + ), + shape=(2), + ) + for var in coo_vars + ], + axis=-1, + ) data.append(Ycoo) # if mask var is provided if msk_var: - Ymsk = tf.expand_dims(tf.ensure_shape(tf.io.parse_tensor( - serialized=parsed_data[msk_var], out_type=dtype), - shape=shape), axis=-1) + Ymsk = tf.expand_dims( + tf.ensure_shape( + tf.io.parse_tensor(serialized=parsed_data[msk_var], out_type=dtype), + shape=shape, + ), + axis=-1, + ) data.append(Ymsk) return tuple(data) @@ -69,8 +90,9 @@ def resize_fn(data): """Resize function that resizes the input data to the target shape.""" resized_data = [] for x in data: - resized_data.append(tf.image.resize( - x, shape, tf.image.ResizeMethod.NEAREST_NEIGHBOR)) + resized_data.append( + tf.image.resize(x, shape, tf.image.ResizeMethod.NEAREST_NEIGHBOR) + ) return tuple(resized_data) return resize_fn @@ -97,8 +119,7 @@ def scale_target_fn(data): # scale y y_scaled = tf.math.divide( tf.subtract(tf.cast(y, dtype=tf.float32), label_no_cyclone), - tf.subtract(tf.cast(patch_size-1, dtype=tf.float32), - label_no_cyclone) + tf.subtract(tf.cast(patch_size - 1, dtype=tf.float32), label_no_cyclone), ) return (x, y_scaled) @@ -122,15 +143,16 @@ def read_tfrecord_as_tensor(filenames, shape, drv_vars, coo_vars, msk_var): # get lambda functions to be applied to this dataset tensor_decoding_fn = get_tensor_decoding_fn( - shape, drv_vars=drv_vars, coo_vars=coo_vars, msk_var=msk_var) + shape, drv_vars=drv_vars, coo_vars=coo_vars, msk_var=msk_var + ) # compute the number of samples into the dataset n_elems = sum(1 for _ in tf.data.TFRecordDataset(filenames)) # Create standard dataset - dataset = tf.data.TFRecordDataset( - filenames, num_parallel_reads=AUTOTUNE).map( - tensor_decoding_fn, num_parallel_calls=AUTOTUNE) + dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE).map( + tensor_decoding_fn, num_parallel_calls=AUTOTUNE + ) # read data as numpy Xdata, ydata = dataset.batch(batch_size=n_elems).as_numpy_iterator().next() diff --git a/use-cases/cyclones/src/transform.py b/use-cases/cyclones/src/transform.py index 351dacd97..d0f44df14 100644 --- a/use-cases/cyclones/src/transform.py +++ b/use-cases/cyclones/src/transform.py @@ -5,7 +5,7 @@ def coo_rot180(data): X, y = data patch_size = X.shape[0] X = tf.image.rot90(X, k=2) - y1 = [-1., -1.] + y1 = [-1.0, -1.0] if y[0] != -1: y1 = [-y[0] + patch_size - 1, -y[1] + patch_size - 1] return (X, y1) @@ -15,9 +15,9 @@ def coo_left_right(data): X, y = data patch_size = X.shape[0] X = tf.image.flip_left_right(X) - y1 = [-1., -1.] + y1 = [-1.0, -1.0] if y[0] != -1: - y1 = [y[0], - y[1] + patch_size - 1] + y1 = [y[0], -y[1] + patch_size - 1] return (X, y1) @@ -25,9 +25,9 @@ def coo_up_down(data): X, y = data patch_size = X.shape[0] X = tf.image.flip_up_down(X) - y1 = [-1., -1.] + y1 = [-1.0, -1.0] if y[0] != -1: - y1 = [- y[0] + patch_size - 1, y[1]] + y1 = [-y[0] + patch_size - 1, y[1]] return (X, y1) diff --git a/use-cases/cyclones/src/utils.py b/use-cases/cyclones/src/utils.py index 4586ebc72..c71653ee4 100644 --- a/use-cases/cyclones/src/utils.py +++ b/use-cases/cyclones/src/utils.py @@ -3,9 +3,7 @@ import time from .macros import Network -from cyclones_vgg import ( - custom_VGG_V1, custom_VGG_V2, custom_VGG_V3 # , ModelV5 -) +from cyclones_vgg import custom_VGG_V1, custom_VGG_V2, custom_VGG_V3 # , ModelV5 def saveparams(file, **kwargs): @@ -16,12 +14,11 @@ def readparams(file): return joblib.load(file) -class Timer(): +class Timer: def __init__( self, - timers=['tot_exec_elapsed_time', - 'io_elapsed_time', 'training_elapsed_time'] + timers=["tot_exec_elapsed_time", "io_elapsed_time", "training_elapsed_time"], ): # initialize execution times data structure self.exec_times = {} @@ -44,22 +41,31 @@ def stop(self, timer): def get_network_config(network, **kwargs): # choose the network configuration based on the passed network type if network == Network.VGG_V1.value: - print('Using custom VGG V1') + print("Using custom VGG V1") model = custom_VGG_V1( - patch_size=kwargs['patch_size'], channels=kwargs['channels'], - activation=kwargs['activation'], regularizer=kwargs['regularizer']) + patch_size=kwargs["patch_size"], + channels=kwargs["channels"], + activation=kwargs["activation"], + regularizer=kwargs["regularizer"], + ) elif network == Network.VGG_V2.value: - print('Using custom VGG V2') + print("Using custom VGG V2") model = custom_VGG_V2( - patch_size=kwargs['patch_size'], channels=kwargs['channels'], - activation=kwargs['activation'], regularizer=kwargs['regularizer']) + patch_size=kwargs["patch_size"], + channels=kwargs["channels"], + activation=kwargs["activation"], + regularizer=kwargs["regularizer"], + ) elif network == Network.VGG_V3.value: - print('Using custom VGG V3') + print("Using custom VGG V3") model = custom_VGG_V3( - patch_size=kwargs['patch_size'], channels=kwargs['channels'], - activation=kwargs['activation'], regularizer=kwargs['regularizer']) + patch_size=kwargs["patch_size"], + channels=kwargs["channels"], + activation=kwargs["activation"], + regularizer=kwargs["regularizer"], + ) # elif network == Network.MODEL_V5.value: # print('Using Model V5') @@ -76,13 +82,13 @@ def load_model(model_fpath): Loads a keras model from a file, recognizing whether or not it is a weight file or a model file. """ - model_fname = model_fpath.split('/')[-1] - if 'model' in model_fname: + model_fname = model_fpath.split("/")[-1] + if "model" in model_fname: try: model = tf.keras.models.load_model(model_fpath) except Exception as e: - print(f'Cannot load model. Caused by error: {e}') - elif 'weight' in model_fname: + print(f"Cannot load model. Caused by error: {e}") + elif "weight" in model_fname: model.compile() model.built = True model.load_weights(model_fpath) diff --git a/use-cases/cyclones/trainer.py b/use-cases/cyclones/trainer.py index f04ebe1d2..2cd1f4e89 100644 --- a/use-cases/cyclones/trainer.py +++ b/use-cases/cyclones/trainer.py @@ -73,12 +73,16 @@ def execute(self, train_data, validation_data, channels) -> None: # Each batch is further split among the workers dist_train_dataset = self.strategy.experimental_distribute_dataset( train_dataset.batch( - self.macro_batch_size, drop_remainder=True, num_parallel_calls=tf.data.AUTOTUNE + self.macro_batch_size, + drop_remainder=True, + num_parallel_calls=tf.data.AUTOTUNE, ) ) dist_valid_dataset = self.strategy.experimental_distribute_dataset( valid_dataset.batch( - self.macro_batch_size, drop_remainder=True, num_parallel_calls=tf.data.AUTOTUNE + self.macro_batch_size, + drop_remainder=True, + num_parallel_calls=tf.data.AUTOTUNE, ) ) diff --git a/use-cases/eurac/data.py b/use-cases/eurac/data.py index f6636a2ea..f7e1fba43 100644 --- a/use-cases/eurac/data.py +++ b/use-cases/eurac/data.py @@ -38,7 +38,9 @@ def __init__( self.static_names = static_names self.target_names = target_names self.mask_names = mask_names - self.train_temporal_range = slice(train_temporal_range[0], train_temporal_range[1]) + self.train_temporal_range = slice( + train_temporal_range[0], train_temporal_range[1] + ) self.test_temporal_range = slice(test_temporal_range[0], test_temporal_range[1]) @monitor_exec @@ -163,7 +165,9 @@ def __init__( self.static_names = static_names self.target_names = target_names self.mask_names = mask_names - self.train_temporal_range = slice(train_temporal_range[0], train_temporal_range[1]) + self.train_temporal_range = slice( + train_temporal_range[0], train_temporal_range[1] + ) self.test_temporal_range = slice(test_temporal_range[0], test_temporal_range[1]) @monitor_exec diff --git a/use-cases/eurac/hpo.py b/use-cases/eurac/hpo.py index c95573347..5ad4a9771 100644 --- a/use-cases/eurac/hpo.py +++ b/use-cases/eurac/hpo.py @@ -77,7 +77,8 @@ def run_hpo(args): # Ray's RunConfig for experiment name and stopping criteria run_config = train.RunConfig( - name="Eurac-Ray-Experiment", stop={"training_iteration": args.max_iterations} + name="Eurac-Ray-Experiment", + stop={"training_iteration": args.max_iterations}, ) # Determine GPU and CPU utilization per trial @@ -92,7 +93,9 @@ def run_hpo(args): ) data = {"pipeline_name": args.pipeline_name} - trainable_with_parameters = tune.with_parameters(trainable_with_resources, data=data) + trainable_with_parameters = tune.with_parameters( + trainable_with_resources, data=data + ) tuner = tune.Tuner( trainable_with_parameters, @@ -143,11 +146,17 @@ def plot_results(result_grid, metric="loss", filename="plot.png"): """ ax = None for result in result_grid: - label = f"lr={result.config['lr']:.6f}, batch size={result.config['batch_size']}" + label = ( + f"lr={result.config['lr']:.6f}, batch size={result.config['batch_size']}" + ) if ax is None: - ax = result.metrics_dataframe.plot("training_iteration", metric, label=label) + ax = result.metrics_dataframe.plot( + "training_iteration", metric, label=label + ) else: - result.metrics_dataframe.plot("training_iteration", metric, ax=ax, label=label) + result.metrics_dataframe.plot( + "training_iteration", metric, ax=ax, label=label + ) ax.set_title(f"{metric.capitalize()} vs. Training Iteration for All Trials") ax.set_ylabel(metric.capitalize()) @@ -161,7 +170,9 @@ def plot_results(result_grid, metric="loss", filename="plot.png"): # Main entry point for script execution if __name__ == "__main__": # Parse command-line arguments - parser = argparse.ArgumentParser(description="Hyperparameter Optimization with Ray Tune") + parser = argparse.ArgumentParser( + description="Hyperparameter Optimization with Ray Tune" + ) parser.add_argument( "--load_old_results", type=bool, @@ -184,10 +195,14 @@ def plot_results(result_grid, metric="loss", filename="plot.png"): Set this only if load_old_results is set to True. \ Defaults to ~/ray_results/Eurac-Ray-Experiment", ) - parser.add_argument("--num_samples", type=int, default=10, help="Number of trials to run") + parser.add_argument( + "--num_samples", type=int, default=10, help="Number of trials to run" + ) parser.add_argument("--ngpus", type=int, help="Number of GPUs available on node.") parser.add_argument("--ncpus", type=int, help="Number of CPUs available on node.") - parser.add_argument("--metric", type=str, default="loss", help="Metric to optimise.") + parser.add_argument( + "--metric", type=str, default="loss", help="Metric to optimise." + ) parser.add_argument( "--max_iterations", type=int, default="20", help="Maximum iterations per trial" ) diff --git a/use-cases/eurac/slurm.py b/use-cases/eurac/slurm.py index 9028cf470..ef948205a 100644 --- a/use-cases/eurac/slurm.py +++ b/use-cases/eurac/slurm.py @@ -38,7 +38,7 @@ def __init__( self.pipe_key = pipe_key def get_training_command(self): - if self.training_command is not None: + if self.training_command is not None: return self.training_command training_command = rf""" diff --git a/use-cases/eurac/trainer.py b/use-cases/eurac/trainer.py index a4d014806..350e645d3 100644 --- a/use-cases/eurac/trainer.py +++ b/use-cases/eurac/trainer.py @@ -132,12 +132,16 @@ def create_model_loss_optimizer(self) -> None: if isinstance(self.strategy, DeepSpeedStrategy): # Batch size definition is not optional for DeepSpeedStrategy! distribute_kwargs = { - "config_params": {"train_micro_batch_size_per_gpu": self.config.batch_size} + "config_params": { + "train_micro_batch_size_per_gpu": self.config.batch_size + } } elif isinstance(self.strategy, TorchDDPStrategy): if "find_unused_parameters" not in self.config.model_fields: self.config.find_unused_parameters = False - distribute_kwargs = {"find_unused_parameters": self.config.find_unused_parameters} + distribute_kwargs = { + "find_unused_parameters": self.config.find_unused_parameters + } self.model, self.optimizer, _ = self.strategy.distributed( model=self.model, @@ -169,7 +173,7 @@ def train(self): epoch_time_tracker = EpochTimeTracker( strategy_name=self.strategy.name, save_path=epoch_time_output_path, - num_nodes=num_nodes + num_nodes=num_nodes, ) trainer = RNNTrainer( @@ -187,7 +191,9 @@ def train(self): device = self.strategy.device() loss_history = {"train": [], "val": []} metric_history = {f"train_{target}": [] for target in trainer.P.target_names} - metric_history.update({f"val_{target}": [] for target in trainer.P.target_names}) + metric_history.update( + {f"val_{target}": [] for target in trainer.P.target_names} + ) best_loss = float("inf") for epoch in tqdm(range(self.epochs)): @@ -400,7 +406,9 @@ def create_model_loss_optimizer(self) -> None: if isinstance(self.strategy, DeepSpeedStrategy): # Batch size definition is not optional for DeepSpeedStrategy! distribute_kwargs = dict( - config_params=dict(train_micro_batch_size_per_gpu=self.config.batch_size) + config_params=dict( + train_micro_batch_size_per_gpu=self.config.batch_size + ) ) else: distribute_kwargs = {} # dict(find_unused_parameters=True) @@ -428,7 +436,9 @@ def train(self): device = self.strategy.device() loss_history = {"train": [], "val": []} metric_history = {f"train_{target}": [] for target in trainer.P.target_names} - metric_history.update({f"val_{target}": [] for target in trainer.P.target_names}) + metric_history.update( + {f"val_{target}": [] for target in trainer.P.target_names} + ) best_loss = float("inf") for epoch in tqdm(range(self.epochs)): diff --git a/use-cases/mnist/torch-lightning/dataloader.py b/use-cases/mnist/torch-lightning/dataloader.py index 6d452b4dd..91626af2b 100644 --- a/use-cases/mnist/torch-lightning/dataloader.py +++ b/use-cases/mnist/torch-lightning/dataloader.py @@ -57,7 +57,10 @@ def __init__( def setup(self, stage=None): if stage == "fit": mnist_full = MNIST( - self.data_path, train=True, download=self.download, transform=self.transform + self.data_path, + train=True, + download=self.download, + transform=self.transform, ) n_train_samples = int(self.train_prop * len(mnist_full)) n_val_samples = len(mnist_full) - n_train_samples @@ -67,12 +70,18 @@ def setup(self, stage=None): if stage == "test": self.mnist_test = MNIST( - self.data_path, train=False, download=self.download, transform=self.transform + self.data_path, + train=False, + download=self.download, + transform=self.transform, ) if stage == "predict": self.mnist_predict = MNIST( - self.data_path, train=False, download=self.download, transform=self.transform + self.data_path, + train=False, + download=self.download, + transform=self.transform, ) def train_dataloader(self): diff --git a/use-cases/mnist/torch-lightning/utils.py b/use-cases/mnist/torch-lightning/utils.py index 4eb9d43d5..6083a120b 100644 --- a/use-cases/mnist/torch-lightning/utils.py +++ b/use-cases/mnist/torch-lightning/utils.py @@ -92,7 +92,9 @@ def dynamically_import_class(name: str): return klass -def flatten_dict(d: MutableMapping, parent_key: str = "", sep: str = ".") -> MutableMapping: +def flatten_dict( + d: MutableMapping, parent_key: str = "", sep: str = "." +) -> MutableMapping: """Flatten dictionary Args: diff --git a/use-cases/mnist/torch/dataloader.py b/use-cases/mnist/torch/dataloader.py index 69791db61..8d03c2f99 100644 --- a/use-cases/mnist/torch/dataloader.py +++ b/use-cases/mnist/torch/dataloader.py @@ -58,7 +58,10 @@ class InferenceMNIST(Dataset): """Loads a set of MNIST images from a folder of JPG files.""" def __init__( - self, root: str, transform: Optional[Callable] = None, supported_format: str = ".jpg" + self, + root: str, + transform: Optional[Callable] = None, + supported_format: str = ".jpg", ) -> None: self.root = root self.transform = transform diff --git a/use-cases/mnist/torch/saver.py b/use-cases/mnist/torch/saver.py index 6d7cedf2c..c50ce421c 100644 --- a/use-cases/mnist/torch/saver.py +++ b/use-cases/mnist/torch/saver.py @@ -31,7 +31,9 @@ def __init__( self.save_dir = save_dir self.predictions_file = predictions_file self.class_labels = ( - class_labels if class_labels is not None else [f"Digit {i}" for i in range(10)] + class_labels + if class_labels is not None + else [f"Digit {i}" for i in range(10)] ) @monitor_exec diff --git a/use-cases/virgo/data.py b/use-cases/virgo/data.py index 8e850ef42..7770b90b0 100644 --- a/use-cases/virgo/data.py +++ b/use-cases/virgo/data.py @@ -94,7 +94,9 @@ def __init__( """ file_path = Path(hdf5_file_location) if not file_path.exists(): - raise ValueError(f"Given file location, {file_path.resolve()} does not exist. ") + raise ValueError( + f"Given file location, {file_path.resolve()} does not exist. " + ) self.hdf5_dataset_name = hdf5_dataset_name self.file_path = file_path self.chunk_size = chunk_size @@ -133,7 +135,9 @@ def __getitem__(self, idx) -> torch.Tensor: torch.Tensor: Normalized tensor for specific idx """ if idx >= len(self): - raise ValueError(f"Index {idx} out of bounds for dataset with length {len(self)}!") + raise ValueError( + f"Index {idx} out of bounds for dataset with length {len(self)}!" + ) offset = idx * self.chunk_size with h5py.File(self.file_path, "r") as f: @@ -354,11 +358,16 @@ def execute( # whole dataset signal_data_test_2d = torch.stack( - [torch.stack([y_test_2d[main_channel].iloc[i]]) for i in range(y_test_2d.shape[0])] + [ + torch.stack([y_test_2d[main_channel].iloc[i]]) + for i in range(y_test_2d.shape[0]) + ] ) aux_data_test_2d = torch.stack( [ - torch.stack([X_test_2d.iloc[i, 0], X_test_2d.iloc[i, 1], X_test_2d.iloc[i, 2]]) + torch.stack( + [X_test_2d.iloc[i, 0], X_test_2d.iloc[i, 1], X_test_2d.iloc[i, 2]] + ) for i in range(X_test_2d.shape[0]) ] ) diff --git a/use-cases/virgo/hpo.py b/use-cases/virgo/hpo.py index 0f7487a20..85e2306e4 100644 --- a/use-cases/virgo/hpo.py +++ b/use-cases/virgo/hpo.py @@ -67,7 +67,9 @@ def run_trial(config: Dict, data: Dict): "learning_rate": config["lr"], }, ) - my_pipeline = parser.parse_pipeline(pipeline_nested_key=pipeline_name, verbose=False) + my_pipeline = parser.parse_pipeline( + pipeline_nested_key=pipeline_name, verbose=False + ) # Skip the first step of the pipeline (data generation) my_pipeline.execute() @@ -97,7 +99,8 @@ def run_hpo(args): ) run_config = train.RunConfig( - name="Virgo-Ray-Experiment", stop={"training_iteration": args.max_iterations} + name="Virgo-Ray-Experiment", + stop={"training_iteration": args.max_iterations}, ) # Determine GPU and CPU utilization per trial @@ -107,7 +110,9 @@ def run_hpo(args): # Set resource allocation for each trial (number of GPUs and/or number of CPUs) resources_per_trial = {"gpu": ngpus_per_trial, "cpu": ncpus_per_trial} - run_with_resources = tune.with_resources(run_trial, resources=resources_per_trial) + run_with_resources = tune.with_resources( + run_trial, resources=resources_per_trial + ) data = {"pipeline_name": args.pipeline_name} trainable_with_parameters = tune.with_parameters(run_with_resources, data=data) @@ -162,11 +167,17 @@ def plot_results(result_grid, metric="loss", filename="plot.png"): """ ax = None for result in result_grid: - label = f"lr={result.config['lr']:.6f}, batch size={result.config['batch_size']}" + label = ( + f"lr={result.config['lr']:.6f}, batch size={result.config['batch_size']}" + ) if ax is None: - ax = result.metrics_dataframe.plot("training_iteration", metric, label=label) + ax = result.metrics_dataframe.plot( + "training_iteration", metric, label=label + ) else: - result.metrics_dataframe.plot("training_iteration", metric, ax=ax, label=label) + result.metrics_dataframe.plot( + "training_iteration", metric, ax=ax, label=label + ) ax.set_title(f"{metric.capitalize()} vs. Training Iteration for All Trials") ax.set_ylabel(metric.capitalize()) @@ -181,7 +192,9 @@ def plot_results(result_grid, metric="loss", filename="plot.png"): # Main entry point for script execution if __name__ == "__main__": # Parse command-line arguments - parser = argparse.ArgumentParser(description="Hyperparameter Optimization with Ray Tune") + parser = argparse.ArgumentParser( + description="Hyperparameter Optimization with Ray Tune" + ) parser.add_argument( "--load_old_results", type=bool, @@ -204,10 +217,14 @@ def plot_results(result_grid, metric="loss", filename="plot.png"): Set this only if load_old_results is set to True. \ Defaults to ~/ray_results/Eurac-Ray-Experiment", ) - parser.add_argument("--num_samples", type=int, default=10, help="Number of trials to run") + parser.add_argument( + "--num_samples", type=int, default=10, help="Number of trials to run" + ) parser.add_argument("--ngpus", type=int, help="Number of GPUs available on node.") parser.add_argument("--ncpus", type=int, help="Number of CPUs available on node.") - parser.add_argument("--metric", type=str, default="loss", help="Metric to optimise.") + parser.add_argument( + "--metric", type=str, default="loss", help="Metric to optimise." + ) parser.add_argument( "--max_iterations", type=int, default="20", help="Maximum iterations per trial" ) diff --git a/use-cases/virgo/slurm.py b/use-cases/virgo/slurm.py index 0d4ec723f..549427ee0 100644 --- a/use-cases/virgo/slurm.py +++ b/use-cases/virgo/slurm.py @@ -38,7 +38,7 @@ def __init__( self.pipe_key = pipe_key def get_training_command(self): - if self.training_command is not None: + if self.training_command is not None: return self.training_command training_command = rf""" diff --git a/use-cases/virgo/src/dataset.py b/use-cases/virgo/src/dataset.py index 7d0054552..06b4376d2 100644 --- a/use-cases/virgo/src/dataset.py +++ b/use-cases/virgo/src/dataset.py @@ -13,7 +13,6 @@ Process Data that to save time. """ - import multiprocessing import matplotlib.pyplot as plt @@ -24,8 +23,12 @@ def generate_dataset_aux_channels( - rows, columns, duration=10, sample_rate=500, - num_waves_range=(10, 15), noise_amplitude=0.1 + rows, + columns, + duration=10, + sample_rate=500, + num_waves_range=(10, 15), + noise_amplitude=0.1, ): """Generate a Pandas DataFrame with randomly generated smooth sine wave time series with added smooth random noise. @@ -69,19 +72,18 @@ def generate_dataset_aux_channels( # frequency, phase) amplitude = np.random.uniform(0.5, 2.0) frequency = np.random.uniform(0.5, 5.0) - phase = np.random.uniform(0, 2*np.pi) + phase = np.random.uniform(0, 2 * np.pi) # Generate the sine wave and add it to the wave_data - wave_data += amplitude * \ - np.sin(2 * np.pi * frequency * times + phase) + wave_data += amplitude * np.sin(2 * np.pi * frequency * times + phase) # Add smooth random noise to the wave data smooth_noise = np.random.normal(0, noise_amplitude, len(times)) wave_data += smooth_noise # Create a TimeSeries object from the wave data - ts = TimeSeries(wave_data, t0=0, dt=1/sample_rate) - df_dict[f'Aux_{col+1}'] = [ts] + ts = TimeSeries(wave_data, t0=0, dt=1 / sample_rate) + df_dict[f"Aux_{col+1}"] = [ts] # Create a DataFrame with the TimeSeries df_row = pd.DataFrame(df_dict) @@ -132,12 +134,10 @@ def generate_dataset_main_channel(input_df, weights=None, noise_amplitude=0.1): linear_combination += np.random.normal(0, noise_amplitude) # Append the result to the list - linear_combination_data.append( - [TimeSeries(linear_combination, dt=dt, t0=0)]) + linear_combination_data.append([TimeSeries(linear_combination, dt=dt, t0=0)]) # Create a DataFrame with the linear combination data - linear_combination_df = pd.DataFrame( - linear_combination_data, columns=['Main']) + linear_combination_df = pd.DataFrame(linear_combination_data, columns=["Main"]) return linear_combination_df @@ -246,8 +246,7 @@ def process_image(row, row_idx, channels, square_size): # Convert the spectrogram to a NumPy array qplot_array = qplot.value - qplot_array_cut = cut_image( - qplot_array, index_freq, index_time, square_size) + qplot_array_cut = cut_image(qplot_array, index_freq, index_time, square_size) df_row[channel] = [qplot_array_cut] return df_row @@ -290,7 +289,7 @@ def show_dataset(df, size, num_plots=10): """ ch_list = list(df.columns) - fig, axes = plt.subplots(2*num_plots, 2, figsize=(18, 12*num_plots)) + fig, axes = plt.subplots(2 * num_plots, 2, figsize=(18, 12 * num_plots)) for j in range(num_plots): qplt_r = np.flipud(df.iloc[j, 0].T) @@ -302,59 +301,66 @@ def show_dataset(df, size, num_plots=10): # fig, axes = plt.subplots(2, 3, figsize=(18, 12)) # Plot for Real - im_r = axes[2*j, 0].imshow( - qplt_r, aspect='auto', - extent=[0, size, 0, size], vmin=0, vmax=25) - axes[2*j, 0].set_title(ch_list[0]) - axes[2*j, 0].set_xlabel('Time') - axes[2*j, 0].set_ylabel('Frequency') - fig.colorbar(im_r, ax=axes[2*j, 0]) # Add colorbar for Real + im_r = axes[2 * j, 0].imshow( + qplt_r, aspect="auto", extent=[0, size, 0, size], vmin=0, vmax=25 + ) + axes[2 * j, 0].set_title(ch_list[0]) + axes[2 * j, 0].set_xlabel("Time") + axes[2 * j, 0].set_ylabel("Frequency") + fig.colorbar(im_r, ax=axes[2 * j, 0]) # Add colorbar for Real # Plot for aux - im_g = axes[2*j, 1].imshow( - qplt_aux1, aspect='auto', - extent=[0, size, 0, size], vmin=0, vmax=25) - axes[2*j, 1].set_title(ch_list[1]) - axes[2*j, 1].set_xlabel('Time') - axes[2*j, 1].set_ylabel('Frequency') - fig.colorbar(im_g, ax=axes[2*j, 1]) # Add colorbar for Generated + im_g = axes[2 * j, 1].imshow( + qplt_aux1, aspect="auto", extent=[0, size, 0, size], vmin=0, vmax=25 + ) + axes[2 * j, 1].set_title(ch_list[1]) + axes[2 * j, 1].set_xlabel("Time") + axes[2 * j, 1].set_ylabel("Frequency") + fig.colorbar(im_g, ax=axes[2 * j, 1]) # Add colorbar for Generated # - im_g = axes[2*j+1, 0].imshow( - qplt_aux2, aspect='auto', - extent=[0, size, 0, size], vmin=0, vmax=25) - axes[2*j+1, 0].set_title(ch_list[2]) - axes[2*j+1, 0].set_xlabel('Time') - axes[2*j+1, 0].set_ylabel('Frequency') - fig.colorbar(im_g, ax=axes[2*j+1, 0]) # Add colorbar for Generated + im_g = axes[2 * j + 1, 0].imshow( + qplt_aux2, aspect="auto", extent=[0, size, 0, size], vmin=0, vmax=25 + ) + axes[2 * j + 1, 0].set_title(ch_list[2]) + axes[2 * j + 1, 0].set_xlabel("Time") + axes[2 * j + 1, 0].set_ylabel("Frequency") + fig.colorbar(im_g, ax=axes[2 * j + 1, 0]) # Add colorbar for Generated # - im_g = axes[2*j+1, 1].imshow( - qplt_aux3, aspect='auto', - extent=[0, size, 0, size], vmin=0, vmax=25) - axes[2*j+1, 1].set_title(ch_list[3]) - axes[2*j+1, 1].set_xlabel('Time') - axes[2*j+1, 1].set_ylabel('Frequency') - fig.colorbar(im_g, ax=axes[2*j+1, 1]) # Add colorbar for Generated + im_g = axes[2 * j + 1, 1].imshow( + qplt_aux3, aspect="auto", extent=[0, size, 0, size], vmin=0, vmax=25 + ) + axes[2 * j + 1, 1].set_title(ch_list[3]) + axes[2 * j + 1, 1].set_xlabel("Time") + axes[2 * j + 1, 1].set_ylabel("Frequency") + fig.colorbar(im_g, ax=axes[2 * j + 1, 1]) # Add colorbar for Generated # Get the bounding boxes of the axes including text decorations r = fig.canvas.get_renderer() - def get_bbox(ax): return ax.get_tightbbox( - r).transformed(fig.transFigure.inverted()) - bboxes = np.array(list(map(get_bbox, axes.flat)), - mtrans.Bbox).reshape(axes.shape) + def get_bbox(ax): + return ax.get_tightbbox(r).transformed(fig.transFigure.inverted()) + + bboxes = np.array(list(map(get_bbox, axes.flat)), mtrans.Bbox).reshape( + axes.shape + ) # Get the minimum and maximum extent, get the coordinate half-way # between those - ymax = np.array(list(map(lambda b: b.y1, bboxes.flat)) - ).reshape(axes.shape).max(axis=1) - ymin = np.array(list(map(lambda b: b.y0, bboxes.flat)) - ).reshape(axes.shape).min(axis=1) + ymax = ( + np.array(list(map(lambda b: b.y1, bboxes.flat))) + .reshape(axes.shape) + .max(axis=1) + ) + ymin = ( + np.array(list(map(lambda b: b.y0, bboxes.flat))) + .reshape(axes.shape) + .min(axis=1) + ) ys = np.c_[ymax[1:], ymin[:-1]].mean(axis=1) # Draw a horizontal lines at those coordinates for y in ys[1::2]: - line = plt.Line2D( - [0, 1], [y, y], transform=fig.transFigure, color="black") + line = plt.Line2D([0, 1], [y, y], transform=fig.transFigure, color="black") fig.add_artist(line) # plt.savefig('very high loss qplots.png') @@ -372,10 +378,10 @@ def normalize_(data, chan=4): - data (torch.tensor) : normalized dataset """ # Compute the maximum value for each channel across all 900 tensors - max_vals = data.view(data.shape[0], data.shape[1], -1).max(0)[0].max( - 1)[0] - print("Maximum values for each channel across all tensors:", - max_vals, max_vals.shape) + max_vals = data.view(data.shape[0], data.shape[1], -1).max(0)[0].max(1)[0] + print( + "Maximum values for each channel across all tensors:", max_vals, max_vals.shape + ) # Divide each element by the maximum value of its channel data /= max_vals.view(1, chan, 1, 1) return data diff --git a/use-cases/virgo/src/model.py b/use-cases/virgo/src/model.py index 64dbb233a..61fab50f8 100644 --- a/use-cases/virgo/src/model.py +++ b/use-cases/virgo/src/model.py @@ -33,23 +33,23 @@ def __init__(self, in_channels, kernel_size=7, norm=True): super(Decoder, self).__init__() self.conv1 = nn.Conv2d( - in_channels, 32, kernel_size=kernel_size, stride=1, - padding=kernel_size // 2) + in_channels, 32, kernel_size=kernel_size, stride=1, padding=kernel_size // 2 + ) self.relu1 = nn.LeakyReLU(0.2, inplace=True) self.conv2 = nn.Conv2d( - 32, 64, kernel_size=kernel_size, stride=1, - padding=kernel_size // 2) + 32, 64, kernel_size=kernel_size, stride=1, padding=kernel_size // 2 + ) self.relu2 = nn.LeakyReLU(0.2, inplace=True) self.conv3 = nn.Conv2d( - 64, 64, kernel_size=kernel_size, stride=1, - padding=kernel_size // 2) + 64, 64, kernel_size=kernel_size, stride=1, padding=kernel_size // 2 + ) self.relu3 = nn.LeakyReLU(0.2, inplace=True) self.conv4 = nn.Conv2d( - 64, 1, kernel_size=kernel_size, stride=1, - padding=kernel_size // 2) + 64, 1, kernel_size=kernel_size, stride=1, padding=kernel_size // 2 + ) if norm: self.activation = torch.nn.Sigmoid() @@ -102,23 +102,23 @@ def __init__(self, in_channels, kernel_size=5, norm=True): super(Decoder_2d_deep, self).__init__() self.conv1 = nn.Conv2d( - in_channels, 64, kernel_size=kernel_size, stride=1, - padding=kernel_size // 2) + in_channels, 64, kernel_size=kernel_size, stride=1, padding=kernel_size // 2 + ) self.relu1 = nn.LeakyReLU(0.3, inplace=True) self.conv2 = nn.Conv2d( - 64, 128, kernel_size=kernel_size, stride=1, - padding=kernel_size // 2) + 64, 128, kernel_size=kernel_size, stride=1, padding=kernel_size // 2 + ) self.relu2 = nn.LeakyReLU(0.3, inplace=True) self.conv3 = nn.Conv2d( - 128, 256, kernel_size=kernel_size, stride=1, - padding=kernel_size // 2) + 128, 256, kernel_size=kernel_size, stride=1, padding=kernel_size // 2 + ) self.relu3 = nn.LeakyReLU(0.3, inplace=True) self.conv4 = nn.Conv2d( - 256, 1, kernel_size=kernel_size, stride=1, - padding=kernel_size // 2) + 256, 1, kernel_size=kernel_size, stride=1, padding=kernel_size // 2 + ) if norm: self.activation = torch.nn.Sigmoid() else: @@ -177,7 +177,7 @@ def __init__(self, in_features): nn.ReLU(inplace=True), # ReLU activation function nn.ReflectionPad2d(1), nn.Conv2d(in_features, in_features, 3), - nn.InstanceNorm2d(in_features) + nn.InstanceNorm2d(in_features), ) def forward(self, x): @@ -203,9 +203,7 @@ class GeneratorResNet(nn.Module): - output_shape (int): Number of output features/channels. """ - def __init__( - self, input_shape, num_residual_block, output_shape, norm=False - ): + def __init__(self, input_shape, num_residual_block, output_shape, norm=False): super(GeneratorResNet, self).__init__() channels = input_shape @@ -221,7 +219,7 @@ def __init__( nn.ReflectionPad2d(channels), nn.Conv2d(channels, out_features, 7), nn.InstanceNorm2d(out_features), - nn.ReLU(inplace=True) + nn.ReLU(inplace=True), ] in_features = out_features @@ -231,7 +229,7 @@ def __init__( model += [ nn.Conv2d(in_features, out_features, 3, stride=2, padding=1), nn.InstanceNorm2d(out_features), - nn.ReLU(inplace=True) + nn.ReLU(inplace=True), ] in_features = out_features @@ -245,15 +243,16 @@ def __init__( model += [ nn.Upsample(scale_factor=2), # Upsampling layer nn.Conv2d(in_features, out_features, 3, stride=1, padding=1), - nn.ReLU(inplace=True) + nn.ReLU(inplace=True), ] in_features = out_features # Output Layer - model += [nn.ReflectionPad2d(target_channels), - nn.Conv2d(out_features, target_channels, 3), - self.final_activation # Sigmoid activation function - ] + model += [ + nn.ReflectionPad2d(target_channels), + nn.Conv2d(out_features, target_channels, 3), + self.final_activation, # Sigmoid activation function + ] # Unpacking self.model = nn.Sequential(*model) @@ -286,10 +285,8 @@ class Conv2dBlock(nn.Module): def __init__(self, in_channels, out_channels, kernel_size=3): super(Conv2dBlock, self).__init__() - self.conv1 = nn.Conv2d(in_channels, out_channels, - kernel_size, padding=1) - self.conv2 = nn.Conv2d( - out_channels, out_channels, kernel_size, padding=1) + self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size, padding=1) + self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size, padding=1) self.activation = nn.ReLU() def forward(self, x): @@ -320,8 +317,7 @@ class EncoderBlock(nn.Module): - dropout (float): Dropout rate (default is 0.3). """ - def __init__(self, in_channels, out_channels, pool_size=(2, 2), - dropout=0.3): + def __init__(self, in_channels, out_channels, pool_size=(2, 2), dropout=0.3): super(EncoderBlock, self).__init__() self.conv_block = Conv2dBlock(in_channels, out_channels) self.maxpool = nn.MaxPool2d(pool_size) @@ -411,12 +407,16 @@ class DecoderBlock(nn.Module): - dropout (float): Dropout rate (default is 0.3). """ - def __init__(self, in_channels, out_channels, kernel_size=3, stride=2, - dropout=0.3): + def __init__(self, in_channels, out_channels, kernel_size=3, stride=2, dropout=0.3): super(DecoderBlock, self).__init__() self.deconv = nn.ConvTranspose2d( - in_channels, out_channels, kernel_size, stride=stride, padding=1, - output_padding=1) + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=1, + output_padding=1, + ) self.dropout = nn.Dropout2d(dropout) self.conv_block = Conv2dBlock(out_channels * 2, out_channels) diff --git a/use-cases/virgo/src/utils.py b/use-cases/virgo/src/utils.py index dff6fc27a..f2c43a468 100644 --- a/use-cases/virgo/src/utils.py +++ b/use-cases/virgo/src/utils.py @@ -1,10 +1,9 @@ - import numpy as np import torch from gwpy.timeseries import TimeSeries -def init_weights(net, init_type='normal', scaling=0.02, generator=None): +def init_weights(net, init_type="normal", scaling=0.02, generator=None): """ Initialize the weights of the neural network according to the specified initialization type. @@ -16,17 +15,18 @@ def init_weights(net, init_type='normal', scaling=0.02, generator=None): - scaling (float): Scaling factor for weight initialization (default is 0.02). """ + def init_func(m): # define the initialization function classname = m.__class__.__name__ - if hasattr(m, 'weight') and (classname.find('Conv')) != -1: + if hasattr(m, "weight") and (classname.find("Conv")) != -1: torch.nn.init.normal_(m.weight.data, 0.0, scaling) # BatchNorm Layer's weight is not a matrix; only normal # distribution applies. - elif classname.find('BatchNorm2d') != -1: + elif classname.find("BatchNorm2d") != -1: torch.nn.init.normal_(m.weight.data, 1.0, scaling) torch.nn.init.constant_(m.bias.data, 0.0) - print('initialize network with %s' % init_type) + print("initialize network with %s" % init_type) net.apply(init_func) # apply the initialization function @@ -48,12 +48,14 @@ def calculate_iou_2d(generated, target, threshold): # print(generated[0][0].shape) # print(type(generated[0][0])) - spectrograms_gen = [TimeSeries( - t[0], dt=1/4096.0).q_transform(frange=(10, 1000)).value - for t in generated] - spectrograms_real = [TimeSeries( - t[0], dt=1/4096.0).q_transform(frange=(10, 1000)).value - for t in target] + spectrograms_gen = [ + TimeSeries(t[0], dt=1 / 4096.0).q_transform(frange=(10, 1000)).value + for t in generated + ] + spectrograms_real = [ + TimeSeries(t[0], dt=1 / 4096.0).q_transform(frange=(10, 1000)).value + for t in target + ] # Create binary masks based on the intensity threshold mask1 = [spectrogram >= threshold for spectrogram in spectrograms_gen] @@ -64,8 +66,9 @@ def calculate_iou_2d(generated, target, threshold): union = [np.logical_or(m1, m2) for m1, m2 in zip(mask1, mask2)] # Calculate Intersection over Union (IoU) - iou_list = np.array([np.sum(inter) / np.sum(uni) - for inter, uni in zip(intersection, union)]) + iou_list = np.array( + [np.sum(inter) / np.sum(uni) for inter, uni in zip(intersection, union)] + ) iou = iou_list.mean() return iou diff --git a/use-cases/virgo/synthetic-data-gen/concat_hdf5_dataset_files.py b/use-cases/virgo/synthetic-data-gen/concat_hdf5_dataset_files.py index 155f92332..baa26d265 100644 --- a/use-cases/virgo/synthetic-data-gen/concat_hdf5_dataset_files.py +++ b/use-cases/virgo/synthetic-data-gen/concat_hdf5_dataset_files.py @@ -73,7 +73,8 @@ def main(): # NOTE: This will not necessarily iterate in same order as the suffices of the # file names files = [ - entry for entry in dir.iterdir() + entry + for entry in dir.iterdir() if (entry.suffix == ".hdf5" and entry.stem != "virgo_data") ] for entry in files: diff --git a/use-cases/virgo/synthetic-data-gen/file_gen_hdf5.py b/use-cases/virgo/synthetic-data-gen/file_gen_hdf5.py index ad49fc677..e65f93b45 100644 --- a/use-cases/virgo/synthetic-data-gen/file_gen_hdf5.py +++ b/use-cases/virgo/synthetic-data-gen/file_gen_hdf5.py @@ -36,7 +36,7 @@ def append_to_hdf5_dataset( print(f"Appending to file: '{str(file_path.resolve())}'.") with h5py.File(file_path, "a") as f: - dset=f[dataset_name] + dset = f[dataset_name] dset.resize(dset.shape[0] + array.shape[0], axis=0) dset[-array.shape[0] :] = array diff --git a/use-cases/virgo/trainer.py b/use-cases/virgo/trainer.py index c4c58ee8a..61ac86278 100644 --- a/use-cases/virgo/trainer.py +++ b/use-cases/virgo/trainer.py @@ -26,7 +26,11 @@ from itwinai.distributed import suppress_workers_print from itwinai.loggers import EpochTimeTracker, Logger from itwinai.torch.config import TrainingConfiguration -from itwinai.torch.distributed import DeepSpeedStrategy, RayDDPStrategy, RayDeepSpeedStrategy +from itwinai.torch.distributed import ( + DeepSpeedStrategy, + RayDDPStrategy, + RayDeepSpeedStrategy, +) from itwinai.torch.profiling.profiler import profile_torch_trainer from itwinai.torch.monitoring.monitoring import measure_gpu_utilization from itwinai.torch.trainer import RayTorchTrainer, TorchTrainer @@ -105,7 +109,9 @@ def create_model_loss_optimizer(self) -> None: raise ValueError("Unrecognized loss type! Got", loss) # Optimizer - self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.optim_lr) + self.optimizer = torch.optim.Adam( + self.model.parameters(), lr=self.config.optim_lr + ) # IMPORTANT: model, optimizer, and scheduler need to be distributed @@ -113,7 +119,9 @@ def create_model_loss_optimizer(self) -> None: if isinstance(self.strategy, DeepSpeedStrategy): # Batch size definition is not optional for DeepSpeedStrategy! distribute_kwargs = dict( - config_params=dict(train_micro_batch_size_per_gpu=self.config.batch_size) + config_params=dict( + train_micro_batch_size_per_gpu=self.config.batch_size + ) ) else: distribute_kwargs = {} @@ -364,7 +372,9 @@ def train(self): # update best model best_val_loss = val_loss_plot[-1] - best_checkpoint_filename = self.checkpoints_location.format("best") + best_checkpoint_filename = self.checkpoints_location.format( + "best" + ) torch.save(checkpoint, best_checkpoint_filename) # itwinai - log checkpoint as artifact self.log( @@ -394,7 +404,11 @@ def __init__( random_seed: int = 1234, ) -> None: super().__init__( - config=config, strategy=strategy, name=name, logger=logger, random_seed=random_seed + config=config, + strategy=strategy, + name=name, + logger=logger, + random_seed=random_seed, ) def create_model_loss_optimizer(self) -> None: @@ -434,7 +448,9 @@ def create_model_loss_optimizer(self) -> None: if isinstance(self.strategy, RayDeepSpeedStrategy): # Batch size definition is not optional for DeepSpeedStrategy! distribute_kwargs = dict( - config_params=dict(train_micro_batch_size_per_gpu=self.config.batch_size) + config_params=dict( + train_micro_batch_size_per_gpu=self.config.batch_size + ) ) else: distribute_kwargs = {} @@ -456,7 +472,7 @@ def custom_collate(self, batch): def train(self, config, data): # Because of the way the ray cluster is set up, the strategy must be initialized within # the training function - self.strategy.init() + self.strategy.initialize_distributed_strategy() # Start the timer for profiling st = timer() @@ -594,7 +610,10 @@ def train(self, config, data): if self.strategy.is_main_worker: # save checkpoint only if it is better than # the previous ones - if self.training_config["save_best"] and val_loss_plot[-1] < best_val_loss: + if ( + self.training_config["save_best"] + and val_loss_plot[-1] < best_val_loss + ): # create checkpoint checkpoint = { "epoch": epoch, diff --git a/use-cases/xtclim/preprocessing/preprocess_2d_seasons.py b/use-cases/xtclim/preprocessing/preprocess_2d_seasons.py index 9b85a1523..ae2db67e1 100644 --- a/use-cases/xtclim/preprocessing/preprocess_2d_seasons.py +++ b/use-cases/xtclim/preprocessing/preprocess_2d_seasons.py @@ -15,11 +15,9 @@ from itwinai.components import DataGetter, monitor_exec + class SplitPreprocessedData(DataGetter): - def __init__( - self, - scenario: str - ): + def __init__(self, scenario: str): super().__init__() self.scenario = scenario @@ -108,7 +106,7 @@ def execute(self): n_memb = 1 # define relevant scenarios - #scenarios = ["126", "245", "370", "585"] + # scenarios = ["126", "245", "370", "585"] scenarios = [self.scenario] # Load preprocessed "daily temperature images" and time series @@ -117,7 +115,6 @@ def execute(self): train_time = pd.read_csv("input/dates_train_data.csv") test_time = pd.read_csv("input/dates_test_data.csv") - ##### 3. Apply to Train and Test Datasets train_season_images, train_season_time = self.season_split( train_images, train_time, "train", n_memb @@ -127,11 +124,12 @@ def execute(self): test_images, test_time, "test", n_memb ) - ##### 4. Apply to Projection Datasets for scenario in scenarios: - proj_images = np.load(f"input/preprocessed_2d_proj{scenario}_data_allssp.npy") + proj_images = np.load( + f"input/preprocessed_2d_proj{scenario}_data_allssp.npy" + ) proj_time = pd.read_csv("input/dates_proj_data.csv") proj_season_images, proj_season_time = self.season_split( diff --git a/use-cases/xtclim/preprocessing/preprocess_3d_seasons.py b/use-cases/xtclim/preprocessing/preprocess_3d_seasons.py index 34757fafd..5c0cea662 100644 --- a/use-cases/xtclim/preprocessing/preprocess_3d_seasons.py +++ b/use-cases/xtclim/preprocessing/preprocess_3d_seasons.py @@ -20,7 +20,7 @@ n_memb = 1 # define relevant scenarios -#scenarios = ["126", "245", "370", "585"] +# scenarios = ["126", "245", "370", "585"] scenarios = ["245", "585"] # Load preprocessed "daily temperature images" and time series diff --git a/use-cases/xtclim/preprocessing/preprocess_functions_2d_ssp.py b/use-cases/xtclim/preprocessing/preprocess_functions_2d_ssp.py index 264a0eaf7..ac43679c0 100644 --- a/use-cases/xtclim/preprocessing/preprocess_functions_2d_ssp.py +++ b/use-cases/xtclim/preprocessing/preprocess_functions_2d_ssp.py @@ -18,17 +18,16 @@ from itwinai.components import DataGetter, monitor_exec + class PreprocessData(DataGetter): - def __init__( - self, - scenario: str, - dataset_root: str - ): + def __init__(self, scenario: str, dataset_root: str): super().__init__() self.scenario = scenario self.dataset_root = dataset_root - def xr_to_ndarray(self, xr_dset: xr.Dataset, sq_coords: dict) -> (np.ndarray, np.array, str): + def xr_to_ndarray( + self, xr_dset: xr.Dataset, sq_coords: dict + ) -> (np.ndarray, np.array, str): """ Converts an xarray dataset it to a cropped square ndarray, after ajusting the longitudes from [0,360] to [-180,180]. @@ -89,7 +88,9 @@ def sftlf_to_ndarray( return land_prop, lat_list, lon_list - def get_extrema(self, histo_dataset: np.ndarray, proj_dataset: np.ndarray) -> np.array: + def get_extrema( + self, histo_dataset: np.ndarray, proj_dataset: np.ndarray + ) -> np.array: """ Computes global extrema over past and future data. @@ -146,7 +147,8 @@ def split_date( year: year where the data is to be split """ split_index = np.where( - time_list == cftime.DatetimeNoLeap(year, 1, 1, 0, 0, 0, 0, has_year_zero=True) + time_list + == cftime.DatetimeNoLeap(year, 1, 1, 0, 0, 0, 0, has_year_zero=True) )[0][0] train_data = nd_dset[:split_index] test_data = nd_dset[split_index:] @@ -232,7 +234,6 @@ def execute(self): temp_histo_nd, time_list = self.xr_to_ndarray(temp_histo, sq32_west_europe) temp_proj_nd, time_proj = self.xr_to_ndarray(temp_proj, sq32_west_europe) - # Compute the variable extrema over history and projections # temp_extrema = get_extrema(temp_histo_nd, temp_proj_nd) @@ -258,7 +259,6 @@ def execute(self): total_test = self.ndarray_to_2d(test_temp, land_prop) total_proj = self.ndarray_to_2d(temp_proj_norm, land_prop) - ##### 7. Save Results # Save train and test data sets @@ -271,15 +271,14 @@ def execute(self): np.save(f"input/preprocessed_2d_proj{scenario}_data_allssp.npy", total_proj) pd.DataFrame(time_proj).to_csv("input/dates_proj_data.csv") - ##### 8. Preprocessing for All Scenarios # This part is to be run as a complement to 6. and 7. # Here you can remove the scenario you already run in 6. and 7. - #scenarios = ["126", "245", "370", "585"] - #TODO: Discuss with Anne/Christian - #scenarios = ["245", "585"] + # scenarios = ["126", "245", "370", "585"] + # TODO: Discuss with Anne/Christian + # scenarios = ["245", "585"] scenarios = [self.scenario] for scenario in scenarios: diff --git a/use-cases/xtclim/src/anomaly.py b/use-cases/xtclim/src/anomaly.py index 8f07af0e7..f9114829a 100644 --- a/use-cases/xtclim/src/anomaly.py +++ b/use-cases/xtclim/src/anomaly.py @@ -17,98 +17,126 @@ if past_evaluation: - for season in ['winter_', 'spring_', 'summer_', 'autumn_']: - + for season in ["winter_", "spring_", "summer_", "autumn_"]: + # load previously trained model cvae_model = model.ConvVAE().to(device) - cvae_model.load_state_dict(torch.load(f'../outputs/cvae_model_{season}1d.pth')) - + cvae_model.load_state_dict(torch.load(f"../outputs/cvae_model_{season}1d.pth")) + # train set and data loader train_time = pd.read_csv(f"../input/dates_train_{season}data.csv") train_data = np.load(f"../input/preprocessed_1d_train_{season}data_allssp.npy") n_train = len(train_data) - trainset = [ ( torch.from_numpy(np.reshape(train_data[i], (3, 32, 32))), - train_time['0'][i] ) for i in range(n_train) ] - trainloader = DataLoader( - trainset, batch_size=1, shuffle=False - ) + trainset = [ + ( + torch.from_numpy(np.reshape(train_data[i], (3, 32, 32))), + train_time["0"][i], + ) + for i in range(n_train) + ] + trainloader = DataLoader(trainset, batch_size=1, shuffle=False) # test set and data loader test_time = pd.read_csv(f"../input/dates_test_{season}data.csv") test_data = np.load(f"../input/preprocessed_1d_test_{season}data_allssp.npy") n_test = len(test_data) - testset = [ ( torch.from_numpy(np.reshape(test_data[i], (3, 32, 32))), - test_time['0'][i] ) for i in range(n_test) ] - testloader = DataLoader( - testset, batch_size=1, shuffle=False - ) - - # average over a few iterations + testset = [ + (torch.from_numpy(np.reshape(test_data[i], (3, 32, 32))), test_time["0"][i]) + for i in range(n_test) + ] + testloader = DataLoader(testset, batch_size=1, shuffle=False) + + # average over a few iterations # for a better reconstruction estimate - train_avg_losses, _, tot_train_losses, _ = evaluate(cvae_model, trainloader, - trainset, device, - criterion, - pixel_wise_criterion) - test_avg_losses, _, tot_test_losses, _ = evaluate(cvae_model, testloader, - testset, device, criterion, - pixel_wise_criterion) + train_avg_losses, _, tot_train_losses, _ = evaluate( + cvae_model, trainloader, trainset, device, criterion, pixel_wise_criterion + ) + test_avg_losses, _, tot_test_losses, _ = evaluate( + cvae_model, testloader, testset, device, criterion, pixel_wise_criterion + ) for i in range(1, n_avg): - train_avg_loss, _, train_losses, _ = evaluate(cvae_model, trainloader, - trainset, device, criterion, - pixel_wise_criterion) + train_avg_loss, _, train_losses, _ = evaluate( + cvae_model, + trainloader, + trainset, + device, + criterion, + pixel_wise_criterion, + ) tot_train_losses = list(map(add, tot_train_losses, train_losses)) train_avg_losses += train_avg_loss - test_avg_loss, _, test_losses, _ = evaluate(cvae_model, testloader, - testset, device, criterion, - pixel_wise_criterion) + test_avg_loss, _, test_losses, _ = evaluate( + cvae_model, testloader, testset, device, criterion, pixel_wise_criterion + ) tot_test_losses = list(map(add, tot_test_losses, test_losses)) test_avg_losses += test_avg_loss - tot_train_losses = np.array(tot_train_losses)/n_avg - tot_test_losses = np.array(tot_test_losses)/n_avg - train_avg_losses = train_avg_losses/n_avg - test_avg_losses = test_avg_losses/n_avg - - pd.DataFrame(tot_train_losses).to_csv(f"../outputs/train_losses_{season}1d_allssp.csv") - pd.DataFrame(tot_test_losses).to_csv(f"../outputs/test_losses_{season}1d_allssp.csv") - print('Train average loss:', train_avg_losses) - print('Test average loss:', test_avg_losses) + tot_train_losses = np.array(tot_train_losses) / n_avg + tot_test_losses = np.array(tot_test_losses) / n_avg + train_avg_losses = train_avg_losses / n_avg + test_avg_losses = test_avg_losses / n_avg + + pd.DataFrame(tot_train_losses).to_csv( + f"../outputs/train_losses_{season}1d_allssp.csv" + ) + pd.DataFrame(tot_test_losses).to_csv( + f"../outputs/test_losses_{season}1d_allssp.csv" + ) + print("Train average loss:", train_avg_losses) + print("Test average loss:", test_avg_losses) if future_evaluation: - for season in ['winter_', 'spring_', 'summer_', 'autumn_']: - + for season in ["winter_", "spring_", "summer_", "autumn_"]: + # load previously trained model cvae_model = model.ConvVAE().to(device) - cvae_model.load_state_dict(torch.load(f'../outputs/cvae_model_{season}1d.pth')) + cvae_model.load_state_dict(torch.load(f"../outputs/cvae_model_{season}1d.pth")) + + for scenario in ["585", "370", "245", "126"]: - for scenario in ['585', '370', '245', '126']: - # projection set and data loader proj_time = pd.read_csv(f"../input/dates_proj_{season}data.csv") - proj_data = np.load(f"../input/preprocessed_1d_proj{scenario}_{season}data_allssp.npy") - n_proj = len(proj_data) - projset = [ ( torch.from_numpy(np.reshape(proj_data[i], (3, 32, 32))), - proj_time['0'][i] ) for i in range(n_proj) ] - projloader = DataLoader( - projset, batch_size=1, shuffle=False + proj_data = np.load( + f"../input/preprocessed_1d_proj{scenario}_{season}data_allssp.npy" ) + n_proj = len(proj_data) + projset = [ + ( + torch.from_numpy(np.reshape(proj_data[i], (3, 32, 32))), + proj_time["0"][i], + ) + for i in range(n_proj) + ] + projloader = DataLoader(projset, batch_size=1, shuffle=False) # get the losses for each data set # on various experiments to have representative statistics - proj_avg_losses, _, tot_proj_losses, _ = evaluate(cvae_model, projloader, - projset, device, criterion, - pixel_wise_criterion) - + proj_avg_losses, _, tot_proj_losses, _ = evaluate( + cvae_model, projloader, projset, device, criterion, pixel_wise_criterion + ) + for i in range(1, n_avg): - proj_avg_loss, _, proj_losses, _ = evaluate(cvae_model, projloader, - projset, device, criterion, - pixel_wise_criterion) + proj_avg_loss, _, proj_losses, _ = evaluate( + cvae_model, + projloader, + projset, + device, + criterion, + pixel_wise_criterion, + ) tot_proj_losses = list(map(add, tot_proj_losses, proj_losses)) proj_avg_losses += proj_avg_loss - - tot_proj_losses = np.array(tot_proj_losses)/n_avg - proj_avg_losses = proj_avg_losses/n_avg + + tot_proj_losses = np.array(tot_proj_losses) / n_avg + proj_avg_losses = proj_avg_losses / n_avg # save the losses time series - pd.DataFrame(tot_proj_losses).to_csv(f"../outputs/proj{scenario}_losses_{season}1d_allssp.csv") - print(f'SSP{scenario} Projection average loss:', proj_avg_losses, 'for', season[:-1]) \ No newline at end of file + pd.DataFrame(tot_proj_losses).to_csv( + f"../outputs/proj{scenario}_losses_{season}1d_allssp.csv" + ) + print( + f"SSP{scenario} Projection average loss:", + proj_avg_losses, + "for", + season[:-1], + ) diff --git a/use-cases/xtclim/src/engine.py b/use-cases/xtclim/src/engine.py index 280322911..7a9b97be2 100644 --- a/use-cases/xtclim/src/engine.py +++ b/use-cases/xtclim/src/engine.py @@ -2,6 +2,7 @@ import torch from initialization import pixel_wise_criterion + def final_loss(bce_loss, mu, logvar, beta=0.1): """ Adds up reconstruction loss (BCELoss) and Kullback-Leibler divergence. @@ -15,7 +16,8 @@ def final_loss(bce_loss, mu, logvar, beta=0.1): """ BCE = bce_loss KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) - return BCE + beta*KLD + return BCE + beta * KLD + def train(model, dataloader, dataset, device, optimizer, criterion, beta): # trains the model over shuffled data set @@ -34,7 +36,9 @@ def train(model, dataloader, dataset, device, optimizer, criterion, beta): model.train() running_loss = 0.0 counter = 0 - for i, data in tqdm(enumerate(dataloader), total=int(len(dataset)/dataloader.batch_size)): + for i, data in tqdm( + enumerate(dataloader), total=int(len(dataset) / dataloader.batch_size) + ): counter += 1 data = data[0] data = data.to(device) @@ -43,12 +47,13 @@ def train(model, dataloader, dataset, device, optimizer, criterion, beta): bce_loss = criterion(reconstruction, data) # total loss = reconstruction loss + KL divergence loss = final_loss(bce_loss, mu, logvar, beta) - loss.backward() # backpropagate loss to learn from mistakes + loss.backward() # backpropagate loss to learn from mistakes running_loss += loss.item() optimizer.step() - train_loss = running_loss / counter # average loss over the batches + train_loss = running_loss / counter # average loss over the batches return train_loss + def validate(model, dataloader, dataset, device, criterion, beta): """ Evaluates the CVAE network and returns the loss and reconstructions. @@ -66,22 +71,31 @@ def validate(model, dataloader, dataset, device, criterion, beta): running_loss = 0.0 counter = 0 with torch.no_grad(): - for i, data in tqdm(enumerate(dataloader), total=int(len(dataset)/dataloader.batch_size)): + for i, data in tqdm( + enumerate(dataloader), total=int(len(dataset) / dataloader.batch_size) + ): counter += 1 - data= data[0] + data = data[0] data = data.to(device) reconstruction, mu, logvar = model(data) bce_loss = criterion(reconstruction, data) loss = final_loss(bce_loss, mu, logvar, beta) running_loss += loss.item() # save the last batch input and output of every epoch - if i == int(len(dataset)/dataloader.batch_size) - 1: + if i == int(len(dataset) / dataloader.batch_size) - 1: recon_images = reconstruction val_loss = running_loss / counter return val_loss, recon_images -def evaluate(model, dataloader, dataset, device, - criterion, pixel_wise_criterion = pixel_wise_criterion): + +def evaluate( + model, + dataloader, + dataset, + device, + criterion, + pixel_wise_criterion=pixel_wise_criterion, +): """ Evaluates the CVAE network and returns the reconstruction loss (no KL divergence component) and reconstructions. @@ -102,22 +116,24 @@ def evaluate(model, dataloader, dataset, device, recon_images = [] pixel_wise_losses = [] with torch.no_grad(): - for i, data in tqdm(enumerate(dataloader), total=int(len(dataset)/dataloader.batch_size)): + for i, data in tqdm( + enumerate(dataloader), total=int(len(dataset) / dataloader.batch_size) + ): counter += 1 - data= data[0] + data = data[0] data = data.to(device) reconstruction, _, _ = model(data) # evaluate anomalies with reconstruction error only loss = criterion(reconstruction, data) - pixel_wise_losses.append(pixel_wise_criterion(reconstruction, - data)) + pixel_wise_losses.append(pixel_wise_criterion(reconstruction, data)) running_loss += loss.item() - losses.append(loss.item()) # keep track of all losses + losses.append(loss.item()) # keep track of all losses # save output of every evaluation recon_images.append(reconstruction) val_loss = running_loss / counter return val_loss, recon_images, losses, pixel_wise_losses + def latent_space_position(model, dataloader, dataset, device, criterion): """ Evaluates the CVAE network and returns the reconstruction loss @@ -135,9 +151,11 @@ def latent_space_position(model, dataloader, dataset, device, criterion): running_loss = 0.0 counter = 0 with torch.no_grad(): - for i, data in tqdm(enumerate(dataloader), total=int(len(dataset)/dataloader.batch_size)): + for i, data in tqdm( + enumerate(dataloader), total=int(len(dataset) / dataloader.batch_size) + ): counter += 1 - data= data[0] + data = data[0] data = data.to(device) reconstruction, mu, logvar = model(data) if i == 0: @@ -149,7 +167,7 @@ def latent_space_position(model, dataloader, dataset, device, criterion): loss = criterion(reconstruction, data) running_loss += loss.item() # save the last batch input and output of every epoch - if i == int(len(dataset)/dataloader.batch_size) - 1: + if i == int(len(dataset) / dataloader.batch_size) - 1: recon_images = reconstruction val_loss = running_loss / counter return val_loss, recon_images, mus, logvars diff --git a/use-cases/xtclim/src/initialization.py b/use-cases/xtclim/src/initialization.py index e00306271..fe53208f8 100644 --- a/use-cases/xtclim/src/initialization.py +++ b/use-cases/xtclim/src/initialization.py @@ -2,13 +2,13 @@ import torch.nn as nn -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Mean-Squared Error as the average difference between the pixels # in the original image vs. the reconstructed one criterion = nn.MSELoss() # pixel-wise MSE loss -pixel_wise_criterion = nn.MSELoss(reduction='none') +pixel_wise_criterion = nn.MSELoss(reduction="none") # KL divergence handles dispersion of information in latent space # a balance is to be found with the prevailing reconstruction error diff --git a/use-cases/xtclim/src/model.py b/use-cases/xtclim/src/model.py index 932a2ca5d..c8c87790f 100644 --- a/use-cases/xtclim/src/model.py +++ b/use-cases/xtclim/src/model.py @@ -2,10 +2,10 @@ import torch.nn as nn import torch.nn.functional as F -kernel_size = 4 # (4, 4) kernel -init_channels = 8 # initial number of filters -image_channels = 2 # 1 channel/variable: max temperature, precipitation, wind -latent_dim = 128 # latent space dimension (in which the image is compressed) +kernel_size = 4 # (4, 4) kernel +init_channels = 8 # initial number of filters +image_channels = 2 # 1 channel/variable: max temperature, precipitation, wind +latent_dim = 128 # latent space dimension (in which the image is compressed) # define a Conv VAE @@ -22,20 +22,32 @@ def __init__(self): # encoder self.enc1 = nn.Conv2d( - in_channels=image_channels, out_channels=init_channels, kernel_size=kernel_size, - stride=2, padding=1 + in_channels=image_channels, + out_channels=init_channels, + kernel_size=kernel_size, + stride=2, + padding=1, ) self.enc2 = nn.Conv2d( - in_channels=init_channels, out_channels=init_channels*2, kernel_size=kernel_size, - stride=2, padding=1 + in_channels=init_channels, + out_channels=init_channels * 2, + kernel_size=kernel_size, + stride=2, + padding=1, ) self.enc3 = nn.Conv2d( - in_channels=init_channels*2, out_channels=init_channels*4, kernel_size=kernel_size, - stride=2, padding=1 + in_channels=init_channels * 2, + out_channels=init_channels * 4, + kernel_size=kernel_size, + stride=2, + padding=1, ) self.enc4 = nn.Conv2d( - in_channels=init_channels*4, out_channels=64, kernel_size=kernel_size, - stride=2, padding=0 + in_channels=init_channels * 4, + out_channels=64, + kernel_size=kernel_size, + stride=2, + padding=0, ) # fully connected layers for learning representations @@ -46,20 +58,32 @@ def __init__(self): # decoder self.dec1 = nn.ConvTranspose2d( - in_channels=64, out_channels=init_channels*8, kernel_size=kernel_size, - stride=1, padding=0 + in_channels=64, + out_channels=init_channels * 8, + kernel_size=kernel_size, + stride=1, + padding=0, ) self.dec2 = nn.ConvTranspose2d( - in_channels=init_channels*8, out_channels=init_channels*4, kernel_size=kernel_size, - stride=2, padding=1 + in_channels=init_channels * 8, + out_channels=init_channels * 4, + kernel_size=kernel_size, + stride=2, + padding=1, ) self.dec3 = nn.ConvTranspose2d( - in_channels=init_channels*4, out_channels=init_channels*2, kernel_size=kernel_size, - stride=2, padding=1 + in_channels=init_channels * 4, + out_channels=init_channels * 2, + kernel_size=kernel_size, + stride=2, + padding=1, ) self.dec4 = nn.ConvTranspose2d( - in_channels=init_channels*2, out_channels=image_channels, kernel_size=kernel_size, - stride=2, padding=1 + in_channels=init_channels * 2, + out_channels=image_channels, + kernel_size=kernel_size, + stride=2, + padding=1, ) def reparameterize(self, mu, log_var): @@ -68,9 +92,9 @@ def reparameterize(self, mu, log_var): :param mu: mean from the encoder's latent space :param log_var: log variance from the encoder's latent space """ - std = torch.exp(0.5*log_var) # standard deviation - eps = torch.randn_like(std) # `randn_like` as we need the same size - sample = mu + (eps * std) # sampling + std = torch.exp(0.5 * log_var) # standard deviation + eps = torch.randn_like(std) # `randn_like` as we need the same size + sample = mu + (eps * std) # sampling return sample def forward(self, x): diff --git a/use-cases/xtclim/src/trainer.py b/use-cases/xtclim/src/trainer.py index 4314060ad..251a58265 100644 --- a/use-cases/xtclim/src/trainer.py +++ b/use-cases/xtclim/src/trainer.py @@ -27,13 +27,9 @@ from utils import save_reconstructed_images, save_loss_plot, save_ex from initialization import device, beta, criterion + class TorchTrainer(Trainer): - def __init__( - self, - epochs: int, - batch_size: int, - lr: float - ): + def __init__(self, epochs: int, batch_size: int, lr: float): super().__init__() self.epochs = epochs self.batch_size = batch_size @@ -49,10 +45,10 @@ def execute(self): n_memb = 1 # initialize learning parameters - #lr0 = 0.001 - #batch_size = 64 - #epochs = 100 - #early stopping parameters + # lr0 = 0.001 + # batch_size = 64 + # epochs = 100 + # early stopping parameters stop_delta = 0.01 # under 1% improvement consider the model starts converging patience = 15 # wait for a few epochs to be sure before actually stopping early_count = 0 # count when validation loss < stop_delta @@ -71,7 +67,10 @@ def execute(self): ) n_train = len(train_data) trainset = [ - (torch.from_numpy(np.reshape(train_data[i], (2, 32, 32))), train_time["0"][i]) + ( + torch.from_numpy(np.reshape(train_data[i], (2, 32, 32))), + train_time["0"][i], + ) for i in range(n_train) ] # load train set, shuffle it, and create batches @@ -79,10 +78,15 @@ def execute(self): # load validation set and validation data test_time = pd.read_csv(f"input/dates_test_{season}data_{n_memb}memb.csv") - test_data = np.load(f"input/preprocessed_1d_test_{season}data_{n_memb}memb.npy") + test_data = np.load( + f"input/preprocessed_1d_test_{season}data_{n_memb}memb.npy" + ) n_test = len(test_data) testset = [ - (torch.from_numpy(np.reshape(test_data[i], (2, 32, 32))), test_time["0"][i]) + ( + torch.from_numpy(np.reshape(test_data[i], (2, 32, 32))), + test_time["0"][i], + ) for i in range(n_test) ] testloader = DataLoader(testset, batch_size=self.batch_size, shuffle=False) @@ -99,7 +103,13 @@ def execute(self): # train the model train_epoch_loss = train( - cvae_model, trainloader, trainset, device, optimizer, criterion, beta + cvae_model, + trainloader, + trainset, + device, + optimizer, + criterion, + beta, ) # evaluate the model on the test set @@ -112,42 +122,41 @@ def execute(self): valid_loss.append(valid_epoch_loss) # save the reconstructed images from the validation loop - #save_reconstructed_images(recon_images, epoch+1, season) + # save_reconstructed_images(recon_images, epoch+1, season) # convert the reconstructed images to PyTorch image grid format image_grid = make_grid(recon_images.detach().cpu()) grid_images.append(image_grid) # save one example of reconstructed image before and after training - #if epoch == 0 or epoch == self.epochs-1: + # if epoch == 0 or epoch == self.epochs-1: # save_ex(recon_images[0], epoch, season) # decreasing learning rate if (epoch + 1) % 20 == 0: lr = lr / 5 -#------- - + # ------- # early stopping to avoid overfitting -# if ( -# epoch > 1 -# and (old_valid_loss - valid_epoch_loss) / old_valid_loss < stop_delta -# ): - # if the marginal improvement in validation loss is too small -# early_count += 1 - - #if early_count > patience: - # if too small improvement for a few epochs in a row, stop learning + # if ( + # epoch > 1 + # and (old_valid_loss - valid_epoch_loss) / old_valid_loss < stop_delta + # ): + # if the marginal improvement in validation loss is too small + # early_count += 1 + + # if early_count > patience: + # if too small improvement for a few epochs in a row, stop learning # save_ex(recon_images[0], epoch, season) - #break + # break -# else: - # if the condition is not verified anymore, reset the count -# early_count = 0 -# old_valid_loss = valid_epoch_loss + # else: + # if the condition is not verified anymore, reset the count + # early_count = 0 + # old_valid_loss = valid_epoch_loss -#--------------- + # --------------- # save best model if valid_epoch_loss < min_valid_epoch_loss: diff --git a/use-cases/xtclim/src/utils.py b/use-cases/xtclim/src/utils.py index 81e166f63..d7f446a0a 100644 --- a/use-cases/xtclim/src/utils.py +++ b/use-cases/xtclim/src/utils.py @@ -6,26 +6,30 @@ to_pil_image = transforms.ToPILImage() + def image_to_vid(images): # save evolving images along the learning and get the video imgs = [np.array(to_pil_image(img)) for img in images] - imageio.mimsave('outputs/generated_images.gif', imgs) + imageio.mimsave("outputs/generated_images.gif", imgs) + -def save_reconstructed_images(recon_images, epoch, season = ''): +def save_reconstructed_images(recon_images, epoch, season=""): # save all reconstructed images at each epoch save_image(recon_images.cpu(), f"outputs/image_record/{season}output{epoch}.jpg") -def save_ex(recon_ex, epoch, season = ''): + +def save_ex(recon_ex, epoch, season=""): # save an example of image at a given epoch save_image(recon_ex.cpu(), f"outputs/image_record/{season}ex{epoch}.jpg") -def save_loss_plot(train_loss, valid_loss, season = ''): + +def save_loss_plot(train_loss, valid_loss, season=""): # saves the plot of both losses evolutions plt.figure(figsize=(10, 7)) - plt.plot(train_loss, color='orange', label='train loss') - plt.plot(valid_loss, color='red', label='validation loss') - plt.xlabel('Epochs') - plt.ylabel('Loss') + plt.plot(train_loss, color="orange", label="train loss") + plt.plot(valid_loss, color="red", label="validation loss") + plt.xlabel("Epochs") + plt.ylabel("Loss") plt.legend() - plt.savefig(f'outputs/{season}loss.jpg') + plt.savefig(f"outputs/{season}loss.jpg") plt.show() diff --git a/use-cases/xtclim/train.py b/use-cases/xtclim/train.py index 4ed63633f..64353159e 100644 --- a/use-cases/xtclim/train.py +++ b/use-cases/xtclim/train.py @@ -1,6 +1,7 @@ """ Train file to launch pipeline """ + import os import sys from typing import Dict @@ -8,23 +9,25 @@ import logging from datetime import datetime -sys.path.append(os.path.join(os.path.dirname(__file__), 'src')) -sys.path.append(os.path.join(os.path.dirname(__file__), 'preprocessing')) +sys.path.append(os.path.join(os.path.dirname(__file__), "src")) +sys.path.append(os.path.join(os.path.dirname(__file__), "preprocessing")) from itwinai.parser import ConfigParser, ArgumentParser if __name__ == "__main__": parser = ArgumentParser() parser.add_argument( - "-p", "--pipeline", type=str, required=True, - help='Configuration file to the pipeline to execute.' + "-p", + "--pipeline", + type=str, + required=True, + help="Configuration file to the pipeline to execute.", ) args = parser.parse_args() pipe_parser = ConfigParser( - config=args.pipeline, + config=args.pipeline, ) pipeline = pipe_parser.parse_pipeline() pipeline.execute() -