From d215fd73f265dee1454cda2677c2017084233b95 Mon Sep 17 00:00:00 2001 From: Sean Xiang Date: Mon, 21 Aug 2023 17:57:47 -0700 Subject: [PATCH 01/30] add dynamic batch_size upload --- .../workspace_uploader/workspace_uploader.py | 141 ++++++++++++------ 1 file changed, 98 insertions(+), 43 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 3a781bd8d..765c68298 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -54,7 +54,7 @@ from datetime import datetime from multiprocessing import cpu_count from pathlib import Path -from typing import Tuple +from typing import Generator, Tuple import yaml @@ -67,6 +67,7 @@ UPLOAD_FILE_EXT = ["genomic.fna.gz"] # uplaod only files that match given extensions JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER = "/kb/module/work/tmp" UPLOADED_YAML = "uploaded.yaml" +MEMORY_UTILIZATION = 0.5 def _get_parser(): @@ -160,22 +161,24 @@ def _get_source_file(assembly_dir: str, assembly_file: str) -> str: return src_file -def _upload_assembly_to_workspace( - conf: Conf, - workspace_id: int, - file_path: str, - assembly_name: str, -) -> str: - """Upload an assembly file to workspace.""" +def _upload_assemblies_to_workspace( + conf: Conf, + workspace_id: int, + file_paths: tuple[str], + assembly_names: tuple[str], +) -> tuple[str]: + + inputs = [{"file": f, "assembly_name": n} + for f, n in zip(file_paths, assembly_names)] + success, attempts, max_attempts = False, 0, 3 while attempts < max_attempts and not success: try: time.sleep(attempts) - assembly_ref = conf.asu.save_assembly_from_fasta2( + assembly_ref = conf.asu.save_assemblies_from_fastas( { - "file": {"path": file_path}, "workspace_id": workspace_id, - "assembly_name": assembly_name, + "inputs": inputs } ) success = True @@ -185,11 +188,12 @@ def _upload_assembly_to_workspace( if not success: raise ValueError( - f"Upload Failed for {file_path} after {max_attempts} attempts!" + f"Upload Failed for {file_paths} after {max_attempts} attempts!" ) - upa = assembly_ref["upa"].replace("/", "_") - return upa + upas = tuple([result_dict["upa"].replace("/", "_") + for result_dict in assembly_ref["results"]]) + return upas def _read_upload_status_yaml_file( @@ -342,36 +346,39 @@ def _process_input(conf: Conf) -> None: print("Stopping") break - upa = None ( upload_env_key, workspace_id, - container_internal_assembly_path, - host_assembly_dir, - assembly_name, + batch_container_internal_assembly_paths, + batch_host_assembly_dirs, + batch_assembly_names, upload_dir, counter, assembly_files_len, ) = task + batch_upas = (None, ) * len(batch_assembly_names) try: - upa = _upload_assembly_to_workspace( - conf, workspace_id, container_internal_assembly_path, assembly_name - ) - _post_process( - upload_env_key, - workspace_id, - host_assembly_dir, - assembly_name, - upload_dir, - conf.output_dir, - upa + batch_upas = _upload_assemblies_to_workspace( + conf, workspace_id, batch_container_internal_assembly_paths, batch_assembly_names ) + for host_assembly_dir, assembly_name, upa in zip( + batch_host_assembly_dirs, batch_assembly_names, batch_upas + ): + _post_process( + upload_env_key, + workspace_id, + host_assembly_dir, + assembly_name, + upload_dir, + conf.output_dir, + upa + ) except Exception as e: - print(f"Failed assembly name: {assembly_name}. Exception:") + print(f"Failed assembly names: {batch_assembly_names}. Exception:") print(e) - conf.output_queue.put((assembly_name, upa)) + conf.output_queue.put((batch_assembly_names, batch_upas)) if counter % 3000 == 0: print(f"Assemblies processed: {counter}/{assembly_files_len}, " @@ -403,19 +410,22 @@ def _upload_assembly_files_in_parallel( print(f"Start uploading {assembly_files_len} assembly files with {conf.workers} workers\n") # Put the assembly files in the input_queue - counter = 1 - for assembly_name, host_assembly_dir in wait_to_upload_assemblies.items(): - - container_internal_assembly_path = os.path.join( - JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER, assembly_name - ) + counter = 0 + for ( + batch_container_internal_assembly_paths, + batch_host_assembly_dirs, + batch_assembly_names, + batch_size, + ) in _gen(conf, wait_to_upload_assemblies): + + counter += batch_size conf.input_queue.put( ( upload_env_key, workspace_id, - container_internal_assembly_path, - host_assembly_dir, - assembly_name, + batch_container_internal_assembly_paths, + batch_host_assembly_dirs, + batch_assembly_names, upload_dir, counter, assembly_files_len, @@ -427,14 +437,12 @@ def _upload_assembly_files_in_parallel( f"Percentage: {counter / assembly_files_len * 100:.2f}%, " f"Time: {datetime.now()}") - counter += 1 - # Signal the workers to terminate when they finish uploading assembly files for _ in range(conf.workers): conf.input_queue.put(None) results = [conf.output_queue.get() for _ in range(assembly_files_len)] - failed_names = [assembly_name for assembly_name, upa in results if upa is None] + failed_names = [name for names, upas in results for name, upa in zip(names, upas) if upa is None] # Close and join the processes conf.pools.close() @@ -443,6 +451,53 @@ def _upload_assembly_files_in_parallel( return failed_names +def _gen( + conf: Conf, + wait_to_upload_assemblies: dict[str, str] +) -> Generator[Tuple[tuple[str], tuple[str], tuple[str], int], None, None]: + """ + Generator function to yield the assembly files to upload. + """ + assembly_files_len = len(wait_to_upload_assemblies) + assembly_names = tuple(wait_to_upload_assemblies.keys()) + host_assembly_dirs = tuple(wait_to_upload_assemblies.values()) + container_internal_assembly_paths = tuple( + [ + os.path.join(JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER, assembly_name) + for assembly_name in assembly_names + ] + ) + + start_idx = 0 + cumsize = 0 + # to avoid memory issue, upload 1GB * MEMORY_UTILIZATION of assembly files at a time + max_cumsize = 1000000000 * MEMORY_UTILIZATION + for idx in range(assembly_files_len): + file_path = os.path.join(conf.job_data_dir, assembly_names[idx]) + if not os.path.exists(file_path): + raise ValueError(f"{file_path} does not exist. " + f"Ensure file {assembly_names[idx]} exist in {conf.job_data_dir}") + file_size = os.path.getsize(file_path) + if file_size + cumsize < max_cumsize: + cumsize += file_size + else: + yield ( + container_internal_assembly_paths[start_idx:idx], + host_assembly_dirs[start_idx:idx], + assembly_names[start_idx:idx], + idx - start_idx, + ) + start_idx = idx + cumsize = file_size + + yield ( + container_internal_assembly_paths[start_idx:], + host_assembly_dirs[start_idx:], + assembly_names[start_idx:], + assembly_files_len - start_idx, + ) + + def main(): parser = _get_parser() args = parser.parse_args() From 1524c47e34a1a23ec65190c7fdb6abeba82cddc1 Mon Sep 17 00:00:00 2001 From: Sean Xiang Date: Tue, 22 Aug 2023 15:27:53 -0700 Subject: [PATCH 02/30] add comments for iterator and fix a bug --- .../workspace_uploader/workspace_uploader.py | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 765c68298..f06c557ae 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -67,7 +67,7 @@ UPLOAD_FILE_EXT = ["genomic.fna.gz"] # uplaod only files that match given extensions JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER = "/kb/module/work/tmp" UPLOADED_YAML = "uploaded.yaml" -MEMORY_UTILIZATION = 0.5 +MEMORY_UTILIZATION = 0.01 # approximately 8 assemblies per batch def _get_parser(): @@ -353,7 +353,7 @@ def _process_input(conf: Conf) -> None: batch_host_assembly_dirs, batch_assembly_names, upload_dir, - counter, + file_counter, assembly_files_len, ) = task @@ -380,9 +380,9 @@ def _process_input(conf: Conf) -> None: conf.output_queue.put((batch_assembly_names, batch_upas)) - if counter % 3000 == 0: - print(f"Assemblies processed: {counter}/{assembly_files_len}, " - f"Percentage: {counter / assembly_files_len * 100:.2f}%, " + if file_counter % 3000 == 0: + print(f"Assemblies processed: {file_counter}/{assembly_files_len}, " + f"Percentage: {file_counter / assembly_files_len * 100:.2f}%, " f"Time: {datetime.now()}") @@ -410,7 +410,8 @@ def _upload_assembly_files_in_parallel( print(f"Start uploading {assembly_files_len} assembly files with {conf.workers} workers\n") # Put the assembly files in the input_queue - counter = 0 + file_counter = 0 + iter_counter = 0 for ( batch_container_internal_assembly_paths, batch_host_assembly_dirs, @@ -418,7 +419,7 @@ def _upload_assembly_files_in_parallel( batch_size, ) in _gen(conf, wait_to_upload_assemblies): - counter += batch_size + file_counter += batch_size conf.input_queue.put( ( upload_env_key, @@ -427,21 +428,23 @@ def _upload_assembly_files_in_parallel( batch_host_assembly_dirs, batch_assembly_names, upload_dir, - counter, + file_counter, assembly_files_len, ) ) - if counter % 5000 == 0: - print(f"Jobs added to the queue: {counter}/{assembly_files_len}, " - f"Percentage: {counter / assembly_files_len * 100:.2f}%, " + if file_counter % 5000 == 0: + print(f"Jobs added to the queue: {file_counter}/{assembly_files_len}, " + f"Percentage: {file_counter / assembly_files_len * 100:.2f}%, " f"Time: {datetime.now()}") + iter_counter += 1 + # Signal the workers to terminate when they finish uploading assembly files for _ in range(conf.workers): conf.input_queue.put(None) - results = [conf.output_queue.get() for _ in range(assembly_files_len)] + results = [conf.output_queue.get() for _ in range(iter_counter)] failed_names = [name for names, upas in results for name, upa in zip(names, upas) if upa is None] # Close and join the processes @@ -478,7 +481,8 @@ def _gen( raise ValueError(f"{file_path} does not exist. " f"Ensure file {assembly_names[idx]} exist in {conf.job_data_dir}") file_size = os.path.getsize(file_path) - if file_size + cumsize < max_cumsize: + # cumulate aasembly files until the total size is greater than max_cumsize + if file_size + cumsize <= max_cumsize: cumsize += file_size else: yield ( @@ -487,9 +491,11 @@ def _gen( assembly_names[start_idx:idx], idx - start_idx, ) + # reset start_idx and cumsize start_idx = idx cumsize = file_size + # yield the remaining assembly files yield ( container_internal_assembly_paths[start_idx:], host_assembly_dirs[start_idx:], From f1ac0b425abfddf897f88d124dd4465bc2248bc4 Mon Sep 17 00:00:00 2001 From: Sean Xiang Date: Tue, 22 Aug 2023 17:44:33 -0700 Subject: [PATCH 03/30] finish clean up && testing --- src/loaders/workspace_uploader/workspace_uploader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index f06c557ae..4ac233845 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -167,7 +167,10 @@ def _upload_assemblies_to_workspace( file_paths: tuple[str], assembly_names: tuple[str], ) -> tuple[str]: - + """ + Upload assembly files to the target workspace in batch. The bulk method fails + and an error will be thrown if any of the assembly files in batch fails to upload. + """ inputs = [{"file": f, "assembly_name": n} for f, n in zip(file_paths, assembly_names)] From 48df9158e45a00836665a10594f632be8e212092 Mon Sep 17 00:00:00 2001 From: Sean Xiang Date: Wed, 23 Aug 2023 13:41:13 -0700 Subject: [PATCH 04/30] correct tuple type hint --- .../workspace_uploader/workspace_uploader.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 4ac233845..fef927bce 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -54,7 +54,7 @@ from datetime import datetime from multiprocessing import cpu_count from pathlib import Path -from typing import Generator, Tuple +from typing import Generator import yaml @@ -67,6 +67,7 @@ UPLOAD_FILE_EXT = ["genomic.fna.gz"] # uplaod only files that match given extensions JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER = "/kb/module/work/tmp" UPLOADED_YAML = "uploaded.yaml" +MAX_DATA_SIZE = 1024 * 1024 * 1024 # can cause memeory issues if the serialized data is > 1GB MEMORY_UTILIZATION = 0.01 # approximately 8 assemblies per batch @@ -164,9 +165,9 @@ def _get_source_file(assembly_dir: str, assembly_file: str) -> str: def _upload_assemblies_to_workspace( conf: Conf, workspace_id: int, - file_paths: tuple[str], - assembly_names: tuple[str], -) -> tuple[str]: + file_paths: tuple[str, ...], + assembly_names: tuple[str, ...], +) -> tuple[str, ...]: """ Upload assembly files to the target workspace in batch. The bulk method fails and an error will be thrown if any of the assembly files in batch fails to upload. @@ -204,7 +205,7 @@ def _read_upload_status_yaml_file( workspace_id: int, assembly_dir: str, assembly_name: str, -) -> Tuple[dict[str, dict[int, list[str]]], bool]: +) -> tuple[dict[str, dict[int, list[str]]], bool]: """ Get metadata and upload status of an assembly from the uploaded.yaml file. """ @@ -257,7 +258,7 @@ def _fetch_assemblies_to_upload( workspace_id: int, collection_source_dir: str, upload_file_ext: list[str], -) -> Tuple[int, dict[str, str]]: +) -> tuple[int, dict[str, str]]: count = 0 wait_to_upload_assemblies = dict() @@ -460,7 +461,7 @@ def _upload_assembly_files_in_parallel( def _gen( conf: Conf, wait_to_upload_assemblies: dict[str, str] -) -> Generator[Tuple[tuple[str], tuple[str], tuple[str], int], None, None]: +) -> Generator[tuple[tuple[str, ...], tuple[str, ...], tuple[str, ...], int], None, None]: """ Generator function to yield the assembly files to upload. """ @@ -477,14 +478,14 @@ def _gen( start_idx = 0 cumsize = 0 # to avoid memory issue, upload 1GB * MEMORY_UTILIZATION of assembly files at a time - max_cumsize = 1000000000 * MEMORY_UTILIZATION - for idx in range(assembly_files_len): - file_path = os.path.join(conf.job_data_dir, assembly_names[idx]) + max_cumsize = MAX_DATA_SIZE * MEMORY_UTILIZATION + for idx, assembly_name in enumerate(assembly_names): + file_path = os.path.join(conf.job_data_dir, assembly_name) if not os.path.exists(file_path): raise ValueError(f"{file_path} does not exist. " - f"Ensure file {assembly_names[idx]} exist in {conf.job_data_dir}") + f"Ensure file {assembly_name} exist in {conf.job_data_dir}") file_size = os.path.getsize(file_path) - # cumulate aasembly files until the total size is greater than max_cumsize + # cumulate assembly files until the total size is greater than max_cumsize if file_size + cumsize <= max_cumsize: cumsize += file_size else: From 1eea48f316b3df67128758f53221ed8ece33715b Mon Sep 17 00:00:00 2001 From: Sean Xiang Date: Fri, 3 Nov 2023 16:19:04 -0700 Subject: [PATCH 05/30] update WS uploader script --- .../workspace_downloader_helper.py | 3 +- .../workspace_uploader/workspace_uploader.py | 265 +++++++----------- .../workspace_uploader_helper.py | 137 +++++++++ .../workspace_uploader_test.py | 143 ++++++---- 4 files changed, 330 insertions(+), 218 deletions(-) create mode 100644 src/loaders/workspace_uploader/workspace_uploader_helper.py diff --git a/src/loaders/workspace_downloader/workspace_downloader_helper.py b/src/loaders/workspace_downloader/workspace_downloader_helper.py index 3ba1663a2..fc07e3865 100644 --- a/src/loaders/workspace_downloader/workspace_downloader_helper.py +++ b/src/loaders/workspace_downloader/workspace_downloader_helper.py @@ -15,7 +15,7 @@ class Conf: """ - Configuration class for the workspace downloader and workspace uploader scripts. + Configuration class for the workspace downloader script. """ def __init__( self, @@ -69,7 +69,6 @@ def __init__( self.workers = workers self.output_dir = output_dir self.input_queue = Queue() - self.output_queue = Queue() self.job_data_dir = loader_helper.make_job_data_dir(job_dir) self.pools = Pool(workers, worker_function, [self]) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index fef927bce..c2ecbd54c 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -1,7 +1,8 @@ """ usage: workspace_uploader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_ver SOURCE_VER] [--root_dir ROOT_DIR] [--token_filepath TOKEN_FILEPATH] [--env {CI,NEXT,APPDEV,PROD}] - [--upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...]] [--workers WORKERS] [--keep_job_dir] + [--upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...]] [--workers WORKERS] [--batch_size BATCH_SIZE] + [--keep_job_dir] PROTOTYPE - Upload assembly files to the workspace service (WSS). @@ -30,6 +31,8 @@ --upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...] Upload only files that match given extensions (default: ['genomic.fna.gz']) --workers WORKERS Number of workers for multiprocessing (default: 5) + --batch_size BATCH_SIZE + Number of files to upload per batch (default: 1000) --keep_job_dir Keep SDK job directory after upload task is completed e.g. @@ -51,6 +54,7 @@ import os import shutil import time +from collections import namedtuple from datetime import datetime from multiprocessing import cpu_count from pathlib import Path @@ -59,7 +63,7 @@ import yaml from src.loaders.common import loader_common_names, loader_helper -from src.loaders.workspace_downloader.workspace_downloader_helper import Conf +from src.loaders.workspace_uploader.workspace_uploader_helper import Conf # setup KB_AUTH_TOKEN as env or provide a token_filepath in --token_filepath # export KB_AUTH_TOKEN="your-kb-auth-token" @@ -67,8 +71,8 @@ UPLOAD_FILE_EXT = ["genomic.fna.gz"] # uplaod only files that match given extensions JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER = "/kb/module/work/tmp" UPLOADED_YAML = "uploaded.yaml" -MAX_DATA_SIZE = 1024 * 1024 * 1024 # can cause memeory issues if the serialized data is > 1GB -MEMORY_UTILIZATION = 0.01 # approximately 8 assemblies per batch + +AssemblyTuple = namedtuple("AssemblyTuple", ["assembly_name", "host_assembly_dir", "container_internal_assembly_path"]) def _get_parser(): @@ -135,6 +139,12 @@ def _get_parser(): default=5, help="Number of workers for multiprocessing", ) + optional.add_argument( + "--batch_size", + type=int, + default=1000, + help="Number of files to upload per batch", + ) optional.add_argument( "--keep_job_dir", action="store_true", @@ -165,35 +175,29 @@ def _get_source_file(assembly_dir: str, assembly_file: str) -> str: def _upload_assemblies_to_workspace( conf: Conf, workspace_id: int, - file_paths: tuple[str, ...], - assembly_names: tuple[str, ...], + assembly_tuples: list[AssemblyTuple], ) -> tuple[str, ...]: """ Upload assembly files to the target workspace in batch. The bulk method fails and an error will be thrown if any of the assembly files in batch fails to upload. """ - inputs = [{"file": f, "assembly_name": n} - for f, n in zip(file_paths, assembly_names)] - - success, attempts, max_attempts = False, 0, 3 - while attempts < max_attempts and not success: - try: - time.sleep(attempts) - assembly_ref = conf.asu.save_assemblies_from_fastas( - { - "workspace_id": workspace_id, - "inputs": inputs - } - ) - success = True - except Exception as e: - print(f"Error:\n{e}\nfrom attempt {attempts + 1}.\nTrying to rerun.") - attempts += 1 + inputs = [ + { + "file": assembly_tuple.container_internal_assembly_path, + "assembly_name": assembly_tuple.assembly_name, + } + for assembly_tuple in assembly_tuples + ] - if not success: - raise ValueError( - f"Upload Failed for {file_paths} after {max_attempts} attempts!" + try: + assembly_ref = conf.asu.save_assemblies_from_fastas( + { + "workspace_id": workspace_id, + "inputs": inputs + } ) + except Exception as e: + raise e upas = tuple([result_dict["upa"].replace("/", "_") for result_dict in assembly_ref["results"]]) @@ -340,62 +344,13 @@ def _post_process( loader_helper.create_softlink_between_dirs(new_dir, target_dir) -def _process_input(conf: Conf) -> None: - """ - Process input from input_queue and put the result in output_queue. - """ - while True: - task = conf.input_queue.get(block=True) - if not task: - print("Stopping") - break - - ( - upload_env_key, - workspace_id, - batch_container_internal_assembly_paths, - batch_host_assembly_dirs, - batch_assembly_names, - upload_dir, - file_counter, - assembly_files_len, - ) = task - - batch_upas = (None, ) * len(batch_assembly_names) - try: - batch_upas = _upload_assemblies_to_workspace( - conf, workspace_id, batch_container_internal_assembly_paths, batch_assembly_names - ) - for host_assembly_dir, assembly_name, upa in zip( - batch_host_assembly_dirs, batch_assembly_names, batch_upas - ): - _post_process( - upload_env_key, - workspace_id, - host_assembly_dir, - assembly_name, - upload_dir, - conf.output_dir, - upa - ) - except Exception as e: - print(f"Failed assembly names: {batch_assembly_names}. Exception:") - print(e) - - conf.output_queue.put((batch_assembly_names, batch_upas)) - - if file_counter % 3000 == 0: - print(f"Assemblies processed: {file_counter}/{assembly_files_len}, " - f"Percentage: {file_counter / assembly_files_len * 100:.2f}%, " - f"Time: {datetime.now()}") - - def _upload_assembly_files_in_parallel( conf: Conf, upload_env_key: str, workspace_id: int, upload_dir: str, wait_to_upload_assemblies: dict[str, str], + batch_size: int, ) -> list[str]: """ Upload assembly files to the target workspace in parallel using multiprocessing. @@ -406,62 +361,63 @@ def _upload_assembly_files_in_parallel( workspace_id: target workspace id upload_dir: a directory in collectionssource that creates new directories linking to sourcedata wait_to_upload_assemblies: a dictionary that maps assembly file name to assembly directory + batch_size: a number of files to upload per batch Returns: - a list of assembly names that failed to upload + a count on assembly files uploded """ assembly_files_len = len(wait_to_upload_assemblies) - print(f"Start uploading {assembly_files_len} assembly files with {conf.workers} workers\n") - - # Put the assembly files in the input_queue - file_counter = 0 - iter_counter = 0 - for ( - batch_container_internal_assembly_paths, - batch_host_assembly_dirs, - batch_assembly_names, - batch_size, - ) in _gen(conf, wait_to_upload_assemblies): - - file_counter += batch_size - conf.input_queue.put( - ( + print(f"Start uploading {assembly_files_len} assembly files\n") + + uploaded_count = 0 + for assembly_tuples in _gen(wait_to_upload_assemblies, batch_size): + + try: + batch_upas = _upload_assemblies_to_workspace(conf, workspace_id, assembly_tuples) + except Exception as e: + print(e) + failed_objects_names = [assembly_tuple.assembly_name for assembly_tuple in assembly_tuples] + assembly_objects_in_ws = loader_helper.list_objects( + workspace_id, + conf, + loader_common_names.OBJECTS_NAME_ASSEMBLY, + ) + assembly_names_in_ws = [object_info[1] for object_info in assembly_objects_in_ws] + objects_names_to_delete_from_ws = set(assembly_names_in_ws) & set(failed_objects_names) + if objects_names_to_delete_from_ws: + conf.ws.delete_objects( + [ + {"wsid": workspace_id, "name": object_name} + for object_name in objects_names_to_delete_from_ws + ] + ) + print(f"workspace {workspace_id} cleanup done ...") + return uploaded_count + + for assembly_tuple, upa in zip(assembly_tuples, batch_upas): + _post_process( upload_env_key, workspace_id, - batch_container_internal_assembly_paths, - batch_host_assembly_dirs, - batch_assembly_names, + assembly_tuple.host_assembly_dir, + assembly_tuple.assembly_name, upload_dir, - file_counter, - assembly_files_len, + conf.output_dir, + upa ) - ) - - if file_counter % 5000 == 0: - print(f"Jobs added to the queue: {file_counter}/{assembly_files_len}, " - f"Percentage: {file_counter / assembly_files_len * 100:.2f}%, " + + uploaded_count += len(assembly_tuples) + if uploaded_count % 1000 == 0: + print(f"Assemblies uploaded: {uploaded_count}/{assembly_files_len}, " + f"Percentage: {uploaded_count / assembly_files_len * 100:.2f}%, " f"Time: {datetime.now()}") - iter_counter += 1 - - # Signal the workers to terminate when they finish uploading assembly files - for _ in range(conf.workers): - conf.input_queue.put(None) - - results = [conf.output_queue.get() for _ in range(iter_counter)] - failed_names = [name for names, upas in results for name, upa in zip(names, upas) if upa is None] - - # Close and join the processes - conf.pools.close() - conf.pools.join() - - return failed_names + return uploaded_count def _gen( - conf: Conf, - wait_to_upload_assemblies: dict[str, str] -) -> Generator[tuple[tuple[str, ...], tuple[str, ...], tuple[str, ...], int], None, None]: + wait_to_upload_assemblies: dict[str, str], + batch_size: int, +) -> Generator[list[AssemblyTuple], None, None]: """ Generator function to yield the assembly files to upload. """ @@ -475,37 +431,19 @@ def _gen( ] ) - start_idx = 0 - cumsize = 0 - # to avoid memory issue, upload 1GB * MEMORY_UTILIZATION of assembly files at a time - max_cumsize = MAX_DATA_SIZE * MEMORY_UTILIZATION - for idx, assembly_name in enumerate(assembly_names): - file_path = os.path.join(conf.job_data_dir, assembly_name) - if not os.path.exists(file_path): - raise ValueError(f"{file_path} does not exist. " - f"Ensure file {assembly_name} exist in {conf.job_data_dir}") - file_size = os.path.getsize(file_path) - # cumulate assembly files until the total size is greater than max_cumsize - if file_size + cumsize <= max_cumsize: - cumsize += file_size - else: - yield ( - container_internal_assembly_paths[start_idx:idx], - host_assembly_dirs[start_idx:idx], - assembly_names[start_idx:idx], - idx - start_idx, - ) - # reset start_idx and cumsize - start_idx = idx - cumsize = file_size - - # yield the remaining assembly files - yield ( - container_internal_assembly_paths[start_idx:], - host_assembly_dirs[start_idx:], - assembly_names[start_idx:], - assembly_files_len - start_idx, - ) + # construct a list of nametuples to ensure consistency + assemblyTuple_list = [ + AssemblyTuple( + assembly_names[idx], + host_assembly_dirs[idx], + container_internal_assembly_paths[idx], + ) + for idx in range(assembly_files_len) + ] + + # yield AssemblyTuples in batch + for idx in range(0, assembly_files_len, batch_size): + yield assemblyTuple_list[idx: idx + batch_size] def main(): @@ -519,6 +457,7 @@ def main(): token_filepath = args.token_filepath upload_file_ext = args.upload_file_ext workers = args.workers + batch_size = args.batch_size keep_job_dir = args.keep_job_dir env = args.env @@ -526,6 +465,8 @@ def main(): if workspace_id <= 0: parser.error(f"workspace_id needs to be > 0") + if batch_size <= 0: + parser.error(f"batch_size needs to be > 0") if workers < 1 or workers > cpu_count(): parser.error(f"minimum worker is 1 and maximum worker is {cpu_count()}") @@ -546,15 +487,8 @@ def main(): # start podman service proc = loader_helper.start_podman_service(uid) - # set up conf, start callback server, and upload assemblies to workspace - conf = Conf( - job_dir, - output_dir, - _process_input, - kb_base_url, - token_filepath, - workers, - ) + # set up conf for uploader, start callback server, and upload assemblies to workspace + conf = Conf(job_dir, output_dir, kb_base_url, token_filepath) count, wait_to_upload_assemblies = _fetch_assemblies_to_upload( env, @@ -575,24 +509,21 @@ def main(): print(f"{wtus_len} assemblies in {data_dir} are ready to upload to workspace {workspace_id}") start = time.time() - failed_names = _upload_assembly_files_in_parallel( + uploaded_count = _upload_assembly_files_in_parallel( conf, env, workspace_id, upload_dir, wait_to_upload_assemblies, + batch_size, ) - - assembly_count = wtus_len - len(failed_names) + upload_time = (time.time() - start) / 60 - assy_per_min = assembly_count / upload_time + assy_per_min = uploaded_count / upload_time - print(f"\n{workers} workers took {upload_time:.2f} minutes to upload {assembly_count} assemblies, " + print(f"\ntook {upload_time:.2f} minutes to upload {uploaded_count} assemblies, " f"averaging {assy_per_min:.2f} assemblies per minute") - if failed_names: - raise ValueError(f"\nFailed to upload {failed_names}") - finally: # stop callback server if it is on if conf: diff --git a/src/loaders/workspace_uploader/workspace_uploader_helper.py b/src/loaders/workspace_uploader/workspace_uploader_helper.py new file mode 100644 index 000000000..637d44366 --- /dev/null +++ b/src/loaders/workspace_uploader/workspace_uploader_helper.py @@ -0,0 +1,137 @@ +import os +import time +import uuid +from typing import Tuple, Union + +import docker + +from src.clients.AssemblyUtilClient import AssemblyUtil +from src.clients.workspaceClient import Workspace +from src.loaders.common import loader_helper +from src.loaders.common.loader_common_names import CALLBACK_IMAGE_NAME + + +class Conf: + """ + Configuration class for the workspace uploader script. + """ + def __init__( + self, + job_dir: str, + output_dir: str, + kb_base_url: str = "https://ci.kbase.us/services/", + token_filepath: str | None = None, + ) -> None: + """ + Initialize the configuration class. + + Args: + job_dir (str): The directory for SDK jobs per user. + output_dir (str): The directory for a specific workspace id under sourcedata/ws. + kb_base_url (str): The base url of the KBase services. + token_filepath (str): The file path that stores a KBase token appropriate for the KBase environment. + If not supplied, the token must be provided in the environment variable KB_AUTH_TOKEN. + """ + port = loader_helper.find_free_port() + token = loader_helper.get_token(token_filepath) + ipv4 = loader_helper.get_ip() + self._start_callback_server( + docker.from_env(), + uuid.uuid4().hex, + job_dir, + kb_base_url, + token, + port, + ipv4, + ) + + ws_url = os.path.join(kb_base_url, "ws") + callback_url = "http://" + ipv4 + ":" + str(port) + print("callback_url:", callback_url) + + self.ws = Workspace(ws_url, token=token) + self.asu = AssemblyUtil(callback_url, token=token) + + self.output_dir = output_dir + self.job_data_dir = loader_helper.make_job_data_dir(job_dir) + + def _setup_callback_server_envs( + self, + job_dir: str, + kb_base_url: str, + token: str, + port: int, + ipv4: str, + ) -> Tuple[dict[str, Union[int, str]], dict[str, dict[str, str]]]: + """ + Setup the environment variables and volumes for the callback server. + + Args: + job_dir (str): The directory for SDK jobs per user. + kb_base_url (str): The base url of the KBase services. + token (str): The KBase token. + port (int): The port number for the callback server. + + Returns: + tuple: A tuple of the environment variables and volumes for the callback server. + """ + # initiate env and vol + env = {} + vol = {} + + # used by the callback server + env["KB_AUTH_TOKEN"] = token + env["KB_BASE_URL"] = kb_base_url + env["JOB_DIR"] = job_dir + env["CALLBACK_PORT"] = port + env["CALLBACK_IP"] = ipv4 # specify an ipv4 address for the callback server + # otherwise, the callback container will use the an ipv6 address + + # setup volumes required for docker container + docker_host = os.environ["DOCKER_HOST"] + if docker_host.startswith("unix:"): + docker_host = docker_host[5:] + + vol[job_dir] = {"bind": job_dir, "mode": "rw"} + vol[docker_host] = {"bind": "/run/docker.sock", "mode": "rw"} + + return env, vol + + def _start_callback_server( + self, + client: docker.client, + container_name: str, + job_dir: str, + kb_base_url: str, + token: str, + port: int, + ipv4: str, + ) -> None: + """ + Start the callback server. + + Args: + client (docker.client): The docker client. + container_name (str): The name of the container. + job_dir (str): The directory for SDK jobs per user. + kb_base_url (str): The base url of the KBase services. + token (str): The KBase token. + port (int): The port number for the callback server. + """ + env, vol = self._setup_callback_server_envs(job_dir, kb_base_url, token, port, ipv4) + self.container = client.containers.run( + name=container_name, + image=CALLBACK_IMAGE_NAME, + detach=True, + network_mode="host", + environment=env, + volumes=vol, + ) + time.sleep(2) + + def stop_callback_server(self) -> None: + """ + Stop the callback server. + """ + self.container.stop() + self.container.remove() diff --git a/test/src/loaders/workspace_uploader/workspace_uploader_test.py b/test/src/loaders/workspace_uploader/workspace_uploader_test.py index 58ac3fda6..1880be2d5 100644 --- a/test/src/loaders/workspace_uploader/workspace_uploader_test.py +++ b/test/src/loaders/workspace_uploader/workspace_uploader_test.py @@ -1,16 +1,14 @@ import os import shutil import uuid -from multiprocessing import Queue from pathlib import Path from typing import NamedTuple -from unittest.mock import Mock, create_autospec +from unittest.mock import MagicMock, Mock, create_autospec import pytest from src.clients.AssemblyUtilClient import AssemblyUtil from src.loaders.common import loader_helper -from src.loaders.workspace_downloader.workspace_downloader_helper import Conf from src.loaders.workspace_uploader import workspace_uploader ASSEMBLY_DIR_NAMES = ["GCF_000979855.1", "GCF_000979175.1"] @@ -223,31 +221,66 @@ def test_post_process(setup_and_teardown): def test_upload_assembly_to_workspace(setup_and_teardown): - _ = setup_and_teardown + params = setup_and_teardown assembly_name = ASSEMBLY_NAMES[0] + host_assembly_dir = params.assembly_dirs[0] conf = Mock() conf.asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) - conf.asu.save_assembly_from_fasta2.return_value = {"upa": "12345/58/1"} - upa = workspace_uploader._upload_assembly_to_workspace( - conf, 12345, "/path/to/file/in/AssembilyUtil", assembly_name + conf.asu.save_assemblies_from_fastas.return_value = {"results":[{"upa": "12345/58/1"}]} + assembly_tuple = workspace_uploader.AssemblyTuple( + assembly_name, host_assembly_dir, "/path/to/file/in/AssembilyUtil" ) - - assert upa == "12345_58_1" - conf.asu.save_assembly_from_fasta2.assert_called_once_with( + upas = workspace_uploader._upload_assemblies_to_workspace(conf, 12345, [assembly_tuple]) + assert upas == tuple(["12345_58_1"]) + conf.asu.save_assemblies_from_fastas.assert_called_once_with( { - "file": {"path": "/path/to/file/in/AssembilyUtil"}, "workspace_id": 12345, - "assembly_name": assembly_name, + "inputs": [ + { + "file": assembly_tuple.container_internal_assembly_path, + "assembly_name": assembly_tuple.assembly_name + } + ] } ) -def test_assembly_files_in_parallel(setup_and_teardown): +def test_generator(setup_and_teardown): params = setup_and_teardown + assembly_dirs = params.assembly_dirs + wait_to_upload_assemblies = { + assembly_name: assembly_dir + for assembly_name, assembly_dir in zip(ASSEMBLY_NAMES, assembly_dirs) + } + assemblyTuple_list = list(workspace_uploader._gen(wait_to_upload_assemblies, 1)) + expected_assemblyTuple_list = [ + [ + workspace_uploader.AssemblyTuple( + "GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", + assembly_dirs[0], + "/kb/module/work/tmp/GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", + ) + ], + [ + workspace_uploader.AssemblyTuple( + "GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", + assembly_dirs[1], + "/kb/module/work/tmp/GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", + ) + ], + ] + assert assemblyTuple_list == expected_assemblyTuple_list + + +def test_upload_assembly_files_in_parallel(setup_and_teardown): + params = setup_and_teardown + src_files = params.target_files + assembly_dirs = params.assembly_dirs upload_dir = Path(params.tmp_dir) / "upload_dir" upload_dir.mkdir() - assembly_dirs = params.assembly_dirs + output_dir = Path(params.tmp_dir) / "output_dir" + output_dir.mkdir() wait_to_upload_assemblies = { assembly_name: assembly_dir @@ -255,46 +288,58 @@ def test_assembly_files_in_parallel(setup_and_teardown): } conf = Mock() - conf.workers = 5 - conf.input_queue = Queue() - conf.output_queue = Queue() + conf.output_dir = output_dir + conf.asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) + conf.asu.save_assemblies_from_fastas.return_value = { + "results":[ + {"upa": "12345/58/1"}, + {"upa": "12345/60/1"} + ] + } + + uploaded_count = workspace_uploader._upload_assembly_files_in_parallel( + conf, "CI", 12345, upload_dir, wait_to_upload_assemblies, 2 + ) - # an uploaded successful - conf.output_queue.put((ASSEMBLY_NAMES[0], "12345_58_1")) - # an upload failed - conf.output_queue.put((ASSEMBLY_NAMES[1], None)) + assert uploaded_count == 2 - failed_names = workspace_uploader._upload_assembly_files_in_parallel( - conf, "CI", 12345, upload_dir, wait_to_upload_assemblies + # check softlink for post_process + assert os.readlink(os.path.join(upload_dir, "12345_58_1")) == os.path.join( + output_dir, "12345_58_1" + ) + assert os.readlink(os.path.join(upload_dir, "12345_60_1")) == os.path.join( + output_dir, "12345_60_1" ) - expected_tuple1 = ( - "CI", - 12345, - os.path.join( - workspace_uploader.JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER, ASSEMBLY_NAMES[0] - ), - assembly_dirs[0], - ASSEMBLY_NAMES[0], - upload_dir, - 1, - len(ASSEMBLY_NAMES), + # check hardlink for post_process + assert os.path.samefile( + src_files[0], os.path.join(os.path.join(output_dir, "12345_58_1"), f"12345_58_1.fna.gz") ) - expected_tuple2 = ( - "CI", - 12345, - os.path.join( - workspace_uploader.JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER, ASSEMBLY_NAMES[1] - ), - assembly_dirs[1], - ASSEMBLY_NAMES[1], - upload_dir, - 2, - len(ASSEMBLY_NAMES), + assert os.path.samefile( + src_files[1], os.path.join(os.path.join(output_dir, "12345_60_1"), f"12345_60_1.fna.gz") + ) + + +def test_fail_upload_assembly_files_in_parallel(setup_and_teardown): + params = setup_and_teardown + assembly_dirs = params.assembly_dirs + upload_dir = Path(params.tmp_dir) / "upload_dir" + upload_dir.mkdir() + output_dir = Path(params.tmp_dir) / "output_dir" + output_dir.mkdir() + + wait_to_upload_assemblies = { + assembly_name: assembly_dir + for assembly_name, assembly_dir in zip(ASSEMBLY_NAMES, assembly_dirs) + } + + conf = MagicMock() + conf.asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) + conf.asu.save_assemblies_from_fastas.side_effect = Exception("Illegal character in object name") + + uploaded_count = workspace_uploader._upload_assembly_files_in_parallel( + conf, "CI", 12345, upload_dir, wait_to_upload_assemblies, 2 ) - assert conf.input_queue.get() == expected_tuple1 - assert conf.input_queue.get() == expected_tuple2 - assert conf.output_queue.empty() - assert failed_names == [ASSEMBLY_NAMES[1]] + assert uploaded_count == 0 From 81170618b6ae96fb5174c7b021a3af31ceae6c81 Mon Sep 17 00:00:00 2001 From: Sean Xiang Date: Mon, 6 Nov 2023 15:58:52 -0800 Subject: [PATCH 06/30] remove workers and run pass all unit tests --- .../workspace_uploader/workspace_uploader.py | 17 +++-------------- .../workspace_uploader_test.py | 7 +++++-- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index c2ecbd54c..5719a912d 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -1,8 +1,7 @@ """ usage: workspace_uploader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_ver SOURCE_VER] [--root_dir ROOT_DIR] [--token_filepath TOKEN_FILEPATH] [--env {CI,NEXT,APPDEV,PROD}] - [--upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...]] [--workers WORKERS] [--batch_size BATCH_SIZE] - [--keep_job_dir] + [--upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...]] [--batch_size BATCH_SIZE] [--keep_job_dir] PROTOTYPE - Upload assembly files to the workspace service (WSS). @@ -30,7 +29,6 @@ KBase environment (default: PROD) --upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...] Upload only files that match given extensions (default: ['genomic.fna.gz']) - --workers WORKERS Number of workers for multiprocessing (default: 5) --batch_size BATCH_SIZE Number of files to upload per batch (default: 1000) --keep_job_dir Keep SDK job directory after upload task is completed @@ -133,12 +131,6 @@ def _get_parser(): default=UPLOAD_FILE_EXT, help="Upload only files that match given extensions", ) - optional.add_argument( - "--workers", - type=int, - default=5, - help="Number of workers for multiprocessing", - ) optional.add_argument( "--batch_size", type=int, @@ -197,6 +189,7 @@ def _upload_assemblies_to_workspace( } ) except Exception as e: + print(e) raise e upas = tuple([result_dict["upa"].replace("/", "_") @@ -364,7 +357,7 @@ def _upload_assembly_files_in_parallel( batch_size: a number of files to upload per batch Returns: - a count on assembly files uploded + number of assembly files have been sucessfully uploaded from wait_to_upload_assemblies """ assembly_files_len = len(wait_to_upload_assemblies) print(f"Start uploading {assembly_files_len} assembly files\n") @@ -375,7 +368,6 @@ def _upload_assembly_files_in_parallel( try: batch_upas = _upload_assemblies_to_workspace(conf, workspace_id, assembly_tuples) except Exception as e: - print(e) failed_objects_names = [assembly_tuple.assembly_name for assembly_tuple in assembly_tuples] assembly_objects_in_ws = loader_helper.list_objects( workspace_id, @@ -456,7 +448,6 @@ def main(): root_dir = getattr(args, loader_common_names.ROOT_DIR_ARG_NAME) token_filepath = args.token_filepath upload_file_ext = args.upload_file_ext - workers = args.workers batch_size = args.batch_size keep_job_dir = args.keep_job_dir @@ -467,8 +458,6 @@ def main(): parser.error(f"workspace_id needs to be > 0") if batch_size <= 0: parser.error(f"batch_size needs to be > 0") - if workers < 1 or workers > cpu_count(): - parser.error(f"minimum worker is 1 and maximum worker is {cpu_count()}") uid = os.getuid() username = os.getlogin() diff --git a/test/src/loaders/workspace_uploader/workspace_uploader_test.py b/test/src/loaders/workspace_uploader/workspace_uploader_test.py index 1880be2d5..4e4f28ae0 100644 --- a/test/src/loaders/workspace_uploader/workspace_uploader_test.py +++ b/test/src/loaders/workspace_uploader/workspace_uploader_test.py @@ -3,7 +3,7 @@ import uuid from pathlib import Path from typing import NamedTuple -from unittest.mock import MagicMock, Mock, create_autospec +from unittest.mock import Mock, create_autospec import pytest @@ -334,10 +334,13 @@ def test_fail_upload_assembly_files_in_parallel(setup_and_teardown): for assembly_name, assembly_dir in zip(ASSEMBLY_NAMES, assembly_dirs) } - conf = MagicMock() + conf = Mock() conf.asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) conf.asu.save_assemblies_from_fastas.side_effect = Exception("Illegal character in object name") + loader_helper.list_objects = create_autospec(loader_helper.list_objects) + loader_helper.list_objects.return_value = [] + uploaded_count = workspace_uploader._upload_assembly_files_in_parallel( conf, "CI", 12345, upload_dir, wait_to_upload_assemblies, 2 ) From 19e52ae04c6e28d39e2b01f81c54801e517ad9e4 Mon Sep 17 00:00:00 2001 From: Sean Xiang Date: Wed, 15 Nov 2023 13:04:19 -0800 Subject: [PATCH 07/30] add AU service_ver CLI --- src/loaders/common/loader_common_names.py | 3 +++ .../workspace_uploader/workspace_uploader.py | 15 +++++++++++++-- .../workspace_uploader_helper.py | 4 +++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index a8f26555c..1481dac8f 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -128,3 +128,6 @@ 'NEXT': 'https://next.kbase.us/services/', 'APPDEV': 'https://appdev.kbase.us/services/', 'PROD': 'https://kbase.us/services/'} + +# service_vers +SERVICE_VERSIONS = ["dev", "beta", "release"] \ No newline at end of file diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 5719a912d..b18cb58cb 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -1,7 +1,8 @@ """ usage: workspace_uploader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_ver SOURCE_VER] [--root_dir ROOT_DIR] [--token_filepath TOKEN_FILEPATH] [--env {CI,NEXT,APPDEV,PROD}] - [--upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...]] [--batch_size BATCH_SIZE] [--keep_job_dir] + [--upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...]] [--batch_size BATCH_SIZE] + [--au_service_ver {dev,beta,release}] [--keep_job_dir] PROTOTYPE - Upload assembly files to the workspace service (WSS). @@ -31,6 +32,8 @@ Upload only files that match given extensions (default: ['genomic.fna.gz']) --batch_size BATCH_SIZE Number of files to upload per batch (default: 1000) + --au_service_ver {dev,beta,release} + The service version of AssemblyUtil client (default: release) --keep_job_dir Keep SDK job directory after upload task is completed e.g. @@ -137,6 +140,13 @@ def _get_parser(): default=1000, help="Number of files to upload per batch", ) + optional.add_argument( + "--au_service_ver", + type=str, + choices=loader_common_names.SERVICE_VERSIONS, + default="release", + help="The service version of AssemblyUtil client", + ) optional.add_argument( "--keep_job_dir", action="store_true", @@ -449,6 +459,7 @@ def main(): token_filepath = args.token_filepath upload_file_ext = args.upload_file_ext batch_size = args.batch_size + au_service_ver = args.au_service_ver keep_job_dir = args.keep_job_dir env = args.env @@ -477,7 +488,7 @@ def main(): proc = loader_helper.start_podman_service(uid) # set up conf for uploader, start callback server, and upload assemblies to workspace - conf = Conf(job_dir, output_dir, kb_base_url, token_filepath) + conf = Conf(job_dir, output_dir, kb_base_url, token_filepath, au_service_ver) count, wait_to_upload_assemblies = _fetch_assemblies_to_upload( env, diff --git a/src/loaders/workspace_uploader/workspace_uploader_helper.py b/src/loaders/workspace_uploader/workspace_uploader_helper.py index 637d44366..1690a96f8 100644 --- a/src/loaders/workspace_uploader/workspace_uploader_helper.py +++ b/src/loaders/workspace_uploader/workspace_uploader_helper.py @@ -21,6 +21,7 @@ def __init__( output_dir: str, kb_base_url: str = "https://ci.kbase.us/services/", token_filepath: str | None = None, + au_service_ver: str = "release", ) -> None: """ Initialize the configuration class. @@ -31,6 +32,7 @@ def __init__( kb_base_url (str): The base url of the KBase services. token_filepath (str): The file path that stores a KBase token appropriate for the KBase environment. If not supplied, the token must be provided in the environment variable KB_AUTH_TOKEN. + au_service_ver (str): The service verison of AssemblyUtilClient. """ port = loader_helper.find_free_port() token = loader_helper.get_token(token_filepath) @@ -50,7 +52,7 @@ def __init__( print("callback_url:", callback_url) self.ws = Workspace(ws_url, token=token) - self.asu = AssemblyUtil(callback_url, token=token) + self.asu = AssemblyUtil(callback_url, service_ver=au_service_ver, token=token) self.output_dir = output_dir self.job_data_dir = loader_helper.make_job_data_dir(job_dir) From 2383f1b071b500ac6295addb3b9e7360599a7a2b Mon Sep 17 00:00:00 2001 From: Sean Xiang Date: Mon, 20 Nov 2023 11:24:10 -0800 Subject: [PATCH 08/30] print out logs and add env max_task --- .../workspace_uploader/workspace_uploader.py | 21 +++++++++++++++++-- .../workspace_uploader_helper.py | 15 ++++++++++++- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index b18cb58cb..b01b6ff85 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -2,7 +2,7 @@ usage: workspace_uploader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_ver SOURCE_VER] [--root_dir ROOT_DIR] [--token_filepath TOKEN_FILEPATH] [--env {CI,NEXT,APPDEV,PROD}] [--upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...]] [--batch_size BATCH_SIZE] - [--au_service_ver {dev,beta,release}] [--keep_job_dir] + [--jr_max_tasks JR_MAX_TASKS] [--au_service_ver {dev,beta,release}] [--keep_job_dir] PROTOTYPE - Upload assembly files to the workspace service (WSS). @@ -32,6 +32,8 @@ Upload only files that match given extensions (default: ['genomic.fna.gz']) --batch_size BATCH_SIZE Number of files to upload per batch (default: 1000) + --jr_max_tasks JR_MAX_TASKS + The maxmium subtasks for the callback server (default: 20) --au_service_ver {dev,beta,release} The service version of AssemblyUtil client (default: release) --keep_job_dir Keep SDK job directory after upload task is completed @@ -140,6 +142,12 @@ def _get_parser(): default=1000, help="Number of files to upload per batch", ) + optional.add_argument( + "--jr_max_tasks", + type=int, + default=20, + help="The maxmium subtasks for the callback server", + ) optional.add_argument( "--au_service_ver", type=str, @@ -459,6 +467,7 @@ def main(): token_filepath = args.token_filepath upload_file_ext = args.upload_file_ext batch_size = args.batch_size + max_task = jr_max_tasks au_service_ver = args.au_service_ver keep_job_dir = args.keep_job_dir @@ -469,6 +478,8 @@ def main(): parser.error(f"workspace_id needs to be > 0") if batch_size <= 0: parser.error(f"batch_size needs to be > 0") + if max_task <= 0: + parser.error(f"jr_max_tasks needs to be > 0") uid = os.getuid() username = os.getlogin() @@ -488,7 +499,7 @@ def main(): proc = loader_helper.start_podman_service(uid) # set up conf for uploader, start callback server, and upload assemblies to workspace - conf = Conf(job_dir, output_dir, kb_base_url, token_filepath, au_service_ver) + conf = Conf(job_dir, output_dir, kb_base_url, token_filepath, au_service_ver, max_task) count, wait_to_upload_assemblies = _fetch_assemblies_to_upload( env, @@ -529,6 +540,12 @@ def main(): if conf: conf.stop_callback_server() + # print out container logs + if conf.logging is not None: + log_string = conf.logging.decode("utf-8") + for line in log_string.split("\n"): + print(line) + # stop podman service if it is on if proc: proc.terminate() diff --git a/src/loaders/workspace_uploader/workspace_uploader_helper.py b/src/loaders/workspace_uploader/workspace_uploader_helper.py index 1690a96f8..6c3fd1546 100644 --- a/src/loaders/workspace_uploader/workspace_uploader_helper.py +++ b/src/loaders/workspace_uploader/workspace_uploader_helper.py @@ -22,6 +22,7 @@ def __init__( kb_base_url: str = "https://ci.kbase.us/services/", token_filepath: str | None = None, au_service_ver: str = "release", + max_task: int = 20, ) -> None: """ Initialize the configuration class. @@ -33,6 +34,7 @@ def __init__( token_filepath (str): The file path that stores a KBase token appropriate for the KBase environment. If not supplied, the token must be provided in the environment variable KB_AUTH_TOKEN. au_service_ver (str): The service verison of AssemblyUtilClient. + max_task (int): The maxmium subtasks for the callback server. """ port = loader_helper.find_free_port() token = loader_helper.get_token(token_filepath) @@ -44,6 +46,7 @@ def __init__( kb_base_url, token, port, + max_task, ipv4, ) @@ -57,12 +60,15 @@ def __init__( self.output_dir = output_dir self.job_data_dir = loader_helper.make_job_data_dir(job_dir) + self.logging = None + def _setup_callback_server_envs( self, job_dir: str, kb_base_url: str, token: str, port: int, + max_task: int, ipv4: str, ) -> Tuple[dict[str, Union[int, str]], dict[str, dict[str, str]]]: """ @@ -73,6 +79,8 @@ def _setup_callback_server_envs( kb_base_url (str): The base url of the KBase services. token (str): The KBase token. port (int): The port number for the callback server. + max_task (int): The maxmium subtasks for the callback server. + ipv4: (str): The ipv4 address for the callback server. Returns: tuple: A tuple of the environment variables and volumes for the callback server. @@ -86,6 +94,7 @@ def _setup_callback_server_envs( env["KB_BASE_URL"] = kb_base_url env["JOB_DIR"] = job_dir env["CALLBACK_PORT"] = port + env["JR_MAX_TASKS"] = max_task env["CALLBACK_IP"] = ipv4 # specify an ipv4 address for the callback server # otherwise, the callback container will use the an ipv6 address @@ -107,6 +116,7 @@ def _start_callback_server( kb_base_url: str, token: str, port: int, + max_task: int, ipv4: str, ) -> None: """ @@ -118,9 +128,11 @@ def _start_callback_server( job_dir (str): The directory for SDK jobs per user. kb_base_url (str): The base url of the KBase services. token (str): The KBase token. + max_task (int): The maxmium subtasks for the callback server. port (int): The port number for the callback server. + ipv4: (str): The ipv4 address for the callback server. """ - env, vol = self._setup_callback_server_envs(job_dir, kb_base_url, token, port, ipv4) + env, vol = self._setup_callback_server_envs(job_dir, kb_base_url, token, port, max_task, ipv4) self.container = client.containers.run( name=container_name, image=CALLBACK_IMAGE_NAME, @@ -135,5 +147,6 @@ def stop_callback_server(self) -> None: """ Stop the callback server. """ + self.logging = self.container.logs() self.container.stop() self.container.remove() From d3055184cd816f2ef5a61f54bf7a01b2133c3a74 Mon Sep 17 00:00:00 2001 From: Sean Xiang Date: Mon, 20 Nov 2023 17:25:36 -0800 Subject: [PATCH 09/30] container.conf setup && clean up --- src/loaders/common/loader_helper.py | 17 +++++++++++++++++ .../workspace_uploader/workspace_uploader.py | 6 +++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/loaders/common/loader_helper.py b/src/loaders/common/loader_helper.py index 3918d70f8..240d1f72a 100644 --- a/src/loaders/common/loader_helper.py +++ b/src/loaders/common/loader_helper.py @@ -1,4 +1,5 @@ import argparse +import configparser import itertools import json import os @@ -278,6 +279,22 @@ def start_podman_service(uid: int): return proc +def setup_callback_server_logs(): + """set up logs config file for the callback server""" + home = os.path.expanduser("~") + conf_path = os.path.join(home, ".config/containers/containers.conf") + config = configparser.ConfigParser() + config.read(conf_path) + if not config.has_section("containers"): + config.add_section("containers") + if config.get("containers", "seccomp_profile", fallback=None) != "\"unconfined\"": + config.set("containers", "seccomp_profile", "\"unconfined\"") + if config.get("containers", "log_driver", fallback=None) != "\"k8s-file\"": + config.set("containers", "log_driver", "\"k8s-file\"") + with open(conf_path, 'w') as configfile: + config.write(configfile) + + def is_upa_info_complete(upa_dir: str): """ Check whether an UPA needs to be downloaded or not by loading the metadata file. diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index b01b6ff85..67519c374 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -467,7 +467,7 @@ def main(): token_filepath = args.token_filepath upload_file_ext = args.upload_file_ext batch_size = args.batch_size - max_task = jr_max_tasks + max_task = args.jr_max_tasks au_service_ver = args.au_service_ver keep_job_dir = args.keep_job_dir @@ -495,6 +495,9 @@ def main(): conf = None try: + # setup container.conf file for the callback server logs + loader_helper.setup_callback_server_logs() + # start podman service proc = loader_helper.start_podman_service(uid) @@ -542,6 +545,7 @@ def main(): # print out container logs if conf.logging is not None: + print("\n****** Logs from the Callback Server ******\n") log_string = conf.logging.decode("utf-8") for line in log_string.split("\n"): print(line) From 20673cdfec8f0949f31162c3b702c993f59fbf0a Mon Sep 17 00:00:00 2001 From: Sean Xiang Date: Mon, 20 Nov 2023 18:00:41 -0800 Subject: [PATCH 10/30] update setup_callback_server_logs fun --- src/loaders/common/loader_helper.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/loaders/common/loader_helper.py b/src/loaders/common/loader_helper.py index 240d1f72a..3041d54e7 100644 --- a/src/loaders/common/loader_helper.py +++ b/src/loaders/common/loader_helper.py @@ -280,19 +280,30 @@ def start_podman_service(uid: int): def setup_callback_server_logs(): - """set up logs config file for the callback server""" + """Set up logs config file for the callback server""" home = os.path.expanduser("~") conf_path = os.path.join(home, ".config/containers/containers.conf") config = configparser.ConfigParser() config.read(conf_path) + params = { + "seccomp_profile": "\"unconfined\"", + "log_driver": "\"k8s-file\"" + } + + modify = False if not config.has_section("containers"): config.add_section("containers") - if config.get("containers", "seccomp_profile", fallback=None) != "\"unconfined\"": - config.set("containers", "seccomp_profile", "\"unconfined\"") - if config.get("containers", "log_driver", fallback=None) != "\"k8s-file\"": - config.set("containers", "log_driver", "\"k8s-file\"") - with open(conf_path, 'w') as configfile: - config.write(configfile) + + for key, val in params.items(): + if config.get("containers", key, fallback=None) != val: + config.set("containers", key, val) + if not modify: + modify = True + + if modify: + with open(conf_path, 'w') as configfile: + config.write(configfile) + print(f"containers.conf is modified and saved to path: {conf_path}") def is_upa_info_complete(upa_dir: str): From 5972e2dbbbb6e8412e10940e3dce56f291f53646 Mon Sep 17 00:00:00 2001 From: Sijie Date: Wed, 22 Nov 2023 13:14:55 -0800 Subject: [PATCH 11/30] fix catalog param --- src/loaders/workspace_uploader/workspace_uploader_helper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/loaders/workspace_uploader/workspace_uploader_helper.py b/src/loaders/workspace_uploader/workspace_uploader_helper.py index 6c3fd1546..51e0a83f3 100644 --- a/src/loaders/workspace_uploader/workspace_uploader_helper.py +++ b/src/loaders/workspace_uploader/workspace_uploader_helper.py @@ -91,6 +91,7 @@ def _setup_callback_server_envs( # used by the callback server env["KB_AUTH_TOKEN"] = token + env["KB_ADMIN_AUTH_TOKEN"] = token # pass in admin_token to get catalog params env["KB_BASE_URL"] = kb_base_url env["JOB_DIR"] = job_dir env["CALLBACK_PORT"] = port From d71c661e9150a0a2db3d2cab41eda80e573cba77 Mon Sep 17 00:00:00 2001 From: Sijie Date: Mon, 27 Nov 2023 18:27:43 -0800 Subject: [PATCH 12/30] update _upload_assembly_files_in_parallel logic && logs permission --- src/loaders/common/loader_common_names.py | 4 +- src/loaders/common/loader_helper.py | 3 +- .../workspace_uploader/workspace_uploader.py | 47 ++++++++++++------- 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index 1481dac8f..9f2dda224 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -130,4 +130,6 @@ 'PROD': 'https://kbase.us/services/'} # service_vers -SERVICE_VERSIONS = ["dev", "beta", "release"] \ No newline at end of file +SERVICE_VERSIONS = ["dev", "beta", "release"] +# containers.conf path +CONTAINERS_CONF_PATH = ".config/containers/containers.conf" \ No newline at end of file diff --git a/src/loaders/common/loader_helper.py b/src/loaders/common/loader_helper.py index 4c03fab22..ae87b2669 100644 --- a/src/loaders/common/loader_helper.py +++ b/src/loaders/common/loader_helper.py @@ -29,6 +29,7 @@ ) from src.loaders.common.loader_common_names import ( COLLECTION_SOURCE_DIR, + CONTAINERS_CONF_PATH, DOCKER_HOST, FATAL_ERROR, FATAL_STACKTRACE, @@ -293,7 +294,7 @@ def start_podman_service(uid: int): def setup_callback_server_logs(): """Set up logs config file for the callback server""" home = os.path.expanduser("~") - conf_path = os.path.join(home, ".config/containers/containers.conf") + conf_path = os.path.join(home, CONTAINERS_CONF_PATH) config = configparser.ConfigParser() config.read(conf_path) params = { diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 67519c374..e174df69a 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -54,6 +54,7 @@ e.g. /global/cfs/cdirs/kbase/collections/collectionssource/ -> ENV -> kbase_collection -> source_ver -> UPA -> .fna.gz file """ import argparse +import click import os import shutil import time @@ -139,7 +140,7 @@ def _get_parser(): optional.add_argument( "--batch_size", type=int, - default=1000, + default=2500, help="Number of files to upload per batch", ) optional.add_argument( @@ -381,30 +382,32 @@ def _upload_assembly_files_in_parallel( print(f"Start uploading {assembly_files_len} assembly files\n") uploaded_count = 0 + uploaded_fail = False for assembly_tuples in _gen(wait_to_upload_assemblies, batch_size): try: batch_upas = _upload_assemblies_to_workspace(conf, workspace_id, assembly_tuples) + batch_uploaded_tuples = assembly_tuples except Exception as e: - failed_objects_names = [assembly_tuple.assembly_name for assembly_tuple in assembly_tuples] + name_assemblyTuple_map = { + assembly_tuple.assembly_name: assembly_tuple for assembly_tuple in assembly_tuples + } assembly_objects_in_ws = loader_helper.list_objects( workspace_id, conf, loader_common_names.OBJECTS_NAME_ASSEMBLY, ) - assembly_names_in_ws = [object_info[1] for object_info in assembly_objects_in_ws] - objects_names_to_delete_from_ws = set(assembly_names_in_ws) & set(failed_objects_names) - if objects_names_to_delete_from_ws: - conf.ws.delete_objects( - [ - {"wsid": workspace_id, "name": object_name} - for object_name in objects_names_to_delete_from_ws - ] - ) - print(f"workspace {workspace_id} cleanup done ...") - return uploaded_count + name_upa_map = { + object_info[1]: "{6}_{0}_{4}".format(*object_info) for object_info in assembly_objects_in_ws + } + batch_uploaded_objects_names = set(name_upa_map.keys()) & set(name_assemblyTuple_map.keys()) + + # update batch_uploaded_tuples and batch_upas + batch_upas = tuple([assembly_names_in_ws_dict[name] for name in batch_uploaded_objects_names]) + batch_uploaded_tuples = [fail_batch_dict[name] for name in batch_uploaded_objects_names] + uploaded_fail = True - for assembly_tuple, upa in zip(assembly_tuples, batch_upas): + for assembly_tuple, upa in zip(batch_uploaded_tuples, batch_upas): _post_process( upload_env_key, workspace_id, @@ -414,13 +417,16 @@ def _upload_assembly_files_in_parallel( conf.output_dir, upa ) - - uploaded_count += len(assembly_tuples) - if uploaded_count % 1000 == 0: + + uploaded_count += len(batch_uploaded_tuples) + if uploaded_count % 100 == 0: print(f"Assemblies uploaded: {uploaded_count}/{assembly_files_len}, " f"Percentage: {uploaded_count / assembly_files_len * 100:.2f}%, " f"Time: {datetime.now()}") + if uploaded_fail: + return uploaded_count + return uploaded_count @@ -496,7 +502,12 @@ def main(): try: # setup container.conf file for the callback server logs - loader_helper.setup_callback_server_logs() + if click.confirm(f"Set up a containers.conf file at {loader_common_names.CONTAINERS_CONF_PATH} if does not exist already?\n" + f"Params 'seccomp_profile' and 'log_driver' will be added under section 'containers'"): + loader_helper.setup_callback_server_logs() + else: + print("Permission denied and exiting ...") + return # start podman service proc = loader_helper.start_podman_service(uid) From 6ae5f4d0c5c8620f1275fdfb10eedb14ce395305 Mon Sep 17 00:00:00 2001 From: Sijie Date: Mon, 27 Nov 2023 22:16:25 -0800 Subject: [PATCH 13/30] test passed new logic --- src/loaders/workspace_uploader/workspace_uploader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index e174df69a..8debcf44d 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -403,8 +403,8 @@ def _upload_assembly_files_in_parallel( batch_uploaded_objects_names = set(name_upa_map.keys()) & set(name_assemblyTuple_map.keys()) # update batch_uploaded_tuples and batch_upas - batch_upas = tuple([assembly_names_in_ws_dict[name] for name in batch_uploaded_objects_names]) - batch_uploaded_tuples = [fail_batch_dict[name] for name in batch_uploaded_objects_names] + batch_upas = tuple([name_upa_map[name] for name in batch_uploaded_objects_names]) + batch_uploaded_tuples = [name_assemblyTuple_map[name] for name in batch_uploaded_objects_names] uploaded_fail = True for assembly_tuple, upa in zip(batch_uploaded_tuples, batch_upas): @@ -503,7 +503,7 @@ def main(): try: # setup container.conf file for the callback server logs if click.confirm(f"Set up a containers.conf file at {loader_common_names.CONTAINERS_CONF_PATH} if does not exist already?\n" - f"Params 'seccomp_profile' and 'log_driver' will be added under section 'containers'"): + f"Params 'seccomp_profile' and 'log_driver' will be added under section [containers]"): loader_helper.setup_callback_server_logs() else: print("Permission denied and exiting ...") @@ -555,7 +555,7 @@ def main(): conf.stop_callback_server() # print out container logs - if conf.logging is not None: + if conf and conf.logging is not None: print("\n****** Logs from the Callback Server ******\n") log_string = conf.logging.decode("utf-8") for line in log_string.split("\n"): From f515968245e44ba7cbe1ee012acb1d62e3697dbd Mon Sep 17 00:00:00 2001 From: Sijie Date: Tue, 28 Nov 2023 17:27:34 -0800 Subject: [PATCH 14/30] update callback server related files --- src/loaders/common/callback_server_wrapper.py | 113 ++++++++----- .../workspace_downloader.py | 29 +++- .../workspace_uploader/workspace_uploader.py | 10 +- .../workspace_uploader_helper.py | 153 ------------------ .../workspace_uploader_test.py | 1 - 5 files changed, 107 insertions(+), 199 deletions(-) delete mode 100644 src/loaders/workspace_uploader/workspace_uploader_helper.py diff --git a/src/loaders/common/callback_server_wrapper.py b/src/loaders/common/callback_server_wrapper.py index fc07e3865..3ed8109a8 100644 --- a/src/loaders/common/callback_server_wrapper.py +++ b/src/loaders/common/callback_server_wrapper.py @@ -15,18 +15,22 @@ class Conf: """ - Configuration class for the workspace downloader script. + Configuration class for the workspace downloader and workspace uploader scripts. """ + def __init__( - self, - job_dir: str, - output_dir: str, - worker_function: Callable, - kb_base_url: str = "https://ci.kbase.us/services/", - token_filepath: str | None = None, - workers: int = 5, - retrieve_sample: bool = False, - ignore_no_sample_error: bool = False, + self, + job_dir: str, + output_dir: str, + kb_base_url: str = "https://ci.kbase.us/services/", + token_filepath: str | None = None, + au_service_ver: str = "release", + workers: int = 5, + max_task: int = 20, + worker_function: Callable | None = None, + retrieve_sample: bool = False, + ignore_no_sample_error: bool = False, + workspace_downloader: bool = True, ) -> None: """ Initialize the configuration class. @@ -34,19 +38,26 @@ def __init__( Args: job_dir (str): The directory for SDK jobs per user. output_dir (str): The directory for a specific workspace id under sourcedata/ws. - worker_function (Callable): The function that will be called by the workers. kb_base_url (str): The base url of the KBase services. - token_filepath (str): The file path that stores a KBase token appropriate for the KBase environment. + token_filepath (str): The file path that stores a KBase token appropriate for the KBase environment. If not supplied, the token must be provided in the environment variable KB_AUTH_TOKEN. + au_service_ver (str): The service verison of AssemblyUtilClient. workers (int): The number of workers to use for multiprocessing. + max_task (int): The maxmium subtasks for the callback server. + worker_function (Callable): The function that will be called by the workers. retrieve_sample (bool): Whether to retrieve sample for each genome object. ignore_no_sample_error (bool): Whether to ignore the error when no sample data is found. + workspace_downloader (bool): Whether to be used for the workspace downloader script. """ + # common instance variables + ipv4 = loader_helper.get_ip() port = loader_helper.find_free_port() token = loader_helper.get_token(token_filepath) - self.retrieve_sample = retrieve_sample - self.ignore_no_sample_error = ignore_no_sample_error - ipv4 = loader_helper.get_ip() + + ws_url = os.path.join(kb_base_url, "ws") + callback_url = "http://" + ipv4 + ":" + str(port) + print("callback_url:", callback_url) + self._start_callback_server( docker.from_env(), uuid.uuid4().hex, @@ -54,31 +65,41 @@ def __init__( kb_base_url, token, port, + max_task, ipv4, ) - ws_url = os.path.join(kb_base_url, "ws") - sample_url = os.path.join(kb_base_url, "sampleservice") - callback_url = "http://" + ipv4 + ":" + str(port) - print("callback_url:", callback_url) - self.ws = Workspace(ws_url, token=token) - self.asu = AssemblyUtil(callback_url, token=token) - self.ss = SampleService(sample_url, token=token) + self.asu = AssemblyUtil(callback_url, service_ver=au_service_ver, token=token) - self.workers = workers self.output_dir = output_dir - self.input_queue = Queue() self.job_data_dir = loader_helper.make_job_data_dir(job_dir) - self.pools = Pool(workers, worker_function, [self]) + + self.logging = None + + # unique to downloader + if workspace_downloader: + if worker_function is None: + raise ValueError( + "worker_function cannot be None for the workspace downloader script" + ) + self.input_queue = Queue() + self.retrieve_sample = retrieve_sample + self.ignore_no_sample_error = ignore_no_sample_error + + sample_url = os.path.join(kb_base_url, "sampleservice") + self.ss = SampleService(sample_url, token=token) + + self.pools = Pool(workers, worker_function, [self]) def _setup_callback_server_envs( - self, - job_dir: str, - kb_base_url: str, - token: str, - port: int, - ipv4: str, + self, + job_dir: str, + kb_base_url: str, + token: str, + port: int, + max_task: int, + ipv4: str, ) -> Tuple[dict[str, Union[int, str]], dict[str, dict[str, str]]]: """ Setup the environment variables and volumes for the callback server. @@ -88,6 +109,8 @@ def _setup_callback_server_envs( kb_base_url (str): The base url of the KBase services. token (str): The KBase token. port (int): The port number for the callback server. + max_task (int): The maxmium subtasks for the callback server. + ipv4: (str): The ipv4 address for the callback server. Returns: tuple: A tuple of the environment variables and volumes for the callback server. @@ -98,9 +121,11 @@ def _setup_callback_server_envs( # used by the callback server env["KB_AUTH_TOKEN"] = token + env["KB_ADMIN_AUTH_TOKEN"] = token # pass in admin_token to get catalog params env["KB_BASE_URL"] = kb_base_url env["JOB_DIR"] = job_dir env["CALLBACK_PORT"] = port + env["JR_MAX_TASKS"] = max_task env["CALLBACK_IP"] = ipv4 # specify an ipv4 address for the callback server # otherwise, the callback container will use the an ipv6 address @@ -115,14 +140,15 @@ def _setup_callback_server_envs( return env, vol def _start_callback_server( - self, - client: docker.client, - container_name: str, - job_dir: str, - kb_base_url: str, - token: str, - port: int, - ipv4: str, + self, + client: docker.client, + container_name: str, + job_dir: str, + kb_base_url: str, + token: str, + port: int, + max_task: int, + ipv4: str, ) -> None: """ Start the callback server. @@ -133,9 +159,13 @@ def _start_callback_server( job_dir (str): The directory for SDK jobs per user. kb_base_url (str): The base url of the KBase services. token (str): The KBase token. + max_task (int): The maxmium subtasks for the callback server. port (int): The port number for the callback server. + ipv4: (str): The ipv4 address for the callback server. """ - env, vol = self._setup_callback_server_envs(job_dir, kb_base_url, token, port, ipv4) + env, vol = self._setup_callback_server_envs( + job_dir, kb_base_url, token, port, max_task, ipv4 + ) self.container = client.containers.run( name=container_name, image=CALLBACK_IMAGE_NAME, @@ -150,5 +180,6 @@ def stop_callback_server(self) -> None: """ Stop the callback server. """ + self.logging = self.container.logs() self.container.stop() - self.container.remove() + self.container.remove() \ No newline at end of file diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 486b6555b..7cda7d60b 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -1,6 +1,8 @@ """ -usage: workspace_downloader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_version SOURCE_VERSION] [--root_dir ROOT_DIR] - [--env {CI,NEXT,APPDEV,PROD}] [--workers WORKERS] [--token_filepath TOKEN_FILEPATH] [--keep_job_dir] [--retrieve_sample] [--ignore_no_sample_error] +usage: workspace_downloader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_version SOURCE_VERSION] + [--root_dir ROOT_DIR] [--env {CI,NEXT,APPDEV,PROD}] [--workers WORKERS] [--token_filepath TOKEN_FILEPATH] + [--jr_max_tasks JR_MAX_TASKS] [--au_service_ver {dev,beta,release}] [--keep_job_dir] [--retrieve_sample] + [--ignore_no_sample_error] PROTOTYPE - Download genome files from the workspace service (WSS). @@ -22,6 +24,10 @@ --workers WORKERS Number of workers for multiprocessing (default: 5) --token_filepath TOKEN_FILEPATH A file path that stores KBase token + --jr_max_tasks JR_MAX_TASKS + The maxmium subtasks for the callback server (default: 20) + --au_service_ver {dev,beta,release} + The service version of AssemblyUtil client (default: release) --keep_job_dir Keep SDK job directory after download task is completed --retrieve_sample Retrieve sample for each genome object --ignore_no_sample_error @@ -373,6 +379,19 @@ def main(): type=str, help="A file path that stores KBase token", ) + optional.add_argument( + "--jr_max_tasks", + type=int, + default=20, + help="The maxmium subtasks for the callback server", + ) + optional.add_argument( + "--au_service_ver", + type=str, + choices=loader_common_names.SERVICE_VERSIONS, + default="release", + help="The service version of AssemblyUtil client", + ) optional.add_argument( "--keep_job_dir", action="store_true", @@ -398,6 +417,8 @@ def main(): env = args.env workers = args.workers token_filepath = args.token_filepath + max_task = args.jr_max_tasks + au_service_ver = args.au_service_ver keep_job_dir = args.keep_job_dir retrieve_sample = args.retrieve_sample ignore_no_sample_error = args.ignore_no_sample_error @@ -438,10 +459,12 @@ def main(): conf = Conf( job_dir, output_dir, - _process_input, kb_base_url, token_filepath, + au_service_ver, workers, + max_task, + _process_input, retrieve_sample, ignore_no_sample_error, ) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 9ffd46506..945c2bb5f 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -512,7 +512,15 @@ def main(): proc = loader_helper.start_podman_service(uid) # set up conf for uploader, start callback server, and upload assemblies to workspace - conf = Conf(job_dir, output_dir, kb_base_url, token_filepath, au_service_ver, max_task) + conf = Conf( + job_dir, + output_dir, + kb_base_url, + token_filepath, + au_service_ver, + max_task=max_task, + workspace_downloader=False, + ) count, wait_to_upload_assemblies = _fetch_assemblies_to_upload( env, diff --git a/src/loaders/workspace_uploader/workspace_uploader_helper.py b/src/loaders/workspace_uploader/workspace_uploader_helper.py deleted file mode 100644 index 51e0a83f3..000000000 --- a/src/loaders/workspace_uploader/workspace_uploader_helper.py +++ /dev/null @@ -1,153 +0,0 @@ -import os -import time -import uuid -from typing import Tuple, Union - -import docker - -from src.clients.AssemblyUtilClient import AssemblyUtil -from src.clients.workspaceClient import Workspace -from src.loaders.common import loader_helper -from src.loaders.common.loader_common_names import CALLBACK_IMAGE_NAME - - -class Conf: - """ - Configuration class for the workspace uploader script. - """ - def __init__( - self, - job_dir: str, - output_dir: str, - kb_base_url: str = "https://ci.kbase.us/services/", - token_filepath: str | None = None, - au_service_ver: str = "release", - max_task: int = 20, - ) -> None: - """ - Initialize the configuration class. - - Args: - job_dir (str): The directory for SDK jobs per user. - output_dir (str): The directory for a specific workspace id under sourcedata/ws. - kb_base_url (str): The base url of the KBase services. - token_filepath (str): The file path that stores a KBase token appropriate for the KBase environment. - If not supplied, the token must be provided in the environment variable KB_AUTH_TOKEN. - au_service_ver (str): The service verison of AssemblyUtilClient. - max_task (int): The maxmium subtasks for the callback server. - """ - port = loader_helper.find_free_port() - token = loader_helper.get_token(token_filepath) - ipv4 = loader_helper.get_ip() - self._start_callback_server( - docker.from_env(), - uuid.uuid4().hex, - job_dir, - kb_base_url, - token, - port, - max_task, - ipv4, - ) - - ws_url = os.path.join(kb_base_url, "ws") - callback_url = "http://" + ipv4 + ":" + str(port) - print("callback_url:", callback_url) - - self.ws = Workspace(ws_url, token=token) - self.asu = AssemblyUtil(callback_url, service_ver=au_service_ver, token=token) - - self.output_dir = output_dir - self.job_data_dir = loader_helper.make_job_data_dir(job_dir) - - self.logging = None - - def _setup_callback_server_envs( - self, - job_dir: str, - kb_base_url: str, - token: str, - port: int, - max_task: int, - ipv4: str, - ) -> Tuple[dict[str, Union[int, str]], dict[str, dict[str, str]]]: - """ - Setup the environment variables and volumes for the callback server. - - Args: - job_dir (str): The directory for SDK jobs per user. - kb_base_url (str): The base url of the KBase services. - token (str): The KBase token. - port (int): The port number for the callback server. - max_task (int): The maxmium subtasks for the callback server. - ipv4: (str): The ipv4 address for the callback server. - - Returns: - tuple: A tuple of the environment variables and volumes for the callback server. - """ - # initiate env and vol - env = {} - vol = {} - - # used by the callback server - env["KB_AUTH_TOKEN"] = token - env["KB_ADMIN_AUTH_TOKEN"] = token # pass in admin_token to get catalog params - env["KB_BASE_URL"] = kb_base_url - env["JOB_DIR"] = job_dir - env["CALLBACK_PORT"] = port - env["JR_MAX_TASKS"] = max_task - env["CALLBACK_IP"] = ipv4 # specify an ipv4 address for the callback server - # otherwise, the callback container will use the an ipv6 address - - # setup volumes required for docker container - docker_host = os.environ["DOCKER_HOST"] - if docker_host.startswith("unix:"): - docker_host = docker_host[5:] - - vol[job_dir] = {"bind": job_dir, "mode": "rw"} - vol[docker_host] = {"bind": "/run/docker.sock", "mode": "rw"} - - return env, vol - - def _start_callback_server( - self, - client: docker.client, - container_name: str, - job_dir: str, - kb_base_url: str, - token: str, - port: int, - max_task: int, - ipv4: str, - ) -> None: - """ - Start the callback server. - - Args: - client (docker.client): The docker client. - container_name (str): The name of the container. - job_dir (str): The directory for SDK jobs per user. - kb_base_url (str): The base url of the KBase services. - token (str): The KBase token. - max_task (int): The maxmium subtasks for the callback server. - port (int): The port number for the callback server. - ipv4: (str): The ipv4 address for the callback server. - """ - env, vol = self._setup_callback_server_envs(job_dir, kb_base_url, token, port, max_task, ipv4) - self.container = client.containers.run( - name=container_name, - image=CALLBACK_IMAGE_NAME, - detach=True, - network_mode="host", - environment=env, - volumes=vol, - ) - time.sleep(2) - - def stop_callback_server(self) -> None: - """ - Stop the callback server. - """ - self.logging = self.container.logs() - self.container.stop() - self.container.remove() diff --git a/test/src/loaders/workspace_uploader/workspace_uploader_test.py b/test/src/loaders/workspace_uploader/workspace_uploader_test.py index 8e361efbb..4e4f28ae0 100644 --- a/test/src/loaders/workspace_uploader/workspace_uploader_test.py +++ b/test/src/loaders/workspace_uploader/workspace_uploader_test.py @@ -9,7 +9,6 @@ from src.clients.AssemblyUtilClient import AssemblyUtil from src.loaders.common import loader_helper -from src.loaders.common.callback_server_wrapper import Conf from src.loaders.workspace_uploader import workspace_uploader ASSEMBLY_DIR_NAMES = ["GCF_000979855.1", "GCF_000979175.1"] From 370b154db094d83396e639443c82eb2cba4e2326 Mon Sep 17 00:00:00 2001 From: Sijie Date: Fri, 1 Dec 2023 13:58:20 -0800 Subject: [PATCH 15/30] update config, admin_token, keynames, logs, aus_version --- src/loaders/common/callback_server_wrapper.py | 21 +++++++--- src/loaders/common/loader_common_names.py | 9 ++-- src/loaders/common/loader_helper.py | 37 +++++++++------- .../workspace_downloader.py | 31 +++++++------- .../workspace_uploader/workspace_uploader.py | 42 ++++++++----------- 5 files changed, 78 insertions(+), 62 deletions(-) diff --git a/src/loaders/common/callback_server_wrapper.py b/src/loaders/common/callback_server_wrapper.py index 3ed8109a8..b24635092 100644 --- a/src/loaders/common/callback_server_wrapper.py +++ b/src/loaders/common/callback_server_wrapper.py @@ -49,7 +49,6 @@ def __init__( ignore_no_sample_error (bool): Whether to ignore the error when no sample data is found. workspace_downloader (bool): Whether to be used for the workspace downloader script. """ - # common instance variables ipv4 = loader_helper.get_ip() port = loader_helper.find_free_port() token = loader_helper.get_token(token_filepath) @@ -69,14 +68,13 @@ def __init__( ipv4, ) + # common instance variables self.ws = Workspace(ws_url, token=token) self.asu = AssemblyUtil(callback_url, service_ver=au_service_ver, token=token) self.output_dir = output_dir self.job_data_dir = loader_helper.make_job_data_dir(job_dir) - self.logging = None - # unique to downloader if workspace_downloader: if worker_function is None: @@ -107,7 +105,7 @@ def _setup_callback_server_envs( Args: job_dir (str): The directory for SDK jobs per user. kb_base_url (str): The base url of the KBase services. - token (str): The KBase token. + token (str): The KBase token. Also used as a catalog admin token if valid. port (int): The port number for the callback server. max_task (int): The maxmium subtasks for the callback server. ipv4: (str): The ipv4 address for the callback server. @@ -121,7 +119,7 @@ def _setup_callback_server_envs( # used by the callback server env["KB_AUTH_TOKEN"] = token - env["KB_ADMIN_AUTH_TOKEN"] = token # pass in admin_token to get catalog params + env["KB_ADMIN_AUTH_TOKEN"] = token # pass in catalog admin token to get catalog params env["KB_BASE_URL"] = kb_base_url env["JOB_DIR"] = job_dir env["CALLBACK_PORT"] = port @@ -176,10 +174,21 @@ def _start_callback_server( ) time.sleep(2) + def _get_container_logs(self) -> None: + """ + Get logs from the callback server container. + """ + logs = self.container.log() + if logs: + print("\n****** Logs from the Callback Server ******\n") + logs = logs.decode("utf-8") + for line in logs.split("\n"): + print(line) + def stop_callback_server(self) -> None: """ Stop the callback server. """ - self.logging = self.container.logs() + self._get_container_logs() self.container.stop() self.container.remove() \ No newline at end of file diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index 9f2dda224..16c54aa08 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -129,7 +129,10 @@ 'APPDEV': 'https://appdev.kbase.us/services/', 'PROD': 'https://kbase.us/services/'} -# service_vers -SERVICE_VERSIONS = ["dev", "beta", "release"] # containers.conf path -CONTAINERS_CONF_PATH = ".config/containers/containers.conf" \ No newline at end of file +CONTAINERS_CONF_PATH = ".config/containers/containers.conf" +# params in containers.conf file +CONTAINERS_CONF_PARAMS = { + "seccomp_profile": "\"unconfined\"", + "log_driver": "\"k8s-file\"" +} \ No newline at end of file diff --git a/src/loaders/common/loader_helper.py b/src/loaders/common/loader_helper.py index ae87b2669..0b0efd575 100644 --- a/src/loaders/common/loader_helper.py +++ b/src/loaders/common/loader_helper.py @@ -29,6 +29,7 @@ ) from src.loaders.common.loader_common_names import ( COLLECTION_SOURCE_DIR, + CONTAINERS_CONF_PARAMS, CONTAINERS_CONF_PATH, DOCKER_HOST, FATAL_ERROR, @@ -291,31 +292,39 @@ def start_podman_service(uid: int): return proc -def setup_callback_server_logs(): - """Set up logs config file for the callback server""" +def get_containers_config(): + """Get containers.conf file at home directory.""" home = os.path.expanduser("~") conf_path = os.path.join(home, CONTAINERS_CONF_PATH) config = configparser.ConfigParser() config.read(conf_path) - params = { - "seccomp_profile": "\"unconfined\"", - "log_driver": "\"k8s-file\"" - } + return config, conf_path - modify = False + +def is_config_modification_required(): + """check if the config requires modification.""" + config, _ = get_containers_config() + if not config.has_section("containers"): + return True + for key, val in CONTAINERS_CONF_PARAMS.items(): + if config.get("containers", key, fallback=None) != val: + return True + return False + + +def setup_callback_server_logs(): + """Set up containers.conf file for the callback server logs.""" + config, conf_path = get_containers_config() if not config.has_section("containers"): config.add_section("containers") - for key, val in params.items(): + for key, val in CONTAINERS_CONF_PARAMS.items(): if config.get("containers", key, fallback=None) != val: config.set("containers", key, val) - if not modify: - modify = True - if modify: - with open(conf_path, 'w') as configfile: - config.write(configfile) - print(f"containers.conf is modified and saved to path: {conf_path}") + with open(conf_path, 'w') as configfile: + config.write(configfile) + print(f"containers.conf is modified and saved to path: {conf_path}") def is_upa_info_complete(upa_dir: str): diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 7cda7d60b..f61039ba4 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -1,7 +1,7 @@ """ usage: workspace_downloader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_version SOURCE_VERSION] [--root_dir ROOT_DIR] [--env {CI,NEXT,APPDEV,PROD}] [--workers WORKERS] [--token_filepath TOKEN_FILEPATH] - [--jr_max_tasks JR_MAX_TASKS] [--au_service_ver {dev,beta,release}] [--keep_job_dir] [--retrieve_sample] + [--jr_max_tasks JR_MAX_TASKS] [--au_service_ver AU_SERVICE_VER] [--keep_job_dir] [--retrieve_sample] [--ignore_no_sample_error] PROTOTYPE - Download genome files from the workspace service (WSS). @@ -26,8 +26,8 @@ A file path that stores KBase token --jr_max_tasks JR_MAX_TASKS The maxmium subtasks for the callback server (default: 20) - --au_service_ver {dev,beta,release} - The service version of AssemblyUtil client (default: release) + --au_service_ver AU_SERVICE_VER + The service version of AssemblyUtil client('dev', 'beta', 'release', or a git commit) (default: release) --keep_job_dir Keep SDK job directory after download task is completed --retrieve_sample Retrieve sample for each genome object --ignore_no_sample_error @@ -388,9 +388,9 @@ def main(): optional.add_argument( "--au_service_ver", type=str, - choices=loader_common_names.SERVICE_VERSIONS, default="release", - help="The service version of AssemblyUtil client", + help="The service version of AssemblyUtil client" + "('dev', 'beta', 'release', or a git commit)", ) optional.add_argument( "--keep_job_dir", @@ -457,16 +457,17 @@ def main(): # set up conf and start callback server conf = Conf( - job_dir, - output_dir, - kb_base_url, - token_filepath, - au_service_ver, - workers, - max_task, - _process_input, - retrieve_sample, - ignore_no_sample_error, + job_dir=job_dir, + output_dir=output_dir, + kb_base_url=kb_base_url, + token_filepath=token_filepath, + au_service_ver=au_service_ver, + workers=workers, + max_task=max_task, + worker_function=_process_input, + retrieve_sample=retrieve_sample, + ignore_no_sample_error=ignore_no_sample_error, + workspace_downloader=True, ) genome_objs = loader_helper.list_objects( diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 945c2bb5f..726e6f916 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -2,7 +2,7 @@ usage: workspace_uploader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_ver SOURCE_VER] [--root_dir ROOT_DIR] [--token_filepath TOKEN_FILEPATH] [--env {CI,NEXT,APPDEV,PROD}] [--upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...]] [--batch_size BATCH_SIZE] - [--jr_max_tasks JR_MAX_TASKS] [--au_service_ver {dev,beta,release}] [--keep_job_dir] + [--jr_max_tasks JR_MAX_TASKS] [--au_service_ver AU_SERVICE_VER] [--keep_job_dir] PROTOTYPE - Upload assembly files to the workspace service (WSS). @@ -34,8 +34,8 @@ Number of files to upload per batch (default: 1000) --jr_max_tasks JR_MAX_TASKS The maxmium subtasks for the callback server (default: 20) - --au_service_ver {dev,beta,release} - The service version of AssemblyUtil client (default: release) + --au_service_ver AU_SERVICE_VER + The service version of AssemblyUtil client('dev', 'beta', 'release', or a git commit) (default: release) --keep_job_dir Keep SDK job directory after upload task is completed e.g. @@ -151,9 +151,9 @@ def _get_parser(): optional.add_argument( "--au_service_ver", type=str, - choices=loader_common_names.SERVICE_VERSIONS, default="release", - help="The service version of AssemblyUtil client", + help="The service version of AssemblyUtil client" + "('dev', 'beta', 'release', or a git commit)", ) optional.add_argument( "--keep_job_dir", @@ -500,24 +500,25 @@ def main(): conf = None try: - # setup container.conf file for the callback server logs - if click.confirm(f"Set up a containers.conf file at {loader_common_names.CONTAINERS_CONF_PATH} if does not exist already?\n" - f"Params 'seccomp_profile' and 'log_driver' will be added under section [containers]"): - loader_helper.setup_callback_server_logs() - else: - print("Permission denied and exiting ...") - return + # setup container.conf file for the callback server logs if needed + if loader_helper.is_config_modification_required(): + if click.confirm(f"Set up a containers.conf file at {loader_common_names.CONTAINERS_CONF_PATH} if does not exist already?\n" + f"Params 'seccomp_profile' and 'log_driver' will be added/updated under section [containers]"): + loader_helper.setup_callback_server_logs() + else: + print("Permission denied and exiting ...") + return # start podman service proc = loader_helper.start_podman_service(uid) # set up conf for uploader, start callback server, and upload assemblies to workspace conf = Conf( - job_dir, - output_dir, - kb_base_url, - token_filepath, - au_service_ver, + job_dir=job_dir, + output_dir=output_dir, + kb_base_url=kb_base_url, + token_filepath=token_filepath, + au_service_ver=au_service_ver, max_task=max_task, workspace_downloader=False, ) @@ -561,13 +562,6 @@ def main(): if conf: conf.stop_callback_server() - # print out container logs - if conf and conf.logging is not None: - print("\n****** Logs from the Callback Server ******\n") - log_string = conf.logging.decode("utf-8") - for line in log_string.split("\n"): - print(line) - # stop podman service if it is on if proc: proc.terminate() From ac5aee7ad4d79643cee5f445c0ab7ba2dae5b72a Mon Sep 17 00:00:00 2001 From: Sijie Date: Fri, 1 Dec 2023 14:02:52 -0800 Subject: [PATCH 16/30] update comments --- src/loaders/common/callback_server_wrapper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/loaders/common/callback_server_wrapper.py b/src/loaders/common/callback_server_wrapper.py index b24635092..34ef10d3f 100644 --- a/src/loaders/common/callback_server_wrapper.py +++ b/src/loaders/common/callback_server_wrapper.py @@ -41,7 +41,8 @@ def __init__( kb_base_url (str): The base url of the KBase services. token_filepath (str): The file path that stores a KBase token appropriate for the KBase environment. If not supplied, the token must be provided in the environment variable KB_AUTH_TOKEN. - au_service_ver (str): The service verison of AssemblyUtilClient. + au_service_ver (str): The service verison of AssemblyUtilClient + ('dev', 'beta', 'release', or a git commit). workers (int): The number of workers to use for multiprocessing. max_task (int): The maxmium subtasks for the callback server. worker_function (Callable): The function that will be called by the workers. From 54f89e924ca19fce4d2c316ab8236731c8b96f6e Mon Sep 17 00:00:00 2001 From: Sijie Date: Mon, 4 Dec 2023 15:22:43 -0800 Subject: [PATCH 17/30] move the upload specific code from Conf into the uploader script --- src/loaders/common/callback_server_wrapper.py | 24 ++++++----- src/loaders/common/loader_helper.py | 6 +-- .../workspace_downloader.py | 4 +- .../workspace_uploader/workspace_uploader.py | 29 +++++++++---- .../workspace_uploader_test.py | 41 ++++++++++++------- 5 files changed, 66 insertions(+), 38 deletions(-) diff --git a/src/loaders/common/callback_server_wrapper.py b/src/loaders/common/callback_server_wrapper.py index 34ef10d3f..57f18f0e6 100644 --- a/src/loaders/common/callback_server_wrapper.py +++ b/src/loaders/common/callback_server_wrapper.py @@ -52,26 +52,28 @@ def __init__( """ ipv4 = loader_helper.get_ip() port = loader_helper.find_free_port() - token = loader_helper.get_token(token_filepath) - - ws_url = os.path.join(kb_base_url, "ws") - callback_url = "http://" + ipv4 + ":" + str(port) - print("callback_url:", callback_url) + # common instance variables + self.ws_url = os.path.join(kb_base_url, "ws") + self.token = loader_helper.get_token(token_filepath) self._start_callback_server( docker.from_env(), uuid.uuid4().hex, job_dir, kb_base_url, - token, + self.token, port, max_task, ipv4, ) - # common instance variables - self.ws = Workspace(ws_url, token=token) - self.asu = AssemblyUtil(callback_url, service_ver=au_service_ver, token=token) + self.callback_url = "http://" + ipv4 + ":" + str(port) + print("callback_url:", self.callback_url) + + self.ws = Workspace(self.ws_url, token=self.token) + self.asu = AssemblyUtil( + self.callback_url, service_ver=au_service_ver, token=self.token + ) self.output_dir = output_dir self.job_data_dir = loader_helper.make_job_data_dir(job_dir) @@ -87,7 +89,7 @@ def __init__( self.ignore_no_sample_error = ignore_no_sample_error sample_url = os.path.join(kb_base_url, "sampleservice") - self.ss = SampleService(sample_url, token=token) + self.ss = SampleService(sample_url, token=self.token) self.pools = Pool(workers, worker_function, [self]) @@ -179,7 +181,7 @@ def _get_container_logs(self) -> None: """ Get logs from the callback server container. """ - logs = self.container.log() + logs = self.container.logs() if logs: print("\n****** Logs from the Callback Server ******\n") logs = logs.decode("utf-8") diff --git a/src/loaders/common/loader_helper.py b/src/loaders/common/loader_helper.py index 0b0efd575..690ff2a01 100644 --- a/src/loaders/common/loader_helper.py +++ b/src/loaders/common/loader_helper.py @@ -459,19 +459,19 @@ def create_hardlink_between_files(new_file, target_file): os.link(target_file, new_file) -def list_objects(wsid, conf, object_type, include_metadata=False, batch_size=10000): +def list_objects(wsid, ws, object_type, include_metadata=False, batch_size=10000): """ List all objects information given a workspace ID. """ if batch_size > 10000: raise ValueError("Maximum value for listing workspace objects is 10000") - maxObjectID = conf.ws.get_workspace_info({"id": wsid})[4] + maxObjectID = ws.get_workspace_info({"id": wsid})[4] batch_input = [ [idx + 1, idx + batch_size] for idx in range(0, maxObjectID, batch_size) ] objs = [ - conf.ws.list_objects( + ws.list_objects( _list_objects_params(wsid, min_id, max_id, object_type, include_metadata) ) for min_id, max_id in batch_input diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index f61039ba4..3fec27663 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -472,13 +472,13 @@ def main(): genome_objs = loader_helper.list_objects( workspace_id, - conf, + conf.ws, loader_common_names.OBJECTS_NAME_GENOME, include_metadata=True, ) assembly_objs = loader_helper.list_objects( workspace_id, - conf, + conf.ws, loader_common_names.OBJECTS_NAME_ASSEMBLY, ) assembly_genome_map, duplicate_map = _assembly_genome_lookup(genome_objs) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 726e6f916..abfe98b8b 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -65,6 +65,8 @@ import yaml +from src.clients.AssemblyUtilClient import AssemblyUtil +from src.clients.workspaceClient import Workspace from src.loaders.common import loader_common_names, loader_helper from src.loaders.common.callback_server_wrapper import Conf @@ -183,7 +185,7 @@ def _get_source_file(assembly_dir: str, assembly_file: str) -> str: def _upload_assemblies_to_workspace( - conf: Conf, + asu: AssemblyUtil, workspace_id: int, assembly_tuples: list[AssemblyTuple], ) -> tuple[str, ...]: @@ -200,7 +202,7 @@ def _upload_assemblies_to_workspace( ] try: - assembly_ref = conf.asu.save_assemblies_from_fastas( + assembly_ref = asu.save_assemblies_from_fastas( { "workspace_id": workspace_id, "inputs": inputs @@ -356,23 +358,27 @@ def _post_process( def _upload_assembly_files_in_parallel( - conf: Conf, + asu: AssemblyUtil, + ws: Workspace, upload_env_key: str, workspace_id: int, upload_dir: str, wait_to_upload_assemblies: dict[str, str], batch_size: int, + output_dir: str, ) -> list[str]: """ Upload assembly files to the target workspace in parallel using multiprocessing. Parameters: - conf: Conf object + asu: AssemblyUtil client + ws: Workspace client upload_env_key: environment variable key in uploaded.yaml file workspace_id: target workspace id upload_dir: a directory in collectionssource that creates new directories linking to sourcedata wait_to_upload_assemblies: a dictionary that maps assembly file name to assembly directory batch_size: a number of files to upload per batch + output_dir: a directory in sourcedata/workspace to store new assembly entries Returns: number of assembly files have been sucessfully uploaded from wait_to_upload_assemblies @@ -385,7 +391,7 @@ def _upload_assembly_files_in_parallel( for assembly_tuples in _gen(wait_to_upload_assemblies, batch_size): try: - batch_upas = _upload_assemblies_to_workspace(conf, workspace_id, assembly_tuples) + batch_upas = _upload_assemblies_to_workspace(asu, workspace_id, assembly_tuples) batch_uploaded_tuples = assembly_tuples except Exception as e: name_assemblyTuple_map = { @@ -393,9 +399,10 @@ def _upload_assembly_files_in_parallel( } assembly_objects_in_ws = loader_helper.list_objects( workspace_id, - conf, + ws, loader_common_names.OBJECTS_NAME_ASSEMBLY, ) + print("assembly_objects_in_ws: ", assembly_objects_in_ws) name_upa_map = { object_info[1]: "{6}_{0}_{4}".format(*object_info) for object_info in assembly_objects_in_ws } @@ -413,7 +420,7 @@ def _upload_assembly_files_in_parallel( assembly_tuple.host_assembly_dir, assembly_tuple.assembly_name, upload_dir, - conf.output_dir, + output_dir, upa ) @@ -541,14 +548,20 @@ def main(): data_dir = _prepare_skd_job_dir_to_upload(conf, wait_to_upload_assemblies) print(f"{wtus_len} assemblies in {data_dir} are ready to upload to workspace {workspace_id}") + ws = Workspace(conf.ws_url, token=conf.token) + asu = AssemblyUtil( + conf.callback_url, service_ver=au_service_ver, token=conf.token + ) start = time.time() uploaded_count = _upload_assembly_files_in_parallel( - conf, + asu, + ws, env, workspace_id, upload_dir, wait_to_upload_assemblies, batch_size, + output_dir, ) upload_time = (time.time() - start) / 60 diff --git a/test/src/loaders/workspace_uploader/workspace_uploader_test.py b/test/src/loaders/workspace_uploader/workspace_uploader_test.py index 4e4f28ae0..70527d6a5 100644 --- a/test/src/loaders/workspace_uploader/workspace_uploader_test.py +++ b/test/src/loaders/workspace_uploader/workspace_uploader_test.py @@ -8,6 +8,7 @@ import pytest from src.clients.AssemblyUtilClient import AssemblyUtil +from src.clients.workspaceClient import Workspace from src.loaders.common import loader_helper from src.loaders.workspace_uploader import workspace_uploader @@ -225,15 +226,14 @@ def test_upload_assembly_to_workspace(setup_and_teardown): assembly_name = ASSEMBLY_NAMES[0] host_assembly_dir = params.assembly_dirs[0] - conf = Mock() - conf.asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) - conf.asu.save_assemblies_from_fastas.return_value = {"results":[{"upa": "12345/58/1"}]} + asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) + asu.save_assemblies_from_fastas.return_value = {"results":[{"upa": "12345/58/1"}]} assembly_tuple = workspace_uploader.AssemblyTuple( assembly_name, host_assembly_dir, "/path/to/file/in/AssembilyUtil" ) - upas = workspace_uploader._upload_assemblies_to_workspace(conf, 12345, [assembly_tuple]) + upas = workspace_uploader._upload_assemblies_to_workspace(asu, 12345, [assembly_tuple]) assert upas == tuple(["12345_58_1"]) - conf.asu.save_assemblies_from_fastas.assert_called_once_with( + asu.save_assemblies_from_fastas.assert_called_once_with( { "workspace_id": 12345, "inputs": [ @@ -287,10 +287,9 @@ def test_upload_assembly_files_in_parallel(setup_and_teardown): for assembly_name, assembly_dir in zip(ASSEMBLY_NAMES, assembly_dirs) } - conf = Mock() - conf.output_dir = output_dir - conf.asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) - conf.asu.save_assemblies_from_fastas.return_value = { + ws = create_autospec(Workspace, spec_set=True, instance=True) + asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) + asu.save_assemblies_from_fastas.return_value = { "results":[ {"upa": "12345/58/1"}, {"upa": "12345/60/1"} @@ -298,7 +297,14 @@ def test_upload_assembly_files_in_parallel(setup_and_teardown): } uploaded_count = workspace_uploader._upload_assembly_files_in_parallel( - conf, "CI", 12345, upload_dir, wait_to_upload_assemblies, 2 + asu, + ws, + "CI", + 12345, + upload_dir, + wait_to_upload_assemblies, + 2, + output_dir, ) assert uploaded_count == 2 @@ -334,15 +340,22 @@ def test_fail_upload_assembly_files_in_parallel(setup_and_teardown): for assembly_name, assembly_dir in zip(ASSEMBLY_NAMES, assembly_dirs) } - conf = Mock() - conf.asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) - conf.asu.save_assemblies_from_fastas.side_effect = Exception("Illegal character in object name") + ws = create_autospec(Workspace, spec_set=True, instance=True) + asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) + asu.save_assemblies_from_fastas.side_effect = Exception("Illegal character in object name") loader_helper.list_objects = create_autospec(loader_helper.list_objects) loader_helper.list_objects.return_value = [] uploaded_count = workspace_uploader._upload_assembly_files_in_parallel( - conf, "CI", 12345, upload_dir, wait_to_upload_assemblies, 2 + asu, + ws, + "CI", + 12345, + upload_dir, + wait_to_upload_assemblies, + 2, + output_dir, ) assert uploaded_count == 0 From 97911b268622695b77b625cad5758c9dffb47f07 Mon Sep 17 00:00:00 2001 From: Sijie Date: Thu, 7 Dec 2023 14:24:22 -0800 Subject: [PATCH 18/30] simplify _gen logic; add descfor cbs wrapper; fix race condition --- src/loaders/common/callback_server_wrapper.py | 32 +++++++++++++---- src/loaders/common/loader_helper.py | 33 ++++++++++++------ .../workspace_uploader/workspace_uploader.py | 34 +++++-------------- 3 files changed, 57 insertions(+), 42 deletions(-) diff --git a/src/loaders/common/callback_server_wrapper.py b/src/loaders/common/callback_server_wrapper.py index 57f18f0e6..742b683ef 100644 --- a/src/loaders/common/callback_server_wrapper.py +++ b/src/loaders/common/callback_server_wrapper.py @@ -54,8 +54,13 @@ def __init__( port = loader_helper.find_free_port() # common instance variables + # the url of the workspace to contact self.ws_url = os.path.join(kb_base_url, "ws") + + # a KBase token appropriate for the KBase environment self.token = loader_helper.get_token(token_filepath) + + # setup and run callback server container self._start_callback_server( docker.from_env(), uuid.uuid4().hex, @@ -67,15 +72,11 @@ def __init__( ipv4, ) + # the url of the callback service to contact self.callback_url = "http://" + ipv4 + ":" + str(port) print("callback_url:", self.callback_url) - self.ws = Workspace(self.ws_url, token=self.token) - self.asu = AssemblyUtil( - self.callback_url, service_ver=au_service_ver, token=self.token - ) - - self.output_dir = output_dir + # the directory for SDK jobs per user self.job_data_dir = loader_helper.make_job_data_dir(job_dir) # unique to downloader @@ -84,13 +85,32 @@ def __init__( raise ValueError( "worker_function cannot be None for the workspace downloader script" ) + + # queue for the workspace downloader tasks self.input_queue = Queue() + + # the directory for a specific workspace id under sourcedata/ws + self.output_dir = output_dir + + # whether to retrieve sample for each genome object self.retrieve_sample = retrieve_sample + + # whether to ignore the error when no sample data is found self.ignore_no_sample_error = ignore_no_sample_error + # Workspace client + self.ws = Workspace(self.ws_url, token=self.token) + + # AssemblyUtil client + self.asu = AssemblyUtil( + self.callback_url, service_ver=au_service_ver, token=self.token + ) + + # SampleService client sample_url = os.path.join(kb_base_url, "sampleservice") self.ss = SampleService(sample_url, token=self.token) + # a pool of worker processes. self.pools = Pool(workers, worker_function, [self]) def _setup_callback_server_envs( diff --git a/src/loaders/common/loader_helper.py b/src/loaders/common/loader_helper.py index 690ff2a01..1e9342e20 100644 --- a/src/loaders/common/loader_helper.py +++ b/src/loaders/common/loader_helper.py @@ -6,6 +6,7 @@ import socket import stat import subprocess +import threading import time import uuid from collections import defaultdict @@ -292,7 +293,7 @@ def start_podman_service(uid: int): return proc -def get_containers_config(): +def _get_containers_config(): """Get containers.conf file at home directory.""" home = os.path.expanduser("~") conf_path = os.path.join(home, CONTAINERS_CONF_PATH) @@ -303,7 +304,7 @@ def get_containers_config(): def is_config_modification_required(): """check if the config requires modification.""" - config, _ = get_containers_config() + config, _ = _get_containers_config() if not config.has_section("containers"): return True for key, val in CONTAINERS_CONF_PARAMS.items(): @@ -314,17 +315,18 @@ def is_config_modification_required(): def setup_callback_server_logs(): """Set up containers.conf file for the callback server logs.""" - config, conf_path = get_containers_config() - if not config.has_section("containers"): - config.add_section("containers") + config, conf_path = _get_containers_config() + lock = threading.Lock() + with lock: + if not config.has_section("containers"): + config.add_section("containers") - for key, val in CONTAINERS_CONF_PARAMS.items(): - if config.get("containers", key, fallback=None) != val: + for key, val in CONTAINERS_CONF_PARAMS.items(): config.set("containers", key, val) - with open(conf_path, 'w') as configfile: - config.write(configfile) - print(f"containers.conf is modified and saved to path: {conf_path}") + with open(conf_path, 'w') as configfile: + config.write(configfile) + print(f"containers.conf is modified and saved to path: {conf_path}") def is_upa_info_complete(upa_dir: str): @@ -462,6 +464,17 @@ def create_hardlink_between_files(new_file, target_file): def list_objects(wsid, ws, object_type, include_metadata=False, batch_size=10000): """ List all objects information given a workspace ID. + + Args: + wsid (int): Target workspace addressed by the permanent ID + ws (Workspace): Workspace client + object_type (str): Type of the objects to be listed + include_metadata (boolean): Whether to include the user provided metadata in the returned object_info + batch_size (int): Number of objects to process in each batch + + Returns: + list: a list of objects on the target workspace + """ if batch_size > 10000: raise ValueError("Maximum value for listing workspace objects is 10000") diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index abfe98b8b..ca8e7400a 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -31,7 +31,7 @@ --upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...] Upload only files that match given extensions (default: ['genomic.fna.gz']) --batch_size BATCH_SIZE - Number of files to upload per batch (default: 1000) + Number of files to upload per batch (default: 2500) --jr_max_tasks JR_MAX_TASKS The maxmium subtasks for the callback server (default: 20) --au_service_ver AU_SERVICE_VER @@ -443,28 +443,12 @@ def _gen( """ Generator function to yield the assembly files to upload. """ - assembly_files_len = len(wait_to_upload_assemblies) - assembly_names = tuple(wait_to_upload_assemblies.keys()) - host_assembly_dirs = tuple(wait_to_upload_assemblies.values()) - container_internal_assembly_paths = tuple( - [ - os.path.join(JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER, assembly_name) - for assembly_name in assembly_names - ] - ) - - # construct a list of nametuples to ensure consistency assemblyTuple_list = [ - AssemblyTuple( - assembly_names[idx], - host_assembly_dirs[idx], - container_internal_assembly_paths[idx], - ) - for idx in range(assembly_files_len) + AssemblyTuple(i[0], i[1], os.path.join(JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER, i[0])) + for i in wait_to_upload_assemblies.items() ] - # yield AssemblyTuples in batch - for idx in range(0, assembly_files_len, batch_size): + for idx in range(0, len(wait_to_upload_assemblies), batch_size): yield assemblyTuple_list[idx: idx + batch_size] @@ -509,8 +493,8 @@ def main(): try: # setup container.conf file for the callback server logs if needed if loader_helper.is_config_modification_required(): - if click.confirm(f"Set up a containers.conf file at {loader_common_names.CONTAINERS_CONF_PATH} if does not exist already?\n" - f"Params 'seccomp_profile' and 'log_driver' will be added/updated under section [containers]"): + if click.confirm(f"The config file at {loader_common_names.CONTAINERS_CONF_PATH}\n" + f"needs to be modified to allow for container logging. Do so now?\n"): loader_helper.setup_callback_server_logs() else: print("Permission denied and exiting ...") @@ -522,17 +506,15 @@ def main(): # set up conf for uploader, start callback server, and upload assemblies to workspace conf = Conf( job_dir=job_dir, - output_dir=output_dir, kb_base_url=kb_base_url, token_filepath=token_filepath, - au_service_ver=au_service_ver, max_task=max_task, workspace_downloader=False, ) count, wait_to_upload_assemblies = _fetch_assemblies_to_upload( - env, - workspace_id, + env, + workspace_id, collection_source_dir, upload_file_ext, ) From e14b49d2490d734b6e9b35f8764eea0f62cadca7 Mon Sep 17 00:00:00 2001 From: Sijie Date: Thu, 7 Dec 2023 14:44:55 -0800 Subject: [PATCH 19/30] change output_dir to keyword argument --- src/loaders/common/callback_server_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/common/callback_server_wrapper.py b/src/loaders/common/callback_server_wrapper.py index 742b683ef..f2cd7c1d6 100644 --- a/src/loaders/common/callback_server_wrapper.py +++ b/src/loaders/common/callback_server_wrapper.py @@ -21,7 +21,7 @@ class Conf: def __init__( self, job_dir: str, - output_dir: str, + output_dir: str | None = None, kb_base_url: str = "https://ci.kbase.us/services/", token_filepath: str | None = None, au_service_ver: str = "release", From 2e01b64bbf4adc3dc5fab6a298a1290d29f17114 Mon Sep 17 00:00:00 2001 From: Sijie Date: Tue, 19 Dec 2023 14:44:12 -0800 Subject: [PATCH 20/30] 1. add load_version; 2. update failure recovery logic; 3.rename CLI and update desc&comments --- src/loaders/common/callback_server_wrapper.py | 58 +++++---- src/loaders/common/loader_common_names.py | 2 +- src/loaders/common/loader_helper.py | 14 +- .../workspace_downloader.py | 10 +- .../workspace_uploader/workspace_uploader.py | 121 ++++++++++++------ .../workspace_uploader_test.py | 33 +++-- 6 files changed, 146 insertions(+), 92 deletions(-) diff --git a/src/loaders/common/callback_server_wrapper.py b/src/loaders/common/callback_server_wrapper.py index f2cd7c1d6..29794559e 100644 --- a/src/loaders/common/callback_server_wrapper.py +++ b/src/loaders/common/callback_server_wrapper.py @@ -16,8 +16,23 @@ class Conf: """ Configuration class for the workspace downloader and workspace uploader scripts. - """ + Instance variables: + + token - a KBase token appropriate for the KBase environment + Also use as an admin token if the user has admin permission to catalog params + callback_url - the url of the callback service to contact + job_data_dir - the directory for SDK jobs per user + input_queue - queue for the workspace downloader tasks + output_dir - the directory for a specific workspace id under sourcedata/ws + retrieve_sample - whether to retrieve sample for each genome object + ignore_no_sample_error - whether to ignore the error when no sample data is found + ws - workspace client + asu - assemblyUtil client + ss - sampleService client + pools - a pool of worker processes + + """ def __init__( self, job_dir: str, @@ -26,11 +41,11 @@ def __init__( token_filepath: str | None = None, au_service_ver: str = "release", workers: int = 5, - max_task: int = 20, + max_callback_server_tasks: int = 20, worker_function: Callable | None = None, retrieve_sample: bool = False, ignore_no_sample_error: bool = False, - workspace_downloader: bool = True, + workspace_downloader: bool = False, ) -> None: """ Initialize the configuration class. @@ -41,10 +56,11 @@ def __init__( kb_base_url (str): The base url of the KBase services. token_filepath (str): The file path that stores a KBase token appropriate for the KBase environment. If not supplied, the token must be provided in the environment variable KB_AUTH_TOKEN. + Also use as an admin token if the user has admin permission to catalog params. au_service_ver (str): The service verison of AssemblyUtilClient ('dev', 'beta', 'release', or a git commit). workers (int): The number of workers to use for multiprocessing. - max_task (int): The maxmium subtasks for the callback server. + max_callback_server_tasks (int): The maxmium subtasks for the callback server. worker_function (Callable): The function that will be called by the workers. retrieve_sample (bool): Whether to retrieve sample for each genome object. ignore_no_sample_error (bool): Whether to ignore the error when no sample data is found. @@ -54,10 +70,7 @@ def __init__( port = loader_helper.find_free_port() # common instance variables - # the url of the workspace to contact - self.ws_url = os.path.join(kb_base_url, "ws") - # a KBase token appropriate for the KBase environment self.token = loader_helper.get_token(token_filepath) # setup and run callback server container @@ -68,15 +81,13 @@ def __init__( kb_base_url, self.token, port, - max_task, + max_callback_server_tasks, ipv4, ) - # the url of the callback service to contact self.callback_url = "http://" + ipv4 + ":" + str(port) print("callback_url:", self.callback_url) - # the directory for SDK jobs per user self.job_data_dir = loader_helper.make_job_data_dir(job_dir) # unique to downloader @@ -86,31 +97,22 @@ def __init__( "worker_function cannot be None for the workspace downloader script" ) - # queue for the workspace downloader tasks self.input_queue = Queue() - - # the directory for a specific workspace id under sourcedata/ws self.output_dir = output_dir - # whether to retrieve sample for each genome object self.retrieve_sample = retrieve_sample - - # whether to ignore the error when no sample data is found self.ignore_no_sample_error = ignore_no_sample_error - # Workspace client - self.ws = Workspace(self.ws_url, token=self.token) + ws_url = os.path.join(kb_base_url, "ws") + self.ws = Workspace(ws_url, token=self.token) - # AssemblyUtil client self.asu = AssemblyUtil( self.callback_url, service_ver=au_service_ver, token=self.token ) - # SampleService client sample_url = os.path.join(kb_base_url, "sampleservice") self.ss = SampleService(sample_url, token=self.token) - # a pool of worker processes. self.pools = Pool(workers, worker_function, [self]) def _setup_callback_server_envs( @@ -119,7 +121,7 @@ def _setup_callback_server_envs( kb_base_url: str, token: str, port: int, - max_task: int, + max_callback_server_tasks: int, ipv4: str, ) -> Tuple[dict[str, Union[int, str]], dict[str, dict[str, str]]]: """ @@ -130,7 +132,7 @@ def _setup_callback_server_envs( kb_base_url (str): The base url of the KBase services. token (str): The KBase token. Also used as a catalog admin token if valid. port (int): The port number for the callback server. - max_task (int): The maxmium subtasks for the callback server. + max_callback_server_tasks (int): The maxmium subtasks for the callback server. ipv4: (str): The ipv4 address for the callback server. Returns: @@ -146,7 +148,7 @@ def _setup_callback_server_envs( env["KB_BASE_URL"] = kb_base_url env["JOB_DIR"] = job_dir env["CALLBACK_PORT"] = port - env["JR_MAX_TASKS"] = max_task + env["JR_MAX_TASKS"] = max_callback_server_tasks env["CALLBACK_IP"] = ipv4 # specify an ipv4 address for the callback server # otherwise, the callback container will use the an ipv6 address @@ -168,7 +170,7 @@ def _start_callback_server( kb_base_url: str, token: str, port: int, - max_task: int, + max_callback_server_tasks: int, ipv4: str, ) -> None: """ @@ -180,12 +182,12 @@ def _start_callback_server( job_dir (str): The directory for SDK jobs per user. kb_base_url (str): The base url of the KBase services. token (str): The KBase token. - max_task (int): The maxmium subtasks for the callback server. + max_callback_server_tasks (int): The maxmium subtasks for the callback server. port (int): The port number for the callback server. ipv4: (str): The ipv4 address for the callback server. """ env, vol = self._setup_callback_server_envs( - job_dir, kb_base_url, token, port, max_task, ipv4 + job_dir, kb_base_url, token, port, max_callback_server_tasks, ipv4 ) self.container = client.containers.run( name=container_name, @@ -214,4 +216,4 @@ def stop_callback_server(self) -> None: """ self._get_container_logs() self.container.stop() - self.container.remove() \ No newline at end of file + self.container.remove() diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index b8f32fef7..c376ab1c0 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -130,7 +130,7 @@ 'PROD': 'https://kbase.us/services/'} # containers.conf path -CONTAINERS_CONF_PATH = ".config/containers/containers.conf" +CONTAINERS_CONF_PATH = "~/.config/containers/containers.conf" # params in containers.conf file CONTAINERS_CONF_PARAMS = { "seccomp_profile": "\"unconfined\"", diff --git a/src/loaders/common/loader_helper.py b/src/loaders/common/loader_helper.py index 1e9342e20..f19387e86 100644 --- a/src/loaders/common/loader_helper.py +++ b/src/loaders/common/loader_helper.py @@ -1,5 +1,6 @@ import argparse import configparser +import fcntl import itertools import json import os @@ -295,8 +296,7 @@ def start_podman_service(uid: int): def _get_containers_config(): """Get containers.conf file at home directory.""" - home = os.path.expanduser("~") - conf_path = os.path.join(home, CONTAINERS_CONF_PATH) + conf_path = os.path.expanduser(CONTAINERS_CONF_PATH) config = configparser.ConfigParser() config.read(conf_path) return config, conf_path @@ -316,18 +316,20 @@ def is_config_modification_required(): def setup_callback_server_logs(): """Set up containers.conf file for the callback server logs.""" config, conf_path = _get_containers_config() - lock = threading.Lock() - with lock: + with open(conf_path, 'w') as writer: + fcntl.flock(writer.fileno(), fcntl.LOCK_EX) + if not config.has_section("containers"): config.add_section("containers") for key, val in CONTAINERS_CONF_PARAMS.items(): config.set("containers", key, val) - with open(conf_path, 'w') as configfile: - config.write(configfile) + config.write(writer) print(f"containers.conf is modified and saved to path: {conf_path}") + fcntl.flock(writer.fileno(), fcntl.LOCK_UN) + def is_upa_info_complete(upa_dir: str): """ diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 8c10d25d2..1e8fdf530 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -1,7 +1,7 @@ """ usage: workspace_downloader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_version SOURCE_VERSION] [--root_dir ROOT_DIR] [--env {CI,NEXT,APPDEV,PROD}] [--workers WORKERS] [--token_filepath TOKEN_FILEPATH] - [--jr_max_tasks JR_MAX_TASKS] [--au_service_ver AU_SERVICE_VER] [--keep_job_dir] [--retrieve_sample] + [--cbs_max_tasks CBS_MAX_TASKS] [--au_service_ver AU_SERVICE_VER] [--keep_job_dir] [--retrieve_sample] [--ignore_no_sample_error] PROTOTYPE - Download genome files from the workspace service (WSS). @@ -24,7 +24,7 @@ --workers WORKERS Number of workers for multiprocessing (default: 5) --token_filepath TOKEN_FILEPATH A file path that stores KBase token - --jr_max_tasks JR_MAX_TASKS + --cbs_max_tasks CBS_MAX_TASKS The maxmium subtasks for the callback server (default: 20) --au_service_ver AU_SERVICE_VER The service version of AssemblyUtil client('dev', 'beta', 'release', or a git commit) (default: release) @@ -380,7 +380,7 @@ def main(): help="A file path that stores KBase token", ) optional.add_argument( - "--jr_max_tasks", + "--cbs_max_tasks", type=int, default=20, help="The maxmium subtasks for the callback server", @@ -417,7 +417,7 @@ def main(): env = args.env workers = args.workers token_filepath = args.token_filepath - max_task = args.jr_max_tasks + cbs_max_tasks = args.cbs_max_tasks au_service_ver = args.au_service_ver keep_job_dir = args.keep_job_dir retrieve_sample = args.retrieve_sample @@ -463,7 +463,7 @@ def main(): token_filepath=token_filepath, au_service_ver=au_service_ver, workers=workers, - max_task=max_task, + max_callback_server_tasks=cbs_max_tasks, worker_function=_process_input, retrieve_sample=retrieve_sample, ignore_no_sample_error=ignore_no_sample_error, diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index ca8e7400a..4d9db8fc2 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -1,8 +1,8 @@ """ usage: workspace_uploader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_ver SOURCE_VER] - [--root_dir ROOT_DIR] [--token_filepath TOKEN_FILEPATH] [--env {CI,NEXT,APPDEV,PROD}] + [--root_dir ROOT_DIR] [--load_ver LOAD_VER] [--token_filepath TOKEN_FILEPATH] [--env {CI,NEXT,APPDEV,PROD}] [--upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...]] [--batch_size BATCH_SIZE] - [--jr_max_tasks JR_MAX_TASKS] [--au_service_ver AU_SERVICE_VER] [--keep_job_dir] + [--cbs_max_tasks CBS_MAX_TASKS] [--au_service_ver AU_SERVICE_VER] [--keep_job_dir] PROTOTYPE - Upload assembly files to the workspace service (WSS). @@ -23,6 +23,7 @@ optional arguments: --root_dir ROOT_DIR Root directory for the collections project. (default: /global/cfs/cdirs/kbase/collections) + --load_ver LOAD_VER KBase load version (e.g. r207.kbase.1). (default: source version) --token_filepath TOKEN_FILEPATH A file path that stores a KBase token appropriate for the KBase environment If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable @@ -32,7 +33,7 @@ Upload only files that match given extensions (default: ['genomic.fna.gz']) --batch_size BATCH_SIZE Number of files to upload per batch (default: 2500) - --jr_max_tasks JR_MAX_TASKS + --cbs_max_tasks CBS_MAX_TASKS The maxmium subtasks for the callback server (default: 20) --au_service_ver AU_SERVICE_VER The service version of AssemblyUtil client('dev', 'beta', 'release', or a git commit) (default: release) @@ -118,11 +119,17 @@ def _get_parser(): default=loader_common_names.ROOT_DIR, help=loader_common_names.ROOT_DIR_DESCR ) + optional.add_argument( + f"--{loader_common_names.LOAD_VER_ARG_NAME}", + type=str, + help=loader_common_names.LOAD_VER_DESCR + " (default: source version)" + ) optional.add_argument( "--token_filepath", type=str, help="A file path that stores a KBase token appropriate for the KBase environment. " - "If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable.", + "If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable. " + "Also use as an admin token if the user has admin permission to catalog params." ) optional.add_argument( "--env", @@ -145,7 +152,7 @@ def _get_parser(): help="Number of files to upload per batch", ) optional.add_argument( - "--jr_max_tasks", + "--cbs_max_tasks", type=int, default=20, help="The maxmium subtasks for the callback server", @@ -209,7 +216,6 @@ def _upload_assemblies_to_workspace( } ) except Exception as e: - print(e) raise e upas = tuple([result_dict["upa"].replace("/", "_") @@ -220,6 +226,7 @@ def _upload_assemblies_to_workspace( def _read_upload_status_yaml_file( upload_env_key: str, workspace_id: int, + load_version: str, assembly_dir: str, assembly_name: str, ) -> tuple[dict[str, dict[int, list[str]]], bool]: @@ -242,7 +249,10 @@ def _read_upload_status_yaml_file( if workspace_id not in data[upload_env_key]: data[upload_env_key][workspace_id] = dict() - assembly_dict = data[upload_env_key][workspace_id] + if load_version not in data[upload_env_key][workspace_id]: + data[upload_env_key][workspace_id][load_version] = dict() + + assembly_dict = data[upload_env_key][workspace_id][load_version] if assembly_dict and assembly_dict["file_name"] == assembly_name: uploaded = True return data, uploaded @@ -251,6 +261,7 @@ def _read_upload_status_yaml_file( def _update_upload_status_yaml_file( upload_env_key: str, workspace_id: int, + load_version: str, upa: str, assembly_dir: str, assembly_name: str, @@ -258,12 +269,20 @@ def _update_upload_status_yaml_file( """ Update the uploaded.yaml file in target genome_dir with newly uploaded assembly names and upa info. """ - data, uploaded = _read_upload_status_yaml_file(upload_env_key, workspace_id, assembly_dir, assembly_name) + data, uploaded = _read_upload_status_yaml_file( + upload_env_key, + workspace_id, + load_version, + assembly_dir, + assembly_name, + ) if uploaded: raise ValueError(f"Assembly {assembly_name} already exists in workspace {workspace_id}") - data[upload_env_key][workspace_id] = {"file_name": assembly_name, "upa": upa} + data[upload_env_key][workspace_id][load_version] = { + "file_name": assembly_name, "upa": upa + } file_path = _get_yaml_file_path(assembly_dir) with open(file_path, "w") as file: @@ -273,6 +292,7 @@ def _update_upload_status_yaml_file( def _fetch_assemblies_to_upload( upload_env_key: str, workspace_id: int, + load_version: str, collection_source_dir: str, upload_file_ext: list[str], ) -> tuple[int, dict[str, str]]: @@ -301,11 +321,18 @@ def _fetch_assemblies_to_upload( count += 1 assembly_name = assembly_file_list[0] - _, uploaded = _read_upload_status_yaml_file(upload_env_key, workspace_id, assembly_dir, assembly_name) + _, uploaded = _read_upload_status_yaml_file( + upload_env_key, + workspace_id, + load_version, + assembly_dir, + assembly_name, + ) if uploaded: print( - f"Assembly {assembly_name} already exists in workspace {workspace_id}. Skipping." + f"Assembly {assembly_name} already exists in " + f"workspace {workspace_id} load {load_version}. Skipping." ) continue @@ -329,8 +356,8 @@ def _prepare_skd_job_dir_to_upload(conf: Conf, wait_to_upload_assemblies: dict[s def _post_process( upload_env_key: str, workspace_id: int, - host_assembly_dir: str, - assembly_name: str, + load_version: str, + assembly_tuple: AssemblyTuple, upload_dir: str, output_dir: str, upa: str, @@ -343,14 +370,24 @@ def _post_process( """ # Create a standard entry in sourcedata/workspace # hardlink to the original assembly file in sourcedata - src_file = _get_source_file(host_assembly_dir, assembly_name) + src_file = _get_source_file( + assembly_tuple.host_assembly_dir, + assembly_tuple.assembly_name, + ) target_dir = os.path.join(output_dir, upa) os.makedirs(target_dir, exist_ok=True) dest_file = os.path.join(target_dir, f"{upa}.fna.gz") loader_helper.create_hardlink_between_files(dest_file, src_file) # Update the uploaded.yaml file - _update_upload_status_yaml_file(upload_env_key, workspace_id, upa, host_assembly_dir, assembly_name) + _update_upload_status_yaml_file( + upload_env_key, + workspace_id, + load_version, + upa, + assembly_tuple.host_assembly_dir, + assembly_tuple.assembly_name, + ) # Creates a softlink from new_dir to the contents of upa_dir. new_dir = os.path.join(upload_dir, upa) @@ -362,6 +399,7 @@ def _upload_assembly_files_in_parallel( ws: Workspace, upload_env_key: str, workspace_id: int, + load_version: str, upload_dir: str, wait_to_upload_assemblies: dict[str, str], batch_size: int, @@ -375,6 +413,7 @@ def _upload_assembly_files_in_parallel( ws: Workspace client upload_env_key: environment variable key in uploaded.yaml file workspace_id: target workspace id + load_version: load version upload_dir: a directory in collectionssource that creates new directories linking to sourcedata wait_to_upload_assemblies: a dictionary that maps assembly file name to assembly directory batch_size: a number of files to upload per batch @@ -394,31 +433,23 @@ def _upload_assembly_files_in_parallel( batch_upas = _upload_assemblies_to_workspace(asu, workspace_id, assembly_tuples) batch_uploaded_tuples = assembly_tuples except Exception as e: - name_assemblyTuple_map = { - assembly_tuple.assembly_name: assembly_tuple for assembly_tuple in assembly_tuples - } - assembly_objects_in_ws = loader_helper.list_objects( - workspace_id, - ws, - loader_common_names.OBJECTS_NAME_ASSEMBLY, - ) - print("assembly_objects_in_ws: ", assembly_objects_in_ws) - name_upa_map = { - object_info[1]: "{6}_{0}_{4}".format(*object_info) for object_info in assembly_objects_in_ws - } - batch_uploaded_objects_names = set(name_upa_map.keys()) & set(name_assemblyTuple_map.keys()) - - # update batch_uploaded_tuples and batch_upas - batch_upas = tuple([name_upa_map[name] for name in batch_uploaded_objects_names]) - batch_uploaded_tuples = [name_assemblyTuple_map[name] for name in batch_uploaded_objects_names] + print(e) + name2tuple = {assembly_tuple.assembly_name: assembly_tuple for assembly_tuple in assembly_tuples} + refs = [{"wsid": workspace_id, "name": name} for name in name2tuple] + res = ws.get_object_info3({"objects": refs, "ignoreErrors": 1}) + + # figure out which uploads succeeded + batch_upas = tuple([path[0].replace("/", "_") for path in res["paths"] if path]) + batch_uploaded_tuples = [name2tuple[info[1]] for info in res["infos"] if info] uploaded_fail = True + # post process on sucessful uploads for assembly_tuple, upa in zip(batch_uploaded_tuples, batch_upas): _post_process( upload_env_key, workspace_id, - assembly_tuple.host_assembly_dir, - assembly_tuple.assembly_name, + load_version, + assembly_tuple, upload_dir, output_dir, upa @@ -459,13 +490,18 @@ def main(): workspace_id = args.workspace_id kbase_collection = getattr(args, loader_common_names.KBASE_COLLECTION_ARG_NAME) source_version = getattr(args, loader_common_names.SOURCE_VER_ARG_NAME) + load_version = getattr(args, loader_common_names.LOAD_VER_ARG_NAME) root_dir = getattr(args, loader_common_names.ROOT_DIR_ARG_NAME) token_filepath = args.token_filepath upload_file_ext = args.upload_file_ext batch_size = args.batch_size - max_task = args.jr_max_tasks + cbs_max_tasks = args.cbs_max_tasks au_service_ver = args.au_service_ver keep_job_dir = args.keep_job_dir + if not load_version: + load_version = source_version + print(f"load version is {load_version}.\n" + f"Please keep using this load version until the load is complete!") env = args.env kb_base_url = loader_common_names.KB_BASE_URL_MAP[env] @@ -474,8 +510,8 @@ def main(): parser.error(f"workspace_id needs to be > 0") if batch_size <= 0: parser.error(f"batch_size needs to be > 0") - if max_task <= 0: - parser.error(f"jr_max_tasks needs to be > 0") + if cbs_max_tasks <= 0: + parser.error(f"cbs_max_tasks needs to be > 0") uid = os.getuid() username = os.getlogin() @@ -494,7 +530,8 @@ def main(): # setup container.conf file for the callback server logs if needed if loader_helper.is_config_modification_required(): if click.confirm(f"The config file at {loader_common_names.CONTAINERS_CONF_PATH}\n" - f"needs to be modified to allow for container logging. Do so now?\n"): + f"needs to be modified to allow for container logging.\n" + f"Params 'seccomp_profile' and 'log_driver' will be added/updated under section [containers]. Do so now?\n"): loader_helper.setup_callback_server_logs() else: print("Permission denied and exiting ...") @@ -508,13 +545,13 @@ def main(): job_dir=job_dir, kb_base_url=kb_base_url, token_filepath=token_filepath, - max_task=max_task, - workspace_downloader=False, + max_callback_server_tasks=cbs_max_tasks, ) count, wait_to_upload_assemblies = _fetch_assemblies_to_upload( env, workspace_id, + load_version, collection_source_dir, upload_file_ext, ) @@ -530,7 +567,8 @@ def main(): data_dir = _prepare_skd_job_dir_to_upload(conf, wait_to_upload_assemblies) print(f"{wtus_len} assemblies in {data_dir} are ready to upload to workspace {workspace_id}") - ws = Workspace(conf.ws_url, token=conf.token) + ws_url = os.path.join(kb_base_url, "ws") + ws = Workspace(ws_url, token=conf.token) asu = AssemblyUtil( conf.callback_url, service_ver=au_service_ver, token=conf.token ) @@ -540,6 +578,7 @@ def main(): ws, env, workspace_id, + load_version, upload_dir, wait_to_upload_assemblies, batch_size, diff --git a/test/src/loaders/workspace_uploader/workspace_uploader_test.py b/test/src/loaders/workspace_uploader/workspace_uploader_test.py index 70527d6a5..1f39088ee 100644 --- a/test/src/loaders/workspace_uploader/workspace_uploader_test.py +++ b/test/src/loaders/workspace_uploader/workspace_uploader_test.py @@ -85,10 +85,10 @@ def test_read_upload_status_yaml_file(setup_and_teardown): # test empty yaml file in assembly_dir data, uploaded = workspace_uploader._read_upload_status_yaml_file( - "CI", 12345, assembly_dir, assembly_name + "CI", 12345, "214", assembly_dir, assembly_name ) - expected_data = {"CI": {12345: dict()}} + expected_data = {"CI": {12345: {"214": dict()}}} assert not uploaded assert expected_data == data @@ -100,20 +100,22 @@ def test_update_upload_status_yaml_file(setup_and_teardown): assembly_name = ASSEMBLY_NAMES[0] workspace_uploader._update_upload_status_yaml_file( - "CI", 12345, "12345_58_1", assembly_dir, assembly_name + "CI", 12345, "214", "12345_58_1", assembly_dir, assembly_name ) data, uploaded = workspace_uploader._read_upload_status_yaml_file( - "CI", 12345, assembly_dir, assembly_name + "CI", 12345, "214", assembly_dir, assembly_name ) - expected_data = {"CI": {12345: {"file_name": assembly_name, "upa": "12345_58_1"}}} + expected_data = { + "CI": {12345: {"214": {"file_name": assembly_name, "upa": "12345_58_1"}}} + } assert uploaded assert expected_data == data with pytest.raises(ValueError, match=f"already exists in workspace"): workspace_uploader._update_upload_status_yaml_file( - "CI", 12345, "12345_58_1", assembly_dir, assembly_name + "CI", 12345, "214", "12345_58_1", assembly_dir, assembly_name ) @@ -125,6 +127,7 @@ def test_fetch_assemblies_to_upload(setup_and_teardown): count, wait_to_upload_assemblies = workspace_uploader._fetch_assemblies_to_upload( "CI", 12345, + "214", collection_source_dir, workspace_uploader.UPLOAD_FILE_EXT, ) @@ -144,7 +147,7 @@ def test_fetch_assemblies_to_upload(setup_and_teardown): upas = ["12345_58_1", "12345_58_2"] for assembly_name, assembly_dir, upa in zip(ASSEMBLY_NAMES, assembly_dirs, upas): workspace_uploader._update_upload_status_yaml_file( - "CI", 12345, upa, assembly_dir, assembly_name + "CI", 12345, "214", upa, assembly_dir, assembly_name ) ( @@ -153,6 +156,7 @@ def test_fetch_assemblies_to_upload(setup_and_teardown): ) = workspace_uploader._fetch_assemblies_to_upload( "CI", 12345, + "214", collection_source_dir, workspace_uploader.UPLOAD_FILE_EXT, ) @@ -191,21 +195,26 @@ def test_post_process(setup_and_teardown): host_assembly_dir = params.assembly_dirs[0] assembly_name = ASSEMBLY_NAMES[0] src_file = params.target_files[0] + assembly_tuple = workspace_uploader.AssemblyTuple( + assembly_name, host_assembly_dir, "/path/to/file/in/AssembilyUtil" + ) workspace_uploader._post_process( "CI", 88888, - host_assembly_dir, - assembly_name, + "214", + assembly_tuple, upload_dir, output_dir, "12345_58_1", ) data, uploaded = workspace_uploader._read_upload_status_yaml_file( - "CI", 88888, host_assembly_dir, assembly_name + "CI", 88888, "214", host_assembly_dir, assembly_name ) - expected_data = {"CI": {88888: {"file_name": assembly_name, "upa": "12345_58_1"}}} + expected_data = { + "CI": {88888: {"214": {"file_name": assembly_name, "upa": "12345_58_1"}}} + } dest_file = os.path.join( os.path.join(output_dir, "12345_58_1"), f"12345_58_1.fna.gz" @@ -301,6 +310,7 @@ def test_upload_assembly_files_in_parallel(setup_and_teardown): ws, "CI", 12345, + "214", upload_dir, wait_to_upload_assemblies, 2, @@ -352,6 +362,7 @@ def test_fail_upload_assembly_files_in_parallel(setup_and_teardown): ws, "CI", 12345, + "214", upload_dir, wait_to_upload_assemblies, 2, From d0f4d45f9365e5c1bbb4b7b9e1499677c093b91b Mon Sep 17 00:00:00 2001 From: Sijie Date: Fri, 19 Jan 2024 15:49:34 -0800 Subject: [PATCH 21/30] fix logic in workspace_uploader.py only --- .../workspace_uploader/workspace_uploader.py | 217 +++++++++++++----- 1 file changed, 162 insertions(+), 55 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 4d9db8fc2..86dde1395 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -1,6 +1,6 @@ """ usage: workspace_uploader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_ver SOURCE_VER] - [--root_dir ROOT_DIR] [--load_ver LOAD_VER] [--token_filepath TOKEN_FILEPATH] [--env {CI,NEXT,APPDEV,PROD}] + [--root_dir ROOT_DIR] [--load_id LOAD_ID] [--token_filepath TOKEN_FILEPATH] [--env {CI,NEXT,APPDEV,PROD}] [--upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...]] [--batch_size BATCH_SIZE] [--cbs_max_tasks CBS_MAX_TASKS] [--au_service_ver AU_SERVICE_VER] [--keep_job_dir] @@ -23,7 +23,8 @@ optional arguments: --root_dir ROOT_DIR Root directory for the collections project. (default: /global/cfs/cdirs/kbase/collections) - --load_ver LOAD_VER KBase load version (e.g. r207.kbase.1). (default: source version) + --load_id LOAD_ID The load id of the objects being uploaded to a workspace + If not provided, a random load_id will be generated --token_filepath TOKEN_FILEPATH A file path that stores a KBase token appropriate for the KBase environment If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable @@ -59,6 +60,7 @@ import os import shutil import time +import uuid from collections import namedtuple from datetime import datetime from pathlib import Path @@ -78,7 +80,7 @@ JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER = "/kb/module/work/tmp" UPLOADED_YAML = "uploaded.yaml" -AssemblyTuple = namedtuple("AssemblyTuple", ["assembly_name", "host_assembly_dir", "container_internal_assembly_path"]) +_AssemblyTuple = namedtuple("AssemblyTuple", ["assembly_name", "host_assembly_dir", "container_internal_assembly_path"]) def _get_parser(): @@ -120,9 +122,10 @@ def _get_parser(): help=loader_common_names.ROOT_DIR_DESCR ) optional.add_argument( - f"--{loader_common_names.LOAD_VER_ARG_NAME}", + "--load_id", type=str, - help=loader_common_names.LOAD_VER_DESCR + " (default: source version)" + help="The load id of the objects being uploaded to a workspace. " + "If not provided, a random load_id will be generated." ) optional.add_argument( "--token_filepath", @@ -194,7 +197,8 @@ def _get_source_file(assembly_dir: str, assembly_file: str) -> str: def _upload_assemblies_to_workspace( asu: AssemblyUtil, workspace_id: int, - assembly_tuples: list[AssemblyTuple], + load_id: int, + assembly_tuples: list[_AssemblyTuple], ) -> tuple[str, ...]: """ Upload assembly files to the target workspace in batch. The bulk method fails @@ -204,19 +208,17 @@ def _upload_assemblies_to_workspace( { "file": assembly_tuple.container_internal_assembly_path, "assembly_name": assembly_tuple.assembly_name, + "object_metadata": {"load_id": load_id}, } for assembly_tuple in assembly_tuples ] - try: - assembly_ref = asu.save_assemblies_from_fastas( - { - "workspace_id": workspace_id, - "inputs": inputs - } - ) - except Exception as e: - raise e + assembly_ref = asu.save_assemblies_from_fastas( + { + "workspace_id": workspace_id, + "inputs": inputs + } + ) upas = tuple([result_dict["upa"].replace("/", "_") for result_dict in assembly_ref["results"]]) @@ -226,7 +228,7 @@ def _upload_assemblies_to_workspace( def _read_upload_status_yaml_file( upload_env_key: str, workspace_id: int, - load_version: str, + load_id: str, assembly_dir: str, assembly_name: str, ) -> tuple[dict[str, dict[int, list[str]]], bool]: @@ -249,19 +251,24 @@ def _read_upload_status_yaml_file( if workspace_id not in data[upload_env_key]: data[upload_env_key][workspace_id] = dict() - if load_version not in data[upload_env_key][workspace_id]: - data[upload_env_key][workspace_id][load_version] = dict() + workspace_dict = data[upload_env_key][workspace_id] + + if "file_name" not in workspace_dict: + workspace_dict["file_name"] = assembly_name - assembly_dict = data[upload_env_key][workspace_id][load_version] - if assembly_dict and assembly_dict["file_name"] == assembly_name: + if "loads" not in workspace_dict: + workspace_dict["loads"] = dict() + + if load_id in workspace_dict["loads"]: uploaded = True + return data, uploaded def _update_upload_status_yaml_file( upload_env_key: str, workspace_id: int, - load_version: str, + load_id: str, upa: str, assembly_dir: str, assembly_name: str, @@ -272,7 +279,7 @@ def _update_upload_status_yaml_file( data, uploaded = _read_upload_status_yaml_file( upload_env_key, workspace_id, - load_version, + load_id, assembly_dir, assembly_name, ) @@ -280,9 +287,7 @@ def _update_upload_status_yaml_file( if uploaded: raise ValueError(f"Assembly {assembly_name} already exists in workspace {workspace_id}") - data[upload_env_key][workspace_id][load_version] = { - "file_name": assembly_name, "upa": upa - } + data[upload_env_key][workspace_id]["loads"][load_id] = {"upa": upa} file_path = _get_yaml_file_path(assembly_dir) with open(file_path, "w") as file: @@ -292,7 +297,7 @@ def _update_upload_status_yaml_file( def _fetch_assemblies_to_upload( upload_env_key: str, workspace_id: int, - load_version: str, + load_id: str, collection_source_dir: str, upload_file_ext: list[str], ) -> tuple[int, dict[str, str]]: @@ -324,7 +329,7 @@ def _fetch_assemblies_to_upload( _, uploaded = _read_upload_status_yaml_file( upload_env_key, workspace_id, - load_version, + load_id, assembly_dir, assembly_name, ) @@ -332,7 +337,7 @@ def _fetch_assemblies_to_upload( if uploaded: print( f"Assembly {assembly_name} already exists in " - f"workspace {workspace_id} load {load_version}. Skipping." + f"workspace {workspace_id} load {load_id}. Skipping." ) continue @@ -341,6 +346,50 @@ def _fetch_assemblies_to_upload( return count, wait_to_upload_assemblies +def _query_workspace_with_load_id( + ws: Workspace, + workspace_id: int, + load_id: str, + assembly_names: list[str], +) -> tuple[list[str], list[str]]: + + if len(assembly_names) > 10000: + raise ValueError("The effective max batch size must be <= 10000") + + refs = [{"wsid": workspace_id, "name": name} for name in assembly_names] + res = ws.get_object_info3({"objects": refs, "ignoreErrors": 1, "includeMetadata": 1}) + uploaded_obj_names_batch = [ + info[1] + for info in res["infos"] + if info is not None and "load_id" in info[10] and info[10]["load_id"] == load_id + ] + uploaded_obj_upas_batch = [ + path[0].replace("/", "_") for path in res["paths"] if path is not None + ] + return uploaded_obj_names_batch, uploaded_obj_upas_batch + + +def _query_workspace_with_load_id_mass( + ws: Workspace, + workspace_id: int, + load_id: str, + assembly_names: list[str], + batch_size: int = 10000, +) -> tuple[list[str], list[str]]: + + uploaded_obj_names = [] + uploaded_obj_upas = [] + + for idx in range(0, len(assembly_names), batch_size): + obj_names_batch, obj_upas_batch = _query_workspace_with_load_id( + ws, workspace_id, load_id, assembly_names[idx: idx + batch_size] + ) + uploaded_obj_names.extend(obj_names_batch) + uploaded_obj_upas.extend(obj_upas_batch) + + return uploaded_obj_names, uploaded_obj_upas + + def _prepare_skd_job_dir_to_upload(conf: Conf, wait_to_upload_assemblies: dict[str, str]) -> str: """ Prepare SDK job directory to upload. @@ -356,8 +405,8 @@ def _prepare_skd_job_dir_to_upload(conf: Conf, wait_to_upload_assemblies: dict[s def _post_process( upload_env_key: str, workspace_id: int, - load_version: str, - assembly_tuple: AssemblyTuple, + load_id: str, + assembly_tuple: _AssemblyTuple, upload_dir: str, output_dir: str, upa: str, @@ -383,7 +432,7 @@ def _post_process( _update_upload_status_yaml_file( upload_env_key, workspace_id, - load_version, + load_id, upa, assembly_tuple.host_assembly_dir, assembly_tuple.assembly_name, @@ -399,7 +448,7 @@ def _upload_assembly_files_in_parallel( ws: Workspace, upload_env_key: str, workspace_id: int, - load_version: str, + load_id: str, upload_dir: str, wait_to_upload_assemblies: dict[str, str], batch_size: int, @@ -413,7 +462,7 @@ def _upload_assembly_files_in_parallel( ws: Workspace client upload_env_key: environment variable key in uploaded.yaml file workspace_id: target workspace id - load_version: load version + load_id: load id upload_dir: a directory in collectionssource that creates new directories linking to sourcedata wait_to_upload_assemblies: a dictionary that maps assembly file name to assembly directory batch_size: a number of files to upload per batch @@ -429,26 +478,39 @@ def _upload_assembly_files_in_parallel( uploaded_fail = False for assembly_tuples in _gen(wait_to_upload_assemblies, batch_size): + batch_upas = tuple() + batch_uploaded_tuples = [] + try: - batch_upas = _upload_assemblies_to_workspace(asu, workspace_id, assembly_tuples) + batch_upas = _upload_assemblies_to_workspace( + asu, workspace_id, load_id, assembly_tuples + ) batch_uploaded_tuples = assembly_tuples except Exception as e: print(e) - name2tuple = {assembly_tuple.assembly_name: assembly_tuple for assembly_tuple in assembly_tuples} - refs = [{"wsid": workspace_id, "name": name} for name in name2tuple] - res = ws.get_object_info3({"objects": refs, "ignoreErrors": 1}) - - # figure out which uploads succeeded - batch_upas = tuple([path[0].replace("/", "_") for path in res["paths"] if path]) - batch_uploaded_tuples = [name2tuple[info[1]] for info in res["infos"] if info] uploaded_fail = True + try: + # figure out uploads that succeeded + name2tuple = {assembly_tuple.assembly_name: assembly_tuple + for assembly_tuple in assembly_tuples} + uploaded_obj_names, uploaded_obj_upas = _query_workspace_with_load_id_mass( + ws, workspace_id, load_id, list(name2tuple.keys()) + ) + + batch_upas = tuple(uploaded_obj_upas) + batch_uploaded_tuples = [name2tuple[name] for name in uploaded_obj_names] + + except Exception as e: + print(f"WARNING: There are inconsistencies between " + f"the workspace and the yaml files as the result of {e}") + # post process on sucessful uploads for assembly_tuple, upa in zip(batch_uploaded_tuples, batch_upas): _post_process( upload_env_key, workspace_id, - load_version, + load_id, assembly_tuple, upload_dir, output_dir, @@ -467,17 +529,26 @@ def _upload_assembly_files_in_parallel( return uploaded_count +def _dict2tuple_list(assemblies_dict: dict[str, str]) -> list[_AssemblyTuple]: + assemblyTuple_list = [ + _AssemblyTuple( + i[0], + i[1], + os.path.join(JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER, i[0]) + ) + for i in assemblies_dict.items() + ] + return assemblyTuple_list + + def _gen( wait_to_upload_assemblies: dict[str, str], batch_size: int, -) -> Generator[list[AssemblyTuple], None, None]: +) -> Generator[list[_AssemblyTuple], None, None]: """ Generator function to yield the assembly files to upload. """ - assemblyTuple_list = [ - AssemblyTuple(i[0], i[1], os.path.join(JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER, i[0])) - for i in wait_to_upload_assemblies.items() - ] + assemblyTuple_list = _dict2tuple_list(wait_to_upload_assemblies) # yield AssemblyTuples in batch for idx in range(0, len(wait_to_upload_assemblies), batch_size): yield assemblyTuple_list[idx: idx + batch_size] @@ -490,7 +561,6 @@ def main(): workspace_id = args.workspace_id kbase_collection = getattr(args, loader_common_names.KBASE_COLLECTION_ARG_NAME) source_version = getattr(args, loader_common_names.SOURCE_VER_ARG_NAME) - load_version = getattr(args, loader_common_names.LOAD_VER_ARG_NAME) root_dir = getattr(args, loader_common_names.ROOT_DIR_ARG_NAME) token_filepath = args.token_filepath upload_file_ext = args.upload_file_ext @@ -498,9 +568,11 @@ def main(): cbs_max_tasks = args.cbs_max_tasks au_service_ver = args.au_service_ver keep_job_dir = args.keep_job_dir - if not load_version: - load_version = source_version - print(f"load version is {load_version}.\n" + load_id = args.load_id + if not load_id: + print("load_id is not provided. Generating a load_id ...") + load_id = uuid.uuid4().hex + print(f"load_id is {load_id}.\n" f"Please keep using this load version until the load is complete!") env = args.env @@ -551,11 +623,48 @@ def main(): count, wait_to_upload_assemblies = _fetch_assemblies_to_upload( env, workspace_id, - load_version, + load_id, collection_source_dir, upload_file_ext, ) + # set up workspace client + ws_url = os.path.join(kb_base_url, "ws") + ws = Workspace(ws_url, token=conf.token) + + uploaded_obj_names, uploaded_obj_upas = _query_workspace_with_load_id_mass( + ws, workspace_id, load_id, list(wait_to_upload_assemblies.keys()) + ) + # fix inconsistencies between the workspace and the local yaml files + if uploaded_obj_names: + print(uploaded_obj_names) + if click.confirm(f"\nThese objects had been successfully uploaded to\n" + f"workspace {workspace_id} per the load_id {load_id} in their metadata,\n" + f"but missing from the local uploaded.yaml files. Start failure recovery now?\n"): + wait_to_update_assemblies = { + assembly_name: wait_to_upload_assemblies[assembly_name] + for assembly_name in uploaded_obj_names + } + uploaded_tuples = _dict2tuple_list(wait_to_update_assemblies) + for assembly_tuple, upa in zip( + uploaded_tuples, uploaded_obj_upas + ): + _post_process( + env, + workspace_id, + load_id, + assembly_tuple, + upload_dir, + output_dir, + upa, + ) + # remove assemblies that are already uploaded + for assembly_name in uploaded_obj_names: + wait_to_upload_assemblies.pop(assembly_name) + else: + print("Failure recovery permission denied and exiting ...") + return + if not wait_to_upload_assemblies: print(f"All {count} assembly files already exist in workspace {workspace_id}") return @@ -567,8 +676,6 @@ def main(): data_dir = _prepare_skd_job_dir_to_upload(conf, wait_to_upload_assemblies) print(f"{wtus_len} assemblies in {data_dir} are ready to upload to workspace {workspace_id}") - ws_url = os.path.join(kb_base_url, "ws") - ws = Workspace(ws_url, token=conf.token) asu = AssemblyUtil( conf.callback_url, service_ver=au_service_ver, token=conf.token ) @@ -578,7 +685,7 @@ def main(): ws, env, workspace_id, - load_version, + load_id, upload_dir, wait_to_upload_assemblies, batch_size, From 744fe56fcd0d00b9f0a0bbaa39c2a2af9272db69 Mon Sep 17 00:00:00 2001 From: Sijie Date: Fri, 19 Jan 2024 17:01:47 -0800 Subject: [PATCH 22/30] fix all the tests --- .../workspace_uploader_test.py | 100 ++++++++++++++++-- 1 file changed, 89 insertions(+), 11 deletions(-) diff --git a/test/src/loaders/workspace_uploader/workspace_uploader_test.py b/test/src/loaders/workspace_uploader/workspace_uploader_test.py index 1f39088ee..5b7cc6091 100644 --- a/test/src/loaders/workspace_uploader/workspace_uploader_test.py +++ b/test/src/loaders/workspace_uploader/workspace_uploader_test.py @@ -87,9 +87,9 @@ def test_read_upload_status_yaml_file(setup_and_teardown): data, uploaded = workspace_uploader._read_upload_status_yaml_file( "CI", 12345, "214", assembly_dir, assembly_name ) - - expected_data = {"CI": {12345: {"214": dict()}}} - + expected_data = { + "CI": {12345: {"file_name": assembly_name, "loads": {}}} + } assert not uploaded assert expected_data == data @@ -107,7 +107,7 @@ def test_update_upload_status_yaml_file(setup_and_teardown): ) expected_data = { - "CI": {12345: {"214": {"file_name": assembly_name, "upa": "12345_58_1"}}} + "CI": {12345: {"file_name": assembly_name, "loads": {"214": {"upa": "12345_58_1"}}}} } assert uploaded @@ -195,7 +195,7 @@ def test_post_process(setup_and_teardown): host_assembly_dir = params.assembly_dirs[0] assembly_name = ASSEMBLY_NAMES[0] src_file = params.target_files[0] - assembly_tuple = workspace_uploader.AssemblyTuple( + assembly_tuple = workspace_uploader._AssemblyTuple( assembly_name, host_assembly_dir, "/path/to/file/in/AssembilyUtil" ) @@ -213,7 +213,7 @@ def test_post_process(setup_and_teardown): "CI", 88888, "214", host_assembly_dir, assembly_name ) expected_data = { - "CI": {88888: {"214": {"file_name": assembly_name, "upa": "12345_58_1"}}} + "CI": {88888: {"file_name": assembly_name, "loads": {"214": {"upa": "12345_58_1"}}}} } dest_file = os.path.join( @@ -237,10 +237,12 @@ def test_upload_assembly_to_workspace(setup_and_teardown): asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) asu.save_assemblies_from_fastas.return_value = {"results":[{"upa": "12345/58/1"}]} - assembly_tuple = workspace_uploader.AssemblyTuple( + assembly_tuple = workspace_uploader._AssemblyTuple( assembly_name, host_assembly_dir, "/path/to/file/in/AssembilyUtil" ) - upas = workspace_uploader._upload_assemblies_to_workspace(asu, 12345, [assembly_tuple]) + upas = workspace_uploader._upload_assemblies_to_workspace( + asu, 12345, "214", [assembly_tuple] + ) assert upas == tuple(["12345_58_1"]) asu.save_assemblies_from_fastas.assert_called_once_with( { @@ -248,7 +250,8 @@ def test_upload_assembly_to_workspace(setup_and_teardown): "inputs": [ { "file": assembly_tuple.container_internal_assembly_path, - "assembly_name": assembly_tuple.assembly_name + "assembly_name": assembly_tuple.assembly_name, + "object_metadata": {"load_id": "214"}, } ] } @@ -265,14 +268,14 @@ def test_generator(setup_and_teardown): assemblyTuple_list = list(workspace_uploader._gen(wait_to_upload_assemblies, 1)) expected_assemblyTuple_list = [ [ - workspace_uploader.AssemblyTuple( + workspace_uploader._AssemblyTuple( "GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", assembly_dirs[0], "/kb/module/work/tmp/GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", ) ], [ - workspace_uploader.AssemblyTuple( + workspace_uploader._AssemblyTuple( "GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", assembly_dirs[1], "/kb/module/work/tmp/GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", @@ -370,3 +373,78 @@ def test_fail_upload_assembly_files_in_parallel(setup_and_teardown): ) assert uploaded_count == 0 + + +def test_query_workspace_with_load_id_mass(setup_and_teardown): + ws = create_autospec(Workspace, spec_set=True, instance=True) + with pytest.raises( + Exception, match="The effective max batch size must be <= 10000" + ): + workspace_uploader._query_workspace_with_load_id_mass( + ws, + 12345, + "214", + [str(num) for num in range(100001)], + 10001, + ) + + # happy test + ws.get_object_info3.return_value = { + 'infos': [ + [ + 1086, + 'GCF_000980105.1_gtlEnvA5udCFS_genomic.fna.gz', + 'KBaseGenomeAnnotations.Assembly-6.3', + '2024-01-18T23:12:44+0000', + 18, + 'sijiex', + 69046, + 'sijiex:narrative_1688077625427', + 'aaa726d2b976e27e729ac288812e81f6', + 71823, + { + 'GC content': '0.41571', + 'Size': '4079204', + 'N Contigs': '260', + 'MD5': '8aa6b1244e18c4f93bb3307902bd3a4d', + "load_id": "998" + } + ], + [ + 1068, + 'GCF_000979375.1_gtlEnvA5udCFS_genomic.fna.gz', + 'KBaseGenomeAnnotations.Assembly-6.3', + '2024-01-18T23:12:35+0000', + 18, + 'sijiex', + 69046, + 'sijiex:narrative_1688077625427', + '866033827fd54569c953e8b3dd58d0aa', + 38242, + { + 'GC content': '0.41526', + 'Size': '4092300', + 'N Contigs': '136', + 'MD5': '1e007bad0811a6d6e09a882d3bf802ab', + "load_id": "998" + } + ], + None], + 'paths': [['69046/1086/18'], ['69046/1068/18'], None] + } + + obj_names, obj_upas = workspace_uploader._query_workspace_with_load_id_mass( + ws, + 69046, + "998", + [ + "GCF_000980105.1_gtlEnvA5udCFS_genomic.fna.gz", + "GCF_000979375.1_gtlEnvA5udCFS_genomic.fna.gz", + "aloha", + ] + ) + assert obj_names == [ + "GCF_000980105.1_gtlEnvA5udCFS_genomic.fna.gz", + "GCF_000979375.1_gtlEnvA5udCFS_genomic.fna.gz", + ] + assert obj_upas == ["69046_1086_18", "69046_1068_18"] From c64ce1dad81617509900a5779d432d62697363d9 Mon Sep 17 00:00:00 2001 From: Sijie Date: Fri, 19 Jan 2024 17:43:01 -0800 Subject: [PATCH 23/30] fix format --- .../workspace_uploader/workspace_uploader.py | 236 ++++++++++-------- .../workspace_uploader_test.py | 6 +- 2 files changed, 137 insertions(+), 105 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 86dde1395..0632e12ec 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -76,11 +76,15 @@ # setup KB_AUTH_TOKEN as env or provide a token_filepath in --token_filepath # export KB_AUTH_TOKEN="your-kb-auth-token" -UPLOAD_FILE_EXT = ["genomic.fna.gz"] # uplaod only files that match given extensions -JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER = "/kb/module/work/tmp" -UPLOADED_YAML = "uploaded.yaml" +_UPLOAD_FILE_EXT = ["genomic.fna.gz"] # uplaod only files that match given extensions +_JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER = "/kb/module/work/tmp" +_UPLOADED_YAML = "uploaded.yaml" +_WS_MAX_BATCH_SIZE = 10000 -_AssemblyTuple = namedtuple("AssemblyTuple", ["assembly_name", "host_assembly_dir", "container_internal_assembly_path"]) +_AssemblyTuple = namedtuple( + "AssemblyTuple", + ["assembly_name", "host_assembly_dir", "container_internal_assembly_path"], +) def _get_parser(): @@ -103,15 +107,15 @@ def _get_parser(): f"--{loader_common_names.KBASE_COLLECTION_ARG_NAME}", type=str, help="The name of the collection being processed. " - "Specifies where the files to be uploaded exist (in the default NONE environment) " - "and the name of the collection to be created in the specific KBase environment in the 'collectionsource' directory.", + "Specifies where the files to be uploaded exist (in the default NONE environment) " + "and the name of the collection to be created in the specific KBase environment in the 'collectionsource' directory.", ) required.add_argument( f"--{loader_common_names.SOURCE_VER_ARG_NAME}", type=str, help="The source version of the collection being processed. " - "Specifies where the files to be uploaded exist (in the default NONE environment) " - "and the source version of the collection to be created in the specific KBase environment in the 'collectionsource' directory.", + "Specifies where the files to be uploaded exist (in the default NONE environment) " + "and the source version of the collection to be created in the specific KBase environment in the 'collectionsource' directory.", ) # Optional argument @@ -119,20 +123,20 @@ def _get_parser(): f"--{loader_common_names.ROOT_DIR_ARG_NAME}", type=str, default=loader_common_names.ROOT_DIR, - help=loader_common_names.ROOT_DIR_DESCR + help=loader_common_names.ROOT_DIR_DESCR, ) optional.add_argument( "--load_id", type=str, help="The load id of the objects being uploaded to a workspace. " - "If not provided, a random load_id will be generated." + "If not provided, a random load_id will be generated.", ) optional.add_argument( "--token_filepath", type=str, help="A file path that stores a KBase token appropriate for the KBase environment. " - "If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable. " - "Also use as an admin token if the user has admin permission to catalog params." + "If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable. " + "Also use as an admin token if the user has admin permission to catalog params.", ) optional.add_argument( "--env", @@ -145,7 +149,7 @@ def _get_parser(): "--upload_file_ext", type=str, nargs="+", - default=UPLOAD_FILE_EXT, + default=_UPLOAD_FILE_EXT, help="Upload only files that match given extensions", ) optional.add_argument( @@ -165,7 +169,7 @@ def _get_parser(): type=str, default="release", help="The service version of AssemblyUtil client" - "('dev', 'beta', 'release', or a git commit)", + "('dev', 'beta', 'release', or a git commit)", ) optional.add_argument( "--keep_job_dir", @@ -179,7 +183,7 @@ def _get_yaml_file_path(assembly_dir: str) -> str: """ Get the uploaded.yaml file path from collections source directory. """ - file_path = os.path.join(assembly_dir, UPLOADED_YAML) + file_path = os.path.join(assembly_dir, _UPLOADED_YAML) Path(file_path).touch(exist_ok=True) return file_path @@ -195,10 +199,10 @@ def _get_source_file(assembly_dir: str, assembly_file: str) -> str: def _upload_assemblies_to_workspace( - asu: AssemblyUtil, - workspace_id: int, - load_id: int, - assembly_tuples: list[_AssemblyTuple], + asu: AssemblyUtil, + workspace_id: int, + load_id: int, + assembly_tuples: list[_AssemblyTuple], ) -> tuple[str, ...]: """ Upload assembly files to the target workspace in batch. The bulk method fails @@ -214,23 +218,24 @@ def _upload_assemblies_to_workspace( ] assembly_ref = asu.save_assemblies_from_fastas( - { - "workspace_id": workspace_id, - "inputs": inputs - } + {"workspace_id": workspace_id, "inputs": inputs} ) - upas = tuple([result_dict["upa"].replace("/", "_") - for result_dict in assembly_ref["results"]]) + upas = tuple( + [ + result_dict["upa"].replace("/", "_") + for result_dict in assembly_ref["results"] + ] + ) return upas def _read_upload_status_yaml_file( - upload_env_key: str, - workspace_id: int, - load_id: str, - assembly_dir: str, - assembly_name: str, + upload_env_key: str, + workspace_id: int, + load_id: str, + assembly_dir: str, + assembly_name: str, ) -> tuple[dict[str, dict[int, list[str]]], bool]: """ Get metadata and upload status of an assembly from the uploaded.yaml file. @@ -238,7 +243,9 @@ def _read_upload_status_yaml_file( uploaded = False if upload_env_key not in loader_common_names.KB_ENV: - raise ValueError(f"Currently only support these {loader_common_names.KB_ENV} envs for upload") + raise ValueError( + f"Currently only support these {loader_common_names.KB_ENV} envs for upload" + ) file_path = _get_yaml_file_path(assembly_dir) @@ -266,12 +273,12 @@ def _read_upload_status_yaml_file( def _update_upload_status_yaml_file( - upload_env_key: str, - workspace_id: int, - load_id: str, - upa: str, - assembly_dir: str, - assembly_name: str, + upload_env_key: str, + workspace_id: int, + load_id: str, + upa: str, + assembly_dir: str, + assembly_name: str, ) -> None: """ Update the uploaded.yaml file in target genome_dir with newly uploaded assembly names and upa info. @@ -285,7 +292,9 @@ def _update_upload_status_yaml_file( ) if uploaded: - raise ValueError(f"Assembly {assembly_name} already exists in workspace {workspace_id}") + raise ValueError( + f"Assembly {assembly_name} already exists in workspace {workspace_id}" + ) data[upload_env_key][workspace_id]["loads"][load_id] = {"upa": upa} @@ -295,11 +304,11 @@ def _update_upload_status_yaml_file( def _fetch_assemblies_to_upload( - upload_env_key: str, - workspace_id: int, - load_id: str, - collection_source_dir: str, - upload_file_ext: list[str], + upload_env_key: str, + workspace_id: int, + load_id: str, + collection_source_dir: str, + upload_file_ext: list[str], ) -> tuple[int, dict[str, str]]: count = 0 wait_to_upload_assemblies = dict() @@ -311,7 +320,6 @@ def _fetch_assemblies_to_upload( ] for assembly_dir in assembly_dirs: - assembly_file_list = [ f for f in os.listdir(assembly_dir) @@ -320,8 +328,10 @@ def _fetch_assemblies_to_upload( ] if len(assembly_file_list) != 1: - raise ValueError(f"One and only one assembly file that ends with {upload_file_ext} " - f"must be present in {assembly_dir} directory") + raise ValueError( + f"One and only one assembly file that ends with {upload_file_ext} " + f"must be present in {assembly_dir} directory" + ) count += 1 assembly_name = assembly_file_list[0] @@ -352,12 +362,13 @@ def _query_workspace_with_load_id( load_id: str, assembly_names: list[str], ) -> tuple[list[str], list[str]]: - - if len(assembly_names) > 10000: + if len(assembly_names) > _WS_MAX_BATCH_SIZE: raise ValueError("The effective max batch size must be <= 10000") refs = [{"wsid": workspace_id, "name": name} for name in assembly_names] - res = ws.get_object_info3({"objects": refs, "ignoreErrors": 1, "includeMetadata": 1}) + res = ws.get_object_info3( + {"objects": refs, "ignoreErrors": 1, "includeMetadata": 1} + ) uploaded_obj_names_batch = [ info[1] for info in res["infos"] @@ -374,15 +385,14 @@ def _query_workspace_with_load_id_mass( workspace_id: int, load_id: str, assembly_names: list[str], - batch_size: int = 10000, + batch_size: int = _WS_MAX_BATCH_SIZE, ) -> tuple[list[str], list[str]]: - uploaded_obj_names = [] uploaded_obj_upas = [] for idx in range(0, len(assembly_names), batch_size): obj_names_batch, obj_upas_batch = _query_workspace_with_load_id( - ws, workspace_id, load_id, assembly_names[idx: idx + batch_size] + ws, workspace_id, load_id, assembly_names[idx : idx + batch_size] ) uploaded_obj_names.extend(obj_names_batch) uploaded_obj_upas.extend(obj_upas_batch) @@ -390,7 +400,9 @@ def _query_workspace_with_load_id_mass( return uploaded_obj_names, uploaded_obj_upas -def _prepare_skd_job_dir_to_upload(conf: Conf, wait_to_upload_assemblies: dict[str, str]) -> str: +def _prepare_skd_job_dir_to_upload( + conf: Conf, wait_to_upload_assemblies: dict[str, str] +) -> str: """ Prepare SDK job directory to upload. """ @@ -403,13 +415,13 @@ def _prepare_skd_job_dir_to_upload(conf: Conf, wait_to_upload_assemblies: dict[s def _post_process( - upload_env_key: str, - workspace_id: int, - load_id: str, - assembly_tuple: _AssemblyTuple, - upload_dir: str, - output_dir: str, - upa: str, + upload_env_key: str, + workspace_id: int, + load_id: str, + assembly_tuple: _AssemblyTuple, + upload_dir: str, + output_dir: str, + upa: str, ) -> None: """ Create a standard entry in sourcedata/workspace for each assembly. @@ -444,15 +456,15 @@ def _post_process( def _upload_assembly_files_in_parallel( - asu: AssemblyUtil, - ws: Workspace, - upload_env_key: str, - workspace_id: int, - load_id: str, - upload_dir: str, - wait_to_upload_assemblies: dict[str, str], - batch_size: int, - output_dir: str, + asu: AssemblyUtil, + ws: Workspace, + upload_env_key: str, + workspace_id: int, + load_id: str, + upload_dir: str, + wait_to_upload_assemblies: dict[str, str], + batch_size: int, + output_dir: str, ) -> list[str]: """ Upload assembly files to the target workspace in parallel using multiprocessing. @@ -477,7 +489,6 @@ def _upload_assembly_files_in_parallel( uploaded_count = 0 uploaded_fail = False for assembly_tuples in _gen(wait_to_upload_assemblies, batch_size): - batch_upas = tuple() batch_uploaded_tuples = [] @@ -492,18 +503,27 @@ def _upload_assembly_files_in_parallel( try: # figure out uploads that succeeded - name2tuple = {assembly_tuple.assembly_name: assembly_tuple - for assembly_tuple in assembly_tuples} - uploaded_obj_names, uploaded_obj_upas = _query_workspace_with_load_id_mass( + name2tuple = { + assembly_tuple.assembly_name: assembly_tuple + for assembly_tuple in assembly_tuples + } + ( + uploaded_obj_names, + uploaded_obj_upas, + ) = _query_workspace_with_load_id_mass( ws, workspace_id, load_id, list(name2tuple.keys()) ) batch_upas = tuple(uploaded_obj_upas) - batch_uploaded_tuples = [name2tuple[name] for name in uploaded_obj_names] + batch_uploaded_tuples = [ + name2tuple[name] for name in uploaded_obj_names + ] except Exception as e: - print(f"WARNING: There are inconsistencies between " - f"the workspace and the yaml files as the result of {e}") + print( + f"WARNING: There are inconsistencies between " + f"the workspace and the yaml files as the result of {e}" + ) # post process on sucessful uploads for assembly_tuple, upa in zip(batch_uploaded_tuples, batch_upas): @@ -514,14 +534,16 @@ def _upload_assembly_files_in_parallel( assembly_tuple, upload_dir, output_dir, - upa + upa, ) uploaded_count += len(batch_uploaded_tuples) if uploaded_count % 100 == 0: - print(f"Assemblies uploaded: {uploaded_count}/{assembly_files_len}, " - f"Percentage: {uploaded_count / assembly_files_len * 100:.2f}%, " - f"Time: {datetime.now()}") + print( + f"Assemblies uploaded: {uploaded_count}/{assembly_files_len}, " + f"Percentage: {uploaded_count / assembly_files_len * 100:.2f}%, " + f"Time: {datetime.now()}" + ) if uploaded_fail: return uploaded_count @@ -532,9 +554,7 @@ def _upload_assembly_files_in_parallel( def _dict2tuple_list(assemblies_dict: dict[str, str]) -> list[_AssemblyTuple]: assemblyTuple_list = [ _AssemblyTuple( - i[0], - i[1], - os.path.join(JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER, i[0]) + i[0], i[1], os.path.join(_JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER, i[0]) ) for i in assemblies_dict.items() ] @@ -542,8 +562,8 @@ def _dict2tuple_list(assemblies_dict: dict[str, str]) -> list[_AssemblyTuple]: def _gen( - wait_to_upload_assemblies: dict[str, str], - batch_size: int, + wait_to_upload_assemblies: dict[str, str], + batch_size: int, ) -> Generator[list[_AssemblyTuple], None, None]: """ Generator function to yield the assembly files to upload. @@ -551,7 +571,7 @@ def _gen( assemblyTuple_list = _dict2tuple_list(wait_to_upload_assemblies) # yield AssemblyTuples in batch for idx in range(0, len(wait_to_upload_assemblies), batch_size): - yield assemblyTuple_list[idx: idx + batch_size] + yield assemblyTuple_list[idx : idx + batch_size] def main(): @@ -572,8 +592,10 @@ def main(): if not load_id: print("load_id is not provided. Generating a load_id ...") load_id = uuid.uuid4().hex - print(f"load_id is {load_id}.\n" - f"Please keep using this load version until the load is complete!") + print( + f"load_id is {load_id}.\n" + f"Please keep using this load version until the load is complete!" + ) env = args.env kb_base_url = loader_common_names.KB_BASE_URL_MAP[env] @@ -592,7 +614,9 @@ def main(): collection_source_dir = loader_helper.make_collection_source_dir( root_dir, loader_common_names.DEFAULT_ENV, kbase_collection, source_version ) - upload_dir = loader_helper.make_collection_source_dir(root_dir, env, kbase_collection, source_version) + upload_dir = loader_helper.make_collection_source_dir( + root_dir, env, kbase_collection, source_version + ) output_dir = loader_helper.make_sourcedata_ws_dir(root_dir, env, workspace_id) proc = None @@ -601,9 +625,11 @@ def main(): try: # setup container.conf file for the callback server logs if needed if loader_helper.is_config_modification_required(): - if click.confirm(f"The config file at {loader_common_names.CONTAINERS_CONF_PATH}\n" - f"needs to be modified to allow for container logging.\n" - f"Params 'seccomp_profile' and 'log_driver' will be added/updated under section [containers]. Do so now?\n"): + if click.confirm( + f"The config file at {loader_common_names.CONTAINERS_CONF_PATH}\n" + f"needs to be modified to allow for container logging.\n" + f"Params 'seccomp_profile' and 'log_driver' will be added/updated under section [containers]. Do so now?\n" + ): loader_helper.setup_callback_server_logs() else: print("Permission denied and exiting ...") @@ -638,17 +664,17 @@ def main(): # fix inconsistencies between the workspace and the local yaml files if uploaded_obj_names: print(uploaded_obj_names) - if click.confirm(f"\nThese objects had been successfully uploaded to\n" - f"workspace {workspace_id} per the load_id {load_id} in their metadata,\n" - f"but missing from the local uploaded.yaml files. Start failure recovery now?\n"): + if click.confirm( + f"\nThese objects had been successfully uploaded to\n" + f"workspace {workspace_id} per the load_id {load_id} in their metadata,\n" + f"but missing from the local uploaded.yaml files. Start failure recovery now?\n" + ): wait_to_update_assemblies = { assembly_name: wait_to_upload_assemblies[assembly_name] for assembly_name in uploaded_obj_names } uploaded_tuples = _dict2tuple_list(wait_to_update_assemblies) - for assembly_tuple, upa in zip( - uploaded_tuples, uploaded_obj_upas - ): + for assembly_tuple, upa in zip(uploaded_tuples, uploaded_obj_upas): _post_process( env, workspace_id, @@ -666,7 +692,9 @@ def main(): return if not wait_to_upload_assemblies: - print(f"All {count} assembly files already exist in workspace {workspace_id}") + print( + f"All {count} assembly files already exist in workspace {workspace_id}" + ) return wtus_len = len(wait_to_upload_assemblies) @@ -674,7 +702,9 @@ def main(): print(f"Detected {count - wtus_len} assembly files already exist in workspace") data_dir = _prepare_skd_job_dir_to_upload(conf, wait_to_upload_assemblies) - print(f"{wtus_len} assemblies in {data_dir} are ready to upload to workspace {workspace_id}") + print( + f"{wtus_len} assemblies in {data_dir} are ready to upload to workspace {workspace_id}" + ) asu = AssemblyUtil( conf.callback_url, service_ver=au_service_ver, token=conf.token @@ -695,8 +725,10 @@ def main(): upload_time = (time.time() - start) / 60 assy_per_min = uploaded_count / upload_time - print(f"\ntook {upload_time:.2f} minutes to upload {uploaded_count} assemblies, " - f"averaging {assy_per_min:.2f} assemblies per minute") + print( + f"\ntook {upload_time:.2f} minutes to upload {uploaded_count} assemblies, " + f"averaging {assy_per_min:.2f} assemblies per minute" + ) finally: # stop callback server if it is on diff --git a/test/src/loaders/workspace_uploader/workspace_uploader_test.py b/test/src/loaders/workspace_uploader/workspace_uploader_test.py index 5b7cc6091..0d21f2b34 100644 --- a/test/src/loaders/workspace_uploader/workspace_uploader_test.py +++ b/test/src/loaders/workspace_uploader/workspace_uploader_test.py @@ -64,7 +64,7 @@ def test_get_yaml_file_path(setup_and_teardown): assembly_dir = params.assembly_dirs[0] yaml_path = workspace_uploader._get_yaml_file_path(assembly_dir) - expected_yaml_path = os.path.join(assembly_dir, workspace_uploader.UPLOADED_YAML) + expected_yaml_path = os.path.join(assembly_dir, workspace_uploader._UPLOADED_YAML) assert expected_yaml_path == yaml_path assert os.path.exists(yaml_path) @@ -129,7 +129,7 @@ def test_fetch_assemblies_to_upload(setup_and_teardown): 12345, "214", collection_source_dir, - workspace_uploader.UPLOAD_FILE_EXT, + workspace_uploader._UPLOAD_FILE_EXT, ) expected_count = len(ASSEMBLY_NAMES) @@ -158,7 +158,7 @@ def test_fetch_assemblies_to_upload(setup_and_teardown): 12345, "214", collection_source_dir, - workspace_uploader.UPLOAD_FILE_EXT, + workspace_uploader._UPLOAD_FILE_EXT, ) assert expected_count == new_count From e3a9b4cf44fe9294c761489483224ffb0f25c722 Mon Sep 17 00:00:00 2001 From: Sijie Date: Sun, 21 Jan 2024 19:15:20 -0800 Subject: [PATCH 24/30] fix race condition on containers.conf --- src/loaders/common/loader_helper.py | 31 ++++++---------- .../workspace_uploader/workspace_uploader.py | 36 ++++++++++++------- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/src/loaders/common/loader_helper.py b/src/loaders/common/loader_helper.py index f19387e86..f9a9c4ac5 100644 --- a/src/loaders/common/loader_helper.py +++ b/src/loaders/common/loader_helper.py @@ -1,13 +1,11 @@ import argparse import configparser -import fcntl import itertools import json import os import socket import stat import subprocess -import threading import time import uuid from collections import defaultdict @@ -32,7 +30,6 @@ from src.loaders.common.loader_common_names import ( COLLECTION_SOURCE_DIR, CONTAINERS_CONF_PARAMS, - CONTAINERS_CONF_PATH, DOCKER_HOST, FATAL_ERROR, FATAL_STACKTRACE, @@ -294,17 +291,16 @@ def start_podman_service(uid: int): return proc -def _get_containers_config(): +def _get_containers_config(conf_path: str): """Get containers.conf file at home directory.""" - conf_path = os.path.expanduser(CONTAINERS_CONF_PATH) config = configparser.ConfigParser() config.read(conf_path) - return config, conf_path + return config -def is_config_modification_required(): +def is_config_modification_required(conf_path: str): """check if the config requires modification.""" - config, _ = _get_containers_config() + config = _get_containers_config(conf_path) if not config.has_section("containers"): return True for key, val in CONTAINERS_CONF_PARAMS.items(): @@ -313,22 +309,17 @@ def is_config_modification_required(): return False -def setup_callback_server_logs(): +def setup_callback_server_logs(conf_path: str): """Set up containers.conf file for the callback server logs.""" - config, conf_path = _get_containers_config() - with open(conf_path, 'w') as writer: - fcntl.flock(writer.fileno(), fcntl.LOCK_EX) + config = _get_containers_config(conf_path) - if not config.has_section("containers"): - config.add_section("containers") - - for key, val in CONTAINERS_CONF_PARAMS.items(): - config.set("containers", key, val) + if not config.has_section("containers"): + config.add_section("containers") - config.write(writer) - print(f"containers.conf is modified and saved to path: {conf_path}") + for key, val in CONTAINERS_CONF_PARAMS.items(): + config.set("containers", key, val) - fcntl.flock(writer.fileno(), fcntl.LOCK_UN) + return config def is_upa_info_complete(upa_dir: str): diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 0632e12ec..09e6342d0 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -57,6 +57,7 @@ """ import argparse import click +import fcntl import os import shutil import time @@ -363,8 +364,9 @@ def _query_workspace_with_load_id( assembly_names: list[str], ) -> tuple[list[str], list[str]]: if len(assembly_names) > _WS_MAX_BATCH_SIZE: - raise ValueError("The effective max batch size must be <= 10000") - + raise ValueError( + f"The effective max batch size must be <= {_WS_MAX_BATCH_SIZE}" + ) refs = [{"wsid": workspace_id, "name": name} for name in assembly_names] res = ws.get_object_info3( {"objects": refs, "ignoreErrors": 1, "includeMetadata": 1} @@ -624,16 +626,26 @@ def main(): try: # setup container.conf file for the callback server logs if needed - if loader_helper.is_config_modification_required(): - if click.confirm( - f"The config file at {loader_common_names.CONTAINERS_CONF_PATH}\n" - f"needs to be modified to allow for container logging.\n" - f"Params 'seccomp_profile' and 'log_driver' will be added/updated under section [containers]. Do so now?\n" - ): - loader_helper.setup_callback_server_logs() - else: - print("Permission denied and exiting ...") - return + setup_permission = True + conf_path = os.path.expanduser(loader_common_names.CONTAINERS_CONF_PATH) + with open(conf_path, "w") as writer: + fcntl.flock(writer.fileno(), fcntl.LOCK_EX) + if loader_helper.is_config_modification_required(conf_path): + if click.confirm( + f"The config file at {loader_common_names.CONTAINERS_CONF_PATH}\n" + f"needs to be modified to allow for container logging.\n" + f"Params 'seccomp_profile' and 'log_driver' will be added/updated under section [containers]. Do so now?\n" + ): + config = loader_helper.setup_callback_server_logs(conf_path) + config.write(writer) + print(f"containers.conf is modified and saved to path: {conf_path}") + else: + setup_permission = False + fcntl.flock(writer.fileno(), fcntl.LOCK_UN) + + if not setup_permission: + print("Permission denied and exiting ...") + return # start podman service proc = loader_helper.start_podman_service(uid) From 51d00da4182b09e40b4cb44a2d32dc6bcc70a6d6 Mon Sep 17 00:00:00 2001 From: Sijie Date: Sun, 21 Jan 2024 23:48:06 -0800 Subject: [PATCH 25/30] add --as_catalog_admin CLI param --- src/loaders/common/callback_server_wrapper.py | 29 ++++++++++++++----- .../workspace_uploader/workspace_uploader.py | 21 +++++++++++--- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/src/loaders/common/callback_server_wrapper.py b/src/loaders/common/callback_server_wrapper.py index 29794559e..085de71bc 100644 --- a/src/loaders/common/callback_server_wrapper.py +++ b/src/loaders/common/callback_server_wrapper.py @@ -20,7 +20,6 @@ class Conf: Instance variables: token - a KBase token appropriate for the KBase environment - Also use as an admin token if the user has admin permission to catalog params callback_url - the url of the callback service to contact job_data_dir - the directory for SDK jobs per user input_queue - queue for the workspace downloader tasks @@ -46,6 +45,7 @@ def __init__( retrieve_sample: bool = False, ignore_no_sample_error: bool = False, workspace_downloader: bool = False, + catalog_admin: bool = False, ) -> None: """ Initialize the configuration class. @@ -56,7 +56,7 @@ def __init__( kb_base_url (str): The base url of the KBase services. token_filepath (str): The file path that stores a KBase token appropriate for the KBase environment. If not supplied, the token must be provided in the environment variable KB_AUTH_TOKEN. - Also use as an admin token if the user has admin permission to catalog params. + The KB_ADMIN_AUTH_TOKEN environment variable will get set by this token if the user runs as catalog admin. au_service_ver (str): The service verison of AssemblyUtilClient ('dev', 'beta', 'release', or a git commit). workers (int): The number of workers to use for multiprocessing. @@ -65,6 +65,7 @@ def __init__( retrieve_sample (bool): Whether to retrieve sample for each genome object. ignore_no_sample_error (bool): Whether to ignore the error when no sample data is found. workspace_downloader (bool): Whether to be used for the workspace downloader script. + catalog_admin (bool): Whether to run the callback server as catalog admin. """ ipv4 = loader_helper.get_ip() port = loader_helper.find_free_port() @@ -83,6 +84,7 @@ def __init__( port, max_callback_server_tasks, ipv4, + catalog_admin, ) self.callback_url = "http://" + ipv4 + ":" + str(port) @@ -123,6 +125,7 @@ def _setup_callback_server_envs( port: int, max_callback_server_tasks: int, ipv4: str, + catalog_admin: bool, ) -> Tuple[dict[str, Union[int, str]], dict[str, dict[str, str]]]: """ Setup the environment variables and volumes for the callback server. @@ -130,10 +133,11 @@ def _setup_callback_server_envs( Args: job_dir (str): The directory for SDK jobs per user. kb_base_url (str): The base url of the KBase services. - token (str): The KBase token. Also used as a catalog admin token if valid. + token (str): The KBase token. port (int): The port number for the callback server. max_callback_server_tasks (int): The maxmium subtasks for the callback server. - ipv4: (str): The ipv4 address for the callback server. + ipv4 (str): The ipv4 address for the callback server. + catalog_admin (bool): Whether to run the callback server as catalog admin. Returns: tuple: A tuple of the environment variables and volumes for the callback server. @@ -144,7 +148,6 @@ def _setup_callback_server_envs( # used by the callback server env["KB_AUTH_TOKEN"] = token - env["KB_ADMIN_AUTH_TOKEN"] = token # pass in catalog admin token to get catalog params env["KB_BASE_URL"] = kb_base_url env["JOB_DIR"] = job_dir env["CALLBACK_PORT"] = port @@ -152,6 +155,10 @@ def _setup_callback_server_envs( env["CALLBACK_IP"] = ipv4 # specify an ipv4 address for the callback server # otherwise, the callback container will use the an ipv6 address + # set admin token to get catalog secure params + if catalog_admin: + env["KB_ADMIN_AUTH_TOKEN"] = token + # setup volumes required for docker container docker_host = os.environ["DOCKER_HOST"] if docker_host.startswith("unix:"): @@ -172,6 +179,7 @@ def _start_callback_server( port: int, max_callback_server_tasks: int, ipv4: str, + catalog_admin: bool, ) -> None: """ Start the callback server. @@ -184,10 +192,17 @@ def _start_callback_server( token (str): The KBase token. max_callback_server_tasks (int): The maxmium subtasks for the callback server. port (int): The port number for the callback server. - ipv4: (str): The ipv4 address for the callback server. + ipv4 (str): The ipv4 address for the callback server. + catalog_admin (bool): Whether to run the callback server as catalog admin. """ env, vol = self._setup_callback_server_envs( - job_dir, kb_base_url, token, port, max_callback_server_tasks, ipv4 + job_dir, + kb_base_url, + token, + port, + max_callback_server_tasks, + ipv4, + catalog_admin, ) self.container = client.containers.run( name=container_name, diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 09e6342d0..55ac1279c 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -2,7 +2,7 @@ usage: workspace_uploader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_ver SOURCE_VER] [--root_dir ROOT_DIR] [--load_id LOAD_ID] [--token_filepath TOKEN_FILEPATH] [--env {CI,NEXT,APPDEV,PROD}] [--upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...]] [--batch_size BATCH_SIZE] - [--cbs_max_tasks CBS_MAX_TASKS] [--au_service_ver AU_SERVICE_VER] [--keep_job_dir] + [--cbs_max_tasks CBS_MAX_TASKS] [--au_service_ver AU_SERVICE_VER] [--keep_job_dir] [--as_catalog_admin] PROTOTYPE - Upload assembly files to the workspace service (WSS). @@ -26,8 +26,9 @@ --load_id LOAD_ID The load id of the objects being uploaded to a workspace If not provided, a random load_id will be generated --token_filepath TOKEN_FILEPATH - A file path that stores a KBase token appropriate for the KBase environment - If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable + A file path that stores a KBase token appropriate for the KBase environment. If not provided, the token must be + provided in the `KB_AUTH_TOKEN` environment variable. The `KB_ADMIN_AUTH_TOKEN` environment variable will get set by + this token if running as catalog admin. --env {CI,NEXT,APPDEV,PROD} KBase environment (default: PROD) --upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...] @@ -39,6 +40,9 @@ --au_service_ver AU_SERVICE_VER The service version of AssemblyUtil client('dev', 'beta', 'release', or a git commit) (default: release) --keep_job_dir Keep SDK job directory after upload task is completed + --as_catalog_admin Run the callback server with catalog admin privileges. If provided, the `KB_ADMIN_AUTH_TOKEN` environment variable + will get set for the catalog token. Non-catalog admins can still run the uploader with the default catalog secure + params. e.g. PYTHONPATH=. python src/loaders/workspace_uploader/workspace_uploader.py --workspace_id 69046 --kbase_collection GTDB --source_ver 207 --env CI --keep_job_dir @@ -137,7 +141,7 @@ def _get_parser(): type=str, help="A file path that stores a KBase token appropriate for the KBase environment. " "If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable. " - "Also use as an admin token if the user has admin permission to catalog params.", + "The `KB_ADMIN_AUTH_TOKEN` environment variable will get set by this token if running as catalog admin. ", ) optional.add_argument( "--env", @@ -177,6 +181,13 @@ def _get_parser(): action="store_true", help="Keep SDK job directory after upload task is completed", ) + optional.add_argument( + "--as_catalog_admin", + action="store_true", + help="Run the callback server with catalog admin privileges. " + "If provided, the `KB_ADMIN_AUTH_TOKEN` environment variable will get set for the catalog token. " + "Non-catalog admins can still run the uploader with the default catalog secure params. ", + ) return parser @@ -590,6 +601,7 @@ def main(): cbs_max_tasks = args.cbs_max_tasks au_service_ver = args.au_service_ver keep_job_dir = args.keep_job_dir + catalog_admin = args.as_catalog_admin load_id = args.load_id if not load_id: print("load_id is not provided. Generating a load_id ...") @@ -656,6 +668,7 @@ def main(): kb_base_url=kb_base_url, token_filepath=token_filepath, max_callback_server_tasks=cbs_max_tasks, + catalog_admin=catalog_admin, ) count, wait_to_upload_assemblies = _fetch_assemblies_to_upload( From 1b09e3fda77a1cd0dabbefc379374094c5a8e39b Mon Sep 17 00:00:00 2001 From: Sijie Date: Wed, 24 Jan 2024 17:13:04 -0800 Subject: [PATCH 26/30] update CLI help message && file lock --- src/loaders/common/loader_helper.py | 19 ++- .../workspace_downloader.py | 6 +- .../workspace_uploader/workspace_uploader.py | 123 ++++++++---------- .../workspace_uploader_test.py | 12 +- 4 files changed, 79 insertions(+), 81 deletions(-) diff --git a/src/loaders/common/loader_helper.py b/src/loaders/common/loader_helper.py index f9a9c4ac5..346415883 100644 --- a/src/loaders/common/loader_helper.py +++ b/src/loaders/common/loader_helper.py @@ -1,5 +1,6 @@ import argparse import configparser +import fcntl import itertools import json import os @@ -311,15 +312,21 @@ def is_config_modification_required(conf_path: str): def setup_callback_server_logs(conf_path: str): """Set up containers.conf file for the callback server logs.""" - config = _get_containers_config(conf_path) + with open(conf_path, "w") as writer: - if not config.has_section("containers"): - config.add_section("containers") + fcntl.flock(writer.fileno(), fcntl.LOCK_EX) + config = _get_containers_config(conf_path) - for key, val in CONTAINERS_CONF_PARAMS.items(): - config.set("containers", key, val) + if not config.has_section("containers"): + config.add_section("containers") - return config + for key, val in CONTAINERS_CONF_PARAMS.items(): + config.set("containers", key, val) + + config.write(writer) + print(f"containers.conf is modified and saved to path: {conf_path}") + + fcntl.flock(writer.fileno(), fcntl.LOCK_UN) def is_upa_info_complete(upa_dir: str): diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 4d9dbc917..b4a8143ec 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -23,7 +23,8 @@ KBase environment, defaulting to PROD (default: PROD) --workers WORKERS Number of workers for multiprocessing (default: 5) --token_filepath TOKEN_FILEPATH - A file path that stores KBase token + A file path that stores a KBase token appropriate for the KBase environment + If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable --cbs_max_tasks CBS_MAX_TASKS The maxmium subtasks for the callback server (default: 20) --au_service_ver AU_SERVICE_VER @@ -387,7 +388,8 @@ def main(): optional.add_argument( "--token_filepath", type=str, - help="A file path that stores KBase token", + help="A file path that stores a KBase token appropriate for the KBase environment. " + "If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable. " ) optional.add_argument( "--cbs_max_tasks", diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 55ac1279c..0ce0d784d 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -23,12 +23,12 @@ optional arguments: --root_dir ROOT_DIR Root directory for the collections project. (default: /global/cfs/cdirs/kbase/collections) - --load_id LOAD_ID The load id of the objects being uploaded to a workspace - If not provided, a random load_id will be generated + --load_id LOAD_ID The load id of the objects being uploaded to a workspace. If not provided, a random load_id will be generated. For a + particular load, any restarts / resumes of the load should use the same load ID to prevent reuploading the same data. + A new load ID will make new versions of all the objects from the prior upload. --token_filepath TOKEN_FILEPATH - A file path that stores a KBase token appropriate for the KBase environment. If not provided, the token must be - provided in the `KB_AUTH_TOKEN` environment variable. The `KB_ADMIN_AUTH_TOKEN` environment variable will get set by - this token if running as catalog admin. + A file path that stores a KBase token appropriate for the KBase environment. + If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable. --env {CI,NEXT,APPDEV,PROD} KBase environment (default: PROD) --upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...] @@ -40,9 +40,9 @@ --au_service_ver AU_SERVICE_VER The service version of AssemblyUtil client('dev', 'beta', 'release', or a git commit) (default: release) --keep_job_dir Keep SDK job directory after upload task is completed - --as_catalog_admin Run the callback server with catalog admin privileges. If provided, the `KB_ADMIN_AUTH_TOKEN` environment variable - will get set for the catalog token. Non-catalog admins can still run the uploader with the default catalog secure - params. + --as_catalog_admin True means the provided user token has catalog admin privileges and will be used to retrieve secure SDK app + parameters from the catalog. If false, the default, SDK apps run as part of this application will not have access to + catalog secure parameters. e.g. PYTHONPATH=. python src/loaders/workspace_uploader/workspace_uploader.py --workspace_id 69046 --kbase_collection GTDB --source_ver 207 --env CI --keep_job_dir @@ -61,7 +61,6 @@ """ import argparse import click -import fcntl import os import shutil import time @@ -134,14 +133,15 @@ def _get_parser(): "--load_id", type=str, help="The load id of the objects being uploaded to a workspace. " - "If not provided, a random load_id will be generated.", + "If not provided, a random load_id will be generated. " + "For a particular load, any restarts / resumes of the load should use the same load ID to prevent reuploading the same data. " + "A new load ID will make new versions of all the objects from the prior upload.", ) optional.add_argument( "--token_filepath", type=str, help="A file path that stores a KBase token appropriate for the KBase environment. " - "If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable. " - "The `KB_ADMIN_AUTH_TOKEN` environment variable will get set by this token if running as catalog admin. ", + "If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable.", ) optional.add_argument( "--env", @@ -184,9 +184,9 @@ def _get_parser(): optional.add_argument( "--as_catalog_admin", action="store_true", - help="Run the callback server with catalog admin privileges. " - "If provided, the `KB_ADMIN_AUTH_TOKEN` environment variable will get set for the catalog token. " - "Non-catalog admins can still run the uploader with the default catalog secure params. ", + help="True means the provided user token has catalog admin privileges and will " + "be used to retrieve secure SDK app parameters from the catalog. If false, the default, " + "SDK apps run as part of this application will not have access to catalog secure parameters.", ) return parser @@ -289,8 +289,7 @@ def _update_upload_status_yaml_file( workspace_id: int, load_id: str, upa: str, - assembly_dir: str, - assembly_name: str, + assembly_tuple: _AssemblyTuple, ) -> None: """ Update the uploaded.yaml file in target genome_dir with newly uploaded assembly names and upa info. @@ -299,18 +298,18 @@ def _update_upload_status_yaml_file( upload_env_key, workspace_id, load_id, - assembly_dir, - assembly_name, + assembly_tuple.host_assembly_dir, + assembly_tuple.assembly_name, ) if uploaded: raise ValueError( - f"Assembly {assembly_name} already exists in workspace {workspace_id}" + f"Assembly {assembly_tuple.assembly_name} already exists in workspace {workspace_id}" ) data[upload_env_key][workspace_id]["loads"][load_id] = {"upa": upa} - file_path = _get_yaml_file_path(assembly_dir) + file_path = _get_yaml_file_path(assembly_tuple.host_assembly_dir) with open(file_path, "w") as file: yaml.dump(data, file) @@ -459,8 +458,7 @@ def _post_process( workspace_id, load_id, upa, - assembly_tuple.host_assembly_dir, - assembly_tuple.assembly_name, + assembly_tuple, ) # Creates a softlink from new_dir to the contents of upa_dir. @@ -535,7 +533,8 @@ def _upload_assembly_files_in_parallel( except Exception as e: print( f"WARNING: There are inconsistencies between " - f"the workspace and the yaml files as the result of {e}" + f"the workspace and the yaml files as the result of {e}\n" + f"Run the script again to attempt resolution." ) # post process on sucessful uploads @@ -638,26 +637,17 @@ def main(): try: # setup container.conf file for the callback server logs if needed - setup_permission = True conf_path = os.path.expanduser(loader_common_names.CONTAINERS_CONF_PATH) - with open(conf_path, "w") as writer: - fcntl.flock(writer.fileno(), fcntl.LOCK_EX) - if loader_helper.is_config_modification_required(conf_path): - if click.confirm( - f"The config file at {loader_common_names.CONTAINERS_CONF_PATH}\n" - f"needs to be modified to allow for container logging.\n" - f"Params 'seccomp_profile' and 'log_driver' will be added/updated under section [containers]. Do so now?\n" - ): - config = loader_helper.setup_callback_server_logs(conf_path) - config.write(writer) - print(f"containers.conf is modified and saved to path: {conf_path}") - else: - setup_permission = False - fcntl.flock(writer.fileno(), fcntl.LOCK_UN) - - if not setup_permission: - print("Permission denied and exiting ...") - return + if loader_helper.is_config_modification_required(conf_path): + if click.confirm( + f"The config file at {loader_common_names.CONTAINERS_CONF_PATH}\n" + f"needs to be modified to allow for container logging.\n" + f"Params 'seccomp_profile' and 'log_driver' will be added/updated under section [containers]. Do so now?\n" + ): + loader_helper.setup_callback_server_logs(conf_path) + else: + print("Permission denied and exiting ...") + return # start podman service proc = loader_helper.start_podman_service(uid) @@ -688,33 +678,26 @@ def main(): ) # fix inconsistencies between the workspace and the local yaml files if uploaded_obj_names: - print(uploaded_obj_names) - if click.confirm( - f"\nThese objects had been successfully uploaded to\n" - f"workspace {workspace_id} per the load_id {load_id} in their metadata,\n" - f"but missing from the local uploaded.yaml files. Start failure recovery now?\n" - ): - wait_to_update_assemblies = { - assembly_name: wait_to_upload_assemblies[assembly_name] - for assembly_name in uploaded_obj_names - } - uploaded_tuples = _dict2tuple_list(wait_to_update_assemblies) - for assembly_tuple, upa in zip(uploaded_tuples, uploaded_obj_upas): - _post_process( - env, - workspace_id, - load_id, - assembly_tuple, - upload_dir, - output_dir, - upa, - ) - # remove assemblies that are already uploaded - for assembly_name in uploaded_obj_names: - wait_to_upload_assemblies.pop(assembly_name) - else: - print("Failure recovery permission denied and exiting ...") - return + print("Start failure recovery process ...") + wait_to_update_assemblies = { + assembly_name: wait_to_upload_assemblies[assembly_name] + for assembly_name in uploaded_obj_names + } + uploaded_tuples = _dict2tuple_list(wait_to_update_assemblies) + for assembly_tuple, upa in zip(uploaded_tuples, uploaded_obj_upas): + _post_process( + env, + workspace_id, + load_id, + assembly_tuple, + upload_dir, + output_dir, + upa, + ) + # remove assemblies that are already uploaded + for assembly_name in uploaded_obj_names: + wait_to_upload_assemblies.pop(assembly_name) + print("Recovery process completed ...") if not wait_to_upload_assemblies: print( diff --git a/test/src/loaders/workspace_uploader/workspace_uploader_test.py b/test/src/loaders/workspace_uploader/workspace_uploader_test.py index 0d21f2b34..ced0c02d7 100644 --- a/test/src/loaders/workspace_uploader/workspace_uploader_test.py +++ b/test/src/loaders/workspace_uploader/workspace_uploader_test.py @@ -98,9 +98,12 @@ def test_update_upload_status_yaml_file(setup_and_teardown): params = setup_and_teardown assembly_dir = params.assembly_dirs[0] assembly_name = ASSEMBLY_NAMES[0] + assembly_tuple = workspace_uploader._AssemblyTuple( + assembly_name, assembly_dir, "/path/to/file/in/AssembilyUtil" + ) workspace_uploader._update_upload_status_yaml_file( - "CI", 12345, "214", "12345_58_1", assembly_dir, assembly_name + "CI", 12345, "214", "12345_58_1", assembly_tuple ) data, uploaded = workspace_uploader._read_upload_status_yaml_file( "CI", 12345, "214", assembly_dir, assembly_name @@ -115,7 +118,7 @@ def test_update_upload_status_yaml_file(setup_and_teardown): with pytest.raises(ValueError, match=f"already exists in workspace"): workspace_uploader._update_upload_status_yaml_file( - "CI", 12345, "214", "12345_58_1", assembly_dir, assembly_name + "CI", 12345, "214", "12345_58_1", assembly_tuple ) @@ -146,8 +149,11 @@ def test_fetch_assemblies_to_upload(setup_and_teardown): # Both assemnly files will be skipped in the next fetch_assemblies_to_upload call upas = ["12345_58_1", "12345_58_2"] for assembly_name, assembly_dir, upa in zip(ASSEMBLY_NAMES, assembly_dirs, upas): + assembly_tuple = workspace_uploader._AssemblyTuple( + assembly_name, assembly_dir, "/path/to/file/in/AssembilyUtil" + ) workspace_uploader._update_upload_status_yaml_file( - "CI", 12345, "214", upa, assembly_dir, assembly_name + "CI", 12345, "214", upa, assembly_tuple ) ( From 5caae42666f26304e1b9795afc3b4c6e4d053dce Mon Sep 17 00:00:00 2001 From: Sijie Date: Thu, 25 Jan 2024 16:00:44 -0800 Subject: [PATCH 27/30] update description && add try finally block for file lock --- src/loaders/common/loader_helper.py | 22 +++++++++---------- .../workspace_uploader/workspace_uploader.py | 13 +++++++++-- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/loaders/common/loader_helper.py b/src/loaders/common/loader_helper.py index 346415883..5e11b21b7 100644 --- a/src/loaders/common/loader_helper.py +++ b/src/loaders/common/loader_helper.py @@ -313,20 +313,20 @@ def is_config_modification_required(conf_path: str): def setup_callback_server_logs(conf_path: str): """Set up containers.conf file for the callback server logs.""" with open(conf_path, "w") as writer: + try: + fcntl.flock(writer.fileno(), fcntl.LOCK_EX) + config = _get_containers_config(conf_path) - fcntl.flock(writer.fileno(), fcntl.LOCK_EX) - config = _get_containers_config(conf_path) - - if not config.has_section("containers"): - config.add_section("containers") - - for key, val in CONTAINERS_CONF_PARAMS.items(): - config.set("containers", key, val) + if not config.has_section("containers"): + config.add_section("containers") - config.write(writer) - print(f"containers.conf is modified and saved to path: {conf_path}") + for key, val in CONTAINERS_CONF_PARAMS.items(): + config.set("containers", key, val) - fcntl.flock(writer.fileno(), fcntl.LOCK_UN) + config.write(writer) + print(f"containers.conf is modified and saved to path: {conf_path}") + finally: + fcntl.flock(writer.fileno(), fcntl.LOCK_UN) def is_upa_info_complete(upa_dir: str): diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 0ce0d784d..bd6685d24 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -4,7 +4,10 @@ [--upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...]] [--batch_size BATCH_SIZE] [--cbs_max_tasks CBS_MAX_TASKS] [--au_service_ver AU_SERVICE_VER] [--keep_job_dir] [--as_catalog_admin] -PROTOTYPE - Upload assembly files to the workspace service (WSS). +PROTOTYPE - Upload assembly files to the workspace service (WSS). Note that the uploader determines whether a genome is already uploaded in +one of two ways. First it consults the *.yaml files in each genomes directory; if that file shows the genome has been uploaded it skips it +regardless of the current state of the workspace. Second, it checks that the most recent version of the genome object in the workspace, if it +exists, was part of the current load ID (see the load ID parameter description below). If so, the genome is skipped. options: -h, --help show this help message and exit @@ -93,7 +96,13 @@ def _get_parser(): parser = argparse.ArgumentParser( - description="PROTOTYPE - Upload assembly files to the workspace service (WSS).", + description="PROTOTYPE - Upload assembly files to the workspace service (WSS).\n\n" + "Note that the uploader determines whether a genome is already uploaded in one of two ways. " + "First it consults the *.yaml files in each genomes directory; if that file shows the genome " + "has been uploaded it skips it regardless of the current state of the workspace. " + "Second, it checks that the most recent version of the genome object in the workspace, " + "if it exists, was part of the current load ID (see the load ID parameter description below). " + "If so, the genome is skipped.", formatter_class=loader_helper.ExplicitDefaultsHelpFormatter, ) From 9a91857fe389a71bcad7f71e21a36ffa9a686a38 Mon Sep 17 00:00:00 2001 From: Sijie Date: Mon, 29 Jan 2024 16:54:54 -0800 Subject: [PATCH 28/30] fix tests --- src/loaders/common/loader_helper.py | 7 +- .../workspace_uploader/workspace_uploader.py | 7 +- .../workspace_uploader_test.py | 89 +++++++++++++++++-- 3 files changed, 91 insertions(+), 12 deletions(-) diff --git a/src/loaders/common/loader_helper.py b/src/loaders/common/loader_helper.py index 5e11b21b7..0296c8ba2 100644 --- a/src/loaders/common/loader_helper.py +++ b/src/loaders/common/loader_helper.py @@ -31,6 +31,7 @@ from src.loaders.common.loader_common_names import ( COLLECTION_SOURCE_DIR, CONTAINERS_CONF_PARAMS, + CONTAINERS_CONF_PATH, DOCKER_HOST, FATAL_ERROR, FATAL_STACKTRACE, @@ -299,8 +300,9 @@ def _get_containers_config(conf_path: str): return config -def is_config_modification_required(conf_path: str): +def is_config_modification_required(): """check if the config requires modification.""" + conf_path = os.path.expanduser(CONTAINERS_CONF_PATH) config = _get_containers_config(conf_path) if not config.has_section("containers"): return True @@ -310,8 +312,9 @@ def is_config_modification_required(conf_path: str): return False -def setup_callback_server_logs(conf_path: str): +def setup_callback_server_logs(): """Set up containers.conf file for the callback server logs.""" + conf_path = os.path.expanduser(CONTAINERS_CONF_PATH) with open(conf_path, "w") as writer: try: fcntl.flock(writer.fileno(), fcntl.LOCK_EX) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index bd6685d24..0cac43668 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -228,6 +228,8 @@ def _upload_assemblies_to_workspace( """ Upload assembly files to the target workspace in batch. The bulk method fails and an error will be thrown if any of the assembly files in batch fails to upload. + Returns the list of workspace UPAs for the created objects in the same order as + `assembly_tuples`. """ inputs = [ { @@ -646,14 +648,13 @@ def main(): try: # setup container.conf file for the callback server logs if needed - conf_path = os.path.expanduser(loader_common_names.CONTAINERS_CONF_PATH) - if loader_helper.is_config_modification_required(conf_path): + if loader_helper.is_config_modification_required(): if click.confirm( f"The config file at {loader_common_names.CONTAINERS_CONF_PATH}\n" f"needs to be modified to allow for container logging.\n" f"Params 'seccomp_profile' and 'log_driver' will be added/updated under section [containers]. Do so now?\n" ): - loader_helper.setup_callback_server_logs(conf_path) + loader_helper.setup_callback_server_logs() else: print("Permission denied and exiting ...") return diff --git a/test/src/loaders/workspace_uploader/workspace_uploader_test.py b/test/src/loaders/workspace_uploader/workspace_uploader_test.py index ced0c02d7..be664adba 100644 --- a/test/src/loaders/workspace_uploader/workspace_uploader_test.py +++ b/test/src/loaders/workspace_uploader/workspace_uploader_test.py @@ -305,10 +305,11 @@ def test_upload_assembly_files_in_parallel(setup_and_teardown): for assembly_name, assembly_dir in zip(ASSEMBLY_NAMES, assembly_dirs) } + # ws.get_object_info3() is unused in this test case ws = create_autospec(Workspace, spec_set=True, instance=True) asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) asu.save_assemblies_from_fastas.return_value = { - "results":[ + "results": [ {"upa": "12345/58/1"}, {"upa": "12345/60/1"} ] @@ -328,6 +329,28 @@ def test_upload_assembly_files_in_parallel(setup_and_teardown): assert uploaded_count == 2 + # assert that no interactions occurred with ws + ws.get_object_info3.assert_not_called() + + # assert that asu was called correctly + asu.save_assemblies_from_fastas.assert_called_once_with( + { + "workspace_id": 12345, + "inputs": [ + { + "file": "/kb/module/work/tmp/GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", + "assembly_name": "GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", + "object_metadata": {"load_id": "214"}, + }, + { + "file": "/kb/module/work/tmp/GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", + "assembly_name": "GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", + "object_metadata": {"load_id": "214"}, + } + ] + } + ) + # check softlink for post_process assert os.readlink(os.path.join(upload_dir, "12345_58_1")) == os.path.join( output_dir, "12345_58_1" @@ -338,11 +361,13 @@ def test_upload_assembly_files_in_parallel(setup_and_teardown): # check hardlink for post_process assert os.path.samefile( - src_files[0], os.path.join(os.path.join(output_dir, "12345_58_1"), f"12345_58_1.fna.gz") + src_files[0], + os.path.join(os.path.join(output_dir, "12345_58_1"), "12345_58_1.fna.gz") ) assert os.path.samefile( - src_files[1], os.path.join(os.path.join(output_dir, "12345_60_1"), f"12345_60_1.fna.gz") + src_files[1], + os.path.join(os.path.join(output_dir, "12345_60_1"), "12345_60_1.fna.gz") ) @@ -362,9 +387,9 @@ def test_fail_upload_assembly_files_in_parallel(setup_and_teardown): ws = create_autospec(Workspace, spec_set=True, instance=True) asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) asu.save_assemblies_from_fastas.side_effect = Exception("Illegal character in object name") - - loader_helper.list_objects = create_autospec(loader_helper.list_objects) - loader_helper.list_objects.return_value = [] + ws.get_object_info3.return_value = { + 'infos': [None, None], 'paths': [None, None] + } uploaded_count = workspace_uploader._upload_assembly_files_in_parallel( asu, @@ -380,8 +405,39 @@ def test_fail_upload_assembly_files_in_parallel(setup_and_teardown): assert uploaded_count == 0 + # assert that asu was called correctly + asu.save_assemblies_from_fastas.assert_called_once_with( + { + "workspace_id": 12345, + "inputs": [ + { + "file": "/kb/module/work/tmp/GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", + "assembly_name": "GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", + "object_metadata": {"load_id": "214"}, + }, + { + "file": "/kb/module/work/tmp/GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", + "assembly_name": "GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", + "object_metadata": {"load_id": "214"}, + } + ] + } + ) -def test_query_workspace_with_load_id_mass(setup_and_teardown): + # assert that ws was called correctly + ws.get_object_info3.assert_called_once_with( + { + "objects": [ + {"wsid": 12345, "name": "GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz"}, + {"wsid": 12345, "name": "GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz"} + ], + "ignoreErrors": 1, + "includeMetadata": 1 + } + ) + + +def test_fail_query_workspace_with_load_id_mass(setup_and_teardown): ws = create_autospec(Workspace, spec_set=True, instance=True) with pytest.raises( Exception, match="The effective max batch size must be <= 10000" @@ -394,7 +450,13 @@ def test_query_workspace_with_load_id_mass(setup_and_teardown): 10001, ) + # assert that no interactions occurred with ws + ws.get_object_info3.assert_not_called() + + +def test_query_workspace_with_load_id_mass(setup_and_teardown): # happy test + ws = create_autospec(Workspace, spec_set=True, instance=True) ws.get_object_info3.return_value = { 'infos': [ [ @@ -454,3 +516,16 @@ def test_query_workspace_with_load_id_mass(setup_and_teardown): "GCF_000979375.1_gtlEnvA5udCFS_genomic.fna.gz", ] assert obj_upas == ["69046_1086_18", "69046_1068_18"] + + # assert that ws was called correctly + ws.get_object_info3.assert_called_once_with( + { + "objects": [ + {"wsid": 69046, "name": "GCF_000980105.1_gtlEnvA5udCFS_genomic.fna.gz"}, + {"wsid": 69046, "name": "GCF_000979375.1_gtlEnvA5udCFS_genomic.fna.gz"}, + {"wsid": 69046, "name": "aloha"} + ], + "ignoreErrors": 1, + "includeMetadata": 1 + } + ) From a412797bb122d13aeed4b508cbd6bf1132ae0531 Mon Sep 17 00:00:00 2001 From: Sijie Date: Tue, 30 Jan 2024 17:14:03 -0800 Subject: [PATCH 29/30] remove redudant code --- .../src/loaders/workspace_uploader/workspace_uploader_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/src/loaders/workspace_uploader/workspace_uploader_test.py b/test/src/loaders/workspace_uploader/workspace_uploader_test.py index be664adba..ce5534ea7 100644 --- a/test/src/loaders/workspace_uploader/workspace_uploader_test.py +++ b/test/src/loaders/workspace_uploader/workspace_uploader_test.py @@ -362,12 +362,12 @@ def test_upload_assembly_files_in_parallel(setup_and_teardown): # check hardlink for post_process assert os.path.samefile( src_files[0], - os.path.join(os.path.join(output_dir, "12345_58_1"), "12345_58_1.fna.gz") + os.path.join(output_dir, "12345_58_1", "12345_58_1.fna.gz") ) assert os.path.samefile( src_files[1], - os.path.join(os.path.join(output_dir, "12345_60_1"), "12345_60_1.fna.gz") + os.path.join(output_dir, "12345_60_1", "12345_60_1.fna.gz") ) From c92fe2f61a1d492d77d402e2681416068b23dba7 Mon Sep 17 00:00:00 2001 From: Sijie Date: Wed, 31 Jan 2024 15:58:28 -0800 Subject: [PATCH 30/30] correct typos --- src/loaders/common/callback_server_wrapper.py | 8 ++++---- src/loaders/workspace_downloader/workspace_downloader.py | 4 ++-- src/loaders/workspace_uploader/workspace_uploader.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/loaders/common/callback_server_wrapper.py b/src/loaders/common/callback_server_wrapper.py index 085de71bc..7431bb975 100644 --- a/src/loaders/common/callback_server_wrapper.py +++ b/src/loaders/common/callback_server_wrapper.py @@ -57,10 +57,10 @@ def __init__( token_filepath (str): The file path that stores a KBase token appropriate for the KBase environment. If not supplied, the token must be provided in the environment variable KB_AUTH_TOKEN. The KB_ADMIN_AUTH_TOKEN environment variable will get set by this token if the user runs as catalog admin. - au_service_ver (str): The service verison of AssemblyUtilClient + au_service_ver (str): The service version of AssemblyUtilClient ('dev', 'beta', 'release', or a git commit). workers (int): The number of workers to use for multiprocessing. - max_callback_server_tasks (int): The maxmium subtasks for the callback server. + max_callback_server_tasks (int): The maximum number of subtasks for the callback server. worker_function (Callable): The function that will be called by the workers. retrieve_sample (bool): Whether to retrieve sample for each genome object. ignore_no_sample_error (bool): Whether to ignore the error when no sample data is found. @@ -135,7 +135,7 @@ def _setup_callback_server_envs( kb_base_url (str): The base url of the KBase services. token (str): The KBase token. port (int): The port number for the callback server. - max_callback_server_tasks (int): The maxmium subtasks for the callback server. + max_callback_server_tasks (int): The maximum number of subtasks for the callback server. ipv4 (str): The ipv4 address for the callback server. catalog_admin (bool): Whether to run the callback server as catalog admin. @@ -190,7 +190,7 @@ def _start_callback_server( job_dir (str): The directory for SDK jobs per user. kb_base_url (str): The base url of the KBase services. token (str): The KBase token. - max_callback_server_tasks (int): The maxmium subtasks for the callback server. + max_callback_server_tasks (int): The maximum number of subtasks for the callback server. port (int): The port number for the callback server. ipv4 (str): The ipv4 address for the callback server. catalog_admin (bool): Whether to run the callback server as catalog admin. diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 69b9b2e99..4ce670ae3 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -26,7 +26,7 @@ A file path that stores a KBase token appropriate for the KBase environment If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable --cbs_max_tasks CBS_MAX_TASKS - The maxmium subtasks for the callback server (default: 20) + The maximum number of subtasks for the callback server (default: 20) --au_service_ver AU_SERVICE_VER The service version of AssemblyUtil client('dev', 'beta', 'release', or a git commit) (default: release) --keep_job_dir Keep SDK job directory after download task is completed @@ -396,7 +396,7 @@ def main(): "--cbs_max_tasks", type=int, default=20, - help="The maxmium subtasks for the callback server", + help="The maximum number of subtasks for the callback server", ) optional.add_argument( "--au_service_ver", diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 0cac43668..65cd27b97 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -39,7 +39,7 @@ --batch_size BATCH_SIZE Number of files to upload per batch (default: 2500) --cbs_max_tasks CBS_MAX_TASKS - The maxmium subtasks for the callback server (default: 20) + The maximum number of subtasks for the callback server (default: 20) --au_service_ver AU_SERVICE_VER The service version of AssemblyUtil client('dev', 'beta', 'release', or a git commit) (default: release) --keep_job_dir Keep SDK job directory after upload task is completed @@ -176,7 +176,7 @@ def _get_parser(): "--cbs_max_tasks", type=int, default=20, - help="The maxmium subtasks for the callback server", + help="The maximum number of subtasks for the callback server", ) optional.add_argument( "--au_service_ver",