From 448f120d13df845ff85b4f57efd57ec3955a1926 Mon Sep 17 00:00:00 2001 From: Ahmed Hussein Date: Thu, 24 Oct 2024 15:02:25 -0500 Subject: [PATCH 1/6] Allow users to specify custom Dependency jars Signed-off-by: Ahmed Hussein Fixes #1359 Add a new input argument that takes a path to a yaml file `--tools_config_file` The config file allows the users to define their own binaries that need to be added to the classpath of the tools jar cmd. This change is important because users can use the user-tools wrapper with their custom spark. --- .../rapids/rapids_tool.py | 74 ++++++++++------ .../resources/databricks_aws-configs.json | 40 +++++---- .../resources/databricks_azure-configs.json | 34 ++++---- .../resources/dataproc-configs.json | 34 ++++---- .../resources/dataproc_gke-configs.json | 34 ++++---- .../resources/dev/prepackage_mgr.py | 38 ++++---- .../resources/emr-configs.json | 48 +++++----- .../resources/onprem-configs.json | 30 ++++--- .../spark_rapids_tools/cmdli/argprocessor.py | 26 +++++- .../src/spark_rapids_tools/cmdli/tools_cli.py | 10 ++- .../configuration/__init__.py | 23 +++++ .../configuration/common.py | 87 +++++++++++++++++++ .../configuration/runtime_conf.py | 28 ++++++ .../configuration/tools_config.py | 63 ++++++++++++++ user_tools/src/spark_rapids_tools/enums.py | 15 ++++ 15 files changed, 435 insertions(+), 149 deletions(-) create mode 100644 user_tools/src/spark_rapids_tools/configuration/__init__.py create mode 100644 user_tools/src/spark_rapids_tools/configuration/common.py create mode 100644 user_tools/src/spark_rapids_tools/configuration/runtime_conf.py create mode 100644 user_tools/src/spark_rapids_tools/configuration/tools_config.py diff --git a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py index 55b91d81c..a8544d0ec 100644 --- a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py +++ b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py @@ -38,9 +38,11 @@ from spark_rapids_pytools.rapids.rapids_job import RapidsJobPropContainer from spark_rapids_pytools.rapids.tool_ctxt import ToolContext from spark_rapids_tools import CspEnv -from spark_rapids_tools.enums import HashAlgorithm +from spark_rapids_tools.configuration.common import RuntimeDependency +from spark_rapids_tools.configuration.tools_config import ToolsConfig +from spark_rapids_tools.enums import DependencyType from spark_rapids_tools.storagelib import LocalPath, CspFs -from spark_rapids_tools.storagelib.tools.fs_utils import untar_file, FileHashAlgorithm +from spark_rapids_tools.storagelib.tools.fs_utils import untar_file from spark_rapids_tools.utils import Utilities from spark_rapids_tools.utils.net_utils import DownloadTask @@ -70,6 +72,13 @@ class RapidsTool(object): logger: Logger = field(default=None, init=False) spinner: ToolsSpinner = field(default=None, init=False) + def get_tools_config_obj(self) -> Optional['ToolsConfig']: + """ + Get the tools configuration object if provided in the CLI arguments. + :return: An object containing all the tools configuration or None if not provided. + """ + return self.wrapper_options.get('toolsConfig') + def pretty_name(self): return self.name.capitalize() @@ -136,7 +145,7 @@ def _check_environment(self) -> None: def _process_output_args(self): self.logger.debug('Processing Output Arguments') - # make sure that output_folder is being absolute + # make sure output_folder is absolute if self.output_folder is None: self.output_folder = Utils.get_rapids_tools_env('OUTPUT_DIRECTORY', os.getcwd()) try: @@ -393,7 +402,8 @@ def _calculate_spark_settings(self, worker_info: NodeHWInfo) -> dict: return res @classmethod - def get_rapids_tools_dependencies(cls, deploy_mode: str, json_props: AbstractPropertiesContainer) -> Optional[list]: + def get_rapids_tools_dependencies(cls, deploy_mode: str, + json_props: AbstractPropertiesContainer) -> Optional[list]: """ Get the tools dependencies from the platform configuration. """ @@ -403,7 +413,9 @@ def get_rapids_tools_dependencies(cls, deploy_mode: str, json_props: AbstractPro depend_arr = json_props.get_value_silent('dependencies', 'deployMode', deploy_mode, active_buildver) if depend_arr is None: raise ValueError(f'Invalid SPARK dependency version [{active_buildver}]') - return depend_arr + # convert the json array to a list of RuntimeDependency objects + runtime_dep_arr = [RuntimeDependency(**dep) for dep in depend_arr] + return runtime_dep_arr @dataclass @@ -532,47 +544,46 @@ def exception_handler(future): if exception: self.logger.error('Error while downloading dependency: %s', exception) - def cache_single_dependency(dep: dict) -> str: + def cache_single_dependency(dep: RuntimeDependency) -> str: """ Downloads the specified URL and saves it to disk """ - self.logger.info('Checking dependency %s', dep['name']) + self.logger.info('Checking dependency %s', dep.name) dest_folder = self.ctxt.get_cache_folder() verify_opts = {} - dep_verification = dep.get('verification') - if dep_verification is not None: - if 'size' in dep_verification: - verify_opts['size'] = dep_verification['size'] - hash_lib_alg = dep_verification.get('hashLib') - if hash_lib_alg: - verify_opts['file_hash'] = FileHashAlgorithm(HashAlgorithm(hash_lib_alg['type']), - hash_lib_alg['value']) - download_task = DownloadTask(src_url=dep['uri'], # pylint: disable=no-value-for-parameter) + if dep.verification is not None: + verify_opts = dict(dep.verification) + download_task = DownloadTask(src_url=dep.uri, # pylint: disable=no-value-for-parameter) dest_folder=dest_folder, verification=verify_opts) download_result = download_task.run_task() self.logger.info('Completed downloading of dependency [%s] => %s', - dep['name'], + dep.name, f'{download_result.pretty_print()}') if not download_result.success: - msg = f'Failed to download dependency {dep["name"]}, reason: {download_result.download_error}' + msg = f'Failed to download dependency {dep.name}, reason: {download_result.download_error}' raise RuntimeError(f'Could not download all dependencies. Aborting Executions.\n\t{msg}') destination_path = self.ctxt.get_local_work_dir() destination_cspath = LocalPath(destination_path) - if dep['type'] == 'archive': + # set the default dependency type to jar + defined_dep_type = DependencyType.get_default() + if dep.dependency_type: + defined_dep_type = dep.dependency_type.dep_type + if defined_dep_type == DependencyType.ARCHIVE: uncompressed_cspath = untar_file(download_result.resource, destination_cspath) dep_item = uncompressed_cspath.no_scheme - relative_path = dep.get('relativePath') - if relative_path is not None: - dep_item = f'{dep_item}/{relative_path}' - else: + if dep.dependency_type.relative_path is not None: + dep_item = f'{dep_item}/{dep.dependency_type.relative_path}' + elif defined_dep_type == DependencyType.JAR: # copy the jar into dependency folder CspFs.copy_resources(download_result.resource, destination_cspath) final_dep_csp = destination_cspath.create_sub_path(download_result.resource.base_name()) dep_item = final_dep_csp.no_scheme + else: + raise ValueError(f'Invalid dependency type [{defined_dep_type}]') return dep_item - def cache_all_dependencies(dep_arr: List[dict]): + def cache_all_dependencies(dep_arr: List[RuntimeDependency]): """ Create a thread pool and download specified urls """ @@ -593,8 +604,19 @@ def cache_all_dependencies(dep_arr: List[dict]): raise ex return results - deploy_mode = DeployMode.tostring(self.ctxt.get_deploy_mode()) - depend_arr = self.get_rapids_tools_dependencies(deploy_mode, self.ctxt.platform.configs) + def populate_dependency_list() -> List[RuntimeDependency]: + # check if the dependencies is defined in a config file + config_obj = self.get_tools_config_obj() + if config_obj is not None: + if config_obj.runtime.dependencies: + return config_obj.runtime.dependencies + self.logger.info('The ToolsConfig did not specify the dependencies. ' + 'Falling back to the default dependencies.') + # load dependency list from the platform configuration + deploy_mode = DeployMode.tostring(self.ctxt.get_deploy_mode()) + return self.get_rapids_tools_dependencies(deploy_mode, self.ctxt.platform.configs) + + depend_arr = populate_dependency_list() if depend_arr: dep_list = cache_all_dependencies(depend_arr) if any(dep_item is None for dep_item in dep_list): diff --git a/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json b/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json index 7d64dfb44..85e123bad 100644 --- a/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json @@ -8,21 +8,23 @@ "name": "Apache Spark", "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", "verification": { - "hashLib": { - "type": "sha512", + "fileHash": { + "algorithm": "sha512", "value": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319" }, "size": 400395283 }, - "type": "archive", - "relativePath": "jars/*" + "dependencyType": { + "depType": "archive", + "relativePath": "jars/*" + } }, { "name": "Hadoop AWS", "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar", "verification": { - "hashLib": { - "type": "sha1", + "fileHash": { + "algorithm": "sha1", "value": "a65839fbf1869f81a1632e09f415e586922e4f80" }, "size": 962685 @@ -33,8 +35,8 @@ "name": "AWS Java SDK Bundled", "uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar", "verification": { - "hashLib": { - "type": "sha1", + "fileHash": { + "algorithm": "sha1", "value": "02deec3a0ad83d13d032b1812421b23d7a961eea" }, "size": 280645251 @@ -47,38 +49,38 @@ "name": "Apache Spark", "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", "verification": { - "hashLib": { + "fileHash": { "type": "sha512", "value": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9" }, "size": 299426263 }, - "type": "archive", - "relativePath": "jars/*" + "dependencyType": { + "depType": "archive", + "relativePath": "jars/*" + } }, { "name": "Hadoop AWS", "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar", "verification": { - "hashLib": { - "type": "sha1", + "fileHash": { + "algorithm": "sha1", "value": "a65839fbf1869f81a1632e09f415e586922e4f80" }, "size": 962685 - }, - "type": "jar" + } }, { "name": "AWS Java SDK Bundled", "uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar", "verification": { - "hashLib": { - "type": "sha1", + "fileHash": { + "algorithm": "sha1", "value": "02deec3a0ad83d13d032b1812421b23d7a961eea" }, "size": 280645251 - }, - "type": "jar" + } } ] } diff --git a/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json b/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json index 0e41fb1ae..4716a60d7 100644 --- a/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json @@ -8,26 +8,27 @@ "name": "Apache Spark", "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", "verification": { - "hashLib": { - "type": "sha512", + "fileHash": { + "algorithm": "sha512", "value": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319" }, "size": 400395283 }, - "type": "archive", - "relativePath": "jars/*" + "dependencyType": { + "depType": "archive", + "relativePath": "jars/*" + } }, { "name": "Hadoop Azure", "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar", "verification": { - "hashLib": { - "type": "sha1", + "fileHash": { + "algorithm": "sha1", "value": "a23f621bca9b2100554150f6b0b521f94b8b419e" }, "size": 574116 - }, - "type": "jar" + } } ], "333": [ @@ -35,26 +36,27 @@ "name": "Apache Spark", "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", "verification": { - "hashLib": { - "type": "sha512", + "fileHash": { + "algorithm": "sha512", "value": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9" }, "size": 299426263 }, - "type": "archive", - "relativePath": "jars/*" + "dependencyType": { + "depType": "archive", + "relativePath": "jars/*" + } }, { "name": "Hadoop Azure", "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar", "verification": { - "hashLib": { - "type": "sha1", + "fileHash": { + "algorithm": "sha1", "value": "a23f621bca9b2100554150f6b0b521f94b8b419e" }, "size": 574116 - }, - "type": "jar" + } } ] } diff --git a/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json b/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json index 1f99e7bac..87034191b 100644 --- a/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json @@ -8,26 +8,27 @@ "name": "Apache Spark", "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", "verification": { - "hashLib": { - "type": "sha512", + "fileHash": { + "algorithm": "sha512", "value": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319" }, "size": 400395283 }, - "type": "archive", - "relativePath": "jars/*" + "dependencyType": { + "depType": "archive", + "relativePath": "jars/*" + } }, { "name": "GCS Connector Hadoop3", "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.19/gcs-connector-hadoop3-2.2.19-shaded.jar", "verification": { - "hashLib": { - "type": "sha1", + "fileHash": { + "algorithm": "sha1", "value": "3bea6d5e62663a2a5c03d8ca44dff4921aeb3170" }, "size": 39359477 - }, - "type": "jar" + } } ], "333": [ @@ -35,26 +36,27 @@ "name": "Apache Spark", "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", "verification": { - "hashLib": { - "type": "sha512", + "fileHash": { + "algorithm": "sha512", "value": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9" }, "size": 299426263 }, - "type": "archive", - "relativePath": "jars/*" + "dependencyType": { + "depType": "archive", + "relativePath": "jars/*" + } }, { "name": "GCS Connector Hadoop3", "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.17/gcs-connector-hadoop3-2.2.17-shaded.jar", "verification": { - "hashLib": { - "type": "sha1", + "fileHash": { + "algorithm": "sha1", "value": "06438f562692ff8fae5e8555eba2b9f95cb74f66" }, "size": 38413466 - }, - "type": "jar" + } } ] } diff --git a/user_tools/src/spark_rapids_pytools/resources/dataproc_gke-configs.json b/user_tools/src/spark_rapids_pytools/resources/dataproc_gke-configs.json index 861b127d7..2fd4f2344 100644 --- a/user_tools/src/spark_rapids_pytools/resources/dataproc_gke-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/dataproc_gke-configs.json @@ -8,26 +8,27 @@ "name": "Apache Spark", "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", "verification": { - "hashLib": { - "type": "sha512", + "fileHash": { + "algorithm": "sha512", "value": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319" }, "size": 400395283 }, - "type": "archive", - "relativePath": "jars/*" + "dependencyType": { + "depType": "archive", + "relativePath": "jars/*" + } }, { "name": "GCS Connector Hadoop3", "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.19/gcs-connector-hadoop3-2.2.19-shaded.jar", "verification": { - "hashLib": { - "type": "sha1", + "fileHash": { + "algorithm": "sha1", "value": "3bea6d5e62663a2a5c03d8ca44dff4921aeb3170" }, "size": 39359477 - }, - "type": "jar" + } } ], "333": [ @@ -35,26 +36,27 @@ "name": "Apache Spark", "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", "verification": { - "hashLib": { - "type": "sha512", + "fileHash": { + "algorithm": "sha512", "value": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9" }, "size": 299426263 }, - "type": "archive", - "relativePath": "jars/*" + "dependencyType": { + "depType": "archive", + "relativePath": "jars/*" + } }, { "name": "GCS Connector Hadoop3", "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.17/gcs-connector-hadoop3-2.2.17-shaded.jar", "verification": { - "hashLib": { - "type": "sha1", + "fileHash": { + "algorithm": "sha1", "value": "06438f562692ff8fae5e8555eba2b9f95cb74f66" }, "size": 38413466 - }, - "type": "jar" + } } ] } diff --git a/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py b/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py index 1920c134b..1a196cc93 100644 --- a/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py +++ b/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py @@ -18,7 +18,6 @@ """ import os -import shutil import tarfile from typing import Optional @@ -27,7 +26,8 @@ from spark_rapids_pytools.common.prop_manager import JSONPropertiesContainer from spark_rapids_pytools.common.sys_storage import FSUtil from spark_rapids_pytools.rapids.rapids_tool import RapidsTool -from spark_rapids_tools import CspEnv +from spark_rapids_tools import CspEnv, CspPath +from spark_rapids_tools.configuration.common import RuntimeDependency from spark_rapids_tools.utils import Utilities from spark_rapids_tools.utils.net_utils import DownloadManager, DownloadTask @@ -100,16 +100,18 @@ def _fetch_resources(self) -> dict: # Add RAPIDS JAR as dependency if self.tools_jar: - # copy from existing file. replace snapshot - jar_file_name = FSUtil.get_resource_name(self.tools_jar) - FSUtil.make_dirs(self.dest_dir) - dest_file = FSUtil.build_path(self.dest_dir, jar_file_name) - shutil.copy2(self.tools_jar, dest_file) + # copy from existing file. + tools_jar_cspath = CspPath(self.tools_jar) + tools_jar_url = str(tools_jar_cspath) + jar_file_name = tools_jar_cspath.base_name() + print(f'Using the provided tools_jar {tools_jar_url}') else: - # get the latest tools_jar from mvn - rapids_url = self._get_spark_rapids_jar_url() - rapids_name = FSUtil.get_resource_name(rapids_url) - resource_uris[rapids_url] = {'name': rapids_name, 'pbar_enabled': False} + tools_jar_url = self._get_spark_rapids_jar_url() + jar_file_name = FSUtil.get_resource_name(tools_jar_url) + resource_uris[tools_jar_url] = { + 'depItem': RuntimeDependency(name=jar_file_name, uri=tools_jar_url), + 'prettyName': jar_file_name + } for platform in self._supported_platforms: # pylint: disable=no-member config_file = FSUtil.build_full_path(self.resource_dir, @@ -117,17 +119,19 @@ def _fetch_resources(self) -> dict: platform_conf = JSONPropertiesContainer(config_file) dependency_list = RapidsTool.get_rapids_tools_dependencies('LOCAL', platform_conf) for dependency in dependency_list: - uri = dependency.get('uri') - name = FSUtil.get_resource_name(uri) - if uri: - resource_uris[uri] = {'name': name, 'pbar_enabled': False} - resource_uris[uri + '.asc'] = {'name': name + '.asc', 'pbar_enabled': False} + if dependency.uri: + uri_str = str(dependency.uri) + pretty_name = FSUtil.get_resource_name(uri_str) + resource_uris[uri_str] = { + 'depItem': dependency, + 'prettyName': pretty_name + } return resource_uris def _download_resources(self, resource_uris: dict): download_tasks = [] for res_uri, res_info in resource_uris.items(): - resource_name = res_info.get('name') + resource_name = res_info.get('prettyName') print(f'Creating download task: {resource_name}') # All the downloadTasks enforces download download_tasks.append(DownloadTask(src_url=res_uri, # pylint: disable=no-value-for-parameter) diff --git a/user_tools/src/spark_rapids_pytools/resources/emr-configs.json b/user_tools/src/spark_rapids_pytools/resources/emr-configs.json index 5b6a34bd2..52b5097c2 100644 --- a/user_tools/src/spark_rapids_pytools/resources/emr-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/emr-configs.json @@ -8,38 +8,38 @@ "name": "Apache Spark", "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", "verification": { - "hashLib": { - "type": "sha512", + "fileHash": { + "algorithm": "sha512", "value": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319" }, "size": 400395283 }, - "type": "archive", - "relativePath": "jars/*" + "dependencyType": { + "depType": "archive", + "relativePath": "jars/*" + } }, { "name": "Hadoop AWS", "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar", "verification": { - "hashLib": { - "type": "sha1", + "fileHash": { + "algorithm": "sha1", "value": "a65839fbf1869f81a1632e09f415e586922e4f80" }, "size": 962685 - }, - "type": "jar" + } }, { "name": "AWS Java SDK Bundled", "uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar", "verification": { - "hashLib": { - "type": "sha1", + "fileHash": { + "algorithm": "sha1", "value": "02deec3a0ad83d13d032b1812421b23d7a961eea" }, "size": 280645251 - }, - "type": "jar" + } } ], "333": [ @@ -47,38 +47,38 @@ "name": "Apache Spark", "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", "verification": { - "hashLib": { - "type": "sha512", + "fileHash": { + "algorithm": "sha512", "value": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9" }, "size": 299426263 }, - "type": "archive", - "relativePath": "jars/*" + "dependencyType": { + "depType": "archive", + "relativePath": "jars/*" + } }, { "name": "Hadoop AWS", "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar", "verification": { - "hashLib": { - "type": "sha1", + "fileHash": { + "algorithm": "sha1", "value": "a65839fbf1869f81a1632e09f415e586922e4f80" }, "size": 962685 - }, - "type": "jar" + } }, { "name": "AWS Java SDK Bundled", "uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar", "verification": { - "hashLib": { - "type": "sha1", + "fileHash": { + "algorithm": "sha1", "value": "02deec3a0ad83d13d032b1812421b23d7a961eea" }, "size": 280645251 - }, - "type": "jar" + } } ] } diff --git a/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json b/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json index f7e9af218..fe8f07d09 100644 --- a/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json @@ -8,14 +8,16 @@ "name": "Apache Spark", "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", "verification": { - "hashLib": { - "type": "sha512", + "fileHash": { + "algorithm": "sha512", "value": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319" }, "size": 400395283 }, - "type": "archive", - "relativePath": "jars/*" + "dependencyType": { + "depType": "archive", + "relativePath": "jars/*" + } } ], "342": [ @@ -23,14 +25,16 @@ "name": "Apache Spark", "uri": "https://archive.apache.org/dist/spark/spark-3.4.2/spark-3.4.2-bin-hadoop3.tgz", "verification": { - "hashLib": { - "type": "sha512", + "fileHash": { + "algorithm": "sha512", "value": "c9470a557c96fe899dd1c9ea8d0dda3310eaf0155b2bb972f70a6d97fee8cdaf838b425c30df3d5856b2c31fc2be933537c111db72d0427eabb76c6abd92c1f1" }, "size": 388664780 }, - "type": "archive", - "relativePath": "jars/*" + "dependencyType": { + "depType": "archive", + "relativePath": "jars/*" + } } ], "333": [ @@ -38,14 +42,16 @@ "name": "Apache Spark", "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", "verification": { - "hashLib": { - "type": "sha512", + "fileHash": { + "algorithm": "sha512", "value": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9" }, "size": 299426263 }, - "type": "archive", - "relativePath": "jars/*" + "dependencyType": { + "depType": "archive", + "relativePath": "jars/*" + } } ] } diff --git a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py index e578d23b6..0913b557f 100644 --- a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py +++ b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py @@ -29,6 +29,7 @@ from spark_rapids_pytools.common.utilities import ToolLogging, Utils from spark_rapids_tools.cloud import ClientCluster from spark_rapids_tools.utils import AbstractPropContainer, is_http_file +from ..configuration.tools_config import ToolsConfig from ..enums import QualFilterApp, CspEnv, QualEstimationModel from ..storagelib.csppath import CspPath from ..tools.autotuner import AutoTunerPropMgr @@ -349,6 +350,7 @@ class ToolUserArgModel(AbsToolUserArgModel): eventlogs: Optional[str] = None jvm_heap_size: Optional[int] = None jvm_threads: Optional[int] = None + tools_config_path: Optional[str] = None def is_concurrent_submission(self) -> bool: return False @@ -370,6 +372,23 @@ def process_jvm_args(self) -> None: self.p_args['toolArgs']['jobResources'] = adjusted_resources self.p_args['toolArgs']['log4jPath'] = Utils.resource_path('dev/log4j.properties') + def process_tools_config(self) -> None: + """ + Load the tools config file if it is provided. it creates a ToolsConfig object and sets it + in the toolArgs without processing the actual dependencies. + :return: None + """ + self.p_args['toolArgs']['toolsConfig'] = None + if self.tools_config_path is not None: + # the CLI provides a tools config file + try: + self.p_args['toolArgs']['toolsConfig'] = ToolsConfig.load_from_file(self.tools_config_path) + except ValidationError as ve: + raise PydanticCustomError( + 'invalid_argument', + f'Tools config file path {self.tools_config_path} could not be loaded. ' + 'It is expected to be a valid YAML file.\n Error:') from ve + def init_extra_arg_cases(self) -> list: if self.eventlogs is None: return [ArgValueCase.UNDEFINED] @@ -481,11 +500,14 @@ def build_tools_args(self) -> dict: runtime_platform = self.get_or_set_platform() # process JVM arguments self.process_jvm_args() + # process the tools config file + self.process_tools_config() # finally generate the final values wrapped_args = { 'runtimePlatform': runtime_platform, 'outputFolder': self.output_folder, + 'toolsConfig': self.p_args['toolArgs']['toolsConfig'], 'platformOpts': { 'credentialFile': None, 'deployMode': DeployMode.LOCAL @@ -601,10 +623,12 @@ def build_tools_args(self) -> dict: # process JVM arguments self.process_jvm_args() - + # process the tools config file + self.process_tools_config() # finally generate the final values wrapped_args = { 'runtimePlatform': runtime_platform, + 'toolsConfig': self.p_args['toolArgs']['toolsConfig'], 'outputFolder': self.output_folder, 'platformOpts': { 'credentialFile': None, diff --git a/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py index 9c6d6c214..ba39e0a35 100644 --- a/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py +++ b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py @@ -45,6 +45,7 @@ def qualification(self, jvm_heap_size: int = None, jvm_threads: int = None, verbose: bool = None, + tools_config_file: str = None, **rapids_options) -> None: """The Qualification cmd provides estimated speedups by migrating Apache Spark applications to GPU accelerated clusters. @@ -79,6 +80,7 @@ def qualification(self, :param jvm_threads: Number of threads to use for parallel processing on the eventlogs batch. Default is calculated as a function of the total number of cores and the heap size on the host. :param verbose: True or False to enable verbosity of the script. + :param tools_config_file: Path to a configuration file that contains the tools' options. :param rapids_options: A list of valid Qualification tool options. Note that the wrapper ignores ["output-directory", "platform"] flags, and it does not support multiple "spark-property" arguments. @@ -112,7 +114,8 @@ def qualification(self, jvm_heap_size=jvm_heap_size, jvm_threads=jvm_threads, filter_apps=filter_apps, - estimation_model_args=estimation_model_args) + estimation_model_args=estimation_model_args, + tools_config_path=tools_config_file) if qual_args: tool_obj = QualificationAsLocal(platform_type=qual_args['runtimePlatform'], output_folder=qual_args['outputFolder'], @@ -131,6 +134,7 @@ def profiling(self, jvm_heap_size: int = None, jvm_threads: int = None, verbose: bool = None, + tools_config_file: str = None, **rapids_options): """The Profiling cmd provides information which can be used for debugging and profiling Apache Spark applications running on GPU accelerated clusters. @@ -158,6 +162,7 @@ def profiling(self, :param jvm_threads: Number of thread to use for parallel processing on the eventlogs batch. Default is calculated as a function of the total number of cores and the heap size on the host. :param verbose: True or False to enable verbosity of the script. + :param tools_config_file: Path to a configuration file that contains the tools' options. :param rapids_options: A list of valid Profiling tool options. Note that the wrapper ignores ["output-directory", "worker-info"] flags, and it does not support multiple "spark-property" arguments. @@ -182,7 +187,8 @@ def profiling(self, jvm_heap_size=jvm_heap_size, jvm_threads=jvm_threads, output_folder=output_folder, - tools_jar=tools_jar) + tools_jar=tools_jar, + tools_config_path=tools_config_file) if prof_args: rapids_options.update(prof_args['rapidOptions']) tool_obj = ProfilingAsLocal(platform_type=prof_args['runtimePlatform'], diff --git a/user_tools/src/spark_rapids_tools/configuration/__init__.py b/user_tools/src/spark_rapids_tools/configuration/__init__.py new file mode 100644 index 000000000..de974a8e6 --- /dev/null +++ b/user_tools/src/spark_rapids_tools/configuration/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""init file of the tools configurations module""" + +from .common import RuntimeDependencyType, DependencyVerification, RuntimeDependency + +__all__ = [ + 'RuntimeDependency', + 'RuntimeDependencyType', + 'DependencyVerification' +] diff --git a/user_tools/src/spark_rapids_tools/configuration/common.py b/user_tools/src/spark_rapids_tools/configuration/common.py new file mode 100644 index 000000000..fc7bcf434 --- /dev/null +++ b/user_tools/src/spark_rapids_tools/configuration/common.py @@ -0,0 +1,87 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Common types and definitions used by the configurations. This module is used by other +modules as well.""" + +from typing import Union +from pydantic import BaseModel, Field, AnyUrl, FilePath, AliasChoices + +from spark_rapids_tools.enums import DependencyType +from spark_rapids_tools.storagelib.tools.fs_utils import FileHashAlgorithm + + +class RuntimeDependencyType(BaseModel): + """Defines the type of dependency. It can be one of the following: + - Archived file (.tgz) + - Simple JAR file (*.jar) + - Classpath directory (not yet supported) + + Note: The 'classpath' type is reserved for future use, allowing users to point to a directory + in the classpath without needing to download or copy any binaries.""" + + dep_type: DependencyType = Field( + description='The type of the dependency', + validation_alias=AliasChoices('dep_type', 'depType')) + relative_path: str = Field( + default=None, + description='The relative path of the dependency in the classpath. This is relevant for tar files', + validation_alias=AliasChoices('relative_path', 'relativePath'), + examples=['jars/*']) + + +class DependencyVerification(BaseModel): + """The verification information of the dependency.""" + size: int = Field( + default=0, + description='The size of the dependency file.', + examples=[3265393]) + file_hash: FileHashAlgorithm = Field( + default=None, + description='The hash function to verify the file', + validation_alias=AliasChoices('file_hash', 'fileHash'), + examples=[ + { + 'algorithm': 'md5', + 'value': 'bc9bf7fedde0e700b974426fbd8d869c' + }]) + + +class RuntimeDependency(BaseModel): + """The runtime dependency required by the tools Jar cmd. All elements are downloaded and added + to the classPath.""" + name: str = Field(description='The name of the dependency') + uri: Union[AnyUrl, FilePath] = Field( + description='The FileURI of the dependency. It can be a local file or a remote file', + examples=['file:///path/to/file.jar', + 'https://mvn-url/24.08.1/rapids-4-spark-tools_2.12-24.08.1.jar', + 'gs://bucket-name/path/to/file.jar']) + dependency_type: RuntimeDependencyType = Field( + default_factory=lambda: RuntimeDependencyType(dep_type=DependencyType.JAR), + description='The type of the dependency and how to find the lib files after decompression.', + validation_alias=AliasChoices('dependency_type', 'dependencyType')) + verification: DependencyVerification = Field( + default=None, + description='The verification information of the dependency.', + examples=[ + { + 'size': 3265393 + }, + { + 'fileHash': { + 'algorithm': 'md5', + 'value': 'bc9bf7fedde0e700b974426fbd8d869c' + } + } + ]) diff --git a/user_tools/src/spark_rapids_tools/configuration/runtime_conf.py b/user_tools/src/spark_rapids_tools/configuration/runtime_conf.py new file mode 100644 index 000000000..91ab4e1b6 --- /dev/null +++ b/user_tools/src/spark_rapids_tools/configuration/runtime_conf.py @@ -0,0 +1,28 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""The runtime configurations of the tools as defined by the user.""" + +from typing import List + +from pydantic import BaseModel, Field + +from spark_rapids_tools.configuration.common import RuntimeDependency + + +class ToolsRuntimeConfig(BaseModel): + """The runtime configurations of the tools as defined by the user.""" + dependencies: List[RuntimeDependency] = Field( + description='The list of runtime dependencies required by the tools Jar cmd. ' + 'All elements are downloaded and added to the classPath') diff --git a/user_tools/src/spark_rapids_tools/configuration/tools_config.py b/user_tools/src/spark_rapids_tools/configuration/tools_config.py new file mode 100644 index 000000000..bee6d625f --- /dev/null +++ b/user_tools/src/spark_rapids_tools/configuration/tools_config.py @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Container for the custom tools configurations. This is the parts of the configuration that can +be passed as an input to the CLI""" + +import json +from typing import Union, Optional + +from pydantic import BaseModel, Field, ValidationError +from pydantic_core import PydanticCustomError + +from spark_rapids_tools import CspPathT +from spark_rapids_tools.configuration.runtime_conf import ToolsRuntimeConfig +from spark_rapids_tools.utils import AbstractPropContainer + + +class ToolsConfigInfo(BaseModel): + """ + High level metadata about the tools configurations (i.e., the api-version and any other relevant + metadata). + """ + api_version: float = Field( + description='The version of the API that the tools are using. ' + 'This is used to test the compatibility of the ' + 'configuration file against the current tools release', + examples=['1.0'], + le=1.0, # minimum version compatible with the current tools implementation + ge=1.0) + + +class ToolsConfig(BaseModel): + """Main container for the user's defined tools configuration""" + info: ToolsConfigInfo = Field( + description='Metadata information of the tools configuration') + runtime: ToolsRuntimeConfig = Field( + description='Configuration related to the runtime environment of the tools') + + @classmethod + def load_from_file(cls, file_path: Union[str, CspPathT]) -> Optional['ToolsConfig']: + """Load the tools configuration from a file""" + try: + prop_container = AbstractPropContainer.load_from_file(file_path) + return cls(**prop_container.props) + except ValidationError as e: + raise PydanticCustomError('Invalid Tools Configuration File', e) from e + + @classmethod + def get_schema(cls) -> str: + """Returns a JSON schema of the tools' configuration. This is useful for generating an API + documentation of the model.""" + return json.dumps(cls.model_json_schema(), indent=2) diff --git a/user_tools/src/spark_rapids_tools/enums.py b/user_tools/src/spark_rapids_tools/enums.py index 61dca23a6..f51bcdeef 100644 --- a/user_tools/src/spark_rapids_tools/enums.py +++ b/user_tools/src/spark_rapids_tools/enums.py @@ -101,6 +101,21 @@ def get_hash_func(self) -> Callable: } return hash_functions[self] + +class DependencyType(EnumeratedType): + """Represents the dependency type for the jar cmd""" + JAR = 'jar' + ARCHIVE = 'archive' + # When classpath is used, it means that the url of a dependency is used as is. + # i.e., it is a folder, or a path that is added to the classPath for java CLI. + CLASSPATH = 'classpath' + + @classmethod + def get_default(cls): + """Returns the default dependency type""" + return cls.JAR + + ########### # CSP Enums ########### From 82a89efd9a7fc4b1bdad5dc78cb5de833d4171cd Mon Sep 17 00:00:00 2001 From: Ahmed Hussein Date: Thu, 24 Oct 2024 15:36:11 -0500 Subject: [PATCH 2/6] fix typo in aws configs Signed-off-by: Ahmed Hussein --- .../spark_rapids_pytools/resources/databricks_aws-configs.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json b/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json index 85e123bad..d0cde1298 100644 --- a/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json @@ -50,7 +50,7 @@ "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", "verification": { "fileHash": { - "type": "sha512", + "algorithm": "sha512", "value": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9" }, "size": 299426263 From 505273d34f144f36397907ede23da92e6fa8c32c Mon Sep 17 00:00:00 2001 From: Ahmed Hussein Date: Fri, 25 Oct 2024 15:28:17 -0500 Subject: [PATCH 3/6] fix default dependencyType Signed-off-by: Ahmed Hussein --- user_tools/src/spark_rapids_tools/configuration/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/user_tools/src/spark_rapids_tools/configuration/common.py b/user_tools/src/spark_rapids_tools/configuration/common.py index fc7bcf434..c199977a1 100644 --- a/user_tools/src/spark_rapids_tools/configuration/common.py +++ b/user_tools/src/spark_rapids_tools/configuration/common.py @@ -68,7 +68,7 @@ class RuntimeDependency(BaseModel): 'https://mvn-url/24.08.1/rapids-4-spark-tools_2.12-24.08.1.jar', 'gs://bucket-name/path/to/file.jar']) dependency_type: RuntimeDependencyType = Field( - default_factory=lambda: RuntimeDependencyType(dep_type=DependencyType.JAR), + default_factory=lambda: RuntimeDependencyType(dep_type=DependencyType.get_default()), description='The type of the dependency and how to find the lib files after decompression.', validation_alias=AliasChoices('dependency_type', 'dependencyType')) verification: DependencyVerification = Field( From 97b2371e27a7f40c88146fb07c8fe880383c1c71 Mon Sep 17 00:00:00 2001 From: "Ahmed Hussein (amahussein)" Date: Thu, 31 Oct 2024 12:36:40 -0500 Subject: [PATCH 4/6] add unit tests for toolsconfig Signed-off-by: Ahmed Hussein (amahussein) --- .../spark_rapids_tools/cmdli/argprocessor.py | 4 +- .../configuration/tools_config.py | 17 +++------ .../tests/spark_rapids_tools_ut/conftest.py | 22 ++++++++++- .../invalid/tools_config_inv_00.yaml | 11 ++++++ .../invalid/tools_config_inv_01.yaml | 8 ++++ .../invalid/tools_config_inv_02.yaml | 17 +++++++++ .../tools_config/valid/tools_config_00.yaml | 11 ++++++ .../test_tool_argprocessor.py | 38 ++++++++++++++++--- 8 files changed, 108 insertions(+), 20 deletions(-) create mode 100644 user_tools/tests/spark_rapids_tools_ut/resources/tools_config/invalid/tools_config_inv_00.yaml create mode 100644 user_tools/tests/spark_rapids_tools_ut/resources/tools_config/invalid/tools_config_inv_01.yaml create mode 100644 user_tools/tests/spark_rapids_tools_ut/resources/tools_config/invalid/tools_config_inv_02.yaml create mode 100644 user_tools/tests/spark_rapids_tools_ut/resources/tools_config/valid/tools_config_00.yaml diff --git a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py index 0913b557f..439abd546 100644 --- a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py +++ b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py @@ -374,7 +374,7 @@ def process_jvm_args(self) -> None: def process_tools_config(self) -> None: """ - Load the tools config file if it is provided. it creates a ToolsConfig object and sets it + Load the tools config file if it is provided. It creates a ToolsConfig object and sets it in the toolArgs without processing the actual dependencies. :return: None """ @@ -387,7 +387,7 @@ def process_tools_config(self) -> None: raise PydanticCustomError( 'invalid_argument', f'Tools config file path {self.tools_config_path} could not be loaded. ' - 'It is expected to be a valid YAML file.\n Error:') from ve + f'It is expected to be a valid YAML file.\n Error:{ve}') def init_extra_arg_cases(self) -> list: if self.eventlogs is None: diff --git a/user_tools/src/spark_rapids_tools/configuration/tools_config.py b/user_tools/src/spark_rapids_tools/configuration/tools_config.py index bee6d625f..b226efdcb 100644 --- a/user_tools/src/spark_rapids_tools/configuration/tools_config.py +++ b/user_tools/src/spark_rapids_tools/configuration/tools_config.py @@ -26,11 +26,8 @@ from spark_rapids_tools.utils import AbstractPropContainer -class ToolsConfigInfo(BaseModel): - """ - High level metadata about the tools configurations (i.e., the api-version and any other relevant - metadata). - """ +class ToolsConfig(BaseModel): + """Main container for the user's defined tools configuration""" api_version: float = Field( description='The version of the API that the tools are using. ' 'This is used to test the compatibility of the ' @@ -38,12 +35,6 @@ class ToolsConfigInfo(BaseModel): examples=['1.0'], le=1.0, # minimum version compatible with the current tools implementation ge=1.0) - - -class ToolsConfig(BaseModel): - """Main container for the user's defined tools configuration""" - info: ToolsConfigInfo = Field( - description='Metadata information of the tools configuration') runtime: ToolsRuntimeConfig = Field( description='Configuration related to the runtime environment of the tools') @@ -54,7 +45,9 @@ def load_from_file(cls, file_path: Union[str, CspPathT]) -> Optional['ToolsConfi prop_container = AbstractPropContainer.load_from_file(file_path) return cls(**prop_container.props) except ValidationError as e: - raise PydanticCustomError('Invalid Tools Configuration File', e) from e + raise e + # raise PydanticCustomError('invalid_argument', + # 'Invalid Tools Configuration File\n Error:') from e @classmethod def get_schema(cls) -> str: diff --git a/user_tools/tests/spark_rapids_tools_ut/conftest.py b/user_tools/tests/spark_rapids_tools_ut/conftest.py index 145355f24..de3f2da12 100644 --- a/user_tools/tests/spark_rapids_tools_ut/conftest.py +++ b/user_tools/tests/spark_rapids_tools_ut/conftest.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -47,6 +47,26 @@ def gen_cpu_cluster_props(): csps = ['dataproc', 'dataproc_gke', 'emr', 'databricks_aws', 'databricks_azure'] all_csps = csps + ['onprem'] autotuner_prop_path = 'worker_info.yaml' +# valid tools config files +valid_tools_conf_files = ['tools_config_00.yaml'] +# invalid tools config files +invalid_tools_conf_files = [ + # test older API_version + # Error:1 validation error for ToolsConfig + # api_version + # Input should be greater than or equal to 1 [type=greater_than_equal, input_value='0.9', input_type=str] + 'tools_config_inv_00.yaml', + # test empty runtime configs + # Error:1 validation error for ToolsConfig + # runtime.dependencies + # Input should be a valid list [type=list_type, input_value=None, input_type=NoneType] + 'tools_config_inv_01.yaml', + # test local dependency file does not exist + # Error:1 validation error for ToolsConfig + # runtime.dependencies + # Input should be a valid list [type=list_type, input_value=None, input_type=NoneType] + 'tools_config_inv_02.yaml' +] class SparkRapidsToolsUT: # pylint: disable=too-few-public-methods diff --git a/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/invalid/tools_config_inv_00.yaml b/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/invalid/tools_config_inv_00.yaml new file mode 100644 index 000000000..aa566a71f --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/invalid/tools_config_inv_00.yaml @@ -0,0 +1,11 @@ +# This yaml file is a configuration file for a tool that uses Spark 3.5.0 as dependency +# invalid: API version is smaller than accepted API version +api_version: '0.9' +runtime: + dependencies: + - name: my-spark350 + uri: https:///archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz + dependency_type: + dep_type: archive + # for tgz files, it is required to give the subfolder where the jars are located + relative_path: jars/* diff --git a/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/invalid/tools_config_inv_01.yaml b/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/invalid/tools_config_inv_01.yaml new file mode 100644 index 000000000..a723f2e66 --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/invalid/tools_config_inv_01.yaml @@ -0,0 +1,8 @@ +# This yaml file is a configuration file +# invalid: empty file +# Error:1 validation error for ToolsConfig +# runtime.dependencies +# Input should be a valid list [type=list_type, input_value=None, input_type=NoneType] +api_version: '1.0' +runtime: + dependencies: diff --git a/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/invalid/tools_config_inv_02.yaml b/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/invalid/tools_config_inv_02.yaml new file mode 100644 index 000000000..210b4419a --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/invalid/tools_config_inv_02.yaml @@ -0,0 +1,17 @@ +# This yaml file is a configuration file for a tool that uses Spark 3.5.0 as dependency +# invalid: Local file can be verified during the initialization and it should fail because no such +# file exists. +# Error:2 validation errors for ToolsConfig +# runtime.dependencies.0.uri.url +# Input should be a valid URL, relative URL without a base [type=url_parsing, input_value='/home/user/workdir...k-3.5.0-bin-hadoop3.tgz', input_type=str] +# For further information visit https://errors.pydantic.dev/2.9/v/url_parsing +# runtime.dependencies.0.uri.function-after[validate_file(), lax-or-strict[lax=union[json-or-python[json=function-after[path_validator(), str],python=is-instance[Path]],function-after[path_validator(), str]],strict=json-or-python[json=function-after[path_validator(), str],python=is-instance[Path]]]] +# Path does not point to a file [type=path_not_file, input_value='/home/archive.apache.org...k-3.5.0-bin-hadoop3.tgz', input_type=str] +api_version: '1.0' +runtime: + dependencies: + - name: my-spark350 + uri: /home/user/workdir/spark-3.5.0-bin-hadoop3.tgz + dependency_type: + dep_type: archive + relative_path: jars/* diff --git a/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/valid/tools_config_00.yaml b/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/valid/tools_config_00.yaml new file mode 100644 index 000000000..f7dccf951 --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/valid/tools_config_00.yaml @@ -0,0 +1,11 @@ +# This yaml file is a configuration file for a tool that uses Spark 3.5.0 as dependency +# Minimal file content. +api_version: '1.0' +runtime: + dependencies: + - name: my-spark350 + uri: https:///archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz + dependency_type: + dep_type: archive + # for tgz files, it is required to give the subfolder where the jars are located + relative_path: jars/* diff --git a/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py b/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py index b899038b7..300751242 100644 --- a/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py +++ b/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py @@ -24,7 +24,8 @@ from spark_rapids_tools import CspEnv from spark_rapids_tools.cmdli.argprocessor import AbsToolUserArgModel, ArgValueCase from spark_rapids_tools.enums import QualFilterApp -from .conftest import SparkRapidsToolsUT, autotuner_prop_path, all_cpu_cluster_props, all_csps +from .conftest import SparkRapidsToolsUT, autotuner_prop_path, all_cpu_cluster_props, all_csps, \ + valid_tools_conf_files, invalid_tools_conf_files @dataclasses.dataclass @@ -77,22 +78,24 @@ def validate_args_w_savings_disabled(tool_name: str, t_args: dict): @staticmethod def create_tool_args_should_pass(tool_name: str, platform=None, cluster=None, - eventlogs=None, tools_jar=None): + eventlogs=None, tools_jar=None, tools_config_path=None): return AbsToolUserArgModel.create_tool_args(tool_name, platform=platform, cluster=cluster, eventlogs=eventlogs, - tools_jar=tools_jar) + tools_jar=tools_jar, + tools_config_path=tools_config_path) @staticmethod def create_tool_args_should_fail(tool_name: str, platform=None, cluster=None, - eventlogs=None, tools_jar=None): + eventlogs=None, tools_jar=None, tools_config_path=None): with pytest.raises(SystemExit) as pytest_wrapped_e: AbsToolUserArgModel.create_tool_args(tool_name, platform=platform, cluster=cluster, eventlogs=eventlogs, - tools_jar=tools_jar) + tools_jar=tools_jar, + tools_config_path=tools_config_path) assert pytest_wrapped_e.type == SystemExit @staticmethod @@ -321,6 +324,31 @@ def test_profiler_with_driverlog(self, get_ut_data_dir, prop_path): assert not prof_args['requiresEventlogs'] assert prof_args['rapidOptions']['driverlog'] == f'{get_ut_data_dir}/{prop_path}' + @pytest.mark.parametrize('tool_name', ['profiling', 'qualification']) + @pytest.mark.parametrize('csp', all_csps) + @pytest.mark.parametrize('tools_conf_fname', valid_tools_conf_files) + def test_tools_configs(self, get_ut_data_dir, tool_name, csp, tools_conf_fname): + tools_conf_path = f'{get_ut_data_dir}/tools_config/valid/{tools_conf_fname}' + # should pass: tools config file is provided + tool_args = self.create_tool_args_should_pass(tool_name, + platform=csp, + eventlogs=f'{get_ut_data_dir}/eventlogs', + tools_config_path=tools_conf_path) + assert tool_args['toolsConfig'] is not None + + @pytest.mark.parametrize('tool_name', ['profiling', 'qualification']) + @pytest.mark.parametrize('csp', all_csps) + @pytest.mark.parametrize('tools_conf_fname', invalid_tools_conf_files) + def test_invalid_tools_configs(self, get_ut_data_dir, tool_name, csp, tools_conf_fname): + tools_conf_path = f'{get_ut_data_dir}/tools_config/invalid/{tools_conf_fname}' + # should pass: tools config file is provided + with pytest.raises(SystemExit) as pytest_wrapped_e: + AbsToolUserArgModel.create_tool_args(tool_name, + platform=csp, + eventlogs=f'{get_ut_data_dir}/eventlogs', + tools_config_path=tools_conf_path) + assert pytest_wrapped_e.type == SystemExit + def test_arg_cases_coverage(self): """ This test ensures that above tests have covered all possible states of the `platform`, `cluster`, From 153661e2e623adabb08d970c022f24137cb9c7bb Mon Sep 17 00:00:00 2001 From: Ahmed Hussein Date: Thu, 31 Oct 2024 16:09:24 -0500 Subject: [PATCH 5/6] improve field descriptions Signed-off-by: Ahmed Hussein --- .../rapids/rapids_tool.py | 2 +- .../spark_rapids_tools/cmdli/argprocessor.py | 13 +- .../src/spark_rapids_tools/cmdli/tools_cli.py | 4 + .../configuration/common.py | 47 ++--- .../configuration/runtime_conf.py | 6 +- .../configuration/tools_config.py | 9 +- user_tools/src/spark_rapids_tools/enums.py | 11 +- .../storagelib/tools/fs_utils.py | 20 +- .../sample-config-specification.json | 182 ++++++++++++++++++ 9 files changed, 235 insertions(+), 59 deletions(-) create mode 100644 user_tools/tests/spark_rapids_tools_ut/resources/tools_config/sample-config-specification.json diff --git a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py index a8544d0ec..c666381b0 100644 --- a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py +++ b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py @@ -583,7 +583,7 @@ def cache_single_dependency(dep: RuntimeDependency) -> str: raise ValueError(f'Invalid dependency type [{defined_dep_type}]') return dep_item - def cache_all_dependencies(dep_arr: List[RuntimeDependency]): + def cache_all_dependencies(dep_arr: List[RuntimeDependency]) -> List[str]: """ Create a thread pool and download specified urls """ diff --git a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py index 439abd546..5b9ce5297 100644 --- a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py +++ b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py @@ -372,7 +372,7 @@ def process_jvm_args(self) -> None: self.p_args['toolArgs']['jobResources'] = adjusted_resources self.p_args['toolArgs']['log4jPath'] = Utils.resource_path('dev/log4j.properties') - def process_tools_config(self) -> None: + def load_tools_config(self) -> None: """ Load the tools config file if it is provided. It creates a ToolsConfig object and sets it in the toolArgs without processing the actual dependencies. @@ -384,10 +384,13 @@ def process_tools_config(self) -> None: try: self.p_args['toolArgs']['toolsConfig'] = ToolsConfig.load_from_file(self.tools_config_path) except ValidationError as ve: + # If required, we can dump the expected specification by appending + # 'ToolsConfig.get_schema()' to the error message raise PydanticCustomError( - 'invalid_argument', + 'invalid_config', f'Tools config file path {self.tools_config_path} could not be loaded. ' - f'It is expected to be a valid YAML file.\n Error:{ve}') + f'It is expected to be a valid configuration YAML file.' + f'\n Error:{ve}\n') from ve def init_extra_arg_cases(self) -> list: if self.eventlogs is None: @@ -501,7 +504,7 @@ def build_tools_args(self) -> dict: # process JVM arguments self.process_jvm_args() # process the tools config file - self.process_tools_config() + self.load_tools_config() # finally generate the final values wrapped_args = { @@ -624,7 +627,7 @@ def build_tools_args(self) -> dict: # process JVM arguments self.process_jvm_args() # process the tools config file - self.process_tools_config() + self.load_tools_config() # finally generate the final values wrapped_args = { 'runtimePlatform': runtime_platform, diff --git a/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py index ba39e0a35..0d46e5025 100644 --- a/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py +++ b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py @@ -81,6 +81,8 @@ def qualification(self, Default is calculated as a function of the total number of cores and the heap size on the host. :param verbose: True or False to enable verbosity of the script. :param tools_config_file: Path to a configuration file that contains the tools' options. + For sample configuration files, please visit + https://github.com/NVIDIA/spark-rapids-tools/tree/main/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/valid :param rapids_options: A list of valid Qualification tool options. Note that the wrapper ignores ["output-directory", "platform"] flags, and it does not support multiple "spark-property" arguments. @@ -163,6 +165,8 @@ def profiling(self, Default is calculated as a function of the total number of cores and the heap size on the host. :param verbose: True or False to enable verbosity of the script. :param tools_config_file: Path to a configuration file that contains the tools' options. + For sample configuration files, please visit + https://github.com/NVIDIA/spark-rapids-tools/tree/main/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/valid :param rapids_options: A list of valid Profiling tool options. Note that the wrapper ignores ["output-directory", "worker-info"] flags, and it does not support multiple "spark-property" arguments. diff --git a/user_tools/src/spark_rapids_tools/configuration/common.py b/user_tools/src/spark_rapids_tools/configuration/common.py index c199977a1..9ac02313c 100644 --- a/user_tools/src/spark_rapids_tools/configuration/common.py +++ b/user_tools/src/spark_rapids_tools/configuration/common.py @@ -23,33 +23,28 @@ class RuntimeDependencyType(BaseModel): - """Defines the type of dependency. It can be one of the following: - - Archived file (.tgz) - - Simple JAR file (*.jar) - - Classpath directory (not yet supported) - - Note: The 'classpath' type is reserved for future use, allowing users to point to a directory - in the classpath without needing to download or copy any binaries.""" + """Defines the type of runtime dependency required by the tools' java cmd.""" dep_type: DependencyType = Field( - description='The type of the dependency', + description='The type of the dependency.', validation_alias=AliasChoices('dep_type', 'depType')) relative_path: str = Field( default=None, - description='The relative path of the dependency in the classpath. This is relevant for tar files', + description='Specifies the relative path from within the archive file which will be added to the java cmd. ' + 'Requires field dep_type to be set to (archive).', validation_alias=AliasChoices('relative_path', 'relativePath'), - examples=['jars/*']) + examples=['*']) class DependencyVerification(BaseModel): - """The verification information of the dependency.""" + """The verification information of a runtime dependency required by the tools' java cmd.""" size: int = Field( default=0, description='The size of the dependency file.', examples=[3265393]) file_hash: FileHashAlgorithm = Field( default=None, - description='The hash function to verify the file', + description='The hash function to verify the file.', validation_alias=AliasChoices('file_hash', 'fileHash'), examples=[ { @@ -59,29 +54,21 @@ class DependencyVerification(BaseModel): class RuntimeDependency(BaseModel): - """The runtime dependency required by the tools Jar cmd. All elements are downloaded and added - to the classPath.""" - name: str = Field(description='The name of the dependency') + """Holds information about a runtime dependency required by the tools' java cmd.""" + name: str = Field( + description='The name of the dependency.', + examples=['Spark-3.5.0', 'AWS Java SDK']) uri: Union[AnyUrl, FilePath] = Field( - description='The FileURI of the dependency. It can be a local file or a remote file', - examples=['file:///path/to/file.jar', + description='The location of the dependency file. It can be a URL to a remote web/storage or a file path.', + examples=['file:///path/to/file.tgz', 'https://mvn-url/24.08.1/rapids-4-spark-tools_2.12-24.08.1.jar', 'gs://bucket-name/path/to/file.jar']) dependency_type: RuntimeDependencyType = Field( default_factory=lambda: RuntimeDependencyType(dep_type=DependencyType.get_default()), - description='The type of the dependency and how to find the lib files after decompression.', + description='Specifies the dependency type to determine how the item is processed. ' + 'For example, jar files are appended to the java classpath while archive files ' + 'such as spark are extracted first before adding subdirectory _/jars/* to the classpath.', validation_alias=AliasChoices('dependency_type', 'dependencyType')) verification: DependencyVerification = Field( default=None, - description='The verification information of the dependency.', - examples=[ - { - 'size': 3265393 - }, - { - 'fileHash': { - 'algorithm': 'md5', - 'value': 'bc9bf7fedde0e700b974426fbd8d869c' - } - } - ]) + description='Optional specification to verify the dependency file.') diff --git a/user_tools/src/spark_rapids_tools/configuration/runtime_conf.py b/user_tools/src/spark_rapids_tools/configuration/runtime_conf.py index 91ab4e1b6..40ab68cf7 100644 --- a/user_tools/src/spark_rapids_tools/configuration/runtime_conf.py +++ b/user_tools/src/spark_rapids_tools/configuration/runtime_conf.py @@ -24,5 +24,7 @@ class ToolsRuntimeConfig(BaseModel): """The runtime configurations of the tools as defined by the user.""" dependencies: List[RuntimeDependency] = Field( - description='The list of runtime dependencies required by the tools Jar cmd. ' - 'All elements are downloaded and added to the classPath') + description='The list of runtime dependencies required by the tools java cmd. ' + 'Set this list to specify Spark binaries along with any other required jar ' + 'files (i.e., hadoop jars, gcp connectors,..etc.). ' + 'When specified, the default predefined dependencies will be ignored.') diff --git a/user_tools/src/spark_rapids_tools/configuration/tools_config.py b/user_tools/src/spark_rapids_tools/configuration/tools_config.py index b226efdcb..b330cfc28 100644 --- a/user_tools/src/spark_rapids_tools/configuration/tools_config.py +++ b/user_tools/src/spark_rapids_tools/configuration/tools_config.py @@ -19,7 +19,6 @@ from typing import Union, Optional from pydantic import BaseModel, Field, ValidationError -from pydantic_core import PydanticCustomError from spark_rapids_tools import CspPathT from spark_rapids_tools.configuration.runtime_conf import ToolsRuntimeConfig @@ -31,12 +30,12 @@ class ToolsConfig(BaseModel): api_version: float = Field( description='The version of the API that the tools are using. ' 'This is used to test the compatibility of the ' - 'configuration file against the current tools release', + 'configuration file against the current tools release.', examples=['1.0'], le=1.0, # minimum version compatible with the current tools implementation ge=1.0) runtime: ToolsRuntimeConfig = Field( - description='Configuration related to the runtime environment of the tools') + description='Configuration related to the runtime environment of the tools.') @classmethod def load_from_file(cls, file_path: Union[str, CspPathT]) -> Optional['ToolsConfig']: @@ -45,9 +44,9 @@ def load_from_file(cls, file_path: Union[str, CspPathT]) -> Optional['ToolsConfi prop_container = AbstractPropContainer.load_from_file(file_path) return cls(**prop_container.props) except ValidationError as e: + # Do nothing. This is kept as a place holder if we want to log the error inside the + # class first raise e - # raise PydanticCustomError('invalid_argument', - # 'Invalid Tools Configuration File\n Error:') from e @classmethod def get_schema(cls) -> str: diff --git a/user_tools/src/spark_rapids_tools/enums.py b/user_tools/src/spark_rapids_tools/enums.py index f51bcdeef..46db8aad1 100644 --- a/user_tools/src/spark_rapids_tools/enums.py +++ b/user_tools/src/spark_rapids_tools/enums.py @@ -92,7 +92,7 @@ def _missing_(cls, value): return None def get_hash_func(self) -> Callable: - """Maps the hash function to the appropriate hashing algorithm""" + """Maps the hash function to the appropriate hashing algorithm.""" hash_functions = { self.MD5: hashlib.md5, self.SHA1: hashlib.sha1, @@ -103,16 +103,13 @@ def get_hash_func(self) -> Callable: class DependencyType(EnumeratedType): - """Represents the dependency type for the jar cmd""" + """Represents the dependency type for the tools' java cmd.""" JAR = 'jar' ARCHIVE = 'archive' - # When classpath is used, it means that the url of a dependency is used as is. - # i.e., it is a folder, or a path that is added to the classPath for java CLI. - CLASSPATH = 'classpath' @classmethod - def get_default(cls): - """Returns the default dependency type""" + def get_default(cls) -> 'DependencyType': + """Returns the default dependency type.""" return cls.JAR diff --git a/user_tools/src/spark_rapids_tools/storagelib/tools/fs_utils.py b/user_tools/src/spark_rapids_tools/storagelib/tools/fs_utils.py index 58dc56ebc..f51bbec8d 100644 --- a/user_tools/src/spark_rapids_tools/storagelib/tools/fs_utils.py +++ b/user_tools/src/spark_rapids_tools/storagelib/tools/fs_utils.py @@ -67,19 +67,21 @@ def raise_invalid_file(file_path: CspPath, msg: str, error_type: str = 'invalid_ @dataclass class FileHashAlgorithm: """ - Represents a file hash algorithm and its value. Used for verification against an - existing file. - ```py - try: - file_algo = FileHashAlgorithm(algorithm=HashAlgorithm.SHA256, value='...') - file_algo.verify_file(CspPath('file://path/to/file')) - except ValidationError as e: - print(e) - ``` + Represents a file hash algorithm and its value. Used for verification against an existing file. """ algorithm: HashAlgorithm value: str + """ + Example usage for the class: + ```py + try: + file_algo = FileHashAlgorithm(algorithm=HashAlgorithm.SHA256, value='...') + file_algo.verify_file(CspPath('file://path/to/file')) + except ValidationError as e: + print(e) + ``` + """ def verify_file(self, file_path: CspPath) -> bool: cb = self.algorithm.get_hash_func() with file_path.open_input_stream() as stream: diff --git a/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/sample-config-specification.json b/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/sample-config-specification.json new file mode 100644 index 000000000..009863963 --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/sample-config-specification.json @@ -0,0 +1,182 @@ +{ + "$defs": { + "DependencyType": { + "description": "Represents the dependency type for the tools' java cmd.", + "enum": [ + "jar", + "archive" + ], + "title": "DependencyType", + "type": "string" + }, + "DependencyVerification": { + "description": "The verification information of a runtime dependency required by the tools' java cmd.", + "properties": { + "size": { + "default": 0, + "description": "The size of the dependency file.", + "examples": [ + 3265393 + ], + "title": "Size", + "type": "integer" + }, + "file_hash": { + "$ref": "#/$defs/FileHashAlgorithm", + "default": null, + "description": "The hash function to verify the file.", + "examples": [ + { + "algorithm": "md5", + "value": "bc9bf7fedde0e700b974426fbd8d869c" + } + ] + } + }, + "title": "DependencyVerification", + "type": "object" + }, + "FileHashAlgorithm": { + "description": "Represents a file hash algorithm and its value. Used for verification against an existing file.", + "properties": { + "algorithm": { + "$ref": "#/$defs/HashAlgorithm" + }, + "value": { + "title": "Value", + "type": "string" + } + }, + "required": [ + "algorithm", + "value" + ], + "title": "FileHashAlgorithm", + "type": "object" + }, + "HashAlgorithm": { + "description": "Represents the supported hashing algorithms", + "enum": [ + "md5", + "sha1", + "sha256", + "sha512" + ], + "title": "HashAlgorithm", + "type": "string" + }, + "RuntimeDependency": { + "description": "Holds information about a runtime dependency required by the tools' java cmd.", + "properties": { + "name": { + "description": "The name of the dependency.", + "examples": [ + "Spark-3.5.0", + "AWS Java SDK" + ], + "title": "Name", + "type": "string" + }, + "uri": { + "anyOf": [ + { + "format": "uri", + "minLength": 1, + "type": "string" + }, + { + "format": "file-path", + "type": "string" + } + ], + "description": "The location of the dependency file. It can be a URL to a remote web/storage or a file path.", + "examples": [ + "file:///path/to/file.tgz", + "https://mvn-url/24.08.1/rapids-4-spark-tools_2.12-24.08.1.jar", + "gs://bucket-name/path/to/file.jar" + ], + "title": "Uri" + }, + "dependency_type": { + "$ref": "#/$defs/RuntimeDependencyType", + "description": "Specifies the dependency type to determine how the item is processed. For example, jar files are appended to the java classpath while archive files such as spark are extracted first before adding subdirectory _/jars/* to the classpath." + }, + "verification": { + "$ref": "#/$defs/DependencyVerification", + "default": null, + "description": "Optional specification to verify the dependency file." + } + }, + "required": [ + "name", + "uri" + ], + "title": "RuntimeDependency", + "type": "object" + }, + "RuntimeDependencyType": { + "description": "Defines the type of runtime dependency required by the tools' java cmd.", + "properties": { + "dep_type": { + "$ref": "#/$defs/DependencyType", + "description": "The type of the dependency." + }, + "relative_path": { + "default": null, + "description": "Specifies the relative path from within the archive file which will be added to the java cmd. Requires field dep_type to be set to (archive).", + "examples": [ + "*" + ], + "title": "Relative Path", + "type": "string" + } + }, + "required": [ + "dep_type" + ], + "title": "RuntimeDependencyType", + "type": "object" + }, + "ToolsRuntimeConfig": { + "description": "The runtime configurations of the tools as defined by the user.", + "properties": { + "dependencies": { + "description": "The list of runtime dependencies required by the tools java cmd. Set this list to specify Spark binaries along with any other required jar files (i.e., hadoop jars, gcp connectors,..etc.). When specified, the default predefined dependencies will be ignored.", + "items": { + "$ref": "#/$defs/RuntimeDependency" + }, + "title": "Dependencies", + "type": "array" + } + }, + "required": [ + "dependencies" + ], + "title": "ToolsRuntimeConfig", + "type": "object" + } + }, + "description": "Main container for the user's defined tools configuration", + "properties": { + "api_version": { + "description": "The version of the API that the tools are using. This is used to test the compatibility of the configuration file against the current tools release.", + "examples": [ + "1.0" + ], + "maximum": 1.0, + "minimum": 1.0, + "title": "Api Version", + "type": "number" + }, + "runtime": { + "$ref": "#/$defs/ToolsRuntimeConfig", + "description": "Configuration related to the runtime environment of the tools." + } + }, + "required": [ + "api_version", + "runtime" + ], + "title": "ToolsConfig", + "type": "object" +} From cf9e295aedeabfca7b9e1030300575f758180371 Mon Sep 17 00:00:00 2001 From: Ahmed Hussein Date: Fri, 1 Nov 2024 09:09:28 -0500 Subject: [PATCH 6/6] address review comments Signed-off-by: Ahmed Hussein --- user_tools/src/spark_rapids_tools/cmdli/argprocessor.py | 2 +- user_tools/src/spark_rapids_tools/configuration/common.py | 2 +- .../resources/tools_config/sample-config-specification.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py index 5b9ce5297..00e7e51f3 100644 --- a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py +++ b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py @@ -389,7 +389,7 @@ def load_tools_config(self) -> None: raise PydanticCustomError( 'invalid_config', f'Tools config file path {self.tools_config_path} could not be loaded. ' - f'It is expected to be a valid configuration YAML file.' + 'It is expected to be a valid configuration YAML file.' f'\n Error:{ve}\n') from ve def init_extra_arg_cases(self) -> list: diff --git a/user_tools/src/spark_rapids_tools/configuration/common.py b/user_tools/src/spark_rapids_tools/configuration/common.py index 9ac02313c..439904cae 100644 --- a/user_tools/src/spark_rapids_tools/configuration/common.py +++ b/user_tools/src/spark_rapids_tools/configuration/common.py @@ -33,7 +33,7 @@ class RuntimeDependencyType(BaseModel): description='Specifies the relative path from within the archive file which will be added to the java cmd. ' 'Requires field dep_type to be set to (archive).', validation_alias=AliasChoices('relative_path', 'relativePath'), - examples=['*']) + examples=['jars/*']) class DependencyVerification(BaseModel): diff --git a/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/sample-config-specification.json b/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/sample-config-specification.json index 009863963..9dbef10ab 100644 --- a/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/sample-config-specification.json +++ b/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/sample-config-specification.json @@ -125,7 +125,7 @@ "default": null, "description": "Specifies the relative path from within the archive file which will be added to the java cmd. Requires field dep_type to be set to (archive).", "examples": [ - "*" + "jars/*" ], "title": "Relative Path", "type": "string"