diff --git a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py index a8544d0ec..c666381b0 100644 --- a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py +++ b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py @@ -583,7 +583,7 @@ def cache_single_dependency(dep: RuntimeDependency) -> str: raise ValueError(f'Invalid dependency type [{defined_dep_type}]') return dep_item - def cache_all_dependencies(dep_arr: List[RuntimeDependency]): + def cache_all_dependencies(dep_arr: List[RuntimeDependency]) -> List[str]: """ Create a thread pool and download specified urls """ diff --git a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py index 439abd546..5b9ce5297 100644 --- a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py +++ b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py @@ -372,7 +372,7 @@ def process_jvm_args(self) -> None: self.p_args['toolArgs']['jobResources'] = adjusted_resources self.p_args['toolArgs']['log4jPath'] = Utils.resource_path('dev/log4j.properties') - def process_tools_config(self) -> None: + def load_tools_config(self) -> None: """ Load the tools config file if it is provided. It creates a ToolsConfig object and sets it in the toolArgs without processing the actual dependencies. @@ -384,10 +384,13 @@ def process_tools_config(self) -> None: try: self.p_args['toolArgs']['toolsConfig'] = ToolsConfig.load_from_file(self.tools_config_path) except ValidationError as ve: + # If required, we can dump the expected specification by appending + # 'ToolsConfig.get_schema()' to the error message raise PydanticCustomError( - 'invalid_argument', + 'invalid_config', f'Tools config file path {self.tools_config_path} could not be loaded. ' - f'It is expected to be a valid YAML file.\n Error:{ve}') + f'It is expected to be a valid configuration YAML file.' + f'\n Error:{ve}\n') from ve def init_extra_arg_cases(self) -> list: if self.eventlogs is None: @@ -501,7 +504,7 @@ def build_tools_args(self) -> dict: # process JVM arguments self.process_jvm_args() # process the tools config file - self.process_tools_config() + self.load_tools_config() # finally generate the final values wrapped_args = { @@ -624,7 +627,7 @@ def build_tools_args(self) -> dict: # process JVM arguments self.process_jvm_args() # process the tools config file - self.process_tools_config() + self.load_tools_config() # finally generate the final values wrapped_args = { 'runtimePlatform': runtime_platform, diff --git a/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py index ba39e0a35..0d46e5025 100644 --- a/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py +++ b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py @@ -81,6 +81,8 @@ def qualification(self, Default is calculated as a function of the total number of cores and the heap size on the host. :param verbose: True or False to enable verbosity of the script. :param tools_config_file: Path to a configuration file that contains the tools' options. + For sample configuration files, please visit + https://github.com/NVIDIA/spark-rapids-tools/tree/main/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/valid :param rapids_options: A list of valid Qualification tool options. Note that the wrapper ignores ["output-directory", "platform"] flags, and it does not support multiple "spark-property" arguments. @@ -163,6 +165,8 @@ def profiling(self, Default is calculated as a function of the total number of cores and the heap size on the host. :param verbose: True or False to enable verbosity of the script. :param tools_config_file: Path to a configuration file that contains the tools' options. + For sample configuration files, please visit + https://github.com/NVIDIA/spark-rapids-tools/tree/main/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/valid :param rapids_options: A list of valid Profiling tool options. Note that the wrapper ignores ["output-directory", "worker-info"] flags, and it does not support multiple "spark-property" arguments. diff --git a/user_tools/src/spark_rapids_tools/configuration/common.py b/user_tools/src/spark_rapids_tools/configuration/common.py index c199977a1..9ac02313c 100644 --- a/user_tools/src/spark_rapids_tools/configuration/common.py +++ b/user_tools/src/spark_rapids_tools/configuration/common.py @@ -23,33 +23,28 @@ class RuntimeDependencyType(BaseModel): - """Defines the type of dependency. It can be one of the following: - - Archived file (.tgz) - - Simple JAR file (*.jar) - - Classpath directory (not yet supported) - - Note: The 'classpath' type is reserved for future use, allowing users to point to a directory - in the classpath without needing to download or copy any binaries.""" + """Defines the type of runtime dependency required by the tools' java cmd.""" dep_type: DependencyType = Field( - description='The type of the dependency', + description='The type of the dependency.', validation_alias=AliasChoices('dep_type', 'depType')) relative_path: str = Field( default=None, - description='The relative path of the dependency in the classpath. This is relevant for tar files', + description='Specifies the relative path from within the archive file which will be added to the java cmd. ' + 'Requires field dep_type to be set to (archive).', validation_alias=AliasChoices('relative_path', 'relativePath'), - examples=['jars/*']) + examples=['*']) class DependencyVerification(BaseModel): - """The verification information of the dependency.""" + """The verification information of a runtime dependency required by the tools' java cmd.""" size: int = Field( default=0, description='The size of the dependency file.', examples=[3265393]) file_hash: FileHashAlgorithm = Field( default=None, - description='The hash function to verify the file', + description='The hash function to verify the file.', validation_alias=AliasChoices('file_hash', 'fileHash'), examples=[ { @@ -59,29 +54,21 @@ class DependencyVerification(BaseModel): class RuntimeDependency(BaseModel): - """The runtime dependency required by the tools Jar cmd. All elements are downloaded and added - to the classPath.""" - name: str = Field(description='The name of the dependency') + """Holds information about a runtime dependency required by the tools' java cmd.""" + name: str = Field( + description='The name of the dependency.', + examples=['Spark-3.5.0', 'AWS Java SDK']) uri: Union[AnyUrl, FilePath] = Field( - description='The FileURI of the dependency. It can be a local file or a remote file', - examples=['file:///path/to/file.jar', + description='The location of the dependency file. It can be a URL to a remote web/storage or a file path.', + examples=['file:///path/to/file.tgz', 'https://mvn-url/24.08.1/rapids-4-spark-tools_2.12-24.08.1.jar', 'gs://bucket-name/path/to/file.jar']) dependency_type: RuntimeDependencyType = Field( default_factory=lambda: RuntimeDependencyType(dep_type=DependencyType.get_default()), - description='The type of the dependency and how to find the lib files after decompression.', + description='Specifies the dependency type to determine how the item is processed. ' + 'For example, jar files are appended to the java classpath while archive files ' + 'such as spark are extracted first before adding subdirectory _/jars/* to the classpath.', validation_alias=AliasChoices('dependency_type', 'dependencyType')) verification: DependencyVerification = Field( default=None, - description='The verification information of the dependency.', - examples=[ - { - 'size': 3265393 - }, - { - 'fileHash': { - 'algorithm': 'md5', - 'value': 'bc9bf7fedde0e700b974426fbd8d869c' - } - } - ]) + description='Optional specification to verify the dependency file.') diff --git a/user_tools/src/spark_rapids_tools/configuration/runtime_conf.py b/user_tools/src/spark_rapids_tools/configuration/runtime_conf.py index 91ab4e1b6..40ab68cf7 100644 --- a/user_tools/src/spark_rapids_tools/configuration/runtime_conf.py +++ b/user_tools/src/spark_rapids_tools/configuration/runtime_conf.py @@ -24,5 +24,7 @@ class ToolsRuntimeConfig(BaseModel): """The runtime configurations of the tools as defined by the user.""" dependencies: List[RuntimeDependency] = Field( - description='The list of runtime dependencies required by the tools Jar cmd. ' - 'All elements are downloaded and added to the classPath') + description='The list of runtime dependencies required by the tools java cmd. ' + 'Set this list to specify Spark binaries along with any other required jar ' + 'files (i.e., hadoop jars, gcp connectors,..etc.). ' + 'When specified, the default predefined dependencies will be ignored.') diff --git a/user_tools/src/spark_rapids_tools/configuration/tools_config.py b/user_tools/src/spark_rapids_tools/configuration/tools_config.py index b226efdcb..b330cfc28 100644 --- a/user_tools/src/spark_rapids_tools/configuration/tools_config.py +++ b/user_tools/src/spark_rapids_tools/configuration/tools_config.py @@ -19,7 +19,6 @@ from typing import Union, Optional from pydantic import BaseModel, Field, ValidationError -from pydantic_core import PydanticCustomError from spark_rapids_tools import CspPathT from spark_rapids_tools.configuration.runtime_conf import ToolsRuntimeConfig @@ -31,12 +30,12 @@ class ToolsConfig(BaseModel): api_version: float = Field( description='The version of the API that the tools are using. ' 'This is used to test the compatibility of the ' - 'configuration file against the current tools release', + 'configuration file against the current tools release.', examples=['1.0'], le=1.0, # minimum version compatible with the current tools implementation ge=1.0) runtime: ToolsRuntimeConfig = Field( - description='Configuration related to the runtime environment of the tools') + description='Configuration related to the runtime environment of the tools.') @classmethod def load_from_file(cls, file_path: Union[str, CspPathT]) -> Optional['ToolsConfig']: @@ -45,9 +44,9 @@ def load_from_file(cls, file_path: Union[str, CspPathT]) -> Optional['ToolsConfi prop_container = AbstractPropContainer.load_from_file(file_path) return cls(**prop_container.props) except ValidationError as e: + # Do nothing. This is kept as a place holder if we want to log the error inside the + # class first raise e - # raise PydanticCustomError('invalid_argument', - # 'Invalid Tools Configuration File\n Error:') from e @classmethod def get_schema(cls) -> str: diff --git a/user_tools/src/spark_rapids_tools/enums.py b/user_tools/src/spark_rapids_tools/enums.py index f51bcdeef..46db8aad1 100644 --- a/user_tools/src/spark_rapids_tools/enums.py +++ b/user_tools/src/spark_rapids_tools/enums.py @@ -92,7 +92,7 @@ def _missing_(cls, value): return None def get_hash_func(self) -> Callable: - """Maps the hash function to the appropriate hashing algorithm""" + """Maps the hash function to the appropriate hashing algorithm.""" hash_functions = { self.MD5: hashlib.md5, self.SHA1: hashlib.sha1, @@ -103,16 +103,13 @@ def get_hash_func(self) -> Callable: class DependencyType(EnumeratedType): - """Represents the dependency type for the jar cmd""" + """Represents the dependency type for the tools' java cmd.""" JAR = 'jar' ARCHIVE = 'archive' - # When classpath is used, it means that the url of a dependency is used as is. - # i.e., it is a folder, or a path that is added to the classPath for java CLI. - CLASSPATH = 'classpath' @classmethod - def get_default(cls): - """Returns the default dependency type""" + def get_default(cls) -> 'DependencyType': + """Returns the default dependency type.""" return cls.JAR diff --git a/user_tools/src/spark_rapids_tools/storagelib/tools/fs_utils.py b/user_tools/src/spark_rapids_tools/storagelib/tools/fs_utils.py index 58dc56ebc..f51bbec8d 100644 --- a/user_tools/src/spark_rapids_tools/storagelib/tools/fs_utils.py +++ b/user_tools/src/spark_rapids_tools/storagelib/tools/fs_utils.py @@ -67,19 +67,21 @@ def raise_invalid_file(file_path: CspPath, msg: str, error_type: str = 'invalid_ @dataclass class FileHashAlgorithm: """ - Represents a file hash algorithm and its value. Used for verification against an - existing file. - ```py - try: - file_algo = FileHashAlgorithm(algorithm=HashAlgorithm.SHA256, value='...') - file_algo.verify_file(CspPath('file://path/to/file')) - except ValidationError as e: - print(e) - ``` + Represents a file hash algorithm and its value. Used for verification against an existing file. """ algorithm: HashAlgorithm value: str + """ + Example usage for the class: + ```py + try: + file_algo = FileHashAlgorithm(algorithm=HashAlgorithm.SHA256, value='...') + file_algo.verify_file(CspPath('file://path/to/file')) + except ValidationError as e: + print(e) + ``` + """ def verify_file(self, file_path: CspPath) -> bool: cb = self.algorithm.get_hash_func() with file_path.open_input_stream() as stream: diff --git a/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/sample-config-specification.json b/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/sample-config-specification.json new file mode 100644 index 000000000..009863963 --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/sample-config-specification.json @@ -0,0 +1,182 @@ +{ + "$defs": { + "DependencyType": { + "description": "Represents the dependency type for the tools' java cmd.", + "enum": [ + "jar", + "archive" + ], + "title": "DependencyType", + "type": "string" + }, + "DependencyVerification": { + "description": "The verification information of a runtime dependency required by the tools' java cmd.", + "properties": { + "size": { + "default": 0, + "description": "The size of the dependency file.", + "examples": [ + 3265393 + ], + "title": "Size", + "type": "integer" + }, + "file_hash": { + "$ref": "#/$defs/FileHashAlgorithm", + "default": null, + "description": "The hash function to verify the file.", + "examples": [ + { + "algorithm": "md5", + "value": "bc9bf7fedde0e700b974426fbd8d869c" + } + ] + } + }, + "title": "DependencyVerification", + "type": "object" + }, + "FileHashAlgorithm": { + "description": "Represents a file hash algorithm and its value. Used for verification against an existing file.", + "properties": { + "algorithm": { + "$ref": "#/$defs/HashAlgorithm" + }, + "value": { + "title": "Value", + "type": "string" + } + }, + "required": [ + "algorithm", + "value" + ], + "title": "FileHashAlgorithm", + "type": "object" + }, + "HashAlgorithm": { + "description": "Represents the supported hashing algorithms", + "enum": [ + "md5", + "sha1", + "sha256", + "sha512" + ], + "title": "HashAlgorithm", + "type": "string" + }, + "RuntimeDependency": { + "description": "Holds information about a runtime dependency required by the tools' java cmd.", + "properties": { + "name": { + "description": "The name of the dependency.", + "examples": [ + "Spark-3.5.0", + "AWS Java SDK" + ], + "title": "Name", + "type": "string" + }, + "uri": { + "anyOf": [ + { + "format": "uri", + "minLength": 1, + "type": "string" + }, + { + "format": "file-path", + "type": "string" + } + ], + "description": "The location of the dependency file. It can be a URL to a remote web/storage or a file path.", + "examples": [ + "file:///path/to/file.tgz", + "https://mvn-url/24.08.1/rapids-4-spark-tools_2.12-24.08.1.jar", + "gs://bucket-name/path/to/file.jar" + ], + "title": "Uri" + }, + "dependency_type": { + "$ref": "#/$defs/RuntimeDependencyType", + "description": "Specifies the dependency type to determine how the item is processed. For example, jar files are appended to the java classpath while archive files such as spark are extracted first before adding subdirectory _/jars/* to the classpath." + }, + "verification": { + "$ref": "#/$defs/DependencyVerification", + "default": null, + "description": "Optional specification to verify the dependency file." + } + }, + "required": [ + "name", + "uri" + ], + "title": "RuntimeDependency", + "type": "object" + }, + "RuntimeDependencyType": { + "description": "Defines the type of runtime dependency required by the tools' java cmd.", + "properties": { + "dep_type": { + "$ref": "#/$defs/DependencyType", + "description": "The type of the dependency." + }, + "relative_path": { + "default": null, + "description": "Specifies the relative path from within the archive file which will be added to the java cmd. Requires field dep_type to be set to (archive).", + "examples": [ + "*" + ], + "title": "Relative Path", + "type": "string" + } + }, + "required": [ + "dep_type" + ], + "title": "RuntimeDependencyType", + "type": "object" + }, + "ToolsRuntimeConfig": { + "description": "The runtime configurations of the tools as defined by the user.", + "properties": { + "dependencies": { + "description": "The list of runtime dependencies required by the tools java cmd. Set this list to specify Spark binaries along with any other required jar files (i.e., hadoop jars, gcp connectors,..etc.). When specified, the default predefined dependencies will be ignored.", + "items": { + "$ref": "#/$defs/RuntimeDependency" + }, + "title": "Dependencies", + "type": "array" + } + }, + "required": [ + "dependencies" + ], + "title": "ToolsRuntimeConfig", + "type": "object" + } + }, + "description": "Main container for the user's defined tools configuration", + "properties": { + "api_version": { + "description": "The version of the API that the tools are using. This is used to test the compatibility of the configuration file against the current tools release.", + "examples": [ + "1.0" + ], + "maximum": 1.0, + "minimum": 1.0, + "title": "Api Version", + "type": "number" + }, + "runtime": { + "$ref": "#/$defs/ToolsRuntimeConfig", + "description": "Configuration related to the runtime environment of the tools." + } + }, + "required": [ + "api_version", + "runtime" + ], + "title": "ToolsConfig", + "type": "object" +}