Skip to content

Commit

Permalink
improve field descriptions
Browse files Browse the repository at this point in the history
Signed-off-by: Ahmed Hussein <[email protected]>
  • Loading branch information
amahussein committed Oct 31, 2024
1 parent 97b2371 commit 153661e
Show file tree
Hide file tree
Showing 9 changed files with 235 additions and 59 deletions.
2 changes: 1 addition & 1 deletion user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ def cache_single_dependency(dep: RuntimeDependency) -> str:
raise ValueError(f'Invalid dependency type [{defined_dep_type}]')
return dep_item

def cache_all_dependencies(dep_arr: List[RuntimeDependency]):
def cache_all_dependencies(dep_arr: List[RuntimeDependency]) -> List[str]:
"""
Create a thread pool and download specified urls
"""
Expand Down
13 changes: 8 additions & 5 deletions user_tools/src/spark_rapids_tools/cmdli/argprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ def process_jvm_args(self) -> None:
self.p_args['toolArgs']['jobResources'] = adjusted_resources
self.p_args['toolArgs']['log4jPath'] = Utils.resource_path('dev/log4j.properties')

def process_tools_config(self) -> None:
def load_tools_config(self) -> None:
"""
Load the tools config file if it is provided. It creates a ToolsConfig object and sets it
in the toolArgs without processing the actual dependencies.
Expand All @@ -384,10 +384,13 @@ def process_tools_config(self) -> None:
try:
self.p_args['toolArgs']['toolsConfig'] = ToolsConfig.load_from_file(self.tools_config_path)
except ValidationError as ve:
# If required, we can dump the expected specification by appending
# 'ToolsConfig.get_schema()' to the error message
raise PydanticCustomError(
'invalid_argument',
'invalid_config',
f'Tools config file path {self.tools_config_path} could not be loaded. '
f'It is expected to be a valid YAML file.\n Error:{ve}')
f'It is expected to be a valid configuration YAML file.'
f'\n Error:{ve}\n') from ve

def init_extra_arg_cases(self) -> list:
if self.eventlogs is None:
Expand Down Expand Up @@ -501,7 +504,7 @@ def build_tools_args(self) -> dict:
# process JVM arguments
self.process_jvm_args()
# process the tools config file
self.process_tools_config()
self.load_tools_config()

# finally generate the final values
wrapped_args = {
Expand Down Expand Up @@ -624,7 +627,7 @@ def build_tools_args(self) -> dict:
# process JVM arguments
self.process_jvm_args()
# process the tools config file
self.process_tools_config()
self.load_tools_config()
# finally generate the final values
wrapped_args = {
'runtimePlatform': runtime_platform,
Expand Down
4 changes: 4 additions & 0 deletions user_tools/src/spark_rapids_tools/cmdli/tools_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ def qualification(self,
Default is calculated as a function of the total number of cores and the heap size on the host.
:param verbose: True or False to enable verbosity of the script.
:param tools_config_file: Path to a configuration file that contains the tools' options.
For sample configuration files, please visit
https://github.com/NVIDIA/spark-rapids-tools/tree/main/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/valid
:param rapids_options: A list of valid Qualification tool options.
Note that the wrapper ignores ["output-directory", "platform"] flags, and it does not support
multiple "spark-property" arguments.
Expand Down Expand Up @@ -163,6 +165,8 @@ def profiling(self,
Default is calculated as a function of the total number of cores and the heap size on the host.
:param verbose: True or False to enable verbosity of the script.
:param tools_config_file: Path to a configuration file that contains the tools' options.
For sample configuration files, please visit
https://github.com/NVIDIA/spark-rapids-tools/tree/main/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/valid
:param rapids_options: A list of valid Profiling tool options.
Note that the wrapper ignores ["output-directory", "worker-info"] flags, and it does not support
multiple "spark-property" arguments.
Expand Down
47 changes: 17 additions & 30 deletions user_tools/src/spark_rapids_tools/configuration/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,33 +23,28 @@


class RuntimeDependencyType(BaseModel):
"""Defines the type of dependency. It can be one of the following:
- Archived file (.tgz)
- Simple JAR file (*.jar)
- Classpath directory (not yet supported)
Note: The 'classpath' type is reserved for future use, allowing users to point to a directory
in the classpath without needing to download or copy any binaries."""
"""Defines the type of runtime dependency required by the tools' java cmd."""

dep_type: DependencyType = Field(
description='The type of the dependency',
description='The type of the dependency.',
validation_alias=AliasChoices('dep_type', 'depType'))
relative_path: str = Field(
default=None,
description='The relative path of the dependency in the classpath. This is relevant for tar files',
description='Specifies the relative path from within the archive file which will be added to the java cmd. '
'Requires field dep_type to be set to (archive).',
validation_alias=AliasChoices('relative_path', 'relativePath'),
examples=['jars/*'])
examples=['*'])


class DependencyVerification(BaseModel):
"""The verification information of the dependency."""
"""The verification information of a runtime dependency required by the tools' java cmd."""
size: int = Field(
default=0,
description='The size of the dependency file.',
examples=[3265393])
file_hash: FileHashAlgorithm = Field(
default=None,
description='The hash function to verify the file',
description='The hash function to verify the file.',
validation_alias=AliasChoices('file_hash', 'fileHash'),
examples=[
{
Expand All @@ -59,29 +54,21 @@ class DependencyVerification(BaseModel):


class RuntimeDependency(BaseModel):
"""The runtime dependency required by the tools Jar cmd. All elements are downloaded and added
to the classPath."""
name: str = Field(description='The name of the dependency')
"""Holds information about a runtime dependency required by the tools' java cmd."""
name: str = Field(
description='The name of the dependency.',
examples=['Spark-3.5.0', 'AWS Java SDK'])
uri: Union[AnyUrl, FilePath] = Field(
description='The FileURI of the dependency. It can be a local file or a remote file',
examples=['file:///path/to/file.jar',
description='The location of the dependency file. It can be a URL to a remote web/storage or a file path.',
examples=['file:///path/to/file.tgz',
'https://mvn-url/24.08.1/rapids-4-spark-tools_2.12-24.08.1.jar',
'gs://bucket-name/path/to/file.jar'])
dependency_type: RuntimeDependencyType = Field(
default_factory=lambda: RuntimeDependencyType(dep_type=DependencyType.get_default()),
description='The type of the dependency and how to find the lib files after decompression.',
description='Specifies the dependency type to determine how the item is processed. '
'For example, jar files are appended to the java classpath while archive files '
'such as spark are extracted first before adding subdirectory _/jars/* to the classpath.',
validation_alias=AliasChoices('dependency_type', 'dependencyType'))
verification: DependencyVerification = Field(
default=None,
description='The verification information of the dependency.',
examples=[
{
'size': 3265393
},
{
'fileHash': {
'algorithm': 'md5',
'value': 'bc9bf7fedde0e700b974426fbd8d869c'
}
}
])
description='Optional specification to verify the dependency file.')
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,7 @@
class ToolsRuntimeConfig(BaseModel):
"""The runtime configurations of the tools as defined by the user."""
dependencies: List[RuntimeDependency] = Field(
description='The list of runtime dependencies required by the tools Jar cmd. '
'All elements are downloaded and added to the classPath')
description='The list of runtime dependencies required by the tools java cmd. '
'Set this list to specify Spark binaries along with any other required jar '
'files (i.e., hadoop jars, gcp connectors,..etc.). '
'When specified, the default predefined dependencies will be ignored.')
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from typing import Union, Optional

from pydantic import BaseModel, Field, ValidationError
from pydantic_core import PydanticCustomError

from spark_rapids_tools import CspPathT
from spark_rapids_tools.configuration.runtime_conf import ToolsRuntimeConfig
Expand All @@ -31,12 +30,12 @@ class ToolsConfig(BaseModel):
api_version: float = Field(
description='The version of the API that the tools are using. '
'This is used to test the compatibility of the '
'configuration file against the current tools release',
'configuration file against the current tools release.',
examples=['1.0'],
le=1.0, # minimum version compatible with the current tools implementation
ge=1.0)
runtime: ToolsRuntimeConfig = Field(
description='Configuration related to the runtime environment of the tools')
description='Configuration related to the runtime environment of the tools.')

@classmethod
def load_from_file(cls, file_path: Union[str, CspPathT]) -> Optional['ToolsConfig']:
Expand All @@ -45,9 +44,9 @@ def load_from_file(cls, file_path: Union[str, CspPathT]) -> Optional['ToolsConfi
prop_container = AbstractPropContainer.load_from_file(file_path)
return cls(**prop_container.props)
except ValidationError as e:
# Do nothing. This is kept as a place holder if we want to log the error inside the
# class first
raise e
# raise PydanticCustomError('invalid_argument',
# 'Invalid Tools Configuration File\n Error:') from e

@classmethod
def get_schema(cls) -> str:
Expand Down
11 changes: 4 additions & 7 deletions user_tools/src/spark_rapids_tools/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def _missing_(cls, value):
return None

def get_hash_func(self) -> Callable:
"""Maps the hash function to the appropriate hashing algorithm"""
"""Maps the hash function to the appropriate hashing algorithm."""
hash_functions = {
self.MD5: hashlib.md5,
self.SHA1: hashlib.sha1,
Expand All @@ -103,16 +103,13 @@ def get_hash_func(self) -> Callable:


class DependencyType(EnumeratedType):
"""Represents the dependency type for the jar cmd"""
"""Represents the dependency type for the tools' java cmd."""
JAR = 'jar'
ARCHIVE = 'archive'
# When classpath is used, it means that the url of a dependency is used as is.
# i.e., it is a folder, or a path that is added to the classPath for java CLI.
CLASSPATH = 'classpath'

@classmethod
def get_default(cls):
"""Returns the default dependency type"""
def get_default(cls) -> 'DependencyType':
"""Returns the default dependency type."""
return cls.JAR


Expand Down
20 changes: 11 additions & 9 deletions user_tools/src/spark_rapids_tools/storagelib/tools/fs_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,19 +67,21 @@ def raise_invalid_file(file_path: CspPath, msg: str, error_type: str = 'invalid_
@dataclass
class FileHashAlgorithm:
"""
Represents a file hash algorithm and its value. Used for verification against an
existing file.
```py
try:
file_algo = FileHashAlgorithm(algorithm=HashAlgorithm.SHA256, value='...')
file_algo.verify_file(CspPath('file://path/to/file'))
except ValidationError as e:
print(e)
```
Represents a file hash algorithm and its value. Used for verification against an existing file.
"""
algorithm: HashAlgorithm
value: str

"""
Example usage for the class:
```py
try:
file_algo = FileHashAlgorithm(algorithm=HashAlgorithm.SHA256, value='...')
file_algo.verify_file(CspPath('file://path/to/file'))
except ValidationError as e:
print(e)
```
"""
def verify_file(self, file_path: CspPath) -> bool:
cb = self.algorithm.get_hash_func()
with file_path.open_input_stream() as stream:
Expand Down
Loading

0 comments on commit 153661e

Please sign in to comment.