improve field descriptions

Signed-off-by: Ahmed Hussein <[email protected]>
NVIDIA · Oct 31, 2024 · 153661e · 153661e
1 parent 97b2371
commit 153661e
Show file tree

Hide file tree

Showing 9 changed files with 235 additions and 59 deletions.
diff --git a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py
@@ -583,7 +583,7 @@ def cache_single_dependency(dep: RuntimeDependency) -> str:
                 raise ValueError(f'Invalid dependency type [{defined_dep_type}]')
             return dep_item
 
-        def cache_all_dependencies(dep_arr: List[RuntimeDependency]):
+        def cache_all_dependencies(dep_arr: List[RuntimeDependency]) -> List[str]:
             """
             Create a thread pool and download specified urls
             """

diff --git a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py
@@ -372,7 +372,7 @@ def process_jvm_args(self) -> None:
         self.p_args['toolArgs']['jobResources'] = adjusted_resources
         self.p_args['toolArgs']['log4jPath'] = Utils.resource_path('dev/log4j.properties')
 
-    def process_tools_config(self) -> None:
+    def load_tools_config(self) -> None:
         """
         Load the tools config file if it is provided. It creates a ToolsConfig object and sets it
         in the toolArgs without processing the actual dependencies.
@@ -384,10 +384,13 @@ def process_tools_config(self) -> None:
             try:
                 self.p_args['toolArgs']['toolsConfig'] = ToolsConfig.load_from_file(self.tools_config_path)
             except ValidationError as ve:
+                # If required, we can dump the expected specification by appending
+                # 'ToolsConfig.get_schema()' to the error message
                 raise PydanticCustomError(
-                    'invalid_argument',
+                    'invalid_config',
                     f'Tools config file path {self.tools_config_path} could not be loaded. '
-                    f'It is expected to be a valid YAML file.\n  Error:{ve}')
+                    f'It is expected to be a valid configuration YAML file.'
+                    f'\n  Error:{ve}\n') from ve
 
     def init_extra_arg_cases(self) -> list:
         if self.eventlogs is None:
@@ -501,7 +504,7 @@ def build_tools_args(self) -> dict:
         # process JVM arguments
         self.process_jvm_args()
         # process the tools config file
-        self.process_tools_config()
+        self.load_tools_config()
 
         # finally generate the final values
         wrapped_args = {
@@ -624,7 +627,7 @@ def build_tools_args(self) -> dict:
         # process JVM arguments
         self.process_jvm_args()
         # process the tools config file
-        self.process_tools_config()
+        self.load_tools_config()
         # finally generate the final values
         wrapped_args = {
             'runtimePlatform': runtime_platform,

diff --git a/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py
@@ -81,6 +81,8 @@ def qualification(self,
                 Default is calculated as a function of the total number of cores and the heap size on the host.
         :param verbose: True or False to enable verbosity of the script.
         :param tools_config_file: Path to a configuration file that contains the tools' options.
+               For sample configuration files, please visit
+               https://github.com/NVIDIA/spark-rapids-tools/tree/main/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/valid
         :param rapids_options: A list of valid Qualification tool options.
                 Note that the wrapper ignores ["output-directory", "platform"] flags, and it does not support
                 multiple "spark-property" arguments.
@@ -163,6 +165,8 @@ def profiling(self,
                 Default is calculated as a function of the total number of cores and the heap size on the host.
         :param verbose: True or False to enable verbosity of the script.
         :param tools_config_file: Path to a configuration file that contains the tools' options.
+               For sample configuration files, please visit
+               https://github.com/NVIDIA/spark-rapids-tools/tree/main/user_tools/tests/spark_rapids_tools_ut/resources/tools_config/valid
         :param rapids_options: A list of valid Profiling tool options.
                 Note that the wrapper ignores ["output-directory", "worker-info"] flags, and it does not support
                 multiple "spark-property" arguments.

diff --git a/user_tools/src/spark_rapids_tools/configuration/common.py b/user_tools/src/spark_rapids_tools/configuration/common.py
@@ -23,33 +23,28 @@
 
 
 class RuntimeDependencyType(BaseModel):
-    """Defines the type of dependency. It can be one of the following:
-       - Archived file (.tgz)
-       - Simple JAR file (*.jar)
-       - Classpath directory (not yet supported)
-
-    Note: The 'classpath' type is reserved for future use, allowing users to point to a directory
-    in the classpath without needing to download or copy any binaries."""
+    """Defines the type of runtime dependency required by the tools' java cmd."""
 
     dep_type: DependencyType = Field(
-        description='The type of the dependency',
+        description='The type of the dependency.',
         validation_alias=AliasChoices('dep_type', 'depType'))
     relative_path: str = Field(
         default=None,
-        description='The relative path of the dependency in the classpath. This is relevant for tar files',
+        description='Specifies the relative path from within the archive file which will be added to the java cmd. '
+                    'Requires field dep_type to be set to (archive).',
         validation_alias=AliasChoices('relative_path', 'relativePath'),
-        examples=['jars/*'])
+        examples=['*'])
 
 
 class DependencyVerification(BaseModel):
-    """The verification information of the dependency."""
+    """The verification information of a runtime dependency required by the tools' java cmd."""
     size: int = Field(
         default=0,
         description='The size of the dependency file.',
         examples=[3265393])
     file_hash: FileHashAlgorithm = Field(
         default=None,
-        description='The hash function to verify the file',
+        description='The hash function to verify the file.',
         validation_alias=AliasChoices('file_hash', 'fileHash'),
         examples=[
             {
@@ -59,29 +54,21 @@ class DependencyVerification(BaseModel):
 
 
 class RuntimeDependency(BaseModel):
-    """The runtime dependency required by the tools Jar cmd. All elements are downloaded and added
-    to the classPath."""
-    name: str = Field(description='The name of the dependency')
+    """Holds information about a runtime dependency required by the tools' java cmd."""
+    name: str = Field(
+        description='The name of the dependency.',
+        examples=['Spark-3.5.0', 'AWS Java SDK'])
     uri: Union[AnyUrl, FilePath] = Field(
-        description='The FileURI of the dependency. It can be a local file or a remote file',
-        examples=['file:///path/to/file.jar',
+        description='The location of the dependency file. It can be a URL to a remote web/storage or a file path.',
+        examples=['file:///path/to/file.tgz',
                   'https://mvn-url/24.08.1/rapids-4-spark-tools_2.12-24.08.1.jar',
                   'gs://bucket-name/path/to/file.jar'])
     dependency_type: RuntimeDependencyType = Field(
         default_factory=lambda: RuntimeDependencyType(dep_type=DependencyType.get_default()),
-        description='The type of the dependency and how to find the lib files after decompression.',
+        description='Specifies the dependency type to determine how the item is processed. '
+                    'For example, jar files are appended to the java classpath while archive files '
+                    'such as spark are extracted first before adding subdirectory _/jars/* to the classpath.',
         validation_alias=AliasChoices('dependency_type', 'dependencyType'))
     verification: DependencyVerification = Field(
         default=None,
-        description='The verification information of the dependency.',
-        examples=[
-            {
-                'size': 3265393
-            },
-            {
-                'fileHash': {
-                    'algorithm': 'md5',
-                    'value': 'bc9bf7fedde0e700b974426fbd8d869c'
-                }
-            }
-        ])
+        description='Optional specification to verify the dependency file.')
diff --git a/user_tools/src/spark_rapids_tools/configuration/runtime_conf.py b/user_tools/src/spark_rapids_tools/configuration/runtime_conf.py
@@ -24,5 +24,7 @@
 class ToolsRuntimeConfig(BaseModel):
     """The runtime configurations of the tools as defined by the user."""
     dependencies: List[RuntimeDependency] = Field(
-        description='The list of runtime dependencies required by the tools Jar cmd. '
-                    'All elements are downloaded and added to the classPath')
+        description='The list of runtime dependencies required by the tools java cmd. '
+                    'Set this list to specify Spark binaries along with any other required jar '
+                    'files (i.e., hadoop jars, gcp connectors,..etc.). '
+                    'When specified, the default predefined dependencies will be ignored.')
diff --git a/user_tools/src/spark_rapids_tools/configuration/tools_config.py b/user_tools/src/spark_rapids_tools/configuration/tools_config.py
@@ -19,7 +19,6 @@
 from typing import Union, Optional
 
 from pydantic import BaseModel, Field, ValidationError
-from pydantic_core import PydanticCustomError
 
 from spark_rapids_tools import CspPathT
 from spark_rapids_tools.configuration.runtime_conf import ToolsRuntimeConfig
@@ -31,12 +30,12 @@ class ToolsConfig(BaseModel):
     api_version: float = Field(
         description='The version of the API that the tools are using. '
                     'This is used to test the compatibility of the '
-                    'configuration file against the current tools release',
+                    'configuration file against the current tools release.',
         examples=['1.0'],
         le=1.0,  # minimum version compatible with the current tools implementation
         ge=1.0)
     runtime: ToolsRuntimeConfig = Field(
-        description='Configuration related to the runtime environment of the tools')
+        description='Configuration related to the runtime environment of the tools.')
 
     @classmethod
     def load_from_file(cls, file_path: Union[str, CspPathT]) -> Optional['ToolsConfig']:
@@ -45,9 +44,9 @@ def load_from_file(cls, file_path: Union[str, CspPathT]) -> Optional['ToolsConfi
             prop_container = AbstractPropContainer.load_from_file(file_path)
             return cls(**prop_container.props)
         except ValidationError as e:
+            # Do nothing. This is kept as a place holder if we want to log the error inside the
+            # class first
             raise e
-            # raise PydanticCustomError('invalid_argument',
-            #                           'Invalid Tools Configuration File\n Error:') from e
 
     @classmethod
     def get_schema(cls) -> str:

diff --git a/user_tools/src/spark_rapids_tools/enums.py b/user_tools/src/spark_rapids_tools/enums.py
@@ -92,7 +92,7 @@ def _missing_(cls, value):
         return None
 
     def get_hash_func(self) -> Callable:
-        """Maps the hash function to the appropriate hashing algorithm"""
+        """Maps the hash function to the appropriate hashing algorithm."""
         hash_functions = {
             self.MD5: hashlib.md5,
             self.SHA1: hashlib.sha1,
@@ -103,16 +103,13 @@ def get_hash_func(self) -> Callable:
 
 
 class DependencyType(EnumeratedType):
-    """Represents the dependency type for the jar cmd"""
+    """Represents the dependency type for the tools' java cmd."""
     JAR = 'jar'
     ARCHIVE = 'archive'
-    # When classpath is used, it means that the url of a dependency is used as is.
-    # i.e., it is a folder, or a path that is added to the classPath for java CLI.
-    CLASSPATH = 'classpath'
 
     @classmethod
-    def get_default(cls):
-        """Returns the default dependency type"""
+    def get_default(cls) -> 'DependencyType':
+        """Returns the default dependency type."""
         return cls.JAR
 
 

diff --git a/user_tools/src/spark_rapids_tools/storagelib/tools/fs_utils.py b/user_tools/src/spark_rapids_tools/storagelib/tools/fs_utils.py
@@ -67,19 +67,21 @@ def raise_invalid_file(file_path: CspPath, msg: str, error_type: str = 'invalid_
 @dataclass
 class FileHashAlgorithm:
     """
-    Represents a file hash algorithm and its value. Used for verification against an
-    existing file.
-    ```py
-    try:
-        file_algo = FileHashAlgorithm(algorithm=HashAlgorithm.SHA256, value='...')
-        file_algo.verify_file(CspPath('file://path/to/file'))
-    except ValidationError as e:
-        print(e)
-    ```
+    Represents a file hash algorithm and its value. Used for verification against an existing file.
     """
     algorithm: HashAlgorithm
     value: str
 
+    """
+    Example usage for the class:
+    ```py
+        try:
+            file_algo = FileHashAlgorithm(algorithm=HashAlgorithm.SHA256, value='...')
+            file_algo.verify_file(CspPath('file://path/to/file'))
+        except ValidationError as e:
+            print(e)
+    ```
+    """
     def verify_file(self, file_path: CspPath) -> bool:
         cb = self.algorithm.get_hash_func()
         with file_path.open_input_stream() as stream: