Merge branch 'opensearch-project:main' into main

dhrubo-os · Aug 8, 2024 · ebda9b5 · ebda9b5
2 parents 420d550 + 04c34c6
commit ebda9b5
Show file tree

Hide file tree

Showing 27 changed files with 1,593 additions and 297 deletions.
diff --git a/.ci/run-repository.sh b/.ci/run-repository.sh
@@ -65,7 +65,7 @@ elif [[ "$TASK_TYPE" == "doc" ]]; then
 
   docker cp opensearch-py-ml-doc-runner:/code/opensearch-py-ml/docs/build/ ./docs/
   docker rm opensearch-py-ml-doc-runner
-elif [[ "$TASK_TYPE" == "trace" ]]; then
+elif [[ "$TASK_TYPE" == "SentenceTransformerTrace" || "$TASK_TYPE" == "SparseTrace" ]]; then
   # Set up OpenSearch cluster & Run model autotracing (Invoked by model_uploader.yml workflow)
   echo -e "\033[34;1mINFO:\033[0m MODEL_ID: ${MODEL_ID}\033[0m"
   echo -e "\033[34;1mINFO:\033[0m MODEL_VERSION: ${MODEL_VERSION}\033[0m"
@@ -74,6 +74,17 @@ elif [[ "$TASK_TYPE" == "trace" ]]; then
   echo -e "\033[34;1mINFO:\033[0m POOLING_MODE: ${POOLING_MODE:-N/A}\033[0m"
   echo -e "\033[34;1mINFO:\033[0m MODEL_DESCRIPTION: ${MODEL_DESCRIPTION:-N/A}\033[0m"
 
+  if [[ "$TASK_TYPE" == "SentenceTransformerTrace" ]]; then
+      NOX_TRACE_TYPE="trace"
+      EXTRA_ARGS="-ed ${EMBEDDING_DIMENSION} -pm ${POOLING_MODE}"
+  elif [[ "$TASK_TYPE" == "SparseTrace" ]]; then
+      NOX_TRACE_TYPE="sparsetrace"
+      EXTRA_ARGS=""
+  else
+      echo "Unknown TASK_TYPE: $TASK_TYPE"
+      exit 1
+  fi
+
   docker run \
   --network=${network_name} \
   --env "STACK_VERSION=${STACK_VERSION}" \
@@ -84,9 +95,12 @@ elif [[ "$TASK_TYPE" == "trace" ]]; then
   --env "TEST_TYPE=server" \
   --name opensearch-py-ml-trace-runner \
   opensearch-project/opensearch-py-ml \
-  nox -s "trace-${PYTHON_VERSION}" -- ${MODEL_ID} ${MODEL_VERSION} ${TRACING_FORMAT} -ed ${EMBEDDING_DIMENSION} -pm ${POOLING_MODE} -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"}
-
+  nox -s "${NOX_TRACE_TYPE}-${PYTHON_VERSION}" -- ${MODEL_ID} ${MODEL_VERSION} ${TRACING_FORMAT} ${EXTRA_ARGS} -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"}
+
+  # To upload a model, we need the model artifact, description, license files into local path
+  # trace_output should include description and license file.
   docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/upload/ ./upload/
   docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/trace_output/ ./trace_output/
+  # Delete the docker image
   docker rm opensearch-py-ml-trace-runner
 fi
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1 +1 @@
-*   @dhrubo-os  @greaa-aws @ylwu-amzn @b4sjoo @jngz-es @rbhavna
+* @dhrubo-os @greaa-aws @ylwu-amzn @b4sjoo @jngz-es @rbhavna
diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml
@@ -17,13 +17,25 @@ on:
         required: true
         type: string
       tracing_format:
-        description: "Model format for auto-tracing (torch_script/onnx)"
+        description: "Model format for auto-tracing (torch_script/onnx), now the sparse model only support torchscript model."
         required: true
         type: choice
         options:
         - "BOTH"
         - "TORCH_SCRIPT"
         - "ONNX"
+      upload_prefix:
+        description: "Specifies the model prefix for uploading. For example, transforming the default path from '.../sentence-transformers/msmarco-distilbert-base-tas-b' to '.../{prefix}/msmarco-distilbert-base-tas-b'."
+        required: false
+        type: string
+      model_type:
+        description: "Model type for auto-tracing (SentenceTransformer/Sparse)"
+        required: true
+        type: choice
+        options:
+          - "SentenceTransformer"
+          - "Sparse"
+        default: "SentenceTransformer"
       embedding_dimension:
         description: "(Optional) Embedding Dimension (Specify here if it does not exist in original config.json file, or you want to overwrite it.)"
         required: false
@@ -62,18 +74,33 @@ jobs:
          echo "This workflow should only be triggered on 'main' branch"
          exit 1
     - name: Initiate folders
+      # This scripts init the folders path variables.
+      # 1. Retrieves the input model_id.
+      # 2. If upload_prefix is provided, constructs model_prefix using upload_prefix and model_source.
+      #    - model_prefix: "ml-models/{model_source}/{upload_prefix}"
+      # 3. If upload_prefix is not provided, it constructs model_prefix using model_source and the prefix part of model_id.
+      #    - The prefix part is the substring before the first '/' in model_id.
+      #    Example:
+      #    - Given model_id: "opensearch-project/opensearch-neural-sparse-encoding-v1"
+      #    - model_prefix: "ml-models/{model_source}/opensearch-project"
+      # 4. Constructs model_folder and model_prefix_folder.
       id: init_folders
       run: |
         model_id=${{ github.event.inputs.model_id }}
-        echo "model_folder=ml-models/${{github.event.inputs.model_source}}/${model_id}" >> $GITHUB_OUTPUT
-        echo "sentence_transformer_folder=ml-models/${{github.event.inputs.model_source}}/${model_id%%/*}/" >> $GITHUB_OUTPUT
+        if [[ -n "${{ github.event.inputs.upload_prefix }}" ]]; then
+          model_prefix="ml-models/${{ github.event.inputs.model_source }}/${{ github.event.inputs.upload_prefix }}"
+        else
+          model_prefix="ml-models/${{ github.event.inputs.model_source }}/${model_id%%/*}"
+        fi
+        echo "model_folder=$model_prefix/${model_id##*/}" >> $GITHUB_OUTPUT
+        echo "model_prefix_folder=$model_prefix" >> $GITHUB_OUTPUT
     - name: Initiate workflow_info
       id: init_workflow_info
       run: |
         embedding_dimension=${{ github.event.inputs.embedding_dimension }}
         pooling_mode=${{ github.event.inputs.pooling_mode }}
         model_description="${{ github.event.inputs.model_description }}"
-        
+        model_type=${{ github.event.inputs.model_type }}
         workflow_info="
         ============= Workflow Details ==============
         - Workflow Name: ${{ github.workflow }}
@@ -84,6 +111,7 @@ jobs:
         ========= Workflow Input Information =========
         - Model ID: ${{ github.event.inputs.model_id }}
         - Model Version: ${{ github.event.inputs.model_version }}
+        - Model Type: ${{ github.event.inputs.model_type }}
         - Tracing Format: ${{ github.event.inputs.tracing_format }}
         - Embedding Dimension: ${embedding_dimension:-N/A}
         - Pooling Mode: ${pooling_mode:-N/A}
@@ -103,7 +131,7 @@ jobs:
         echo "unverified=- [ ]  :warning: The license cannot be verified. Please confirm by yourself that the model is licensed under Apache 2.0  :warning:" >> $GITHUB_OUTPUT
     outputs:
       model_folder: ${{ steps.init_folders.outputs.model_folder }}
-      sentence_transformer_folder: ${{ steps.init_folders.outputs.sentence_transformer_folder }}
+      model_prefix_folder: ${{ steps.init_folders.outputs.model_prefix_folder }}
       workflow_info: ${{ steps.init_workflow_info.outputs.workflow_info }}
       verified_license_line: ${{ steps.init_license_line.outputs.verified }}
       unverified_license_line: ${{ steps.init_license_line.outputs.unverified }}
@@ -133,7 +161,7 @@ jobs:
         if: github.event.inputs.allow_overwrite == 'NO' && (github.event.inputs.tracing_format == 'TORCH_SCRIPT' || github.event.inputs.tracing_format == 'BOTH')
         run: |
           TORCH_FILE_PATH=$(python utils/model_uploader/save_model_file_path_to_env.py \
-              ${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} ${{ github.event.inputs.model_id }} \
+              ${{ needs.init-workflow-var.outputs.model_prefix_folder }} ${{ github.event.inputs.model_id }} \
               ${{ github.event.inputs.model_version }} TORCH_SCRIPT)
           aws s3api head-object --bucket ${{ secrets.MODEL_BUCKET }} --key $TORCH_FILE_PATH > /dev/null 2>&1 || TORCH_MODEL_NOT_EXIST=true
           if [[ -z $TORCH_MODEL_NOT_EXIST ]]
@@ -145,7 +173,7 @@ jobs:
         if: github.event.inputs.allow_overwrite == 'NO' && (github.event.inputs.tracing_format == 'ONNX' || github.event.inputs.tracing_format == 'BOTH')
         run: |
           ONNX_FILE_PATH=$(python utils/model_uploader/save_model_file_path_to_env.py \
-            ${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} ${{ github.event.inputs.model_id }} \
+            ${{ needs.init-workflow-var.outputs.model_prefix_folder }} ${{ github.event.inputs.model_id }} \
             ${{ github.event.inputs.model_version }} ONNX)
           aws s3api head-object --bucket ${{ secrets.MODEL_BUCKET }} --key $ONNX_FILE_PATH > /dev/null 2>&1 || ONNX_MODEL_NOT_EXIST=true
           if [[ -z $ONNX_MODEL_NOT_EXIST ]]
@@ -168,7 +196,7 @@ jobs:
         cluster: ["opensearch"]
         secured: ["true"]
         entry:
-          - { opensearch_version: 2.7.0 }
+          - { opensearch_version: 2.11.0 }
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -181,7 +209,7 @@ jobs:
           echo "POOLING_MODE=${{ github.event.inputs.pooling_mode }}" >> $GITHUB_ENV     
           echo "MODEL_DESCRIPTION=${{ github.event.inputs.model_description }}" >> $GITHUB_ENV     
       - name: Autotracing ${{ matrix.cluster }} secured=${{ matrix.secured }} version=${{matrix.entry.opensearch_version}}
-        run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} trace"
+        run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} ${{github.event.inputs.model_type}}Trace"
       - name: Limit Model Size to 2GB
         run: |
           upload_size_in_binary_bytes=$(ls -lR ./upload/ | awk '{ SUM += $5} END {print SUM}')
@@ -226,7 +254,7 @@ jobs:
       - name: Dryrun model uploading
         id: dryrun_model_uploading
         run: |
-          dryrun_output=$(aws s3 sync ./upload/ s3://${{ secrets.MODEL_BUCKET }}/${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} --dryrun \
+          dryrun_output=$(aws s3 sync ./upload/ s3://${{ secrets.MODEL_BUCKET }}/${{ needs.init-workflow-var.outputs.model_prefix_folder }} --dryrun \
             | sed 's|s3://${{ secrets.MODEL_BUCKET }}/|s3://(MODEL_BUCKET)/|' 
           )
           echo "dryrun_output<<EOF" >> $GITHUB_OUTPUT
@@ -301,7 +329,7 @@ jobs:
       - name: Copy Files to the Bucket
         id: copying_to_bucket
         run: |
-          aws s3 sync ./upload/ s3://${{ secrets.MODEL_BUCKET }}/${{ needs.init-workflow-var.outputs.sentence_transformer_folder }}
+          aws s3 sync ./upload/ s3://${{ secrets.MODEL_BUCKET }}/${{ needs.init-workflow-var.outputs.model_prefix_folder }}
           echo "upload_time=$(TZ='America/Los_Angeles' date "+%Y-%m-%d %T")" >> $GITHUB_OUTPUT
     outputs:
       upload_time: ${{ steps.copying_to_bucket.outputs.upload_time }}
@@ -428,4 +456,4 @@ jobs:
           version=${{ github.event.inputs.model_version }}
           format=${{ github.event.inputs.tracing_format }}
           jenkins_params="{\"BASE_DOWNLOAD_PATH\":\"$base_download_path\", \"VERSION\":\"$version\", \"FORMAT\":\"$format\"}"
-          sh utils/model_uploader/trigger_ml_models_release.sh $jenkins_trigger_token "$jenkins_params"
+          sh utils/model_uploader/trigger_ml_models_release.sh $jenkins_trigger_token "$jenkins_params"
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,9 +14,10 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - Add support for model profiles by @rawwar in ([#358](https://github.com/opensearch-project/opensearch-py-ml/pull/358))
 - Support for security default admin credential changes in 2.12.0 in ([#365](https://github.com/opensearch-project/opensearch-py-ml/pull/365))
 - adding cross encoder models in the pre-trained traced list ([#378](https://github.com/opensearch-project/opensearch-py-ml/pull/378))
-
+- Add workflows and scripts for sparse encoding model tracing and uploading process by @conggguan in ([#394](https://github.com/opensearch-project/opensearch-py-ml/pull/394))
 
 ### Changed
+- Add a parameter for customize the upload folder prefix ([#398](https://github.com/opensearch-project/opensearch-py-ml/pull/398))
 - Modify ml-models.JenkinsFile so that it takes model format into account and can be triggered with generic webhook by @thanawan-atc in ([#211](https://github.com/opensearch-project/opensearch-py-ml/pull/211))
 - Update demo_tracing_model_torchscript_onnx.ipynb to use make_model_config_json by @thanawan-atc in ([#220](https://github.com/opensearch-project/opensearch-py-ml/pull/220))
 - Bump torch from 1.13.1 to 2.0.1 and add onnx dependency by @thanawan-atc ([#237](https://github.com/opensearch-project/opensearch-py-ml/pull/237))
@@ -38,8 +39,13 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - use try-except-else block for handling unexpected exceptions during integration tests by @rawwar([#370](https://github.com/opensearch-project/opensearch-py-ml/pull/370))
 - Removed pandas version pin in nox tests by @rawwar ([#368](https://github.com/opensearch-project/opensearch-py-ml/pull/368))
 - Switch AL2 to AL2023 agent and DockerHub to ECR images in ml-models.JenkinsFile ([#377](https://github.com/opensearch-project/opensearch-py-ml/pull/377))
+- Refactored validators in ML Commons' client([#385](https://github.com/opensearch-project/opensearch-py-ml/pull/385))
+- Update model upload history -  opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini (v.1.0.0)(TORCH_SCRIPT) by @dhrubo-os ([#407](https://github.com/opensearch-project/opensearch-py-ml/pull/407))
+- Update model upload history -  opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill (v.1.0.0)(TORCH_SCRIPT) by @dhrubo-os ([#405](https://github.com/opensearch-project/opensearch-py-ml/pull/405))
+- Update model upload history -  opensearch-project/opensearch-neural-sparse-encoding-v2-distill (v.1.0.0)(TORCH_SCRIPT) by @dhrubo-os ([#410](https://github.com/opensearch-project/opensearch-py-ml/pull/410))
 
 ### Fixed
+- Fix the wrong input parameter for model_uploader's base_download_path in jekins trigger.([#402](https://github.com/opensearch-project/opensearch-py-ml/pull/402))
 - Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203))
 - Correct demo_ml_commons_integration.ipynb by @thanawan-atc in ([#208](https://github.com/opensearch-project/opensearch-py-ml/pull/208))
 - Handle the case when the model max length is undefined in tokenizer by @thanawan-atc in ([#219](https://github.com/opensearch-project/opensearch-py-ml/pull/219))
@@ -52,6 +58,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - Fix conditional job execution issue in model upload workflow by @thanawan-atc in ([#294](https://github.com/opensearch-project/opensearch-py-ml/pull/294))
 - fix bug in `MLCommonClient_client.upload_model` by @rawwar in ([#336](https://github.com/opensearch-project/opensearch-py-ml/pull/336))
 - fix lint issues on main by @rawwar in ([#374](https://github.com/opensearch-project/opensearch-py-ml/pull/374))
+- fix CVE vulnerability by @rawwar in ([#383](https://github.com/opensearch-project/opensearch-py-ml/pull/383))
 
 ## [1.1.0]
 

diff --git a/SECURITY.md b/SECURITY.md
@@ -1,3 +1,3 @@
 ## Reporting a Vulnerability
 
-If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/) or directly via email to aws-security@amazon.com. Please do **not** create a public GitHub issue.
+If you discover a potential security issue in this project we ask that you notify OpenSearch Security directly via email to security@opensearch.org. Please do **not** create a public GitHub issue.
diff --git a/noxfile.py b/noxfile.py
@@ -166,3 +166,20 @@ def trace(session):
         "utils/model_uploader/model_autotracing.py",
         *(session.posargs),
     )
+
+
+@nox.session(python=["3.9"])
+def sparsetrace(session):
+    session.install(
+        "-r",
+        "requirements-dev.txt",
+        "--timeout",
+        "1500",
+    )
+    session.install(".")
+
+    session.run(
+        "python",
+        "utils/model_uploader/sparse_model_autotracing.py",
+        *(session.posargs),
+    )
diff --git a/opensearch_py_ml/ml_commons/ml_common_utils.py b/opensearch_py_ml/ml_commons/ml_common_utils.py
@@ -11,7 +11,7 @@
 MODEL_CHUNK_MAX_SIZE = 10_000_000
 MODEL_MAX_SIZE = 4_000_000_000
 BUF_SIZE = 65536  # lets read stuff in 64kb chunks!
-TIMEOUT = 120  # timeout for synchronous method calls in seconds
+TIMEOUT = 240  # timeout for synchronous method calls in seconds
 META_API_ENDPOINT = "models/meta"
 MODEL_NAME_FIELD = "name"
 MODEL_VERSION_FIELD = "version"
@@ -24,6 +24,12 @@
 FRAMEWORK_TYPE = "framework_type"
 MODEL_CONTENT_HASH_VALUE = "model_content_hash_value"
 MODEL_GROUP_ID = "model_group_id"
+MODEL_FUNCTION_NAME = "function_name"
+MODEL_TASK_TYPE = "model_task_type"
+# URL of the license file for the OpenSearch project
+LICENSE_URL = "https://github.com/opensearch-project/opensearch-py-ml/raw/main/LICENSE"
+# Name of the function used for sparse encoding
+SPARSE_ENCODING_FUNCTION_NAME = "SPARSE_ENCODING"
 
 
 def _generate_model_content_hash_value(model_file_path: str) -> str:

diff --git a/opensearch_py_ml/ml_commons/ml_commons_client.py b/opensearch_py_ml/ml_commons/ml_commons_client.py
@@ -25,7 +25,7 @@
 from opensearch_py_ml.ml_commons.model_connector import Connector
 from opensearch_py_ml.ml_commons.model_execute import ModelExecute
 from opensearch_py_ml.ml_commons.model_uploader import ModelUploader
-from opensearch_py_ml.ml_commons.validators.profile import validate_profile_input
+from opensearch_py_ml.ml_commons.validators import validate_profile_input
 
 
 class MLCommonClient:
@@ -498,6 +498,24 @@ def get_model_info(self, model_id: str) -> object:
             url=API_URL,
         )
 
+    def generate_model_inference(self, model_id: str, request_body: dict) -> object:
+        """
+        Generates inference result for the given input using the specified request body.
+
+        :param model_id: Unique ID of the model.
+        :type model_id: string
+        :param request_body: Request body to send to the API.
+        :type request_body: dict
+        :return: Returns a JSON object `inference_results` containing the results for the given input.
+        :rtype: object
+        """
+        API_URL = f"{ML_BASE_URI}/models/{model_id}/_predict/"
+        return self._client.transport.perform_request(
+            method="POST",
+            url=API_URL,
+            body=request_body,
+        )
+
     def generate_embedding(self, model_id: str, sentences: List[str]) -> object:
         """
         This method return embedding for given sentences (using ml commons _predict api)

diff --git a/opensearch_py_ml/ml_commons/model_access_control.py b/opensearch_py_ml/ml_commons/model_access_control.py
@@ -11,7 +11,7 @@
 from opensearchpy.exceptions import NotFoundError
 
 from opensearch_py_ml.ml_commons.ml_common_utils import ML_BASE_URI
-from opensearch_py_ml.ml_commons.validators.model_access_control import (
+from opensearch_py_ml.ml_commons.validators import (
     validate_create_model_group_parameters,
     validate_delete_model_group_parameters,
     validate_search_model_group_parameters,

diff --git a/opensearch_py_ml/ml_commons/model_uploader.py b/opensearch_py_ml/ml_commons/model_uploader.py
@@ -22,9 +22,11 @@
     MODEL_CONTENT_HASH_VALUE,
     MODEL_CONTENT_SIZE_IN_BYTES_FIELD,
     MODEL_FORMAT_FIELD,
+    MODEL_FUNCTION_NAME,
     MODEL_GROUP_ID,
     MODEL_MAX_SIZE,
     MODEL_NAME_FIELD,
+    MODEL_TASK_TYPE,
     MODEL_TYPE,
     MODEL_VERSION_FIELD,
     TOTAL_CHUNKS_FIELD,
@@ -167,6 +169,7 @@ def _check_mandatory_field(self, model_meta: dict) -> bool:
         """
 
         if model_meta:
+
             if not model_meta.get(MODEL_NAME_FIELD):
                 raise ValueError(f"{MODEL_NAME_FIELD} can not be empty")
             if not model_meta.get(MODEL_VERSION_FIELD):
@@ -178,7 +181,11 @@ def _check_mandatory_field(self, model_meta: dict) -> bool:
             if not model_meta.get(TOTAL_CHUNKS_FIELD):
                 raise ValueError(f"{TOTAL_CHUNKS_FIELD} can not be empty")
             if not model_meta.get(MODEL_CONFIG_FIELD):
-                raise ValueError(f"{MODEL_CONFIG_FIELD} can not be empty")
+                if (
+                    model_meta.get(MODEL_FUNCTION_NAME) != "SPARSE_ENCODING"
+                    and model_meta.get(MODEL_TASK_TYPE) != "SPARSE_ENCODING"
+                ):
+                    raise ValueError(f"{MODEL_CONFIG_FIELD} can not be empty")
             else:
                 if not isinstance(model_meta.get(MODEL_CONFIG_FIELD), dict):
                     raise TypeError(
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		* @dhrubo-os @greaa-aws @ylwu-amzn @b4sjoo @jngz-es @rbhavna
		* @dhrubo-os @greaa-aws @ylwu-amzn @b4sjoo @jngz-es @rbhavna