Merge pull request #144 from IcyFeather233/dev

OSPP: Implementation of Domain-Specific Large Model Benchmarking Based on KubeEdge-Ianvs
kubeedge · Oct 30, 2024 · 432d321 · 432d321
2 parents ea06b50 + fa4d8c6
commit 432d321
Show file tree

Hide file tree

Showing 23 changed files with 1,050 additions and 9 deletions.
diff --git a/core/common/constant.py b/core/common/constant.py
@@ -26,6 +26,7 @@ class DatasetFormat(Enum):
     CSV = "csv"
     TXT = "txt"
     JSON = "json"
+    JSONL = "jsonl"
 
 
 class ParadigmType(Enum):

diff --git a/core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py b/core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py
@@ -84,5 +84,8 @@ def _inference(self, job, trained_model):
         inference_output_dir = os.path.join(self.workspace, "output/inference/")
         os.environ["RESULT_SAVED_URL"] = inference_output_dir
         job.load(trained_model)
-        infer_res = job.predict(inference_dataset.x)
+        if hasattr(inference_dataset, 'need_other_info'):
+            infer_res = job.predict(inference_dataset)
+        else:
+            infer_res = job.predict(inference_dataset.x)
         return infer_res
diff --git a/core/testenvmanager/dataset/dataset.py b/core/testenvmanager/dataset/dataset.py
@@ -16,10 +16,16 @@
 
 import os
 import tempfile
-
 import pandas as pd
-from sedna.datasources import CSVDataParse, TxtDataParse, JSONDataParse
-
+# pylint: disable=no-name-in-module
+# pylint: disable=too-many-instance-attributes
+from sedna.datasources import (
+    CSVDataParse,
+    TxtDataParse,
+    JSONDataParse,
+    JsonlDataParse,
+    JSONMetaDataParse,
+)
 from core.common import utils
 from core.common.constant import DatasetFormat
 
@@ -38,12 +44,28 @@ class Dataset:
     def __init__(self, config):
         self.train_url: str = ""
         self.test_url: str = ""
+        self.train_index: str = ""
+        self.test_index: str = ""
+        self.train_data: str = ""
+        self.test_data: str = ""
+        self.train_data_info: str = ""
+        self.test_data_info: str = ""
         self.label: str = ""
         self._parse_config(config)
 
     def _check_fields(self):
-        self._check_dataset_url(self.train_url)
-        self._check_dataset_url(self.test_url)
+        if self.train_index:
+            self._check_dataset_url(self.train_index)
+        if self.test_index:
+            self._check_dataset_url(self.test_index)
+        if self.train_data:
+            self._check_dataset_url(self.train_data)
+        if self.test_data:
+            self._check_dataset_url(self.test_data)
+        if self.train_data_info:
+            self._check_dataset_url(self.train_data_info)
+        if self.test_data_info:
+            self._check_dataset_url(self.test_data_info)
 
     def _parse_config(self, config):
         for attr, value in config.items():
@@ -108,6 +130,20 @@ def _process_index_file(self, file_url):
 
         return None
 
+    def _process_data_file(self, file_url):
+        file_format = utils.get_file_format(file_url)
+        if file_format == DatasetFormat.JSONL.value:
+            return file_url
+
+        return None
+
+    def _process_data_info_file(self, file_url):
+        file_format = utils.get_file_format(file_url)
+        if file_format == DatasetFormat.JSON.value:
+            return file_url
+
+        return None
+
     def process_dataset(self):
         """
         process dataset:
@@ -116,9 +152,26 @@ def process_dataset(self):
               in the index file(e.g.: txt index file).
 
         """
+        if self.train_index:
+            self.train_url = self._process_index_file(self.train_index)
+        elif self.train_data:
+            self.train_url = self._process_data_file(self.train_data)
+        elif self.train_data_info:
+            self.train_url = self._process_data_info_file(self.train_data_info)
+            # raise NotImplementedError('to be done')
+        else:
+            raise NotImplementedError('not one of train_index/train_data/train_data_info')
+
+        if self.test_index:
+            self.test_url = self._process_index_file(self.test_index)
+        elif self.test_data:
+            self.test_url = self._process_data_file(self.test_data)
+        elif self.test_data_info:
+            self.test_url = self._process_data_info_file(self.test_data_info)
+            # raise NotImplementedError('to be done')
+        else:
+            raise NotImplementedError('not one of test_index/test_data/test_data_info')
 
-        self.train_url = self._process_index_file(self.train_url)
-        self.test_url = self._process_index_file(self.test_url)
 
     # pylint: disable=too-many-arguments
     def split_dataset(
@@ -514,6 +567,11 @@ def load_data(
             e.g.: TxtDataParse, CSVDataParse.
 
         """
+        if file.split('/')[-1] == "metadata.json":
+            data = JSONMetaDataParse(data_type=data_type, func=feature_process)
+            data.parse(file)
+            return data
+
         data_format = utils.get_file_format(file)
 
         data = None
@@ -523,11 +581,14 @@ def load_data(
 
         if data_format == DatasetFormat.TXT.value:
             data = TxtDataParse(data_type=data_type, func=feature_process)
-            # print(file)
             data.parse(file, use_raw=use_raw)
 
         if data_format == DatasetFormat.JSON.value:
             data = JSONDataParse(data_type=data_type, func=feature_process)
             data.parse(file)
 
+        if data_format == DatasetFormat.JSONL.value:
+            data = JsonlDataParse(data_type=data_type, func=feature_process)
+            data.parse(file)
+
         return data
diff --git a/examples/government/singletask_learning_bench/README.md b/examples/government/singletask_learning_bench/README.md
@@ -0,0 +1,104 @@
+# Government BenchMark
+
+## Introduction
+
+This is the work for Domain-specific Large Model Benchmark:
+
+Constructs a suite for the government sector, including test datasets, evaluation metrics, testing environments, and usage guidelines.
+
+This Benchmark consists of two parts: subjective evaluation data and objective evaluation data.
+
+## Design
+
+### Metadata Format
+
+| Name | Field Name | Option | Description |
+| --- | --- | --- | --- |
+| Data Name | dataset |  Required | Name of the dataset |
+| Data Description | description | Optional | Dataset description, such as usage scope, sample size, etc. |
+| First-level Dimension | level_1_dim | Required | Should fill in "Single Modal" or "Multi-Modal" |
+| Second-level Dimension | level_2_dim | Required | For "Single Modal", fill in "Text", "Image", or "Audio". For "Multi-Modal", fill in "Text-Image", "Text-Audio", "Image-Audio", or "Text-Image-Audio" |
+| Third-level Dimension | level_3_dim | Optional | Should be filled if all samples in the dataset have the same third-level dimension. If filled, content should be based on the standards shown in the normative reference document |
+| Fourth-level Dimension | level_4_dim | Optional | Should be filled if all samples in the dataset have the same third-level dimension. If filled, content should be based on the standards shown in the normative reference document |
+
+metadata example:
+
+```json
+{
+    "dataset": "Medical BenchMark",
+    "description": "xxx",
+    "level_1_dim": "single-modal",
+    "level_2_dim": "text",
+    "level_3_dim": "Q&A",
+    "level_4_dim": "medical"
+}
+```
+
+### Data format:
+
+|name|Option|information|
+|---|---|---|
+|prompt|Optional|the background of the LLM testing|
+|query|Required|the testing question|
+|response|Required|the answer of the question|
+|explanation|Optional|the explanation of the answer|
+|judge_prompt|Optional|the prompt of the judge model|
+|level_1_dim|Optional|single-modal or multi-modal|
+|level_2_dim|Optional|single-modal: text, image, video; multi-modal: text-image, text-video, text-image-video|
+|level_3_dim|Required|details|
+|level_4_dim|Required|details|
+
+data example:
+
+```json
+{
+    "prompt": "Please think step by step and answer the question.",
+    "question": "Which one is the correct answer of xxx? A. xxx B. xxx C. xxx D. xxx",
+    "response": "C",
+    "explanation": "xxx",
+    "level_1_dim": "single-modal",
+    "level_2_dim": "text",
+    "level_3_dim": "knowledge Q&A",
+    "level_4_dim": "medical knowledge"
+}
+```
+
+
+## Change to Core Code
+
+![](./imgs/structure.png)
+
+## Prepare Datasets
+
+You can download dataset in [kaggle](https://www.kaggle.com/datasets/kubeedgeianvs/the-government-affairs-dataset-govaff/data?select=government_benchmark)
+
+```
+dataset/government
+├── objective
+│   ├── test_data
+│   │   ├── data.jsonl
+│   │   └── metadata.json
+│   └── train_data
+└── subjective
+    ├── test_data
+    │   ├── data_full.jsonl
+    │   ├── data.jsonl
+    │   └── metadata.json
+    └── train_data
+```
+
+## Prepare Environment
+
+You should change your sedna package like this: [my sedna repo commit](https://github.com/IcyFeather233/sedna/commit/e13b82363c03dc771fca4922a24798554ca32a9f)
+
+Or you can replace the file in `yourpath/anaconda3/envs/ianvs/lib/python3.x/site-packages/sedna` with `examples/resources/sedna-llm.zip`
+
+## Run Ianvs
+
+### Objective
+
+`ianvs -f examples/government/singletask_learning_bench/objective/benchmarkingjob.yaml`
+
+### Subjective
+
+`ianvs -f examples/government/singletask_learning_bench/subjective/benchmarkingjob.yaml`
diff --git a/examples/government/singletask_learning_bench/imgs/structure.png b/examples/government/singletask_learning_bench/imgs/structure.png
diff --git a/examples/government/singletask_learning_bench/objective/benchmarkingjob.yaml b/examples/government/singletask_learning_bench/objective/benchmarkingjob.yaml
@@ -0,0 +1,72 @@
+benchmarkingjob:
+  # job name of bechmarking; string type;
+  name: "benchmarkingjob"
+  # the url address of job workspace that will reserve the output of tests; string type;
+  workspace: "/home/icyfeather/project/ianvs/workspace"
+
+  # the url address of test environment configuration file; string type;
+  # the file format supports yaml/yml;
+  testenv: "./examples/government/singletask_learning_bench/objective/testenv/testenv.yaml"
+
+  # the configuration of test object
+  test_object:
+    # test type; string type;
+    # currently the option of value is "algorithms",the others will be added in succession.
+    type: "algorithms"
+    # test algorithm configuration files; list type;
+    algorithms:
+      # algorithm name; string type;
+      - name: "politic_bench_singletask_learning"
+        # the url address of test algorithm configuration file; string type;
+        # the file format supports yaml/yml;
+        url: "./examples/government/singletask_learning_bench/objective/testalgorithms/gen/gen_algorithm.yaml"
+
+  # the configuration of ranking leaderboard
+  rank:
+    # rank leaderboard with metric of test case's evaluation and order ; list type;
+    # the sorting priority is based on the sequence of metrics in the list from front to back;
+    sort_by: [ { "acc": "descend" } ]
+
+    # visualization configuration
+    visualization:
+      # mode of visualization in the leaderboard; string type;
+      # There are quite a few possible dataitems in the leaderboard. Not all of them can be shown simultaneously on the screen.
+      # In the leaderboard, we provide the "selected_only" mode for the user to configure what is shown or is not shown.
+      mode: "selected_only"
+      # method of visualization for selected dataitems; string type;
+      # currently the options of value are as follows:
+      #  1> "print_table": print selected dataitems;
+      method: "print_table"
+
+    # selected dataitem configuration
+    # The user can add his/her interested dataitems in terms of "paradigms", "modules", "hyperparameters" and "metrics",
+    # so that the selected columns will be shown.
+    selected_dataitem:
+      # currently the options of value are as follows:
+      #   1> "all": select all paradigms in the leaderboard;
+      #   2> paradigms in the leaderboard, e.g., "singletasklearning"
+      paradigms: [ "all" ]
+      # currently the options of value are as follows:
+      #   1> "all": select all modules in the leaderboard;
+      #   2> modules in the leaderboard, e.g., "basemodel"
+      modules: [ "all" ]
+      # currently the options of value are as follows:
+      #   1> "all": select all hyperparameters in the leaderboard;
+      #   2> hyperparameters in the leaderboard, e.g., "momentum"
+      hyperparameters: [ "all" ]
+      # currently the options of value are as follows:
+      #   1> "all": select all metrics in the leaderboard;
+      #   2> metrics in the leaderboard, e.g., "f1_score"
+      metrics: [ "acc" ]
+
+    # model of save selected and all dataitems in workspace; string type;
+    # currently the options of value are as follows:
+    #  1> "selected_and_all": save selected and all dataitems;
+    #  2> "selected_only": save selected dataitems;
+    save_mode: "selected_and_all"
+
+
+
+
+
+