some changes

microsoft · Nov 29, 2024 · fc4e063 · fc4e063
1 parent ea5dc12
commit fc4e063
Show file tree

Hide file tree

Showing 7 changed files with 94 additions and 135 deletions.
diff --git a/rdagent/components/coder/data_science/raw_data_loader/exp.py b/rdagent/components/coder/data_science/raw_data_loader/exp.py
@@ -8,8 +8,7 @@
 from rdagent.core.experiment import Experiment, FBWorkspace
 from rdagent.core.utils import cache_with_pickle
 from rdagent.oai.llm_utils import md5_hash
-from rdagent.utils.env import KGDockerEnv, QTDockerEnv
-
+from rdagent.utils.env import DockerEnv, DSDockerConf
 # TODO: Complete the implementation of the class DataLoaderTask and class DataLoaderFBWorkspace
 
 
@@ -38,26 +37,13 @@ def __repr__(self) -> str:
 
 
 class DataLoaderFBWorkspace(FBWorkspace):
-    def hash_func(
-        self,
-        batch_size: int = 8,
-        num_features: int = 10,
-        num_timesteps: int = 4,
-        num_edges: int = 20,
-        input_value: float = 1.0,
-        param_init_value: float = 1.0,
-    ) -> str:
-        target_file_name = f"{batch_size}_{num_features}_{num_timesteps}_{input_value}_{param_init_value}"
-        for code_file_name in sorted(list(self.code_dict.keys())):
-            target_file_name = f"{target_file_name}_{self.code_dict[code_file_name]}"
-        return md5_hash(target_file_name)
 
-    @cache_with_pickle(hash_func)
+    # TODO: use the cache_with_pickle decorator.
     def execute(self):
         super().execute()
         try:
-            qtde = QTDockerEnv() if self.target_task.version == 1 else KGDockerEnv()
-            qtde.prepare()
+            de = DockerEnv(conf=DSDockerConf())
+            de.prepare()
 
             # TODO: UNIT TEST for data loader
             dump_code = (Path(__file__).parent / "data_loader_unit_test.txt").read_text()
@@ -68,5 +54,4 @@ def execute(self):
         except Exception as e:
             pass
 
-
-DataLoaderExperiment = Experiment
+        return "data_loader.py and spec.md executed successfully", "content of spec.md", "pkl generated by data_loader.py"
diff --git a/rdagent/components/coder/data_science/raw_data_loader/test.py b/rdagent/components/coder/data_science/raw_data_loader/test.py
@@ -6,14 +6,24 @@
 - it is not interface unittest(i.e. workspace evaluator in the CoSTEER Loop)
 """
 
-from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask, DataLoaderExperiment
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
+from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.scenarios.data_science.experiment.experiment import DataLoaderExperiment
 
-def build_dummpy_exp(): # -> experiment
+def develop_one_competition(competition: str): # -> experiment
+    scen = DataScienceScen(competition=competition)
+    data_loader_coder = DataLoaderCoSTEER(scen)
+
+    # Create the experiment
     dlt = DataLoaderTask(name="DataLoaderTask", description="")
     exp = DataLoaderExperiment(
         sub_tasks=[dlt],
     )
 
+    # Develop the experiment
+    exp = data_loader_coder.develop(exp)
+
 
-def get_developer():
-    ...
+if __name__ == "__main__":
+    develop_one_competition("aerial-cactus-identification")
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
@@ -105,10 +105,7 @@ def run_pipeline(self, **files: str):
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
-        self.code_dict: dict[str, Any] = {}
-        self.code_dict = (
-            {}
-        )  # The code injected into the folder, store them in the variable to reproduce the former result
+        self.code_dict: dict[str, Any] = {} # The code injected into the folder, store them in the variable to reproduce the former result
         self.workspace_path: Path = RD_AGENT_SETTINGS.workspace_path / uuid.uuid4().hex
 
     @property

diff --git a/...a_science/experiment/kaggle_experiment.py → ...ios/data_science/experiment/experiment.py b/...a_science/experiment/kaggle_experiment.py → ...ios/data_science/experiment/experiment.py
@@ -1,60 +1,53 @@
 from copy import deepcopy
 from pathlib import Path
 
+from rdagent.core.experiment import Experiment
 from rdagent.app.data_science.conf import DS_RD_SETTING
-from rdagent.components.coder.data_science.raw_data_loader.raw_data_loader import (
-    DataLoaderExperiment,
+from rdagent.components.coder.data_science.raw_data_loader.exp import (
     DataLoaderFBWorkspace,
     DataLoaderTask,
 )
 from rdagent.components.coder.factor_coder.factor import (
     FactorFBWorkspace,
     FactorTask,
-    FeatureExperiment,
 )
 from rdagent.components.coder.model_coder.model import (
-    ModelExperiment,
     ModelFBWorkspace,
     ModelTask,
 )
-from rdagent.scenarios.data_science.experiment.workspace import KGFBWorkspace
+from rdagent.scenarios.data_science.experiment.workspace import DSFBWorkspace
 
-KG_MODEL_TYPE_XGBOOST = "XGBoost"
-KG_MODEL_TYPE_RANDOMFOREST = "RandomForest"
-KG_MODEL_TYPE_LIGHTGBM = "LightGBM"
-KG_MODEL_TYPE_NN = "NN"
+# KG_MODEL_TYPE_XGBOOST = "XGBoost"
+# KG_MODEL_TYPE_RANDOMFOREST = "RandomForest"
+# KG_MODEL_TYPE_LIGHTGBM = "LightGBM"
+# KG_MODEL_TYPE_NN = "NN"
 
-KG_MODEL_MAPPING = {
-    KG_MODEL_TYPE_XGBOOST: "model/model_xgboost.py",
-    KG_MODEL_TYPE_RANDOMFOREST: "model/model_randomforest.py",
-    KG_MODEL_TYPE_LIGHTGBM: "model/model_lightgbm.py",
-    KG_MODEL_TYPE_NN: "model/model_nn.py",
-}
+# KG_MODEL_MAPPING = {
+#     KG_MODEL_TYPE_XGBOOST: "model/model_xgboost.py",
+#     KG_MODEL_TYPE_RANDOMFOREST: "model/model_randomforest.py",
+#     KG_MODEL_TYPE_LIGHTGBM: "model/model_lightgbm.py",
+#     KG_MODEL_TYPE_NN: "model/model_nn.py",
+# }
 
-KG_SELECT_MAPPING = {
-    KG_MODEL_TYPE_XGBOOST: "model/select_xgboost.py",
-    KG_MODEL_TYPE_RANDOMFOREST: "model/select_randomforest.py",
-    KG_MODEL_TYPE_LIGHTGBM: "model/select_lightgbm.py",
-    KG_MODEL_TYPE_NN: "model/select_nn.py",
-}
+# KG_SELECT_MAPPING = {
+#     KG_MODEL_TYPE_XGBOOST: "model/select_xgboost.py",
+#     KG_MODEL_TYPE_RANDOMFOREST: "model/select_randomforest.py",
+#     KG_MODEL_TYPE_LIGHTGBM: "model/select_lightgbm.py",
+#     KG_MODEL_TYPE_NN: "model/select_nn.py",
+# }
 
 
-class KGDataLoaderExperiment(DataLoaderExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
-    # TODO: complete the implementation
-    def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
+
+class DataLoaderExperiment(Experiment[DataLoaderTask, DSFBWorkspace, DataLoaderFBWorkspace]):
+    def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        # TODO: It seems there are some problems as the folder has not been created.
-        # self.experiment_workspace = KGFBWorkspace(
-        #     template_folder_path=Path(__file__).resolve()
-        #     / Path(DS_RD_SETTING.template_path).resolve()
-        #     / DS_RD_SETTING.competition
-        # )
+        self.experiment_workspace = DataLoaderFBWorkspace()
 
 
-class KGModelExperiment(ModelExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
+class ModelExperiment(Experiment[ModelTask, DSFBWorkspace, ModelFBWorkspace]):
     def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = KGFBWorkspace(
+        self.experiment_workspace = DSFBWorkspace(
             template_folder_path=Path(__file__).resolve()
             / Path(DS_RD_SETTING.template_path).resolve()
             / DS_RD_SETTING.competition
@@ -77,10 +70,10 @@ def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
             ]
 
 
-class KGFactorExperiment(FeatureExperiment[FactorTask, KGFBWorkspace, FactorFBWorkspace]):
+class FactorExperiment(Experiment[FactorTask, DSFBWorkspace, FactorFBWorkspace]):
     def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = KGFBWorkspace(
+        self.experiment_workspace = DSFBWorkspace(
             template_folder_path=Path(__file__).resolve()
             / Path(DS_RD_SETTING.template_path).resolve()
             / DS_RD_SETTING.competition
@@ -100,4 +93,4 @@ def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
                     ).get_task_information(),
                     source_feature_size,
                 )
-            ]
+            ]
diff --git a/rdagent/scenarios/data_science/experiment/workspace.py b/rdagent/scenarios/data_science/experiment/workspace.py
@@ -1,97 +1,55 @@
-import subprocess
-import zipfile
 from pathlib import Path
 from typing import Any, List, Tuple
 
 import pandas as pd
 
-from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.core.experiment import FBWorkspace
 from rdagent.log import rdagent_logger as logger
-from rdagent.utils.env import KGDockerEnv
-
-KG_FEATURE_PREPROCESS_SCRIPT = """import pickle
-
-from fea_share_preprocess import preprocess_script
-
-X_train, X_valid, y_train, y_valid, X_test, *others = preprocess_script()
-
-pickle.dump(X_train, open("X_train.pkl", "wb"))
-pickle.dump(X_valid, open("X_valid.pkl", "wb"))
-pickle.dump(y_train, open("y_train.pkl", "wb"))
-pickle.dump(y_valid, open("y_valid.pkl", "wb"))
-pickle.dump(X_test, open("X_test.pkl", "wb"))
-pickle.dump(others, open("others.pkl", "wb"))
-"""
-
-
-class KGFBWorkspace(FBWorkspace):
-    def __init__(self, template_folder_path: Path, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.inject_code_from_folder(template_folder_path)
-        self.data_description: List[Tuple[str, int]] = []
-
-    @property
-    def model_description(self) -> dict[str, str]:
-        model_description = {}
-        for k, v in self.code_dict.items():
-            if k.startswith("model/"):
-                model_description[k] = v
-        return model_description
-
-    def generate_preprocess_data(
-        self,
-    ) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, Any]:
-        kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
-        kgde.prepare()
-
-        execute_log, results = kgde.dump_python_code_run_and_get_results(
-            code=KG_FEATURE_PREPROCESS_SCRIPT,
-            local_path=str(self.workspace_path),
-            dump_file_names=[
-                "X_train.pkl",
-                "X_valid.pkl",
-                "y_train.pkl",
-                "y_valid.pkl",
-                "X_test.pkl",
-                "others.pkl",
-            ],
-            running_extra_volume=(
-                {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"}
-                if KAGGLE_IMPLEMENT_SETTING.competition
-                else None
-            ),
-        )
-        if results is None:
-            logger.error("Feature preprocess failed.")
-            raise Exception("Feature preprocess failed.")
-        else:
-            X_train, X_valid, y_train, y_valid, X_test, others = results
-            return X_train, X_valid, y_train, y_valid, X_test, *others
-
-    def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
+from rdagent.utils.env import DockerEnv, DSDockerConf
+
+
+class DSFBWorkspace(FBWorkspace):
+
+    # TODO: use the cache_with_pickle decorator.
+    def execute(self, run_env: dict = {}, *args, **kwargs) -> pd.DataFrame:
+        """
+        Executes the experiment(a competition) within the specified workspace.
+
+        Args:
+            run_env (dict): The runtime environment variables.
+
+        Returns:
+            pd.DataFrame: Scores of each Model and ensemble Model.
+            Example:
+            | Model                 | <Metric like ACC/AUROC/MAE...> |
+            |-----------------------|--------------------------------|
+            | model1                | 0.9                            |
+            | model2                | 0.8                            |
+            | <ensemble model name> | 0.95                           |
+        """
         logger.info(f"Running the experiment in {self.workspace_path}")
 
-        kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
-        kgde.prepare()
+        de = DockerEnv(DSDockerConf())
+        de.prepare()
 
         running_extra_volume = {}
-        if KAGGLE_IMPLEMENT_SETTING.competition:
+        if DS_RD_SETTING.competition:
             running_extra_volume = {
-                KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"
+                DS_RD_SETTING.local_data_path + "/" + DS_RD_SETTING.competition: "/kaggle/input"
             }
         else:
             running_extra_volume = {}
 
-        execute_log = kgde.run(
+        execute_log = de.run(
             local_path=str(self.workspace_path),
             env=run_env,
             running_extra_volume=running_extra_volume,
         )
 
-        csv_path = self.workspace_path / "submission_score.csv"
+        csv_path = self.workspace_path / "scores.csv"
 
         if not csv_path.exists():
             logger.error(f"File {csv_path} does not exist.")
             return None
-        return pd.read_csv(csv_path, index_col=0).iloc[:, 0]
+        return pd.read_csv(csv_path, index_col=0)
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
@@ -187,6 +187,18 @@ class KGDockerConf(DockerConf):
         "48g"  # Add memory limit attribute # new-york-city-taxi-fare-prediction may need more memory
     )
 
+class DSDockerConf(DockerConf):
+    model_config = ExtendedSettingsConfigDict(env_prefix="DS_DOCKER_")
+
+    build_from_dockerfile: bool = False
+    image: str = "gcr.io/kaggle-gpu-images/python:latest"
+    mount_path: str = "/kaggle/workspace"
+    default_entry: str = "python main.py"
+
+    running_timeout_period: int = 600
+    mem_limit: str | None = (
+        "48g"  # Add memory limit attribute # new-york-city-taxi-fare-prediction may need more memory
+    )
 
 class MLEBDockerConf(DockerConf):
     model_config = ExtendedSettingsConfigDict(env_prefix="MLEB_DOCKER_")

diff --git a/test/utils/coder/test_CoSTEER.py b/test/utils/coder/test_CoSTEER.py
@@ -18,6 +18,10 @@ def test_data_loader(self):
         # 3) test the results
         # - check spec.md
         # - check data_loader.py
+        from rdagent.components.coder.data_science.raw_data_loader.test import develop_one_competition
+
+        exp = develop_one_competition("aerial-cactus-identification")
+
         pass
 
     def test_model(self):