Skip to content

Commit

Permalink
some changes
Browse files Browse the repository at this point in the history
  • Loading branch information
XianBW committed Nov 29, 2024
1 parent ea5dc12 commit fc4e063
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 135 deletions.
25 changes: 5 additions & 20 deletions rdagent/components/coder/data_science/raw_data_loader/exp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
from rdagent.core.experiment import Experiment, FBWorkspace
from rdagent.core.utils import cache_with_pickle
from rdagent.oai.llm_utils import md5_hash
from rdagent.utils.env import KGDockerEnv, QTDockerEnv

from rdagent.utils.env import DockerEnv, DSDockerConf
# TODO: Complete the implementation of the class DataLoaderTask and class DataLoaderFBWorkspace


Expand Down Expand Up @@ -38,26 +37,13 @@ def __repr__(self) -> str:


class DataLoaderFBWorkspace(FBWorkspace):
def hash_func(
self,
batch_size: int = 8,
num_features: int = 10,
num_timesteps: int = 4,
num_edges: int = 20,
input_value: float = 1.0,
param_init_value: float = 1.0,
) -> str:
target_file_name = f"{batch_size}_{num_features}_{num_timesteps}_{input_value}_{param_init_value}"
for code_file_name in sorted(list(self.code_dict.keys())):
target_file_name = f"{target_file_name}_{self.code_dict[code_file_name]}"
return md5_hash(target_file_name)

@cache_with_pickle(hash_func)
# TODO: use the cache_with_pickle decorator.
def execute(self):
super().execute()
try:
qtde = QTDockerEnv() if self.target_task.version == 1 else KGDockerEnv()
qtde.prepare()
de = DockerEnv(conf=DSDockerConf())
de.prepare()

# TODO: UNIT TEST for data loader
dump_code = (Path(__file__).parent / "data_loader_unit_test.txt").read_text()
Expand All @@ -68,5 +54,4 @@ def execute(self):
except Exception as e:
pass


DataLoaderExperiment = Experiment
return "data_loader.py and spec.md executed successfully", "content of spec.md", "pkl generated by data_loader.py"
18 changes: 14 additions & 4 deletions rdagent/components/coder/data_science/raw_data_loader/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,24 @@
- it is not interface unittest(i.e. workspace evaluator in the CoSTEER Loop)
"""

from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask, DataLoaderExperiment
from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
from rdagent.scenarios.data_science.scen import DataScienceScen
from rdagent.scenarios.data_science.experiment.experiment import DataLoaderExperiment

def build_dummpy_exp(): # -> experiment
def develop_one_competition(competition: str): # -> experiment
scen = DataScienceScen(competition=competition)
data_loader_coder = DataLoaderCoSTEER(scen)

# Create the experiment
dlt = DataLoaderTask(name="DataLoaderTask", description="")
exp = DataLoaderExperiment(
sub_tasks=[dlt],
)

# Develop the experiment
exp = data_loader_coder.develop(exp)


def get_developer():
...
if __name__ == "__main__":
develop_one_competition("aerial-cactus-identification")
5 changes: 1 addition & 4 deletions rdagent/core/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,7 @@ def run_pipeline(self, **files: str):

def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.code_dict: dict[str, Any] = {}
self.code_dict = (
{}
) # The code injected into the folder, store them in the variable to reproduce the former result
self.code_dict: dict[str, Any] = {} # The code injected into the folder, store them in the variable to reproduce the former result
self.workspace_path: Path = RD_AGENT_SETTINGS.workspace_path / uuid.uuid4().hex

@property
Expand Down
Original file line number Diff line number Diff line change
@@ -1,60 +1,53 @@
from copy import deepcopy
from pathlib import Path

from rdagent.core.experiment import Experiment
from rdagent.app.data_science.conf import DS_RD_SETTING
from rdagent.components.coder.data_science.raw_data_loader.raw_data_loader import (
DataLoaderExperiment,
from rdagent.components.coder.data_science.raw_data_loader.exp import (
DataLoaderFBWorkspace,
DataLoaderTask,
)
from rdagent.components.coder.factor_coder.factor import (
FactorFBWorkspace,
FactorTask,
FeatureExperiment,
)
from rdagent.components.coder.model_coder.model import (
ModelExperiment,
ModelFBWorkspace,
ModelTask,
)
from rdagent.scenarios.data_science.experiment.workspace import KGFBWorkspace
from rdagent.scenarios.data_science.experiment.workspace import DSFBWorkspace

KG_MODEL_TYPE_XGBOOST = "XGBoost"
KG_MODEL_TYPE_RANDOMFOREST = "RandomForest"
KG_MODEL_TYPE_LIGHTGBM = "LightGBM"
KG_MODEL_TYPE_NN = "NN"
# KG_MODEL_TYPE_XGBOOST = "XGBoost"
# KG_MODEL_TYPE_RANDOMFOREST = "RandomForest"
# KG_MODEL_TYPE_LIGHTGBM = "LightGBM"
# KG_MODEL_TYPE_NN = "NN"

KG_MODEL_MAPPING = {
KG_MODEL_TYPE_XGBOOST: "model/model_xgboost.py",
KG_MODEL_TYPE_RANDOMFOREST: "model/model_randomforest.py",
KG_MODEL_TYPE_LIGHTGBM: "model/model_lightgbm.py",
KG_MODEL_TYPE_NN: "model/model_nn.py",
}
# KG_MODEL_MAPPING = {
# KG_MODEL_TYPE_XGBOOST: "model/model_xgboost.py",
# KG_MODEL_TYPE_RANDOMFOREST: "model/model_randomforest.py",
# KG_MODEL_TYPE_LIGHTGBM: "model/model_lightgbm.py",
# KG_MODEL_TYPE_NN: "model/model_nn.py",
# }

KG_SELECT_MAPPING = {
KG_MODEL_TYPE_XGBOOST: "model/select_xgboost.py",
KG_MODEL_TYPE_RANDOMFOREST: "model/select_randomforest.py",
KG_MODEL_TYPE_LIGHTGBM: "model/select_lightgbm.py",
KG_MODEL_TYPE_NN: "model/select_nn.py",
}
# KG_SELECT_MAPPING = {
# KG_MODEL_TYPE_XGBOOST: "model/select_xgboost.py",
# KG_MODEL_TYPE_RANDOMFOREST: "model/select_randomforest.py",
# KG_MODEL_TYPE_LIGHTGBM: "model/select_lightgbm.py",
# KG_MODEL_TYPE_NN: "model/select_nn.py",
# }


class KGDataLoaderExperiment(DataLoaderExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
# TODO: complete the implementation
def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:

class DataLoaderExperiment(Experiment[DataLoaderTask, DSFBWorkspace, DataLoaderFBWorkspace]):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
# TODO: It seems there are some problems as the folder has not been created.
# self.experiment_workspace = KGFBWorkspace(
# template_folder_path=Path(__file__).resolve()
# / Path(DS_RD_SETTING.template_path).resolve()
# / DS_RD_SETTING.competition
# )
self.experiment_workspace = DataLoaderFBWorkspace()


class KGModelExperiment(ModelExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
class ModelExperiment(Experiment[ModelTask, DSFBWorkspace, ModelFBWorkspace]):
def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.experiment_workspace = KGFBWorkspace(
self.experiment_workspace = DSFBWorkspace(
template_folder_path=Path(__file__).resolve()
/ Path(DS_RD_SETTING.template_path).resolve()
/ DS_RD_SETTING.competition
Expand All @@ -77,10 +70,10 @@ def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
]


class KGFactorExperiment(FeatureExperiment[FactorTask, KGFBWorkspace, FactorFBWorkspace]):
class FactorExperiment(Experiment[FactorTask, DSFBWorkspace, FactorFBWorkspace]):
def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.experiment_workspace = KGFBWorkspace(
self.experiment_workspace = DSFBWorkspace(
template_folder_path=Path(__file__).resolve()
/ Path(DS_RD_SETTING.template_path).resolve()
/ DS_RD_SETTING.competition
Expand All @@ -100,4 +93,4 @@ def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
).get_task_information(),
source_feature_size,
)
]
]
102 changes: 30 additions & 72 deletions rdagent/scenarios/data_science/experiment/workspace.py
Original file line number Diff line number Diff line change
@@ -1,97 +1,55 @@
import subprocess
import zipfile
from pathlib import Path
from typing import Any, List, Tuple

import pandas as pd

from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
from rdagent.app.data_science.conf import DS_RD_SETTING
from rdagent.core.experiment import FBWorkspace
from rdagent.log import rdagent_logger as logger
from rdagent.utils.env import KGDockerEnv

KG_FEATURE_PREPROCESS_SCRIPT = """import pickle
from fea_share_preprocess import preprocess_script
X_train, X_valid, y_train, y_valid, X_test, *others = preprocess_script()
pickle.dump(X_train, open("X_train.pkl", "wb"))
pickle.dump(X_valid, open("X_valid.pkl", "wb"))
pickle.dump(y_train, open("y_train.pkl", "wb"))
pickle.dump(y_valid, open("y_valid.pkl", "wb"))
pickle.dump(X_test, open("X_test.pkl", "wb"))
pickle.dump(others, open("others.pkl", "wb"))
"""


class KGFBWorkspace(FBWorkspace):
def __init__(self, template_folder_path: Path, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.inject_code_from_folder(template_folder_path)
self.data_description: List[Tuple[str, int]] = []

@property
def model_description(self) -> dict[str, str]:
model_description = {}
for k, v in self.code_dict.items():
if k.startswith("model/"):
model_description[k] = v
return model_description

def generate_preprocess_data(
self,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, Any]:
kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
kgde.prepare()

execute_log, results = kgde.dump_python_code_run_and_get_results(
code=KG_FEATURE_PREPROCESS_SCRIPT,
local_path=str(self.workspace_path),
dump_file_names=[
"X_train.pkl",
"X_valid.pkl",
"y_train.pkl",
"y_valid.pkl",
"X_test.pkl",
"others.pkl",
],
running_extra_volume=(
{KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"}
if KAGGLE_IMPLEMENT_SETTING.competition
else None
),
)
if results is None:
logger.error("Feature preprocess failed.")
raise Exception("Feature preprocess failed.")
else:
X_train, X_valid, y_train, y_valid, X_test, others = results
return X_train, X_valid, y_train, y_valid, X_test, *others

def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
from rdagent.utils.env import DockerEnv, DSDockerConf


class DSFBWorkspace(FBWorkspace):

# TODO: use the cache_with_pickle decorator.
def execute(self, run_env: dict = {}, *args, **kwargs) -> pd.DataFrame:
"""
Executes the experiment(a competition) within the specified workspace.
Args:
run_env (dict): The runtime environment variables.
Returns:
pd.DataFrame: Scores of each Model and ensemble Model.
Example:
| Model | <Metric like ACC/AUROC/MAE...> |
|-----------------------|--------------------------------|
| model1 | 0.9 |
| model2 | 0.8 |
| <ensemble model name> | 0.95 |
"""
logger.info(f"Running the experiment in {self.workspace_path}")

kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
kgde.prepare()
de = DockerEnv(DSDockerConf())
de.prepare()

running_extra_volume = {}
if KAGGLE_IMPLEMENT_SETTING.competition:
if DS_RD_SETTING.competition:
running_extra_volume = {
KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"
DS_RD_SETTING.local_data_path + "/" + DS_RD_SETTING.competition: "/kaggle/input"
}
else:
running_extra_volume = {}

execute_log = kgde.run(
execute_log = de.run(
local_path=str(self.workspace_path),
env=run_env,
running_extra_volume=running_extra_volume,
)

csv_path = self.workspace_path / "submission_score.csv"
csv_path = self.workspace_path / "scores.csv"

if not csv_path.exists():
logger.error(f"File {csv_path} does not exist.")
return None
return pd.read_csv(csv_path, index_col=0).iloc[:, 0]
return pd.read_csv(csv_path, index_col=0)
12 changes: 12 additions & 0 deletions rdagent/utils/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,18 @@ class KGDockerConf(DockerConf):
"48g" # Add memory limit attribute # new-york-city-taxi-fare-prediction may need more memory
)

class DSDockerConf(DockerConf):
model_config = ExtendedSettingsConfigDict(env_prefix="DS_DOCKER_")

build_from_dockerfile: bool = False
image: str = "gcr.io/kaggle-gpu-images/python:latest"
mount_path: str = "/kaggle/workspace"
default_entry: str = "python main.py"

running_timeout_period: int = 600
mem_limit: str | None = (
"48g" # Add memory limit attribute # new-york-city-taxi-fare-prediction may need more memory
)

class MLEBDockerConf(DockerConf):
model_config = ExtendedSettingsConfigDict(env_prefix="MLEB_DOCKER_")
Expand Down
4 changes: 4 additions & 0 deletions test/utils/coder/test_CoSTEER.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ def test_data_loader(self):
# 3) test the results
# - check spec.md
# - check data_loader.py
from rdagent.components.coder.data_science.raw_data_loader.test import develop_one_competition

exp = develop_one_competition("aerial-cactus-identification")

pass

def test_model(self):
Expand Down

0 comments on commit fc4e063

Please sign in to comment.