Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
cf7902c
Replace print statements with logging.debug for improved debugging an…
j-irion Jun 19, 2025
047e5f5
Update .gitignore to exclude .DS_Store and .idea directories
j-irion Jun 19, 2025
f41c133
Add command-line argument parsing and refactor main function for clarity
j-irion Jun 19, 2025
dda1140
Add docstrings to classes and methods for improved documentation and …
j-irion Jun 25, 2025
c34e61c
Refactor write_result_to_csv function to remove unused parameters and…
j-irion Jun 25, 2025
7b10822
Refactor logging statements to improve clarity and remove unnecessary…
j-irion Jun 25, 2025
af9c89c
Refactor predictors to ensure input data is consistently shaped as co…
j-irion Jun 25, 2025
215cfca
Refactor LinearPredictor to use SGDRegressor and improve model update…
j-irion Jul 2, 2025
e4869c9
Refactor NeuralNetworkPredictor to support incremental model updates …
j-irion Jul 2, 2025
a53636d
Refactor KMeansPredictor to support incremental updates and improve d…
j-irion Jul 2, 2025
901e5dd
Refactor RandomForestPredictor to support incremental updates and imp…
j-irion Jul 2, 2025
faaa8ae
Refactor KNNRegressor to support incremental updates and improve hype…
j-irion Jul 2, 2025
53c4ba2
Refactor sizing_tasks to replace duplicate neural network model updat…
j-irion Jul 2, 2025
19dc251
Refactor LinearRegressionPredictor and NeuralNetworkPredictor to supp…
j-irion Jul 9, 2025
4a1fd2a
Refactor LinearRegressionPredictor to use mini-batch data for scaling…
j-irion Jul 9, 2025
5d5b5e8
Refactor predictors to support batch size and retrain interval parame…
j-irion Jul 16, 2025
5030fe1
Refactor NeuralNetworkPredictor to support online grid search and inc…
j-irion Jul 16, 2025
3bb49c7
Refactor predictors to implement online grid search for hyperparamete…
j-irion Jul 24, 2025
e30209f
Add experiment script for hyperparameter tuning and logging
j-irion Aug 25, 2025
cb54e93
Refactor predictors to remove batch size and retrain interval paramet…
j-irion Aug 27, 2025
fe2f3f2
Refactor predictors to streamline data handling and remove batch size…
j-irion Sep 17, 2025
25f1052
Refactor experiment script to remove batch size and retrain interval …
j-irion Sep 17, 2025
9e6daa4
Update README to reflect changes in command-line parameters for onlin…
j-irion Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
results/*
!results/*.gitkeep
.DS_Store
.idea
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,17 @@

### Run Sizey

1. Create a Python virtual environment and install the dependencies
2. Run `python3 main.py filename alpha softmax error_metric seed`
1. Create a Python virtual environment and install the dependencies
2. Run `python3 main.py filename alpha softmax error_metric seed [--use_online_grid]`

Where:

- `filename` describes the workflow from the data folder. For instance `./data/trace_methylseq.csv`
- `alpha` sets the alpha you want to execute Sizey with. It has to be between 0.0 and 1.0
- `interpolation` actives the interpolation strategy. It is either False or True. If set to False, the Argmax strategy is used.
- `softmax` toggles the softmax ensemble strategy. Set to `True` to use it, otherwise `False` for the argmax strategy.
- `error_metric` defines the XYZ used for ABC. Currently, it is either `smoothed_mape` or `neg_mean_squared_error` whereas `smoothed_mape` should be used and other error metrics might be experimental and change the impact on the RAQ score.
- `seed` defines the seed for splitting up the initial data in training and test data and also defines the order of online task input.
- `--use_online_grid` (optional) toggles the online grid search.

Here is an example command: `./data/trace_methylseq.csv 0.0 True smoothed_mape 1996`

Expand Down
90 changes: 90 additions & 0 deletions approach/abstract_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,33 @@


class PredictionModel(metaclass=ABCMeta):
"""
PredictionModel is an abstract base class for implementing machine learning models for regression tasks.

This class provides a structure for defining methods related to model training, prediction, and updating.
It includes attributes for storing training data, scalers, and error metrics, and requires subclasses to
implement specific methods for model functionality.

Attributes:
workflow_name (str): Name of the workflow this model is associated with.
task_name (str): Name of the task this model is associated with.
err_metr (str): Error metric used for model evaluation (e.g., 'smoothed_mape').
regressor (Optional[object]): The machine learning model used for predictions.
train_X_scaler (Optional[object]): Scaler for normalizing training features.
train_y_scaler (Optional[object]): Scaler for normalizing training labels.
X_train_full (Optional[np.ndarray]): Historical training features.
y_train_full (Optional[np.ndarray]): Historical training labels.
model_error (Optional[float]): Error score of the model.
"""

def __init__(self, workflow_name: str, task_name: str, err_metr: str):
"""
Initializes the PredictionModel with workflow name, task name, and error metric.

:param workflow_name: Name of the workflow this model is associated with.
:param task_name: Name of the task this model is associated with.
:param err_metr: Error metric to be used for model evaluation (e.g., 'smoothed_mape').
"""
self.workflow_name = workflow_name
self.task_name = task_name
self.err_metr = err_metr
Expand All @@ -18,30 +43,95 @@ def __init__(self, workflow_name: str, task_name: str, err_metr: str):
self.y_train_full = None
self.model_error = None

def _ensure_column_vector(self, array_like: Union[pd.Series, np.ndarray, list]) -> np.ndarray:
"""
Converts the input array-like object into a numpy array shaped as a column vector.

This method ensures that the input, whether it is a pandas Series, a 1-D numpy array, or a list,
is transformed into a two-dimensional numpy array with a single column. This format is required
by scikit-learn for certain operations.

:param array_like: Input data to be converted. Can be a pandas Series, numpy array, or list.
:return: A numpy array reshaped into a column vector (2-D array with shape (n, 1)).
"""
arr = np.asarray(array_like)
if arr.ndim == 1:
arr = arr.reshape(-1, 1)
return arr

def initial_model_training(self, X_train, y_train) -> None:
"""
Initializes the model with training data.

:param X_train: Training features.
:param y_train: Training labels.
"""
raise NotImplementedError('Model prediction method has not been implemented.')

def predict_task(self, task_features: pd.Series) -> np.ndarray:
"""
Predicts the output for a single task based on its features.

:param task_features: Features of the task to predict.
"""
raise NotImplementedError('Model prediction method has not been implemented.')

def predict_tasks(self, taskDataframe: pd.DataFrame) -> float:
"""
Predicts the output for multiple tasks based on their features.

:param taskDataframe: DataFrame containing features of multiple tasks.
"""
raise NotImplementedError('Predicting multiple tasks has not been implemented.')

def update_model(self, X_train: pd.Series, y_train: float) -> None:
"""
Updates the model with new training data.

:param X_train: New training features.
:param y_train: New training labels.
"""
raise NotImplementedError('Model update method has not been implemented.')



class PredictionMethod(metaclass=ABCMeta):

def predict(self, X_test, y_test, user_estimate):
"""
Predicts the output for given test data and user estimate.

:param X_test: Test features.
:param y_test: Test labels.
:param user_estimate: User's estimate for the task.
"""
raise NotImplementedError('Model prediction method has not been implemented.')

def update_model(self, X_train: pd.Series, y_train: float):
"""
Updates the model with new training data.

:param X_train: New training features.
:param y_train: New training labels.
"""
raise NotImplementedError('Model prediction method has not been implemented.')

def handle_underprediction(self, input_size: float, predicted: float, user_estimate: float, retry_number: int, actual_memory: float):
"""
Handles underprediction scenarios by adjusting the predicted value.

:param input_size: Size of the input task.
:param predicted: Predicted value from the model.
:param user_estimate: User's estimate for the task.
:param retry_number: Number of retries attempted.
:param actual_memory: Actual memory used for the task.
"""
raise NotImplementedError('Model prediction method has not been implemented.')

def get_number_subModels(self) -> dict[str, int]:
"""
Returns the number of sub-models used in the prediction method.

:return: Dictionary with model names as keys and their counts as values.
"""
raise NotImplementedError('Model prediction method has not been implemented.')
17 changes: 17 additions & 0 deletions approach/experiment_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,28 @@


class ERROR_STRATEGY(Enum):
"""
Defines the strategy for handling errors in the experiment.

Attributes:
DOUBLE (int): Strategy to double the error value.
MAX_EVER_OBSERVED (int): Strategy to use the maximum error value ever observed.
"""
DOUBLE = 1
MAX_EVER_OBSERVED = 2


class OFFSET_STRATEGY(Enum):
"""
Defines the strategy for handling offsets in the experiment.

Attributes:
STD (int): Strategy to use the standard deviation.
MED_UNDER (int): Strategy to use the median of a subset of data.
MED_ALL (int): Strategy to use the median of all data.
STDUNDER (int): Strategy to use a modified standard deviation.
DYNAMIC (int): Strategy to dynamically adjust the offset.
"""
STD = 1
MED_UNDER = 2
MED_ALL = 5
Expand Down
Loading