Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reg cocktails pytorch embedding roc #490

Open
wants to merge 23 commits into
base: reg_cocktails
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
0bd7d5f
update eta for experiments
ravinkohli Mar 10, 2022
76dae54
add check if True is in value range
ravinkohli Mar 11, 2022
d26c611
Reg cocktails common paper modifications 2 (#417)
ravinkohli Mar 14, 2022
bc60e31
have working embedding from pytroch
ravinkohli Mar 23, 2022
13fad76
divide columns to encode and embed based on threshold
ravinkohli Mar 31, 2022
cf4fd98
cleanup unwanted changes
ravinkohli Mar 31, 2022
af41dd7
use shape after preprocessing in base network backbone
ravinkohli Mar 31, 2022
9706875
remove redundant call to load datamanager
ravinkohli Apr 5, 2022
def144c
add init file for column splitting
ravinkohli Apr 11, 2022
926a757
fix tests
ravinkohli Jun 14, 2022
7567d26
fix precommit and add test changes
ravinkohli Jun 14, 2022
09fdc0d
[ADD] Calculate memory of dataset after one hot encoding (pytorch emb…
ravinkohli Jul 16, 2022
3aef02e
suggestions from review
ravinkohli Jul 18, 2022
8e3dbef
add preprocessed_dtype to determine double or float
ravinkohli Aug 9, 2022
52427bc
test fix in progress
ravinkohli Aug 16, 2022
90512ee
TODO: fix errors after rebase
ravinkohli Aug 17, 2022
895b904
Reg cocktails apt1.0+reg cocktails pytorch embedding reduced (#454)
ravinkohli Aug 17, 2022
033bca7
fix embeddings after rebase
ravinkohli Aug 17, 2022
d4cd8b4
fix error with pytorch embeddings
ravinkohli Aug 18, 2022
a5807cb
fix redundant code
ravinkohli Aug 18, 2022
960e1ef
change userdefined to False
ravinkohli Aug 18, 2022
1be80d5
remove using categorical columns
ravinkohli Aug 19, 2022
a616ecb
Add fix for ROC
ravinkohli Feb 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
test fix in progress
ravinkohli committed Oct 25, 2022
commit 52427bc65cf9fe4cb5b77f010d419e92f5cb0b87
7 changes: 3 additions & 4 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
@@ -77,10 +77,9 @@ class TabularFeatureValidator(BaseFeatureValidator):
transformer.

Attributes:
categories (List[List[str]]):
List for which an element at each index is a
list containing the categories for the respective
categorical column.
num_categories_per_col (List[int]):
List for which an element at each index is the number
of categories for the respective categorical column.
transformed_columns (List[str])
List of columns that were transformed.
column_transformer (Optional[BaseEstimator])
4 changes: 2 additions & 2 deletions autoPyTorch/datasets/time_series_dataset.py
Original file line number Diff line number Diff line change
@@ -559,7 +559,7 @@ def __init__(self,
self.num_features: int = self.validator.feature_validator.num_features # type: ignore[assignment]
self.num_targets: int = self.validator.target_validator.out_dimensionality # type: ignore[assignment]

self.categories = self.validator.feature_validator.categories
self.num_categories_per_col = self.validator.feature_validator.num_categories_per_col

self.feature_shapes = self.validator.feature_shapes
self.feature_names = tuple(self.validator.feature_names)
@@ -1072,7 +1072,7 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
'categorical_features': self.categorical_features,
'numerical_columns': self.numerical_columns,
'categorical_columns': self.categorical_columns,
'categories': self.categories,
'num_categories_per_col': self.num_categories_per_col,
})
return info

Original file line number Diff line number Diff line change
@@ -16,7 +16,7 @@

class ColumnSplitter(autoPyTorchTabularPreprocessingComponent):
"""
Removes features that have the same value in the training data.
Splits categorical columns into embed or encode columns based on a hyperparameter.
"""
def __init__(
self,
Original file line number Diff line number Diff line change
@@ -19,14 +19,14 @@ def __init__(self,
def fit(self, X: Dict[str, Any], y: Any = None) -> TimeSeriesBaseEncoder:
OneHotEncoder.fit(self, X, y)
categorical_columns = X['dataset_properties']['categorical_columns']
n_features_cat = X['dataset_properties']['categories']
num_categories_per_col = X['dataset_properties']['num_categories_per_col']
feature_names = X['dataset_properties']['feature_names']
feature_shapes = X['dataset_properties']['feature_shapes']

if len(n_features_cat) == 0:
n_features_cat = self.preprocessor['categorical'].categories # type: ignore
if len(num_categories_per_col) == 0:
num_categories_per_col = [len(cat) for cat in self.preprocessor['categorical'].categories] # type: ignore
for i, cat_column in enumerate(categorical_columns):
feature_shapes[feature_names[cat_column]] = len(n_features_cat[i])
feature_shapes[feature_names[cat_column]] = num_categories_per_col[i]
self.feature_shapes = feature_shapes
return self

Original file line number Diff line number Diff line change
@@ -15,11 +15,11 @@ def __init__(self) -> None:
super(TimeSeriesBaseEncoder, self).__init__()
self.add_fit_requirements([
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
FitRequirement('categories', (List,), user_defined=True, dataset_property=True),
FitRequirement('num_categories_per_col', (List,), user_defined=True, dataset_property=True),
FitRequirement('feature_names', (tuple,), user_defined=True, dataset_property=True),
FitRequirement('feature_shapes', (Dict, ), user_defined=True, dataset_property=True),
])
self.feature_shapes: Union[Dict[str, int]] = {}
self.feature_shapes: Dict[str, int] = {}

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
"""
9 changes: 7 additions & 2 deletions autoPyTorch/pipeline/components/training/trainer/__init__.py
Original file line number Diff line number Diff line change
@@ -447,7 +447,13 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
raise RuntimeError("Budget exhausted without finishing an epoch.")

if self.choice.use_stochastic_weight_averaging and self.choice.swa_updated:
use_double = 'float64' in X['preprocessed_dtype']
# By default, we assume the data is double. Only if the data was preprocessed,
# we check the dtype and use it accordingly
preprocessed_dtype = X.get('preprocessed_dtype', None)
if preprocessed_dtype is None:
use_double = True
else:
use_double = 'float64' in preprocessed_dtype

# update batch norm statistics
swa_model = self.choice.swa_model.double() if use_double else self.choice.swa_model
@@ -458,7 +464,6 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
# we update only the last network which pertains to the stochastic weight averaging model
snapshot_model = self.choice.model_snapshots[-1].double() if use_double else self.choice.model_snapshots[-1]
swa_utils.update_bn(X['train_data_loader'], snapshot_model)
update_model_state_dict_from_swa(X['network_snapshots'][-1], self.choice.swa_model.state_dict())

# wrap up -- add score if not evaluating every epoch
if not self.eval_valid_each_epoch(X):