From d5ff0dfa3a99aa479767f8b4c949ba167aaaa2fa Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sat, 25 May 2024 08:06:31 +0000 Subject: [PATCH 001/106] added test case and initial testing and and code for shardingoutput tabular data --- configs/tabularize.yaml | 2 +- src/MEDS_tabular_automl/tabularize.py | 679 ++++++++++++++++++++++++++ tests/test_tabularize.py | 136 ++++++ 3 files changed, 816 insertions(+), 1 deletion(-) create mode 100644 src/MEDS_tabular_automl/tabularize.py create mode 100644 tests/test_tabularize.py diff --git a/configs/tabularize.yaml b/configs/tabularize.yaml index 5d94c75..86f8369 100644 --- a/configs/tabularize.yaml +++ b/configs/tabularize.yaml @@ -23,12 +23,12 @@ aggs: - "value/residual/sum" - "value/residual/sum_sqd" - # Sharding n_patients_per_sub_shard: null # Misc do_overwrite: False +do_update: True seed: 1 # Hydra diff --git a/src/MEDS_tabular_automl/tabularize.py b/src/MEDS_tabular_automl/tabularize.py new file mode 100644 index 0000000..d2b8d79 --- /dev/null +++ b/src/MEDS_tabular_automl/tabularize.py @@ -0,0 +1,679 @@ +"""The base class for core dataset processing logic. + +Attributes: + INPUT_DF_T: This defines the type of the allowable input dataframes -- e.g., databases, filepaths, + dataframes, etc. + DF_T: This defines the type of internal dataframes -- e.g. polars DataFrames. +""" + +import copy +import json +from collections.abc import Callable, Mapping, Sequence +from pathlib import Path + +import numpy as np +import polars as pl +import polars.selectors as cs +from omegaconf import DictConfig +from tqdm.auto import tqdm + +DF_T = pl.DataFrame + + +def load_meds_data(MEDS_cohort_dir: str) -> Mapping[str, pl.DataFrame]: + """Loads the MEDS dataset from disk. + + Args: + MEDS_cohort_dir: The directory containing the MEDS datasets split by subfolders. + We expect `train` to be a split so `MEDS_cohort_dir/train` should exist. + + Returns: + Mapping[str, pl.DataFrame]: Mapping from split name to a polars DataFrame containing the MEDS dataset. + """ + MEDS_cohort_dir = Path(MEDS_cohort_dir) + meds_fps = list(MEDS_cohort_dir.glob("*/*.parquet")) + splits = {fp.parent.stem for fp in meds_fps} + assert "train" in splits, f"Expected 'train' split in {splits}." + split_to_fps = {split: [fp for fp in meds_fps if fp.parent.stem == split] for split in splits} + split_to_df = { + split: pl.concat([pl.scan_parquet(fp) for fp in split_fps]) + for split, split_fps in split_to_fps.items() + } + return split_to_df + + +def store_params_json(params_fp: Path, cfg: DictConfig, sp_subjects: Mapping[str, Sequence[Sequence[int]]]): + """Stores configuration parameters into a JSON file. + + This function writes a dictionary of parameters, which includes patient partitioning + information and configuration details, to a specified JSON file. If the file already exists, + the function can update it with new values depending on the configuration settings provided. + + Parameters: + - params_fp (Path): The file path for the JSON file where parameters should be stored. + - cfg (DictConfig): A configuration object containing settings like the number of patients + per sub-shard, minimum code inclusion frequency, and flags for updating or overwriting existing files. + - sp_subjects (Mapping[str, Sequence[Sequence[int]]]): A mapping of split names to sequences + representing patient IDs, structured in sub-shards. + + Behavior: + - If params_fp exists and cfg.do_update is True, the function checks for differences + between existing and new parameters. If discrepancies are found, it will raise an error detailing + the differences. The number of patients per sub-shard will be standardized to match the existing record. + - If params_fp exists and cfg.do_overwrite is False (without do_update being True), a + FileExistsError is raised to prevent unintentional data loss. + + Raises: + - ValueError: If there are discrepancies between old and new parameters during an update. + - FileExistsError: If the file exists and neither updating nor overwriting is allowed. + + Example: + >>> cfg = DictConfig({ + >>> "n_patients_per_sub_shard": 100, + >>> "min_code_inclusion_frequency": 5, + >>> "do_update": False, + >>> "do_overwrite": True + >>> }) + >>> sp_subjects = {"train": [[1, 2, 3], [4, 5]], "test": [[6, 7]]} + >>> params = store_params_json(Path("/path/to/params.json"), cfg, sp_subjects) + """ + params = { + "n_patients_per_sub_shard": cfg.n_patients_per_sub_shard, + "min_code_inclusion_frequency": cfg.min_code_inclusion_frequency, + "patient_shard_by_split": sp_subjects, + } + if params_fp.exists(): + if cfg.do_update: + with open(params_fp) as f: + old_params = json.load(f) + + if old_params["n_patients_per_sub_shard"] != params["n_patients_per_sub_shard"]: + print( + "Standardizing chunk size to existing record " + f"({old_params['n_patients_per_sub_shard']})." + ) + params["n_patients_per_sub_shard"] = old_params["n_patients_per_sub_shard"] + params["patient_shard_by_split"] = old_params["patient_shard_by_split"] + + if old_params != params: + err_strings = ["Asked to update but parameters differ:"] + old = set(old_params.keys()) + new = set(params.keys()) + if old != new: + err_strings.append("Keys differ: ") + if old - new: + err_strings.append(f" old - new = {old - new}") + if new - old: + err_strings.append(f" new - old = {old - new}") + + for k in old & new: + old_val = old_params[k] + new_val = params[k] + + if old_val != new_val: + err_strings.append(f"Values differ for {k}:") + err_strings.append(f" Old: {old_val}") + err_strings.append(f" New: {new_val}") + + raise ValueError("\n".join(err_strings)) + elif not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {params_fp} exists!") + with open(params_fp, mode="w") as f: + json.dump(params, f) + return params + + +def _write_df(cls, df: DF_T, fp: Path, **kwargs): + """Write shard to disk.""" + do_overwrite = kwargs.get("do_overwrite", False) + + if not do_overwrite and fp.is_file(): + raise FileExistsError(f"{fp} exists and do_overwrite is {do_overwrite}!") + + fp.parent.mkdir(exist_ok=True, parents=True) + + if isinstance(df, pl.LazyFrame): + df.collect().write_parquet(fp, use_pyarrow=cls.WRITE_USE_PYARROW) + else: + df.write_parquet(fp, use_pyarrow=cls.WRITE_USE_PYARROW) + + +def _get_flat_col_dtype(self, col: str) -> pl.DataType: + """Gets the appropriate minimal dtype for the given flat representation column string.""" + + parts = col.split("/") + if len(parts) < 4: + raise ValueError(f"Malformed column {col}. Should be temporal/measurement/feature/agg") + + temp, meas = parts[0], parts[1] + agg = parts[-1] + feature = "/".join(parts[2:-1]) + + cfg = self.measurement_configs[meas] + + match agg: + case "sum" | "sum_sqd" | "min" | "max" | "value": + return pl.Float32 + case "present": + return pl.Boolean + case "count" | "has_values_count": + # config.observation_rate_over_cases = total_observed / total_possible + # config.observation_rate_per_case = raw_total_observed / total_observed + + match temp: + case "static": + n_possible = len(self.subject_ids) + case str() | "dynamic": + n_possible = sum(self.n_events_per_subject.values()) + case _: + raise ValueError( + f"Column name {col} malformed: Temporality {temp} not in `static` or `dynamic" + ) + + if cfg.vocabulary is None: + observation_frequency = cfg.observation_rate_per_case * cfg.observation_rate_over_cases + else: + if feature not in cfg.vocabulary.idxmap: + raise ValueError(f"Column name {col} malformed: Feature {feature} not in {meas}!") + else: + observation_frequency = cfg.vocabulary.obs_frequencies[cfg.vocabulary[feature]] + + total_observations = int(np.ceil(observation_frequency * n_possible)) + + return self.get_smallest_valid_uint_type(total_observations) + case _: + raise ValueError(f"Column name {col} malformed!") + + +def _normalize_flat_rep_df_cols( + flat_df: DF_T, feature_columns: list[str] | None = None, set_count_0_to_null: bool = False +) -> DF_T: + if feature_columns is None: + feature_columns = [x for x in flat_df.columns if x not in ("subject_id", "timestamp")] + cols_to_add = set() + cols_to_retype = set(feature_columns) + else: + cols_to_add = set(feature_columns) - set(flat_df.columns) + cols_to_retype = set(feature_columns).intersection(set(flat_df.columns)) + + cols_to_add = [(c, _get_flat_col_dtype(c)) for c in cols_to_add] + cols_to_retype = [(c, _get_flat_col_dtype(c)) for c in cols_to_retype] + + if "timestamp" in flat_df.columns: + key_cols = ["subject_id", "timestamp"] + else: + key_cols = ["subject_id"] + + flat_df = flat_df.with_columns( + *[pl.lit(None, dtype=dt).alias(c) for c, dt in cols_to_add], + *[pl.col(c).cast(dt).alias(c) for c, dt in cols_to_retype], + ).select(*key_cols, *feature_columns) + + if not set_count_0_to_null: + return flat_df + + flat_df = flat_df.collect() + + flat_df = flat_df.with_columns( + pl.when(cs.ends_with("count") != 0).then(cs.ends_with("count")).keep_name() + ).lazy() + return flat_df + + +def _summarize_dynamic_measurements( + self, + feature_columns: list[str], + include_only_subjects: set[int] | None = None, +) -> pl.LazyFrame: + if include_only_subjects is None: + df = self.dynamic_measurements_df + else: + df = self.dynamic_measurements_df.join( + self.events_df.filter(pl.col("subject_id").is_in(list(include_only_subjects))).select("event_id"), + on="event_id", + how="inner", + ) + + valid_measures = {} + for feat_col in feature_columns: + temp, meas, feat, _ = self._parse_flat_feature_column(feat_col) + + if temp != "dynamic": + continue + + if meas not in valid_measures: + valid_measures[meas] = set() + valid_measures[meas].add(feat) + + out_dfs = {} + for m, allowed_vocab in valid_measures.items(): + cfg = self.measurement_configs[m] + + total_observations = int( + np.ceil( + cfg.observation_rate_per_case + * cfg.observation_rate_over_cases + * sum(self.n_events_per_subject.values()) + ) + ) + + count_type = self.get_smallest_valid_uint_type(total_observations) + + if cfg.modality == "univariate_regression" and cfg.vocabulary is None: + prefix = f"dynamic/{m}/{m}" + + key_col = pl.col(m) + val_col = pl.col(m).drop_nans().cast(pl.Float32) + + out_dfs[m] = ( + df.lazy() + .select("measurement_id", "event_id", m) + .filter(pl.col(m).is_not_null()) + .groupby("event_id") + .agg( + pl.col(m).is_not_null().sum().cast(count_type).alias(f"{prefix}/count"), + ( + (pl.col(m).is_not_nan() & pl.col(m).is_not_null()) + .sum() + .cast(count_type) + .alias(f"{prefix}/has_values_count") + ), + val_col.sum().alias(f"{prefix}/sum"), + (val_col**2).sum().alias(f"{prefix}/sum_sqd"), + val_col.min().alias(f"{prefix}/min"), + val_col.max().alias(f"{prefix}/max"), + ) + ) + continue + elif cfg.modality == "multivariate_regression": + column_cols = [m, m] + values_cols = [m, cfg.values_column] + key_prefix = f"{m}_{m}_" + val_prefix = f"{cfg.values_column}_{m}_" + + key_col = cs.starts_with(key_prefix) + val_col = cs.starts_with(val_prefix).drop_nans().cast(pl.Float32) + + aggs = [ + key_col.is_not_null() + .sum() + .cast(count_type) + .map_alias(lambda c: f"dynamic/{m}/{c.replace(key_prefix, '')}/count"), + ( + (cs.starts_with(val_prefix).is_not_null() & cs.starts_with(val_prefix).is_not_nan()) + .sum() + .map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/has_values_count") + ), + val_col.sum().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/sum"), + (val_col**2).sum().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/sum_sqd"), + val_col.min().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/min"), + val_col.max().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/max"), + ] + else: + column_cols = [m] + values_cols = [m] + aggs = [ + pl.all().is_not_null().sum().cast(count_type).map_alias(lambda c: f"dynamic/{m}/{c}/count") + ] + + ID_cols = ["measurement_id", "event_id"] + out_dfs[m] = ( + df.select(*ID_cols, *set(column_cols + values_cols)) + .filter(pl.col(m).is_in(allowed_vocab)) + .pivot( + index=ID_cols, + columns=column_cols, + values=values_cols, + aggregate_function=None, + ) + .lazy() + .drop("measurement_id") + .groupby("event_id") + .agg(*aggs) + ) + + return pl.concat(list(out_dfs.values()), how="align") + + +def _summarize_over_window(df: DF_T, window_size: str) -> pl.LazyFrame: + """Apply aggregations to the raw representation over a window size.""" + if isinstance(df, Path): + df = pl.scan_parquet(df) + + def time_aggd_col_alias_fntr(new_agg: str | None = None) -> Callable[[str], str]: + if new_agg is None: + + def f(c: str) -> str: + return "/".join([window_size] + c.split("/")[1:]) + + else: + + def f(c: str) -> str: + return "/".join([window_size] + c.split("/")[1:-1] + [new_agg]) + + return f + + # Columns to convert to counts: + present_indicator_cols = cs.ends_with("/present") + + # Columns to convert to value aggregations: + value_cols = cs.ends_with("/value") + + # Columns to aggregate via other operations + cnt_cols = (cs.ends_with("/count") | cs.ends_with("/has_values_count")).fill_null(0) + + cols_to_sum = cs.ends_with("/sum") | cs.ends_with("/sum_sqd") + cols_to_min = cs.ends_with("/min") + cols_to_max = cs.ends_with("/max") + + if window_size == "FULL": + df = df.groupby("subject_id").agg( + "timestamp", + # present to counts + present_indicator_cols.cumsum().map_alias(time_aggd_col_alias_fntr("count")), + # values to stats + value_cols.is_not_null().cumsum().map_alias(time_aggd_col_alias_fntr("count")), + ( + (value_cols.is_not_null() & value_cols.is_not_nan()) + .cumsum() + .map_alias(time_aggd_col_alias_fntr("has_values_count")) + ), + value_cols.cumsum().map_alias(time_aggd_col_alias_fntr("sum")), + (value_cols**2).cumsum().map_alias(time_aggd_col_alias_fntr("sum_sqd")), + value_cols.cummin().map_alias(time_aggd_col_alias_fntr("min")), + value_cols.cummax().map_alias(time_aggd_col_alias_fntr("max")), + # Raw aggregations + cnt_cols.cumsum().map_alias(time_aggd_col_alias_fntr()), + cols_to_sum.cumsum().map_alias(time_aggd_col_alias_fntr()), + cols_to_min.cummin().map_alias(time_aggd_col_alias_fntr()), + cols_to_max.cummax().map_alias(time_aggd_col_alias_fntr()), + ) + df = df.explode(*[c for c in df.columns if c != "subject_id"]) + else: + df = df.groupby_rolling( + index_column="timestamp", + by="subject_id", + period=window_size, + ).agg( + # present to counts + present_indicator_cols.sum().map_alias(time_aggd_col_alias_fntr("count")), + # values to stats + value_cols.is_not_null().sum().map_alias(time_aggd_col_alias_fntr("count")), + ( + (value_cols.is_not_null() & value_cols.is_not_nan()) + .sum() + .map_alias(time_aggd_col_alias_fntr("has_values_count")) + ), + value_cols.sum().map_alias(time_aggd_col_alias_fntr("sum")), + (value_cols**2).sum().map_alias(time_aggd_col_alias_fntr("sum_sqd")), + value_cols.min().map_alias(time_aggd_col_alias_fntr("min")), + value_cols.max().map_alias(time_aggd_col_alias_fntr("max")), + # Raw aggregations + cnt_cols.sum().map_alias(time_aggd_col_alias_fntr()), + cols_to_sum.sum().map_alias(time_aggd_col_alias_fntr()), + cols_to_min.min().map_alias(time_aggd_col_alias_fntr()), + cols_to_max.max().map_alias(time_aggd_col_alias_fntr()), + ) + + return _normalize_flat_rep_df_cols(df, set_count_0_to_null=True) + + +def _get_flat_ts_rep( + self, + feature_columns: list[str], + **kwargs, +) -> pl.LazyFrame: + """Produce raw representation for dynamic data.""" + return _normalize_flat_rep_df_cols( + _summarize_dynamic_measurements(feature_columns, **kwargs) + .drop("event_id") + .sort(by=["subject_id", "timestamp"]) + .collect() + .lazy(), + [c for c in feature_columns if not c.startswith("static/")], + ) + # The above .collect().lazy() shouldn't be necessary but it appears to be for some reason... + + +def _get_flat_static_rep( + self, + feature_columns: list[str], + **kwargs, +) -> pl.LazyFrame: + """Produce raw representation for static data.""" + static_features = [c for c in feature_columns if c.startswith("static/")] + return self._normalize_flat_rep_df_cols( + self._summarize_static_measurements(static_features, **kwargs).collect().lazy(), + static_features, + set_count_0_to_null=False, + ) + + +def _get_flat_rep_feature_cols( + self, + feature_inclusion_frequency: float | dict[str, float] | None = None, + window_sizes: list[str] | None = None, + include_only_measurements: set[str] | None = None, +) -> list[str]: + """ + process aggregations and select which columns get which aggregations + 1. static + 1. code & no numerical_values + 2. numerical_values + 2. dynamic + 1. codes -> aggs applied to all codes + 2. numerical_values -> continuous aggs + """ + feature_inclusion_frequency, include_only_measurements = self._resolve_flat_rep_cache_params( + feature_inclusion_frequency, include_only_measurements + ) + feature_columns = [] + for m, cfg in self.measurement_configs.items(): + if m not in include_only_measurements: + continue + + features = None + if cfg.vocabulary is not None: + vocab = copy.deepcopy(cfg.vocabulary) + if feature_inclusion_frequency is not None: + m_freq = feature_inclusion_frequency[m] + vocab.filter(total_observations=None, min_valid_element_freq=m_freq) + features = vocab.vocabulary + # elif cfg.modality == DataModality.UNIVARIATE_REGRESSION: + # features = [m] + else: + raise ValueError(f"Config with modality {cfg.modality} should have a Vocabulary!") + temps = [] + aggs = [] + + # match cfg.temporality: + # case TemporalityType.STATIC: + # temps = [str(cfg.temporality)] + # match cfg.modality: + # case DataModality.UNIVARIATE_REGRESSION: + # aggs = ["value"] + # case DataModality.SINGLE_LABEL_CLASSIFICATION: + # aggs = ["present"] + # case _: + # raise ValueError(f"{cfg.modality} invalid with {cfg.temporality}") + # case TemporalityType.FUNCTIONAL_TIME_DEPENDENT if window_sizes is None: + # temps = [str(cfg.temporality)] + # match cfg.modality: + # case DataModality.UNIVARIATE_REGRESSION: + # aggs = ["value"] + # case DataModality.SINGLE_LABEL_CLASSIFICATION: + # aggs = ["present"] + # case _: + # raise ValueError(f"{cfg.modality} invalid with {cfg.temporality}") + # case TemporalityType.FUNCTIONAL_TIME_DEPENDENT if window_sizes is not None: + # temps = window_sizes + # match cfg.modality: + # case DataModality.UNIVARIATE_REGRESSION: + # aggs = ["count", "has_values_count", "sum", "sum_sqd", "min", "max"] + # case DataModality.SINGLE_LABEL_CLASSIFICATION: + # aggs = ["count"] + # case _: + # raise ValueError(f"{cfg.modality} invalid with {cfg.temporality}") + # case TemporalityType.DYNAMIC: + # temps = [str(cfg.temporality)] if window_sizes is None else window_sizes + # match cfg.modality: + # case DataModality.UNIVARIATE_REGRESSION | DataModality.MULTIVARIATE_REGRESSION: + # aggs = ["count", "has_values_count", "sum", "sum_sqd", "min", "max"] + # case DataModality.MULTI_LABEL_CLASSIFICATION: + # aggs = ["count"] + # case _: + # raise ValueError(f"{cfg.modality} invalid with {cfg.temporality}") + + for temp in temps: + for feature in features: + for agg in aggs: + feature_columns.append(f"{temp}/{m}/{feature}/{agg}") + + return sorted(feature_columns) + + +def cache_flat_representation( + cfg: DictConfig, +): + """Writes a flat (historically summarized) representation of the dataset to disk. + + This file caches a set of files useful for building flat representations of the dataset to disk, + suitable for, e.g., sklearn style modeling for downstream tasks. It will produce a few sets of files: + + * A new directory ``self.config.save_dir / "flat_reps"`` which contains the following: + * A subdirectory ``raw`` which contains: (1) a json file with the configuration arguments and (2) a + set of parquet files containing flat (e.g., wide) representations of summarized events per subject, + broken out by split and subject chunk. + * A set of subdirectories ``past/*`` which contains summarized views over the past ``*`` time period + per subject per event, for all time periods in ``window_sizes``, if any. + + Args: + cfg: + MEDS_cohort_dir: directory of MEDS format dataset that is ingested. + tabularized_data_dir: output directory of tabularized data. + min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate + what features can be included in the flat representation. It can either be a float, in which + case it applies across all measurements, or `None`, in which case no filtering is applied, or + a dictionary from measurement type to a float dictating a per-measurement-type inclusion + cutoff. + window_sizes: Beyond writing out a raw, per-event flattened representation, the dataset also has + the capability to summarize these flattened representations over the historical windows + specified in this argument. These are strings specifying time deltas, using this syntax: + `link`_. Each window size will be summarized to a separate directory, and will share the same + subject file split as is used in the raw representation files. + codes: A list of codes to include in the flat representation. If `None`, all codes will be included + in the flat representation. + aggs: A list of aggregations to apply to the raw representation. Must have length greater than 0. + n_patients_per_sub_shard: The number of subjects that should be included in each output file. + Lowering this number increases the number of files written, making the process of creating and + leveraging these files slower but more memory efficient. + do_overwrite: If `True`, this function will overwrite the data already stored in the target save + directory. + do_update: bool = True + seed: The seed to use for random number generation. + + .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 + """ + # setup rng seed + rng = np.random.default_rng(cfg.seed) + + # create output dir + flat_dir = Path(cfg.tabularized_data_dir) / "flat_reps" + flat_dir.mkdir(exist_ok=True, parents=True) + + # load MEDS data + split_to_df = load_meds_data(cfg.MEDS_cohort_dir) + + # for every dataset split, create shards to output flat representations to + sp_subjects = {} + for split_name, split_df in split_to_df.items(): + split_patient_ids = ( + split_df.select(pl.col("patient_id").cast(pl.Int32).unique()).collect().to_series().to_list() + ) + print(len(split_patient_ids)) + if cfg.n_patients_per_sub_shard is None: + sp_subjects[split_name] = split_patient_ids + else: + shuffled_patient_ids = rng.permutation(split_patient_ids) + num_shards = max(len(split_patient_ids) // cfg.n_patients_per_sub_shard, 1) # must be 1 or larger + sharded_patient_ids = np.array_split(shuffled_patient_ids, num_shards) + sp_subjects[split_name] = [shard.tolist() for shard in sharded_patient_ids] + + # store params in json file + params_fp = flat_dir / "params.json" + params = store_params_json(params_fp, cfg, sp_subjects) + + # 0. Identify Output Columns + # We set window_sizes to None here because we want to get the feature column names for the raw flat + # representation, not the summarized one. + feature_columns = _get_flat_rep_feature_cols( + min_code_inclusion_frequency=cfg.min_code_inclusion_frequency, + window_sizes=None, + ) + + # 1. Produce static representation + static_subdir = flat_dir / "static" + + static_dfs = {} + for sp, subjects in tqdm(list(params["patient_shard_by_split"].items()), desc="Flattening Splits"): + static_dfs[sp] = [] + sp_dir = static_subdir / sp + + for i, subjects_list in enumerate(tqdm(subjects, desc="Subject chunks", leave=False)): + fp = sp_dir / f"{i}.parquet" + static_dfs[sp].append(fp) + if fp.exists(): + if cfg.do_update: + continue + elif not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") + + df = _get_flat_static_rep( + feature_columns=feature_columns, + include_only_subjects=subjects_list, + ) + + _write_df(df, fp, do_overwrite=cfg.do_overwrite) + + # 2. Produce raw representation + ts_subdir = flat_dir / "at_ts" + + ts_dfs = {} + for sp, subjects in tqdm(list(params["patient_shard_by_split"].items()), desc="Flattening Splits"): + ts_dfs[sp] = [] + sp_dir = ts_subdir / sp + + for i, subjects_list in enumerate(tqdm(subjects, desc="Subject chunks", leave=False)): + fp = sp_dir / f"{i}.parquet" + ts_dfs[sp].append(fp) + if fp.exists(): + if cfg.do_update: + continue + elif not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") + + df = _get_flat_ts_rep( + feature_columns=feature_columns, + include_only_subjects=subjects_list, + ) + + _write_df(df, fp, do_overwrite=cfg.do_overwrite) + + if cfg.window_sizes is None: + return + + # 3. Produce summarized history representations + history_subdir = flat_dir / "over_history" + + for window_size in tqdm(cfg.window_sizes, desc="History window sizes"): + for sp, df_fps in tqdm(list(ts_dfs.items()), desc="Windowing Splits", leave=False): + for i, df_fp in enumerate(tqdm(df_fps, desc="Subject chunks", leave=False)): + fp = history_subdir / sp / window_size / f"{i}.parquet" + if fp.exists(): + if cfg.do_update: + continue + elif not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") + + df = _summarize_over_window(df_fp, window_size) + _write_df(df, fp) diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py new file mode 100644 index 0000000..f918796 --- /dev/null +++ b/tests/test_tabularize.py @@ -0,0 +1,136 @@ +import rootutils + +root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) + +import json +import tempfile +from io import StringIO +from pathlib import Path + +import polars as pl +from loguru import logger +from omegaconf import OmegaConf + +from MEDS_tabular_automl.tabularize import cache_flat_representation + +SPLITS_JSON = """{"train/0": [239684, 1195293], "train/1": [68729, 814703], "tuning/0": [754281], "held_out/0": [1500733]}""" # noqa: E501 + +MEDS_TRAIN_0 = """ +patient_id,code,timestamp,numerical_value +239684,HEIGHT,,175.271115221764 +239684,EYE_COLOR//BROWN,, +239684,DOB,1980-12-28T00:00:00.000000, +239684,TEMP,2010-05-11T17:41:51.000000,96.0 +239684,ADMISSION//CARDIAC,2010-05-11T17:41:51.000000, +239684,HR,2010-05-11T17:41:51.000000,102.6 +239684,TEMP,2010-05-11T17:48:48.000000,96.2 +239684,HR,2010-05-11T17:48:48.000000,105.1 +239684,TEMP,2010-05-11T18:25:35.000000,95.8 +239684,HR,2010-05-11T18:25:35.000000,113.4 +239684,HR,2010-05-11T18:57:18.000000,112.6 +239684,TEMP,2010-05-11T18:57:18.000000,95.5 +239684,DISCHARGE,2010-05-11T19:27:19.000000, +1195293,HEIGHT,,164.6868838269085 +1195293,EYE_COLOR//BLUE,, +1195293,DOB,1978-06-20T00:00:00.000000, +1195293,TEMP,2010-06-20T19:23:52.000000,100.0 +1195293,ADMISSION//CARDIAC,2010-06-20T19:23:52.000000, +1195293,HR,2010-06-20T19:23:52.000000,109.0 +1195293,TEMP,2010-06-20T19:25:32.000000,100.0 +1195293,HR,2010-06-20T19:25:32.000000,114.1 +1195293,HR,2010-06-20T19:45:19.000000,119.8 +1195293,TEMP,2010-06-20T19:45:19.000000,99.9 +1195293,HR,2010-06-20T20:12:31.000000,112.5 +1195293,TEMP,2010-06-20T20:12:31.000000,99.8 +1195293,HR,2010-06-20T20:24:44.000000,107.7 +1195293,TEMP,2010-06-20T20:24:44.000000,100.0 +1195293,TEMP,2010-06-20T20:41:33.000000,100.4 +1195293,HR,2010-06-20T20:41:33.000000,107.5 +1195293,DISCHARGE,2010-06-20T20:50:04.000000, +""" +MEDS_TRAIN_1 = """ +patient_id,code,timestamp,numerical_value +68729,EYE_COLOR//HAZEL,, +68729,HEIGHT,,160.3953106166676 +68729,DOB,1978-03-09T00:00:00.000000, +68729,HR,2010-05-26T02:30:56.000000,86.0 +68729,ADMISSION//PULMONARY,2010-05-26T02:30:56.000000, +68729,TEMP,2010-05-26T02:30:56.000000,97.8 +68729,DISCHARGE,2010-05-26T04:51:52.000000, +814703,EYE_COLOR//HAZEL,, +814703,HEIGHT,,156.48559093209357 +814703,DOB,1976-03-28T00:00:00.000000, +814703,TEMP,2010-02-05T05:55:39.000000,100.1 +814703,HR,2010-02-05T05:55:39.000000,170.2 +814703,ADMISSION//ORTHOPEDIC,2010-02-05T05:55:39.000000, +814703,DISCHARGE,2010-02-05T07:02:30.000000, +""" +MEDS_HELD_OUT_0 = """ +patient_id,code,timestamp,numerical_value +1500733,HEIGHT,,158.60131573580904 +1500733,EYE_COLOR//BROWN,, +1500733,DOB,1986-07-20T00:00:00.000000, +1500733,TEMP,2010-06-03T14:54:38.000000,100.0 +1500733,HR,2010-06-03T14:54:38.000000,91.4 +1500733,ADMISSION//ORTHOPEDIC,2010-06-03T14:54:38.000000, +1500733,HR,2010-06-03T15:39:49.000000,84.4 +1500733,TEMP,2010-06-03T15:39:49.000000,100.3 +1500733,HR,2010-06-03T16:20:49.000000,90.1 +1500733,TEMP,2010-06-03T16:20:49.000000,100.1 +1500733,DISCHARGE,2010-06-03T16:44:26.000000, +""" +MEDS_TUNING_0 = """ +patient_id,code,timestamp,numerical_value +754281,EYE_COLOR//BROWN,, +754281,HEIGHT,,166.22261567137025 +754281,DOB,1988-12-19T00:00:00.000000, +754281,ADMISSION//PULMONARY,2010-01-03T06:27:59.000000, +754281,TEMP,2010-01-03T06:27:59.000000,99.8 +754281,HR,2010-01-03T06:27:59.000000,142.0 +754281,DISCHARGE,2010-01-03T08:22:13.000000, +""" + +MEDS_OUTPUTS = { + "train/0": MEDS_TRAIN_0, + "train/1": MEDS_TRAIN_1, + "held_out/0": MEDS_HELD_OUT_0, + "tuning/0": MEDS_TUNING_0, +} + + +def test_tabularize(): + with tempfile.TemporaryDirectory() as d: + MEDS_cohort_dir = Path(d) / "MEDS_cohort" + tabularized_data_dir = Path(d) / "cached_reps" + + # Create the directories + MEDS_cohort_dir.mkdir() + tabularized_data_dir.mkdir() + + # Store MEDS outputs + for split, data in MEDS_OUTPUTS.items(): + file_path = MEDS_cohort_dir / f"{split}.parquet" + file_path.parent.mkdir(exist_ok=True) + pl.read_csv(StringIO(data)).write_parquet(file_path) + + split_json = json.load(StringIO(SPLITS_JSON)) + splits_fp = MEDS_cohort_dir / "splits.json" + json.dump(split_json, splits_fp.open("w")) + + tabularize_config_kwargs = { + "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), + "tabularized_data_dir": str(tabularized_data_dir.resolve()), + "min_code_inclusion_frequency": 1, + "window_sizes": [30, 365, None], + "codes": None, + # "aggs": None, + "n_patients_per_sub_shard": 2, + "do_overwrite": False, + "do_update": True, + "seed": 1, + "hydra.verbose": True, + } + cfg = OmegaConf.create(tabularize_config_kwargs) + + logger.info("caching flat representation of MEDS data") + cache_flat_representation(cfg) From 4a486aafd836647bec786455484b82fb485a9cb4 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sun, 26 May 2024 19:01:02 +0000 Subject: [PATCH 002/106] added static feature pivoting --- configs/tabularize.yaml | 3 + src/MEDS_tabular_automl/tabularize.py | 374 ++++++++++++++++---------- tests/test_tabularize.py | 10 +- 3 files changed, 237 insertions(+), 150 deletions(-) diff --git a/configs/tabularize.yaml b/configs/tabularize.yaml index 86f8369..a92f707 100644 --- a/configs/tabularize.yaml +++ b/configs/tabularize.yaml @@ -22,6 +22,9 @@ aggs: - "value/intercept" - "value/residual/sum" - "value/residual/sum_sqd" +numeric_value_impute_strategy: "drop" +dynamic_threshold: 0.01 +numerical_value_threshold: 0.1 # Sharding n_patients_per_sub_shard: null diff --git a/src/MEDS_tabular_automl/tabularize.py b/src/MEDS_tabular_automl/tabularize.py index d2b8d79..ccb2a00 100644 --- a/src/MEDS_tabular_automl/tabularize.py +++ b/src/MEDS_tabular_automl/tabularize.py @@ -6,8 +6,9 @@ DF_T: This defines the type of internal dataframes -- e.g. polars DataFrames. """ -import copy +import enum import json +from collections import OrderedDict from collections.abc import Callable, Mapping, Sequence from pathlib import Path @@ -17,7 +18,18 @@ from omegaconf import DictConfig from tqdm.auto import tqdm + +class CodeType(enum.Enum): + """Enum for the type of code.""" + + STATIC_CATEGORICAL = "STATIC_CATEGORICAL" + DYNAMIC_CATEGORICAL = "DYNAMIC_CATEGORICAL" + STATIC_CONTINUOUS = "STATIC_CONTINUOUS" + DYNAMIC_CONTINUOUS = "DYNAMIC_CONTINUOUS" + + DF_T = pl.DataFrame +WRITE_USE_PYARROW = True def load_meds_data(MEDS_cohort_dir: str) -> Mapping[str, pl.DataFrame]: @@ -123,7 +135,7 @@ def store_params_json(params_fp: Path, cfg: DictConfig, sp_subjects: Mapping[str return params -def _write_df(cls, df: DF_T, fp: Path, **kwargs): +def _write_df(df: DF_T, fp: Path, **kwargs): """Write shard to disk.""" do_overwrite = kwargs.get("do_overwrite", False) @@ -133,76 +145,79 @@ def _write_df(cls, df: DF_T, fp: Path, **kwargs): fp.parent.mkdir(exist_ok=True, parents=True) if isinstance(df, pl.LazyFrame): - df.collect().write_parquet(fp, use_pyarrow=cls.WRITE_USE_PYARROW) + df.collect().write_parquet(fp, use_pyarrow=WRITE_USE_PYARROW) else: - df.write_parquet(fp, use_pyarrow=cls.WRITE_USE_PYARROW) + df.write_parquet(fp, use_pyarrow=WRITE_USE_PYARROW) -def _get_flat_col_dtype(self, col: str) -> pl.DataType: - """Gets the appropriate minimal dtype for the given flat representation column string.""" +def get_smallest_valid_uint_type(num: int | float | pl.Expr) -> pl.DataType: + """Returns the smallest valid unsigned integral type for an ID variable with `num` unique options. + + Args: + num: The number of IDs that must be uniquely expressed. + + Raises: + ValueError: If there is no unsigned int type big enough to express the passed number of ID + variables. + + Examples: + >>> import polars as pl + >>> Dataset.get_smallest_valid_uint_type(num=1) + UInt8 + >>> Dataset.get_smallest_valid_uint_type(num=2**8-1) + UInt16 + >>> Dataset.get_smallest_valid_uint_type(num=2**16-1) + UInt32 + >>> Dataset.get_smallest_valid_uint_type(num=2**32-1) + UInt64 + >>> Dataset.get_smallest_valid_uint_type(num=2**64-1) + Traceback (most recent call last): + ... + ValueError: Value is too large to be expressed as an int! + """ + if num >= (2**64) - 1: + raise ValueError("Value is too large to be expressed as an int!") + if num >= (2**32) - 1: + return pl.UInt64 + elif num >= (2**16) - 1: + return pl.UInt32 + elif num >= (2**8) - 1: + return pl.UInt16 + else: + return pl.UInt8 - parts = col.split("/") - if len(parts) < 4: - raise ValueError(f"Malformed column {col}. Should be temporal/measurement/feature/agg") - temp, meas = parts[0], parts[1] - agg = parts[-1] - feature = "/".join(parts[2:-1]) +def _get_flat_col_dtype(col: str) -> pl.DataType: + """Gets the appropriate minimal dtype for the given flat representation column string.""" - cfg = self.measurement_configs[meas] + code_type, code, agg = _parse_flat_feature_column(col) match agg: - case "sum" | "sum_sqd" | "min" | "max" | "value": + case "sum" | "sum_sqd" | "min" | "max" | "value" | "first": return pl.Float32 case "present": return pl.Boolean case "count" | "has_values_count": - # config.observation_rate_over_cases = total_observed / total_possible - # config.observation_rate_per_case = raw_total_observed / total_observed - - match temp: - case "static": - n_possible = len(self.subject_ids) - case str() | "dynamic": - n_possible = sum(self.n_events_per_subject.values()) - case _: - raise ValueError( - f"Column name {col} malformed: Temporality {temp} not in `static` or `dynamic" - ) - - if cfg.vocabulary is None: - observation_frequency = cfg.observation_rate_per_case * cfg.observation_rate_over_cases - else: - if feature not in cfg.vocabulary.idxmap: - raise ValueError(f"Column name {col} malformed: Feature {feature} not in {meas}!") - else: - observation_frequency = cfg.vocabulary.obs_frequencies[cfg.vocabulary[feature]] - - total_observations = int(np.ceil(observation_frequency * n_possible)) - - return self.get_smallest_valid_uint_type(total_observations) + return pl.UInt32 + # TODO: reduce the dtype to the smallest possible unsigned int type + # return get_smallest_valid_uint_type(total_observations) case _: raise ValueError(f"Column name {col} malformed!") def _normalize_flat_rep_df_cols( - flat_df: DF_T, feature_columns: list[str] | None = None, set_count_0_to_null: bool = False + flat_df: DF_T, feature_columns: list[str], set_count_0_to_null: bool = False ) -> DF_T: - if feature_columns is None: - feature_columns = [x for x in flat_df.columns if x not in ("subject_id", "timestamp")] - cols_to_add = set() - cols_to_retype = set(feature_columns) - else: - cols_to_add = set(feature_columns) - set(flat_df.columns) - cols_to_retype = set(feature_columns).intersection(set(flat_df.columns)) + cols_to_add = set(feature_columns) - set(flat_df.columns) + cols_to_retype = set(feature_columns).intersection(set(flat_df.columns)) cols_to_add = [(c, _get_flat_col_dtype(c)) for c in cols_to_add] cols_to_retype = [(c, _get_flat_col_dtype(c)) for c in cols_to_retype] if "timestamp" in flat_df.columns: - key_cols = ["subject_id", "timestamp"] + key_cols = ["patient_id", "timestamp"] else: - key_cols = ["subject_id"] + key_cols = ["patient_id"] flat_df = flat_df.with_columns( *[pl.lit(None, dtype=dt).alias(c) for c, dt in cols_to_add], @@ -236,7 +251,7 @@ def _summarize_dynamic_measurements( valid_measures = {} for feat_col in feature_columns: - temp, meas, feat, _ = self._parse_flat_feature_column(feat_col) + temp, meas, feat = self._parse_flat_feature_column(feat_col) if temp != "dynamic": continue @@ -419,117 +434,182 @@ def f(c: str) -> str: def _get_flat_ts_rep( - self, feature_columns: list[str], **kwargs, ) -> pl.LazyFrame: """Produce raw representation for dynamic data.""" + return _normalize_flat_rep_df_cols( _summarize_dynamic_measurements(feature_columns, **kwargs) - .drop("event_id") .sort(by=["subject_id", "timestamp"]) .collect() .lazy(), - [c for c in feature_columns if not c.startswith("static/")], + [c for c in feature_columns if c.startswith("dynamic")], ) # The above .collect().lazy() shouldn't be necessary but it appears to be for some reason... +def _parse_flat_feature_column(c: str) -> tuple[str, str, str, str]: + parts = c.split("/") + if len(parts) < 3: + raise ValueError(f"Column {c} is not a valid flat feature column!") + return (parts[0], "/".join(parts[1:-1]), parts[-1]) + + +def _summarize_static_measurements( + feature_columns: list[str], + df: DF_T, +) -> pl.LazyFrame: + static_present = [c for c in feature_columns if c.startswith("STATIC_") and c.endswith("present")] + static_first = [c for c in feature_columns if c.startswith("STATIC_") and c.endswith("first")] + + static_first_codes = [_parse_flat_feature_column(c)[1] for c in static_first] + code_subset = df.filter(pl.col("code").is_in(static_first_codes)) + first_code_subset = code_subset.groupby(pl.col("patient_id")).first().collect() + static_value_pivot_df = first_code_subset.pivot( + index=["patient_id"], columns=["code"], values=["numerical_value"], aggregate_function=None + ) + # rename code to feature name + remap_cols = { + input_name: output_name + for input_name, output_name in zip(static_first_codes, static_first) + if input_name in static_value_pivot_df.columns + } + static_value_pivot_df = static_value_pivot_df.select( + *["patient_id"], *[pl.col(k).alias(v).cast(pl.Boolean) for k, v in remap_cols.items()] + ) + # pivot can be faster: https://stackoverflow.com/questions/73522017/replacing-a-pivot-with-a-lazy-groupby-operation # noqa: E501 + # maybe cast with .cast(pl.Float32)) + + static_present_codes = [_parse_flat_feature_column(c)[1] for c in static_present] + static_present_pivot_df = ( + df.select(*["patient_id", "code"]) + .filter(pl.col("code").is_in(static_present_codes)) + .with_columns(pl.lit(True).alias("__indicator")) + .collect() + .pivot( + index=["patient_id"], + columns=["code"], + values="__indicator", + aggregate_function=None, + ) + ) + remap_cols = { + input_name: output_name + for input_name, output_name in zip(static_present_codes, static_present) + if input_name in static_present_pivot_df.columns + } + # rename columns to final feature names + static_present_pivot_df = static_present_pivot_df.select( + *["patient_id"], *[pl.col(k).alias(v).cast(pl.Boolean) for k, v in remap_cols.items()] + ) + return pl.concat([static_value_pivot_df, static_present_pivot_df], how="align") + + def _get_flat_static_rep( - self, feature_columns: list[str], - **kwargs, + shard_df: DF_T, ) -> pl.LazyFrame: """Produce raw representation for static data.""" - static_features = [c for c in feature_columns if c.startswith("static/")] - return self._normalize_flat_rep_df_cols( - self._summarize_static_measurements(static_features, **kwargs).collect().lazy(), + static_features = [c for c in feature_columns if c.startswith("STATIC_")] + static_measurements = _summarize_static_measurements(static_features, df=shard_df) + # fill up missing feature columns with nulls + normalized_measurements = _normalize_flat_rep_df_cols( + static_measurements, static_features, set_count_0_to_null=False, ) + return normalized_measurements -def _get_flat_rep_feature_cols( - self, - feature_inclusion_frequency: float | dict[str, float] | None = None, - window_sizes: list[str] | None = None, - include_only_measurements: set[str] | None = None, -) -> list[str]: - """ - process aggregations and select which columns get which aggregations - 1. static - 1. code & no numerical_values - 2. numerical_values - 2. dynamic - 1. codes -> aggs applied to all codes - 2. numerical_values -> continuous aggs +def evaluate_code_properties(df, cfg): + """Evaluates and categorizes each code in a dataframe based on its timestamp presence and numerical + values. + + This function categorizes codes as 'dynamic' or 'static' based on the presence + of timestamps, and as 'continuous' or 'categorical' based on the presence of + numerical values. A code is considered: + - Dynamic if the ratio of present timestamps to its total occurrences exceeds + the configured dynamic threshold. + - Continuous if the ratio of non-null numerical values to total occurrences + exceeds the numerical value threshold + and there is more than one unique numerical value. + + Parameters: + - df (DataFrame): The dataframe containing the codes and their attributes. + - cfg (dict): Configuration dictionary with keys 'dynamic_threshold', 'numerical_value_threshold', + and 'min_code_inclusion_frequency' to determine the thresholds for categorizing codes. + + Returns: + - dict: A dictionary with code as keys and their properties (e.g., 'dynamic_continuous') as values. + Codes with total occurrences less than 'min_code_inclusion_frequency' are excluded. + + Examples: + >>> import polars as pl + >>> data = {'code': ['A', 'A', 'B', 'B', 'C', 'C', 'C'], + ... 'timestamp': [None, '2021-01-01', None, '2021-01-02', '2021-01-03', '2021-01-04', None], + ... 'numerical_value': [1, None, 2, 2, None, None, 3]} + >>> df = pl.DataFrame(data) + >>> cfg = {'dynamic_threshold': 0.5, 'numerical_value_threshold': 0.5, 'min_code_inclusion_frequency': 1} + >>> evaluate_code_properties(df, cfg) + {'A': 'static_categorical', 'B': 'dynamic_continuous', 'C': 'dynamic_categorical'} """ - feature_inclusion_frequency, include_only_measurements = self._resolve_flat_rep_cache_params( - feature_inclusion_frequency, include_only_measurements - ) - feature_columns = [] - for m, cfg in self.measurement_configs.items(): - if m not in include_only_measurements: + code_properties = OrderedDict() + for code in df.select(pl.col("code").unique()).collect().to_series(): + # Determine total count, timestamp count, and numerical count + code_data = df.filter(pl.col("code") == code) + total_count = code_data.select(pl.count("code")).collect().item() + if total_count < cfg["min_code_inclusion_frequency"]: continue - features = None - if cfg.vocabulary is not None: - vocab = copy.deepcopy(cfg.vocabulary) - if feature_inclusion_frequency is not None: - m_freq = feature_inclusion_frequency[m] - vocab.filter(total_observations=None, min_valid_element_freq=m_freq) - features = vocab.vocabulary - # elif cfg.modality == DataModality.UNIVARIATE_REGRESSION: - # features = [m] - else: - raise ValueError(f"Config with modality {cfg.modality} should have a Vocabulary!") - temps = [] - aggs = [] - - # match cfg.temporality: - # case TemporalityType.STATIC: - # temps = [str(cfg.temporality)] - # match cfg.modality: - # case DataModality.UNIVARIATE_REGRESSION: - # aggs = ["value"] - # case DataModality.SINGLE_LABEL_CLASSIFICATION: - # aggs = ["present"] - # case _: - # raise ValueError(f"{cfg.modality} invalid with {cfg.temporality}") - # case TemporalityType.FUNCTIONAL_TIME_DEPENDENT if window_sizes is None: - # temps = [str(cfg.temporality)] - # match cfg.modality: - # case DataModality.UNIVARIATE_REGRESSION: - # aggs = ["value"] - # case DataModality.SINGLE_LABEL_CLASSIFICATION: - # aggs = ["present"] - # case _: - # raise ValueError(f"{cfg.modality} invalid with {cfg.temporality}") - # case TemporalityType.FUNCTIONAL_TIME_DEPENDENT if window_sizes is not None: - # temps = window_sizes - # match cfg.modality: - # case DataModality.UNIVARIATE_REGRESSION: - # aggs = ["count", "has_values_count", "sum", "sum_sqd", "min", "max"] - # case DataModality.SINGLE_LABEL_CLASSIFICATION: - # aggs = ["count"] - # case _: - # raise ValueError(f"{cfg.modality} invalid with {cfg.temporality}") - # case TemporalityType.DYNAMIC: - # temps = [str(cfg.temporality)] if window_sizes is None else window_sizes - # match cfg.modality: - # case DataModality.UNIVARIATE_REGRESSION | DataModality.MULTIVARIATE_REGRESSION: - # aggs = ["count", "has_values_count", "sum", "sum_sqd", "min", "max"] - # case DataModality.MULTI_LABEL_CLASSIFICATION: - # aggs = ["count"] - # case _: - # raise ValueError(f"{cfg.modality} invalid with {cfg.temporality}") - - for temp in temps: - for feature in features: - for agg in aggs: - feature_columns.append(f"{temp}/{m}/{feature}/{agg}") - - return sorted(feature_columns) + timestamp_count = code_data.select(pl.col("timestamp").count()).collect().item() + numerical_count = code_data.select(pl.col("numerical_value").count()).collect().item() + + # Determine dynamic vs static + is_dynamic = (timestamp_count / total_count) > cfg["dynamic_threshold"] + + # Determine categorical vs continuous + is_continuous = (numerical_count / total_count) > cfg[ + "numerical_value_threshold" + ] and code_data.select(pl.col("numerical_value").n_unique()).collect().item() > 1 + + match (is_dynamic, is_continuous): + case (False, False): + code_properties[code] = CodeType.STATIC_CATEGORICAL + case (False, True): + code_properties[code] = CodeType.STATIC_CONTINUOUS + case (True, False): + code_properties[code] = CodeType.DYNAMIC_CATEGORICAL + case (True, True): + code_properties[code] = CodeType.DYNAMIC_CONTINUOUS + + return code_properties + + +def get_code_column(code: str, code_type: CodeType, aggs: Sequence[str]): + """Get the column name for a given code and aggregation type.""" + prefix = f"{code_type.value}/{code}" + if code_type == CodeType.STATIC_CATEGORICAL: + return [f"{prefix}/present"] + elif code_type == CodeType.DYNAMIC_CATEGORICAL: + valid_aggs = [agg[4:] for agg in aggs if agg.startswith("code")] + return [f"{prefix}/{agg}" for agg in valid_aggs] + elif code_type == CodeType.STATIC_CONTINUOUS: + return [f"{prefix}/present", f"{prefix}/first"] + elif code_type == CodeType.DYNAMIC_CONTINUOUS: + valid_aggs = [agg[5:] for agg in aggs if agg.startswith("value")] + return [f"{prefix}/{agg}" for agg in valid_aggs] + else: + raise ValueError(f"Invalid code type: {code_type}") + + +def _get_flat_rep_feature_cols(cfg, split_to_shard_df) -> list[str]: + feature_columns = [] + all_train_data = pl.concat(split_to_shard_df["train"]) + code_properties = evaluate_code_properties(all_train_data, cfg) + for code, code_type in code_properties.items(): + feature_columns.extend(get_code_column(code, code_type, cfg.aggs)) + return feature_columns, code_properties def cache_flat_representation( @@ -586,6 +666,7 @@ def cache_flat_representation( # for every dataset split, create shards to output flat representations to sp_subjects = {} + sp_dfs = {} for split_name, split_df in split_to_df.items(): split_patient_ids = ( split_df.select(pl.col("patient_id").cast(pl.Int32).unique()).collect().to_series().to_list() @@ -593,33 +674,34 @@ def cache_flat_representation( print(len(split_patient_ids)) if cfg.n_patients_per_sub_shard is None: sp_subjects[split_name] = split_patient_ids + sp_dfs[split_name] = [split_df] else: shuffled_patient_ids = rng.permutation(split_patient_ids) num_shards = max(len(split_patient_ids) // cfg.n_patients_per_sub_shard, 1) # must be 1 or larger sharded_patient_ids = np.array_split(shuffled_patient_ids, num_shards) sp_subjects[split_name] = [shard.tolist() for shard in sharded_patient_ids] + sp_dfs[split_name] = [ + split_df.filter(pl.col("patient_id").is_in(set(shard))) for shard in sharded_patient_ids + ] # store params in json file params_fp = flat_dir / "params.json" - params = store_params_json(params_fp, cfg, sp_subjects) + store_params_json(params_fp, cfg, sp_subjects) # 0. Identify Output Columns # We set window_sizes to None here because we want to get the feature column names for the raw flat # representation, not the summarized one. - feature_columns = _get_flat_rep_feature_cols( - min_code_inclusion_frequency=cfg.min_code_inclusion_frequency, - window_sizes=None, - ) + feature_columns, code_properties = _get_flat_rep_feature_cols(cfg, sp_dfs) # 1. Produce static representation static_subdir = flat_dir / "static" static_dfs = {} - for sp, subjects in tqdm(list(params["patient_shard_by_split"].items()), desc="Flattening Splits"): + for sp, subjects_dfs in tqdm(list(sp_dfs.items()), desc="Flattening Splits"): static_dfs[sp] = [] sp_dir = static_subdir / sp - for i, subjects_list in enumerate(tqdm(subjects, desc="Subject chunks", leave=False)): + for i, shard_df in enumerate(tqdm(subjects_dfs, desc="Subject chunks", leave=False)): fp = sp_dir / f"{i}.parquet" static_dfs[sp].append(fp) if fp.exists(): @@ -630,7 +712,7 @@ def cache_flat_representation( df = _get_flat_static_rep( feature_columns=feature_columns, - include_only_subjects=subjects_list, + shard_df=shard_df, ) _write_df(df, fp, do_overwrite=cfg.do_overwrite) @@ -639,11 +721,11 @@ def cache_flat_representation( ts_subdir = flat_dir / "at_ts" ts_dfs = {} - for sp, subjects in tqdm(list(params["patient_shard_by_split"].items()), desc="Flattening Splits"): + for sp, subjects_dfs in tqdm(list(sp_dfs.items()), desc="Flattening Splits"): ts_dfs[sp] = [] sp_dir = ts_subdir / sp - for i, subjects_list in enumerate(tqdm(subjects, desc="Subject chunks", leave=False)): + for i, shard_df in enumerate(tqdm(subjects_dfs, desc="Subject chunks", leave=False)): fp = sp_dir / f"{i}.parquet" ts_dfs[sp].append(fp) if fp.exists(): @@ -654,7 +736,7 @@ def cache_flat_representation( df = _get_flat_ts_rep( feature_columns=feature_columns, - include_only_subjects=subjects_list, + shard_df=shard_df, ) _write_df(df, fp, do_overwrite=cfg.do_overwrite) diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index f918796..0c72576 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -8,8 +8,8 @@ from pathlib import Path import polars as pl +from hydra import compose, initialize from loguru import logger -from omegaconf import OmegaConf from MEDS_tabular_automl.tabularize import cache_flat_representation @@ -130,7 +130,9 @@ def test_tabularize(): "seed": 1, "hydra.verbose": True, } - cfg = OmegaConf.create(tabularize_config_kwargs) - logger.info("caching flat representation of MEDS data") - cache_flat_representation(cfg) + with initialize(version_base=None, config_path="../configs/"): # path to config.yaml + overrides = [f"{k}={v}" for k, v in tabularize_config_kwargs.items()] + cfg = compose(config_name="tabularize", overrides=overrides) # config.yaml + logger.info("caching flat representation of MEDS data") + cache_flat_representation(cfg) From 19f0f4e49a9ad7e9396b6b3a9b394a8153c40219 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sun, 26 May 2024 19:46:51 +0000 Subject: [PATCH 003/106] added docstrings and a smiple test case checking the number of subjects is correct when producing static representations --- configs/tabularize.yaml | 3 - .../generate_static_features.py | 112 ++++ .../generate_ts_features.py | 230 +++++++ src/MEDS_tabular_automl/tabularize.py | 595 ++---------------- src/MEDS_tabular_automl/utils.py | 269 ++++++++ 5 files changed, 667 insertions(+), 542 deletions(-) create mode 100644 src/MEDS_tabular_automl/generate_static_features.py create mode 100644 src/MEDS_tabular_automl/generate_ts_features.py create mode 100644 src/MEDS_tabular_automl/utils.py diff --git a/configs/tabularize.yaml b/configs/tabularize.yaml index a92f707..17b8ab5 100644 --- a/configs/tabularize.yaml +++ b/configs/tabularize.yaml @@ -20,9 +20,6 @@ aggs: - "value/last" - "value/slope" - "value/intercept" - - "value/residual/sum" - - "value/residual/sum_sqd" -numeric_value_impute_strategy: "drop" dynamic_threshold: 0.01 numerical_value_threshold: 0.1 diff --git a/src/MEDS_tabular_automl/generate_static_features.py b/src/MEDS_tabular_automl/generate_static_features.py new file mode 100644 index 0000000..207d537 --- /dev/null +++ b/src/MEDS_tabular_automl/generate_static_features.py @@ -0,0 +1,112 @@ +"""This module provides functions for generating static representations of patient data for use in automated +machine learning models. It includes functionality to summarize measurements based on static features and then +transform them into a tabular format suitable for analysis. The module leverages the polars library for +efficient data manipulation. + +Functions: +- _summarize_static_measurements: Summarizes static measurements from a given DataFrame. +- get_flat_static_rep: Produces a tabular representation of static data features. +""" + +import polars as pl + +from MEDS_tabular_automl.utils import ( + DF_T, + _normalize_flat_rep_df_cols, + _parse_flat_feature_column, +) + + +def _summarize_static_measurements( + feature_columns: list[str], + df: DF_T, +) -> pl.LazyFrame: + """Aggregates static measurements for feature columns that are marked as 'present' or 'first'. + + Parameters: + - feature_columns (list[str]): List of feature column identifiers that are specifically marked + for staticanalysis. + - df (DF_T): Data frame from which features will be extracted and summarized. + + Returns: + - pl.LazyFrame: A LazyFrame containing the summarized data pivoted by 'patient_id' + for each static feature. + + This function first filters for features that need to be recorded as the first occurrence + or simply as present, then performs a pivot to reshape the data for each patient, providing + a tabular format where each row represents a patient and each column represents a static feature. + """ + static_present = [c for c in feature_columns if c.startswith("STATIC_") and c.endswith("present")] + static_first = [c for c in feature_columns if c.startswith("STATIC_") and c.endswith("first")] + + # Handling 'first' static values + static_first_codes = [_parse_flat_feature_column(c)[1] for c in static_first] + code_subset = df.filter(pl.col("code").is_in(static_first_codes)) + first_code_subset = code_subset.groupby(pl.col("patient_id")).first().collect() + static_value_pivot_df = first_code_subset.pivot( + index=["patient_id"], columns=["code"], values=["numerical_value"], aggregate_function=None + ) + # rename code to feature name + remap_cols = { + input_name: output_name + for input_name, output_name in zip(static_first_codes, static_first) + if input_name in static_value_pivot_df.columns + } + static_value_pivot_df = static_value_pivot_df.select( + *["patient_id"], *[pl.col(k).alias(v).cast(pl.Boolean) for k, v in remap_cols.items()] + ) + # pivot can be faster: https://stackoverflow.com/questions/73522017/replacing-a-pivot-with-a-lazy-groupby-operation # noqa: E501 + # TODO: consider casting with .cast(pl.Float32)) + + # Handling 'present' static indicators + static_present_codes = [_parse_flat_feature_column(c)[1] for c in static_present] + static_present_pivot_df = ( + df.select(*["patient_id", "code"]) + .filter(pl.col("code").is_in(static_present_codes)) + .with_columns(pl.lit(True).alias("__indicator")) + .collect() + .pivot( + index=["patient_id"], + columns=["code"], + values="__indicator", + aggregate_function=None, + ) + ) + remap_cols = { + input_name: output_name + for input_name, output_name in zip(static_present_codes, static_present) + if input_name in static_present_pivot_df.columns + } + # rename columns to final feature names + static_present_pivot_df = static_present_pivot_df.select( + *["patient_id"], *[pl.col(k).alias(v).cast(pl.Boolean) for k, v in remap_cols.items()] + ) + return pl.concat([static_value_pivot_df, static_present_pivot_df], how="align") + + +def get_flat_static_rep( + feature_columns: list[str], + shard_df: DF_T, +) -> pl.LazyFrame: + """Produces a raw representation for static data from a specified shard DataFrame. + + Parameters: + - feature_columns (list[str]): List of feature columns to include in the static representation. + - shard_df (DF_T): The shard DataFrame containing patient data. + + Returns: + - pl.LazyFrame: A LazyFrame that includes all static features for the data provided. + + This function selects the appropriate static features, summarizes them using + _summarize_static_measurements, and then normalizes the resulting data to ensure it is + suitable for further analysis or machine learning tasks. + """ + static_features = [c for c in feature_columns if c.startswith("STATIC_")] + static_measurements = _summarize_static_measurements(static_features, df=shard_df) + # fill up missing feature columns with nulls + normalized_measurements = _normalize_flat_rep_df_cols( + static_measurements, + static_features, + set_count_0_to_null=False, + ) + return normalized_measurements diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py new file mode 100644 index 0000000..be2c089 --- /dev/null +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -0,0 +1,230 @@ +"""WIP. + +This file will be used to generate time series features from the raw data. +""" +from collections.abc import Callable +from pathlib import Path + +import numpy as np +import polars as pl +import polars.selectors as cs + +from MEDS_tabular_automl.utils import ( + DF_T, + _normalize_flat_rep_df_cols, + _parse_flat_feature_column, +) + + +def _summarize_dynamic_measurements( + self, + feature_columns: list[str], + include_only_subjects: set[int] | None = None, +) -> pl.LazyFrame: + if include_only_subjects is None: + df = self.dynamic_measurements_df + else: + df = self.dynamic_measurements_df.join( + self.events_df.filter(pl.col("subject_id").is_in(list(include_only_subjects))).select("event_id"), + on="event_id", + how="inner", + ) + + valid_measures = {} + for feat_col in feature_columns: + temp, meas, feat = _parse_flat_feature_column(feat_col) + + if temp != "dynamic": + continue + + if meas not in valid_measures: + valid_measures[meas] = set() + valid_measures[meas].add(feat) + + out_dfs = {} + for m, allowed_vocab in valid_measures.items(): + cfg = self.measurement_configs[m] + + total_observations = int( + np.ceil( + cfg.observation_rate_per_case + * cfg.observation_rate_over_cases + * sum(self.n_events_per_subject.values()) + ) + ) + + count_type = self.get_smallest_valid_uint_type(total_observations) + + if cfg.modality == "univariate_regression" and cfg.vocabulary is None: + prefix = f"dynamic/{m}/{m}" + + key_col = pl.col(m) + val_col = pl.col(m).drop_nans().cast(pl.Float32) + + out_dfs[m] = ( + df.lazy() + .select("measurement_id", "event_id", m) + .filter(pl.col(m).is_not_null()) + .groupby("event_id") + .agg( + pl.col(m).is_not_null().sum().cast(count_type).alias(f"{prefix}/count"), + ( + (pl.col(m).is_not_nan() & pl.col(m).is_not_null()) + .sum() + .cast(count_type) + .alias(f"{prefix}/has_values_count") + ), + val_col.sum().alias(f"{prefix}/sum"), + (val_col**2).sum().alias(f"{prefix}/sum_sqd"), + val_col.min().alias(f"{prefix}/min"), + val_col.max().alias(f"{prefix}/max"), + ) + ) + continue + elif cfg.modality == "multivariate_regression": + column_cols = [m, m] + values_cols = [m, cfg.values_column] + key_prefix = f"{m}_{m}_" + val_prefix = f"{cfg.values_column}_{m}_" + + key_col = cs.starts_with(key_prefix) + val_col = cs.starts_with(val_prefix).drop_nans().cast(pl.Float32) + + aggs = [ + key_col.is_not_null() + .sum() + .cast(count_type) + .map_alias(lambda c: f"dynamic/{m}/{c.replace(key_prefix, '')}/count"), + ( + (cs.starts_with(val_prefix).is_not_null() & cs.starts_with(val_prefix).is_not_nan()) + .sum() + .map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/has_values_count") + ), + val_col.sum().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/sum"), + (val_col**2).sum().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/sum_sqd"), + val_col.min().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/min"), + val_col.max().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/max"), + ] + else: + column_cols = [m] + values_cols = [m] + aggs = [ + pl.all().is_not_null().sum().cast(count_type).map_alias(lambda c: f"dynamic/{m}/{c}/count") + ] + + ID_cols = ["measurement_id", "event_id"] + out_dfs[m] = ( + df.select(*ID_cols, *set(column_cols + values_cols)) + .filter(pl.col(m).is_in(allowed_vocab)) + .pivot( + index=ID_cols, + columns=column_cols, + values=values_cols, + aggregate_function=None, + ) + .lazy() + .drop("measurement_id") + .groupby("event_id") + .agg(*aggs) + ) + + return pl.concat(list(out_dfs.values()), how="align") + + +def _summarize_over_window(df: DF_T, window_size: str) -> pl.LazyFrame: + """Apply aggregations to the raw representation over a window size.""" + if isinstance(df, Path): + df = pl.scan_parquet(df) + + def time_aggd_col_alias_fntr(new_agg: str | None = None) -> Callable[[str], str]: + if new_agg is None: + + def f(c: str) -> str: + return "/".join([window_size] + c.split("/")[1:]) + + else: + + def f(c: str) -> str: + return "/".join([window_size] + c.split("/")[1:-1] + [new_agg]) + + return f + + # Columns to convert to counts: + present_indicator_cols = cs.ends_with("/present") + + # Columns to convert to value aggregations: + value_cols = cs.ends_with("/value") + + # Columns to aggregate via other operations + cnt_cols = (cs.ends_with("/count") | cs.ends_with("/has_values_count")).fill_null(0) + + cols_to_sum = cs.ends_with("/sum") | cs.ends_with("/sum_sqd") + cols_to_min = cs.ends_with("/min") + cols_to_max = cs.ends_with("/max") + + if window_size == "FULL": + df = df.groupby("subject_id").agg( + "timestamp", + # present to counts + present_indicator_cols.cumsum().map_alias(time_aggd_col_alias_fntr("count")), + # values to stats + value_cols.is_not_null().cumsum().map_alias(time_aggd_col_alias_fntr("count")), + ( + (value_cols.is_not_null() & value_cols.is_not_nan()) + .cumsum() + .map_alias(time_aggd_col_alias_fntr("has_values_count")) + ), + value_cols.cumsum().map_alias(time_aggd_col_alias_fntr("sum")), + (value_cols**2).cumsum().map_alias(time_aggd_col_alias_fntr("sum_sqd")), + value_cols.cummin().map_alias(time_aggd_col_alias_fntr("min")), + value_cols.cummax().map_alias(time_aggd_col_alias_fntr("max")), + # Raw aggregations + cnt_cols.cumsum().map_alias(time_aggd_col_alias_fntr()), + cols_to_sum.cumsum().map_alias(time_aggd_col_alias_fntr()), + cols_to_min.cummin().map_alias(time_aggd_col_alias_fntr()), + cols_to_max.cummax().map_alias(time_aggd_col_alias_fntr()), + ) + df = df.explode(*[c for c in df.columns if c != "subject_id"]) + else: + df = df.groupby_rolling( + index_column="timestamp", + by="subject_id", + period=window_size, + ).agg( + # present to counts + present_indicator_cols.sum().map_alias(time_aggd_col_alias_fntr("count")), + # values to stats + value_cols.is_not_null().sum().map_alias(time_aggd_col_alias_fntr("count")), + ( + (value_cols.is_not_null() & value_cols.is_not_nan()) + .sum() + .map_alias(time_aggd_col_alias_fntr("has_values_count")) + ), + value_cols.sum().map_alias(time_aggd_col_alias_fntr("sum")), + (value_cols**2).sum().map_alias(time_aggd_col_alias_fntr("sum_sqd")), + value_cols.min().map_alias(time_aggd_col_alias_fntr("min")), + value_cols.max().map_alias(time_aggd_col_alias_fntr("max")), + # Raw aggregations + cnt_cols.sum().map_alias(time_aggd_col_alias_fntr()), + cols_to_sum.sum().map_alias(time_aggd_col_alias_fntr()), + cols_to_min.min().map_alias(time_aggd_col_alias_fntr()), + cols_to_max.max().map_alias(time_aggd_col_alias_fntr()), + ) + + return _normalize_flat_rep_df_cols(df, set_count_0_to_null=True) + + +def get_flat_ts_rep( + feature_columns: list[str], + **kwargs, +) -> pl.LazyFrame: + """Produce raw representation for dynamic data.""" + + return _normalize_flat_rep_df_cols( + _summarize_dynamic_measurements(feature_columns, **kwargs) + .sort(by=["subject_id", "timestamp"]) + .collect() + .lazy(), + [c for c in feature_columns if c.startswith("dynamic")], + ) + # The above .collect().lazy() shouldn't be necessary but it appears to be for some reason... diff --git a/src/MEDS_tabular_automl/tabularize.py b/src/MEDS_tabular_automl/tabularize.py index ccb2a00..83a0d19 100644 --- a/src/MEDS_tabular_automl/tabularize.py +++ b/src/MEDS_tabular_automl/tabularize.py @@ -6,30 +6,17 @@ DF_T: This defines the type of internal dataframes -- e.g. polars DataFrames. """ -import enum import json -from collections import OrderedDict -from collections.abc import Callable, Mapping, Sequence +from collections.abc import Mapping, Sequence from pathlib import Path import numpy as np import polars as pl -import polars.selectors as cs from omegaconf import DictConfig from tqdm.auto import tqdm - -class CodeType(enum.Enum): - """Enum for the type of code.""" - - STATIC_CATEGORICAL = "STATIC_CATEGORICAL" - DYNAMIC_CATEGORICAL = "DYNAMIC_CATEGORICAL" - STATIC_CONTINUOUS = "STATIC_CONTINUOUS" - DYNAMIC_CONTINUOUS = "DYNAMIC_CONTINUOUS" - - -DF_T = pl.DataFrame -WRITE_USE_PYARROW = True +from MEDS_tabular_automl.generate_static_features import get_flat_static_rep +from MEDS_tabular_automl.utils import get_flat_rep_feature_cols, write_df def load_meds_data(MEDS_cohort_dir: str) -> Mapping[str, pl.DataFrame]: @@ -135,483 +122,6 @@ def store_params_json(params_fp: Path, cfg: DictConfig, sp_subjects: Mapping[str return params -def _write_df(df: DF_T, fp: Path, **kwargs): - """Write shard to disk.""" - do_overwrite = kwargs.get("do_overwrite", False) - - if not do_overwrite and fp.is_file(): - raise FileExistsError(f"{fp} exists and do_overwrite is {do_overwrite}!") - - fp.parent.mkdir(exist_ok=True, parents=True) - - if isinstance(df, pl.LazyFrame): - df.collect().write_parquet(fp, use_pyarrow=WRITE_USE_PYARROW) - else: - df.write_parquet(fp, use_pyarrow=WRITE_USE_PYARROW) - - -def get_smallest_valid_uint_type(num: int | float | pl.Expr) -> pl.DataType: - """Returns the smallest valid unsigned integral type for an ID variable with `num` unique options. - - Args: - num: The number of IDs that must be uniquely expressed. - - Raises: - ValueError: If there is no unsigned int type big enough to express the passed number of ID - variables. - - Examples: - >>> import polars as pl - >>> Dataset.get_smallest_valid_uint_type(num=1) - UInt8 - >>> Dataset.get_smallest_valid_uint_type(num=2**8-1) - UInt16 - >>> Dataset.get_smallest_valid_uint_type(num=2**16-1) - UInt32 - >>> Dataset.get_smallest_valid_uint_type(num=2**32-1) - UInt64 - >>> Dataset.get_smallest_valid_uint_type(num=2**64-1) - Traceback (most recent call last): - ... - ValueError: Value is too large to be expressed as an int! - """ - if num >= (2**64) - 1: - raise ValueError("Value is too large to be expressed as an int!") - if num >= (2**32) - 1: - return pl.UInt64 - elif num >= (2**16) - 1: - return pl.UInt32 - elif num >= (2**8) - 1: - return pl.UInt16 - else: - return pl.UInt8 - - -def _get_flat_col_dtype(col: str) -> pl.DataType: - """Gets the appropriate minimal dtype for the given flat representation column string.""" - - code_type, code, agg = _parse_flat_feature_column(col) - - match agg: - case "sum" | "sum_sqd" | "min" | "max" | "value" | "first": - return pl.Float32 - case "present": - return pl.Boolean - case "count" | "has_values_count": - return pl.UInt32 - # TODO: reduce the dtype to the smallest possible unsigned int type - # return get_smallest_valid_uint_type(total_observations) - case _: - raise ValueError(f"Column name {col} malformed!") - - -def _normalize_flat_rep_df_cols( - flat_df: DF_T, feature_columns: list[str], set_count_0_to_null: bool = False -) -> DF_T: - cols_to_add = set(feature_columns) - set(flat_df.columns) - cols_to_retype = set(feature_columns).intersection(set(flat_df.columns)) - - cols_to_add = [(c, _get_flat_col_dtype(c)) for c in cols_to_add] - cols_to_retype = [(c, _get_flat_col_dtype(c)) for c in cols_to_retype] - - if "timestamp" in flat_df.columns: - key_cols = ["patient_id", "timestamp"] - else: - key_cols = ["patient_id"] - - flat_df = flat_df.with_columns( - *[pl.lit(None, dtype=dt).alias(c) for c, dt in cols_to_add], - *[pl.col(c).cast(dt).alias(c) for c, dt in cols_to_retype], - ).select(*key_cols, *feature_columns) - - if not set_count_0_to_null: - return flat_df - - flat_df = flat_df.collect() - - flat_df = flat_df.with_columns( - pl.when(cs.ends_with("count") != 0).then(cs.ends_with("count")).keep_name() - ).lazy() - return flat_df - - -def _summarize_dynamic_measurements( - self, - feature_columns: list[str], - include_only_subjects: set[int] | None = None, -) -> pl.LazyFrame: - if include_only_subjects is None: - df = self.dynamic_measurements_df - else: - df = self.dynamic_measurements_df.join( - self.events_df.filter(pl.col("subject_id").is_in(list(include_only_subjects))).select("event_id"), - on="event_id", - how="inner", - ) - - valid_measures = {} - for feat_col in feature_columns: - temp, meas, feat = self._parse_flat_feature_column(feat_col) - - if temp != "dynamic": - continue - - if meas not in valid_measures: - valid_measures[meas] = set() - valid_measures[meas].add(feat) - - out_dfs = {} - for m, allowed_vocab in valid_measures.items(): - cfg = self.measurement_configs[m] - - total_observations = int( - np.ceil( - cfg.observation_rate_per_case - * cfg.observation_rate_over_cases - * sum(self.n_events_per_subject.values()) - ) - ) - - count_type = self.get_smallest_valid_uint_type(total_observations) - - if cfg.modality == "univariate_regression" and cfg.vocabulary is None: - prefix = f"dynamic/{m}/{m}" - - key_col = pl.col(m) - val_col = pl.col(m).drop_nans().cast(pl.Float32) - - out_dfs[m] = ( - df.lazy() - .select("measurement_id", "event_id", m) - .filter(pl.col(m).is_not_null()) - .groupby("event_id") - .agg( - pl.col(m).is_not_null().sum().cast(count_type).alias(f"{prefix}/count"), - ( - (pl.col(m).is_not_nan() & pl.col(m).is_not_null()) - .sum() - .cast(count_type) - .alias(f"{prefix}/has_values_count") - ), - val_col.sum().alias(f"{prefix}/sum"), - (val_col**2).sum().alias(f"{prefix}/sum_sqd"), - val_col.min().alias(f"{prefix}/min"), - val_col.max().alias(f"{prefix}/max"), - ) - ) - continue - elif cfg.modality == "multivariate_regression": - column_cols = [m, m] - values_cols = [m, cfg.values_column] - key_prefix = f"{m}_{m}_" - val_prefix = f"{cfg.values_column}_{m}_" - - key_col = cs.starts_with(key_prefix) - val_col = cs.starts_with(val_prefix).drop_nans().cast(pl.Float32) - - aggs = [ - key_col.is_not_null() - .sum() - .cast(count_type) - .map_alias(lambda c: f"dynamic/{m}/{c.replace(key_prefix, '')}/count"), - ( - (cs.starts_with(val_prefix).is_not_null() & cs.starts_with(val_prefix).is_not_nan()) - .sum() - .map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/has_values_count") - ), - val_col.sum().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/sum"), - (val_col**2).sum().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/sum_sqd"), - val_col.min().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/min"), - val_col.max().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/max"), - ] - else: - column_cols = [m] - values_cols = [m] - aggs = [ - pl.all().is_not_null().sum().cast(count_type).map_alias(lambda c: f"dynamic/{m}/{c}/count") - ] - - ID_cols = ["measurement_id", "event_id"] - out_dfs[m] = ( - df.select(*ID_cols, *set(column_cols + values_cols)) - .filter(pl.col(m).is_in(allowed_vocab)) - .pivot( - index=ID_cols, - columns=column_cols, - values=values_cols, - aggregate_function=None, - ) - .lazy() - .drop("measurement_id") - .groupby("event_id") - .agg(*aggs) - ) - - return pl.concat(list(out_dfs.values()), how="align") - - -def _summarize_over_window(df: DF_T, window_size: str) -> pl.LazyFrame: - """Apply aggregations to the raw representation over a window size.""" - if isinstance(df, Path): - df = pl.scan_parquet(df) - - def time_aggd_col_alias_fntr(new_agg: str | None = None) -> Callable[[str], str]: - if new_agg is None: - - def f(c: str) -> str: - return "/".join([window_size] + c.split("/")[1:]) - - else: - - def f(c: str) -> str: - return "/".join([window_size] + c.split("/")[1:-1] + [new_agg]) - - return f - - # Columns to convert to counts: - present_indicator_cols = cs.ends_with("/present") - - # Columns to convert to value aggregations: - value_cols = cs.ends_with("/value") - - # Columns to aggregate via other operations - cnt_cols = (cs.ends_with("/count") | cs.ends_with("/has_values_count")).fill_null(0) - - cols_to_sum = cs.ends_with("/sum") | cs.ends_with("/sum_sqd") - cols_to_min = cs.ends_with("/min") - cols_to_max = cs.ends_with("/max") - - if window_size == "FULL": - df = df.groupby("subject_id").agg( - "timestamp", - # present to counts - present_indicator_cols.cumsum().map_alias(time_aggd_col_alias_fntr("count")), - # values to stats - value_cols.is_not_null().cumsum().map_alias(time_aggd_col_alias_fntr("count")), - ( - (value_cols.is_not_null() & value_cols.is_not_nan()) - .cumsum() - .map_alias(time_aggd_col_alias_fntr("has_values_count")) - ), - value_cols.cumsum().map_alias(time_aggd_col_alias_fntr("sum")), - (value_cols**2).cumsum().map_alias(time_aggd_col_alias_fntr("sum_sqd")), - value_cols.cummin().map_alias(time_aggd_col_alias_fntr("min")), - value_cols.cummax().map_alias(time_aggd_col_alias_fntr("max")), - # Raw aggregations - cnt_cols.cumsum().map_alias(time_aggd_col_alias_fntr()), - cols_to_sum.cumsum().map_alias(time_aggd_col_alias_fntr()), - cols_to_min.cummin().map_alias(time_aggd_col_alias_fntr()), - cols_to_max.cummax().map_alias(time_aggd_col_alias_fntr()), - ) - df = df.explode(*[c for c in df.columns if c != "subject_id"]) - else: - df = df.groupby_rolling( - index_column="timestamp", - by="subject_id", - period=window_size, - ).agg( - # present to counts - present_indicator_cols.sum().map_alias(time_aggd_col_alias_fntr("count")), - # values to stats - value_cols.is_not_null().sum().map_alias(time_aggd_col_alias_fntr("count")), - ( - (value_cols.is_not_null() & value_cols.is_not_nan()) - .sum() - .map_alias(time_aggd_col_alias_fntr("has_values_count")) - ), - value_cols.sum().map_alias(time_aggd_col_alias_fntr("sum")), - (value_cols**2).sum().map_alias(time_aggd_col_alias_fntr("sum_sqd")), - value_cols.min().map_alias(time_aggd_col_alias_fntr("min")), - value_cols.max().map_alias(time_aggd_col_alias_fntr("max")), - # Raw aggregations - cnt_cols.sum().map_alias(time_aggd_col_alias_fntr()), - cols_to_sum.sum().map_alias(time_aggd_col_alias_fntr()), - cols_to_min.min().map_alias(time_aggd_col_alias_fntr()), - cols_to_max.max().map_alias(time_aggd_col_alias_fntr()), - ) - - return _normalize_flat_rep_df_cols(df, set_count_0_to_null=True) - - -def _get_flat_ts_rep( - feature_columns: list[str], - **kwargs, -) -> pl.LazyFrame: - """Produce raw representation for dynamic data.""" - - return _normalize_flat_rep_df_cols( - _summarize_dynamic_measurements(feature_columns, **kwargs) - .sort(by=["subject_id", "timestamp"]) - .collect() - .lazy(), - [c for c in feature_columns if c.startswith("dynamic")], - ) - # The above .collect().lazy() shouldn't be necessary but it appears to be for some reason... - - -def _parse_flat_feature_column(c: str) -> tuple[str, str, str, str]: - parts = c.split("/") - if len(parts) < 3: - raise ValueError(f"Column {c} is not a valid flat feature column!") - return (parts[0], "/".join(parts[1:-1]), parts[-1]) - - -def _summarize_static_measurements( - feature_columns: list[str], - df: DF_T, -) -> pl.LazyFrame: - static_present = [c for c in feature_columns if c.startswith("STATIC_") and c.endswith("present")] - static_first = [c for c in feature_columns if c.startswith("STATIC_") and c.endswith("first")] - - static_first_codes = [_parse_flat_feature_column(c)[1] for c in static_first] - code_subset = df.filter(pl.col("code").is_in(static_first_codes)) - first_code_subset = code_subset.groupby(pl.col("patient_id")).first().collect() - static_value_pivot_df = first_code_subset.pivot( - index=["patient_id"], columns=["code"], values=["numerical_value"], aggregate_function=None - ) - # rename code to feature name - remap_cols = { - input_name: output_name - for input_name, output_name in zip(static_first_codes, static_first) - if input_name in static_value_pivot_df.columns - } - static_value_pivot_df = static_value_pivot_df.select( - *["patient_id"], *[pl.col(k).alias(v).cast(pl.Boolean) for k, v in remap_cols.items()] - ) - # pivot can be faster: https://stackoverflow.com/questions/73522017/replacing-a-pivot-with-a-lazy-groupby-operation # noqa: E501 - # maybe cast with .cast(pl.Float32)) - - static_present_codes = [_parse_flat_feature_column(c)[1] for c in static_present] - static_present_pivot_df = ( - df.select(*["patient_id", "code"]) - .filter(pl.col("code").is_in(static_present_codes)) - .with_columns(pl.lit(True).alias("__indicator")) - .collect() - .pivot( - index=["patient_id"], - columns=["code"], - values="__indicator", - aggregate_function=None, - ) - ) - remap_cols = { - input_name: output_name - for input_name, output_name in zip(static_present_codes, static_present) - if input_name in static_present_pivot_df.columns - } - # rename columns to final feature names - static_present_pivot_df = static_present_pivot_df.select( - *["patient_id"], *[pl.col(k).alias(v).cast(pl.Boolean) for k, v in remap_cols.items()] - ) - return pl.concat([static_value_pivot_df, static_present_pivot_df], how="align") - - -def _get_flat_static_rep( - feature_columns: list[str], - shard_df: DF_T, -) -> pl.LazyFrame: - """Produce raw representation for static data.""" - static_features = [c for c in feature_columns if c.startswith("STATIC_")] - static_measurements = _summarize_static_measurements(static_features, df=shard_df) - # fill up missing feature columns with nulls - normalized_measurements = _normalize_flat_rep_df_cols( - static_measurements, - static_features, - set_count_0_to_null=False, - ) - return normalized_measurements - - -def evaluate_code_properties(df, cfg): - """Evaluates and categorizes each code in a dataframe based on its timestamp presence and numerical - values. - - This function categorizes codes as 'dynamic' or 'static' based on the presence - of timestamps, and as 'continuous' or 'categorical' based on the presence of - numerical values. A code is considered: - - Dynamic if the ratio of present timestamps to its total occurrences exceeds - the configured dynamic threshold. - - Continuous if the ratio of non-null numerical values to total occurrences - exceeds the numerical value threshold - and there is more than one unique numerical value. - - Parameters: - - df (DataFrame): The dataframe containing the codes and their attributes. - - cfg (dict): Configuration dictionary with keys 'dynamic_threshold', 'numerical_value_threshold', - and 'min_code_inclusion_frequency' to determine the thresholds for categorizing codes. - - Returns: - - dict: A dictionary with code as keys and their properties (e.g., 'dynamic_continuous') as values. - Codes with total occurrences less than 'min_code_inclusion_frequency' are excluded. - - Examples: - >>> import polars as pl - >>> data = {'code': ['A', 'A', 'B', 'B', 'C', 'C', 'C'], - ... 'timestamp': [None, '2021-01-01', None, '2021-01-02', '2021-01-03', '2021-01-04', None], - ... 'numerical_value': [1, None, 2, 2, None, None, 3]} - >>> df = pl.DataFrame(data) - >>> cfg = {'dynamic_threshold': 0.5, 'numerical_value_threshold': 0.5, 'min_code_inclusion_frequency': 1} - >>> evaluate_code_properties(df, cfg) - {'A': 'static_categorical', 'B': 'dynamic_continuous', 'C': 'dynamic_categorical'} - """ - code_properties = OrderedDict() - for code in df.select(pl.col("code").unique()).collect().to_series(): - # Determine total count, timestamp count, and numerical count - code_data = df.filter(pl.col("code") == code) - total_count = code_data.select(pl.count("code")).collect().item() - if total_count < cfg["min_code_inclusion_frequency"]: - continue - - timestamp_count = code_data.select(pl.col("timestamp").count()).collect().item() - numerical_count = code_data.select(pl.col("numerical_value").count()).collect().item() - - # Determine dynamic vs static - is_dynamic = (timestamp_count / total_count) > cfg["dynamic_threshold"] - - # Determine categorical vs continuous - is_continuous = (numerical_count / total_count) > cfg[ - "numerical_value_threshold" - ] and code_data.select(pl.col("numerical_value").n_unique()).collect().item() > 1 - - match (is_dynamic, is_continuous): - case (False, False): - code_properties[code] = CodeType.STATIC_CATEGORICAL - case (False, True): - code_properties[code] = CodeType.STATIC_CONTINUOUS - case (True, False): - code_properties[code] = CodeType.DYNAMIC_CATEGORICAL - case (True, True): - code_properties[code] = CodeType.DYNAMIC_CONTINUOUS - - return code_properties - - -def get_code_column(code: str, code_type: CodeType, aggs: Sequence[str]): - """Get the column name for a given code and aggregation type.""" - prefix = f"{code_type.value}/{code}" - if code_type == CodeType.STATIC_CATEGORICAL: - return [f"{prefix}/present"] - elif code_type == CodeType.DYNAMIC_CATEGORICAL: - valid_aggs = [agg[4:] for agg in aggs if agg.startswith("code")] - return [f"{prefix}/{agg}" for agg in valid_aggs] - elif code_type == CodeType.STATIC_CONTINUOUS: - return [f"{prefix}/present", f"{prefix}/first"] - elif code_type == CodeType.DYNAMIC_CONTINUOUS: - valid_aggs = [agg[5:] for agg in aggs if agg.startswith("value")] - return [f"{prefix}/{agg}" for agg in valid_aggs] - else: - raise ValueError(f"Invalid code type: {code_type}") - - -def _get_flat_rep_feature_cols(cfg, split_to_shard_df) -> list[str]: - feature_columns = [] - all_train_data = pl.concat(split_to_shard_df["train"]) - code_properties = evaluate_code_properties(all_train_data, cfg) - for code, code_type in code_properties.items(): - feature_columns.extend(get_code_column(code, code_type, cfg.aggs)) - return feature_columns, code_properties - - def cache_flat_representation( cfg: DictConfig, ): @@ -691,12 +201,13 @@ def cache_flat_representation( # 0. Identify Output Columns # We set window_sizes to None here because we want to get the feature column names for the raw flat # representation, not the summarized one. - feature_columns, code_properties = _get_flat_rep_feature_cols(cfg, sp_dfs) + feature_columns, code_properties = get_flat_rep_feature_cols(cfg, sp_dfs) # 1. Produce static representation static_subdir = flat_dir / "static" static_dfs = {} + actual_num_patients = 0 for sp, subjects_dfs in tqdm(list(sp_dfs.items()), desc="Flattening Splits"): static_dfs[sp] = [] sp_dir = static_subdir / sp @@ -710,52 +221,58 @@ def cache_flat_representation( elif not cfg.do_overwrite: raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") - df = _get_flat_static_rep( - feature_columns=feature_columns, - shard_df=shard_df, - ) - - _write_df(df, fp, do_overwrite=cfg.do_overwrite) - - # 2. Produce raw representation - ts_subdir = flat_dir / "at_ts" - - ts_dfs = {} - for sp, subjects_dfs in tqdm(list(sp_dfs.items()), desc="Flattening Splits"): - ts_dfs[sp] = [] - sp_dir = ts_subdir / sp - - for i, shard_df in enumerate(tqdm(subjects_dfs, desc="Subject chunks", leave=False)): - fp = sp_dir / f"{i}.parquet" - ts_dfs[sp].append(fp) - if fp.exists(): - if cfg.do_update: - continue - elif not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") - - df = _get_flat_ts_rep( + df = get_flat_static_rep( feature_columns=feature_columns, shard_df=shard_df, ) - _write_df(df, fp, do_overwrite=cfg.do_overwrite) - - if cfg.window_sizes is None: - return - - # 3. Produce summarized history representations - history_subdir = flat_dir / "over_history" - - for window_size in tqdm(cfg.window_sizes, desc="History window sizes"): - for sp, df_fps in tqdm(list(ts_dfs.items()), desc="Windowing Splits", leave=False): - for i, df_fp in enumerate(tqdm(df_fps, desc="Subject chunks", leave=False)): - fp = history_subdir / sp / window_size / f"{i}.parquet" - if fp.exists(): - if cfg.do_update: - continue - elif not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") - - df = _summarize_over_window(df_fp, window_size) - _write_df(df, fp) + write_df(df, fp, do_overwrite=cfg.do_overwrite) + actual_num_patients += df.shape[0] + expected_num_patients = sum(len(ids) for split_ids in sp_subjects.values() for ids in split_ids) + assert ( + actual_num_patients == expected_num_patients + ), f"Expected {expected_num_patients} patients, got {actual_num_patients}." + + # # 2. Produce raw representation + # ts_subdir = flat_dir / "at_ts" + # import pdb; pdb.set_trace() + + # ts_dfs = {} + # for sp, subjects_dfs in tqdm(list(sp_dfs.items()), desc="Flattening Splits"): + # ts_dfs[sp] = [] + # sp_dir = ts_subdir / sp + + # for i, shard_df in enumerate(tqdm(subjects_dfs, desc="Subject chunks", leave=False)): + # fp = sp_dir / f"{i}.parquet" + # ts_dfs[sp].append(fp) + # if fp.exists(): + # if cfg.do_update: + # continue + # elif not cfg.do_overwrite: + # raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") + + # df = get_flat_ts_rep( + # feature_columns=feature_columns, + # shard_df=shard_df, + # ) + + # write_df(df, fp, do_overwrite=cfg.do_overwrite) + + # if cfg.window_sizes is None: + # return + + # # 3. Produce summarized history representations + # history_subdir = flat_dir / "over_history" + + # for window_size in tqdm(cfg.window_sizes, desc="History window sizes"): + # for sp, df_fps in tqdm(list(ts_dfs.items()), desc="Windowing Splits", leave=False): + # for i, df_fp in enumerate(tqdm(df_fps, desc="Subject chunks", leave=False)): + # fp = history_subdir / sp / window_size / f"{i}.parquet" + # if fp.exists(): + # if cfg.do_update: + # continue + # elif not cfg.do_overwrite: + # raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") + + # df = _summarize_over_window(df_fp, window_size) + # write_df(df, fp) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py new file mode 100644 index 0000000..b2c9299 --- /dev/null +++ b/src/MEDS_tabular_automl/utils.py @@ -0,0 +1,269 @@ +"""The base class for core dataset processing logic. + +Attributes: + INPUT_DF_T: This defines the type of the allowable input dataframes -- e.g., databases, filepaths, + dataframes, etc. + DF_T: This defines the type of internal dataframes -- e.g. polars DataFrames. +""" + +import enum +from collections import OrderedDict +from collections.abc import Sequence +from pathlib import Path + +import polars as pl +import polars.selectors as cs + + +class CodeType(enum.Enum): + """Enum for the type of code.""" + + STATIC_CATEGORICAL = "STATIC_CATEGORICAL" + DYNAMIC_CATEGORICAL = "DYNAMIC_CATEGORICAL" + STATIC_CONTINUOUS = "STATIC_CONTINUOUS" + DYNAMIC_CONTINUOUS = "DYNAMIC_CONTINUOUS" + + +DF_T = pl.DataFrame +WRITE_USE_PYARROW = True + + +def _parse_flat_feature_column(c: str) -> tuple[str, str, str, str]: + parts = c.split("/") + if len(parts) < 3: + raise ValueError(f"Column {c} is not a valid flat feature column!") + return (parts[0], "/".join(parts[1:-1]), parts[-1]) + + +def write_df(df: DF_T, fp: Path, **kwargs): + """Write shard to disk.""" + do_overwrite = kwargs.get("do_overwrite", False) + + if not do_overwrite and fp.is_file(): + raise FileExistsError(f"{fp} exists and do_overwrite is {do_overwrite}!") + + fp.parent.mkdir(exist_ok=True, parents=True) + + if isinstance(df, pl.LazyFrame): + df.collect().write_parquet(fp, use_pyarrow=WRITE_USE_PYARROW) + else: + df.write_parquet(fp, use_pyarrow=WRITE_USE_PYARROW) + + +def get_smallest_valid_uint_type(num: int | float | pl.Expr) -> pl.DataType: + """Returns the smallest valid unsigned integral type for an ID variable with `num` unique options. + + Args: + num: The number of IDs that must be uniquely expressed. + + Raises: + ValueError: If there is no unsigned int type big enough to express the passed number of ID + variables. + + Examples: + >>> import polars as pl + >>> Dataset.get_smallest_valid_uint_type(num=1) + UInt8 + >>> Dataset.get_smallest_valid_uint_type(num=2**8-1) + UInt16 + >>> Dataset.get_smallest_valid_uint_type(num=2**16-1) + UInt32 + >>> Dataset.get_smallest_valid_uint_type(num=2**32-1) + UInt64 + >>> Dataset.get_smallest_valid_uint_type(num=2**64-1) + Traceback (most recent call last): + ... + ValueError: Value is too large to be expressed as an int! + """ + if num >= (2**64) - 1: + raise ValueError("Value is too large to be expressed as an int!") + if num >= (2**32) - 1: + return pl.UInt64 + elif num >= (2**16) - 1: + return pl.UInt32 + elif num >= (2**8) - 1: + return pl.UInt16 + else: + return pl.UInt8 + + +def get_flat_col_dtype(col: str) -> pl.DataType: + """Gets the appropriate minimal dtype for the given flat representation column string.""" + + code_type, code, agg = _parse_flat_feature_column(col) + + match agg: + case "sum" | "sum_sqd" | "min" | "max" | "value" | "first": + return pl.Float32 + case "present": + return pl.Boolean + case "count" | "has_values_count": + return pl.UInt32 + # TODO: reduce the dtype to the smallest possible unsigned int type + # return get_smallest_valid_uint_type(total_observations) + case _: + raise ValueError(f"Column name {col} malformed!") + + +def _normalize_flat_rep_df_cols( + flat_df: DF_T, feature_columns: list[str], set_count_0_to_null: bool = False +) -> DF_T: + """Normalizes columns in a DataFrame so all expected columns are present and appropriately typed. + + Parameters: + - flat_df (DF_T): The DataFrame to be normalized. + - feature_columns (list[str]): A list of feature column names that should exist in the DataFrame. + - set_count_0_to_null (bool): A flag indicating whether counts of zero should be converted to nulls. + + Returns: + - DF_T: The normalized DataFrame with all columns set to the correct type and zero-counts handled + if specified. + + This function ensures that all necessary columns are added and typed correctly within + a DataFrame, potentially modifying zero counts to nulls based on the configuration. + """ + cols_to_add = set(feature_columns) - set(flat_df.columns) + cols_to_retype = set(feature_columns).intersection(set(flat_df.columns)) + + cols_to_add = [(c, get_flat_col_dtype(c)) for c in cols_to_add] + cols_to_retype = [(c, get_flat_col_dtype(c)) for c in cols_to_retype] + + if "timestamp" in flat_df.columns: + key_cols = ["patient_id", "timestamp"] + else: + key_cols = ["patient_id"] + + flat_df = flat_df.with_columns( + *[pl.lit(None, dtype=dt).alias(c) for c, dt in cols_to_add], + *[pl.col(c).cast(dt).alias(c) for c, dt in cols_to_retype], + ).select(*key_cols, *feature_columns) + + if not set_count_0_to_null: + return flat_df + + flat_df = flat_df.collect() + + flat_df = flat_df.with_columns( + pl.when(cs.ends_with("count") != 0).then(cs.ends_with("count")).keep_name() + ).lazy() + return flat_df + + +def evaluate_code_properties(df, cfg): + """Evaluates and categorizes each code in a dataframe based on its timestamp presence and numerical + values. + + This function categorizes codes as 'dynamic' or 'static' based on the presence + of timestamps, and as 'continuous' or 'categorical' based on the presence of + numerical values. A code is considered: + - Dynamic if the ratio of present timestamps to its total occurrences exceeds + the configured dynamic threshold. + - Continuous if the ratio of non-null numerical values to total occurrences + exceeds the numerical value threshold + and there is more than one unique numerical value. + + Parameters: + - df (DataFrame): The dataframe containing the codes and their attributes. + - cfg (dict): Configuration dictionary with keys 'dynamic_threshold', 'numerical_value_threshold', + and 'min_code_inclusion_frequency' to determine the thresholds for categorizing codes. + + Returns: + - dict: A dictionary with code as keys and their properties (e.g., 'dynamic_continuous') as values. + Codes with total occurrences less than 'min_code_inclusion_frequency' are excluded. + + Examples: + >>> import polars as pl + >>> data = {'code': ['A', 'A', 'B', 'B', 'C', 'C', 'C'], + ... 'timestamp': [None, '2021-01-01', None, '2021-01-02', '2021-01-03', '2021-01-04', None], + ... 'numerical_value': [1, None, 2, 2, None, None, 3]} + >>> df = pl.DataFrame(data) + >>> cfg = {'dynamic_threshold': 0.5, 'numerical_value_threshold': 0.5, 'min_code_inclusion_frequency': 1} + >>> evaluate_code_properties(df, cfg) + {'A': 'static_categorical', 'B': 'dynamic_continuous', 'C': 'dynamic_categorical'} + """ + code_properties = OrderedDict() + for code in df.select(pl.col("code").unique()).collect().to_series(): + # Determine total count, timestamp count, and numerical count + code_data = df.filter(pl.col("code") == code) + total_count = code_data.select(pl.count("code")).collect().item() + if total_count < cfg["min_code_inclusion_frequency"]: + continue + + timestamp_count = code_data.select(pl.col("timestamp").count()).collect().item() + numerical_count = code_data.select(pl.col("numerical_value").count()).collect().item() + + # Determine dynamic vs static + is_dynamic = (timestamp_count / total_count) > cfg["dynamic_threshold"] + + # Determine categorical vs continuous + is_continuous = (numerical_count / total_count) > cfg[ + "numerical_value_threshold" + ] and code_data.select(pl.col("numerical_value").n_unique()).collect().item() > 1 + + match (is_dynamic, is_continuous): + case (False, False): + code_properties[code] = CodeType.STATIC_CATEGORICAL + case (False, True): + code_properties[code] = CodeType.STATIC_CONTINUOUS + case (True, False): + code_properties[code] = CodeType.DYNAMIC_CATEGORICAL + case (True, True): + code_properties[code] = CodeType.DYNAMIC_CONTINUOUS + + return code_properties + + +def get_code_column(code: str, code_type: CodeType, aggs: Sequence[str]): + """Constructs feature column names based on a given code, its type, and specified aggregations. + + Parameters: + - code (str): The specific code identifier for which the feature columns are being generated. + - code_type (CodeType): The type of the code (e.g., STATIC_CATEGORICAL, DYNAMIC_CONTINUOUS) + that determines how the code is processed. + - aggs (Sequence[str]): A list of aggregation operations to apply to the code, e.g., + "count", "sum". + + Returns: + - list[str]: A list of fully qualified feature column names constructed based on the + code type and applicable aggregations. + + This function builds a list of feature column names using the code and its type to apply + the correct prefix and filters applicable aggregations based on whether they are relevant + to the code type. + """ + prefix = f"{code_type.value}/{code}" + if code_type == CodeType.STATIC_CATEGORICAL: + return [f"{prefix}/present"] + elif code_type == CodeType.DYNAMIC_CATEGORICAL: + valid_aggs = [agg[4:] for agg in aggs if agg.startswith("code")] + return [f"{prefix}/{agg}" for agg in valid_aggs] + elif code_type == CodeType.STATIC_CONTINUOUS: + return [f"{prefix}/present", f"{prefix}/first"] + elif code_type == CodeType.DYNAMIC_CONTINUOUS: + valid_aggs = [agg[5:] for agg in aggs if agg.startswith("value")] + return [f"{prefix}/{agg}" for agg in valid_aggs] + else: + raise ValueError(f"Invalid code type: {code_type}") + + +def get_flat_rep_feature_cols(cfg, split_to_shard_df) -> list[str]: + """Generates a list of feature column names from the data within each shard based on specified + configurations. + + Parameters: + - cfg (dict): Configuration dictionary specifying how features should be evaluated and aggregated. + - split_to_shard_df (dict): A dictionary of DataFrames, divided by data split (e.g., 'train', 'test'). + + Returns: + - tuple[list[str], dict]: A tuple containing a list of feature columns and a dictionary of code properties + identified during the evaluation. + + This function evaluates the properties of codes within training data and applies configured + aggregations to generate a comprehensive list of feature columns for modeling purposes. + """ + feature_columns = [] + all_train_data = pl.concat(split_to_shard_df["train"]) + code_properties = evaluate_code_properties(all_train_data, cfg) + for code, code_type in code_properties.items(): + feature_columns.extend(get_code_column(code, code_type, cfg.aggs)) + return feature_columns, code_properties From fd1731f11ddbffb4dbde332ff5acb8e7ee2a9efd Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 27 May 2024 00:35:21 +0000 Subject: [PATCH 004/106] Refactor scripts into separate modules for improved clarity: - Create individual scripts for: 1. Retrieving column names 2. Generating static representations 3. Generating dynamic representations 4. Summarizing data over windows - Add doctests for retrieving column names and generating static representations - Confirm functionality of the above tests --- .pre-commit-config.yaml | 1 + scripts/identify_columns.py | 118 ++++++++ scripts/summarize_over_windows.py | 53 ++++ scripts/tabularize_static.py | 125 +++++++++ scripts/tabularize_ts.py | 51 ++++ .../generate_static_features.py | 12 +- .../generate_ts_features.py | 12 +- src/MEDS_tabular_automl/tabularize.py | 205 +++++--------- src/MEDS_tabular_automl/utils.py | 263 ++++++++---------- tests/test_tabularize.py | 12 +- 10 files changed, 558 insertions(+), 294 deletions(-) create mode 100644 scripts/identify_columns.py create mode 100644 scripts/summarize_over_windows.py create mode 100644 scripts/tabularize_static.py create mode 100644 scripts/tabularize_ts.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7540f52..1533f74 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,6 +38,7 @@ repos: rev: v2.2.0 hooks: - id: autoflake + args: [--in-place, --remove-all-unused-imports] # python upgrading syntax to newer version - repo: https://github.com/asottile/pyupgrade diff --git a/scripts/identify_columns.py b/scripts/identify_columns.py new file mode 100644 index 0000000..d74f811 --- /dev/null +++ b/scripts/identify_columns.py @@ -0,0 +1,118 @@ +"""This Python script, utilizing the Hydra and Polars libraries, automates the creation of flat +representations of medical datasets for machine learning modeling. + +It includes functions to store configuration parameters in a JSON file and write summarized dataset +representations to disk based on configurable parameters such as inclusion frequencies and historical window +sizes. The script ensures data integrity through conditional checks on overwriting and updating existing +files, and enhances traceability by recording configuration details and feature columns used in the output. +""" +import json +from pathlib import Path + +import hydra +from omegaconf import DictConfig, OmegaConf + +from MEDS_tabular_automl.utils import get_flat_rep_feature_cols, load_meds_data + + +def store_config_yaml(config_fp: Path, cfg: DictConfig): + """Stores configuration parameters into a JSON file. + + This function writes a dictionary of parameters, which includes patient partitioning + information and configuration details, to a specified JSON file. + + Args: + - config_fp (Path): The file path for the JSON file where config should be stored. + - cfg (DictConfig): A configuration object containing settings like the number of patients + per sub-shard, minimum code inclusion frequency, and flags for updating or overwriting existing files. + + Behavior: + - If config_fp exists and cfg.do_overwrite is False (without do_update being True), a + FileExistsError is raised to prevent unintentional data loss. + + Raises: + - ValueError: If there are discrepancies between old and new parameters during an update. + - FileExistsError: If the file exists and neither updating nor overwriting is allowed. + + Example: + >>> cfg = DictConfig({ + ... "n_patients_per_sub_shard": 100, + ... "min_code_inclusion_frequency": 5, + ... "do_update": False, + ... "do_overwrite": True + ... }) + >>> import tempfile + >>> from pathlib import Path + >>> with tempfile.TemporaryDirectory() as d: + ... config_fp = Path(d) / "config.yaml" + ... store_config_yaml(config_fp, cfg) + ... assert config_fp.exists() + """ + if config_fp.exists(): + if not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {config_fp} exists!") + OmegaConf.save(cfg, config_fp) + + +@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") +def store_columns( + cfg: DictConfig, +): + """Writes a flat (historically summarized) representation of the dataset to disk. + + This file caches a set of files useful for building flat representations of the dataset to disk, + suitable for, e.g., sklearn style modeling for downstream tasks. It will produce a few sets of files: + + * A new directory ``self.config.save_dir / "flat_reps"`` which contains the following: + * A subdirectory ``raw`` which contains: (1) a json file with the configuration arguments and (2) a + set of parquet files containing flat (e.g., wide) representations of summarized events per subject, + broken out by split and subject chunk. + * A set of subdirectories ``past/*`` which contains summarized views over the past ``*`` time period + per subject per event, for all time periods in ``window_sizes``, if any. + + Args: + cfg: + MEDS_cohort_dir: directory of MEDS format dataset that is ingested. + tabularized_data_dir: output directory of tabularized data. + min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate + what features can be included in the flat representation. It can either be a float, in which + case it applies across all measurements, or `None`, in which case no filtering is applied, or + a dictionary from measurement type to a float dictating a per-measurement-type inclusion + cutoff. + window_sizes: Beyond writing out a raw, per-event flattened representation, the dataset also has + the capability to summarize these flattened representations over the historical windows + specified in this argument. These are strings specifying time deltas, using this syntax: + `link`_. Each window size will be summarized to a separate directory, and will share the same + subject file split as is used in the raw representation files. + codes: A list of codes to include in the flat representation. If `None`, all codes will be included + in the flat representation. + aggs: A list of aggregations to apply to the raw representation. Must have length greater than 0. + n_patients_per_sub_shard: The number of subjects that should be included in each output file. + Lowering this number increases the number of files written, making the process of creating and + leveraging these files slower but more memory efficient. + do_overwrite: If `True`, this function will overwrite the data already stored in the target save + directory. + do_update: bool = True + seed: The seed to use for random number generation. + + .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 + """ + # create output dir + flat_dir = Path(cfg.tabularized_data_dir) / "flat_reps" + flat_dir.mkdir(exist_ok=True, parents=True) + + # load MEDS data + split_to_df = load_meds_data(cfg.MEDS_cohort_dir) + + # store params in json file + config_fp = flat_dir / "config.yaml" + store_config_yaml(config_fp, cfg) + + # 0. Identify Output Columns + # We set window_sizes to None here because we want to get the feature column names for the raw flat + # representation, not the summarized one. + feature_columns = set() + for shard_df in split_to_df["train"]: + feature_columns.update(get_flat_rep_feature_cols(cfg, shard_df)) + feature_columns = sorted(list(feature_columns)) + json.dump(feature_columns, open(flat_dir / "feature_columns.json", "w")) diff --git a/scripts/summarize_over_windows.py b/scripts/summarize_over_windows.py new file mode 100644 index 0000000..acf1d51 --- /dev/null +++ b/scripts/summarize_over_windows.py @@ -0,0 +1,53 @@ +"""WIP.""" + + +import hydra +from omegaconf import DictConfig + +from MEDS_tabular_automl.utils import setup_environment + + +@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") +def summarize_ts_data_over_windows( + cfg: DictConfig, +): + """Writes a flat (historically summarized) representation of the dataset to disk. + + This file caches a set of files useful for building flat representations of the dataset to disk, + suitable for, e.g., sklearn style modeling for downstream tasks. It will produce a few sets of files: + + * A new directory ``self.config.save_dir / "flat_reps"`` which contains the following: + * A subdirectory ``raw`` which contains: (1) a json file with the configuration arguments and (2) a + set of parquet files containing flat (e.g., wide) representations of summarized events per subject, + broken out by split and subject chunk. + * A set of subdirectories ``past/*`` which contains summarized views over the past ``*`` time period + per subject per event, for all time periods in ``window_sizes``, if any. + + Args: + cfg: + MEDS_cohort_dir: directory of MEDS format dataset that is ingested. + tabularized_data_dir: output directory of tabularized data. + min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate + what features can be included in the flat representation. It can either be a float, in which + case it applies across all measurements, or `None`, in which case no filtering is applied, or + a dictionary from measurement type to a float dictating a per-measurement-type inclusion + cutoff. + window_sizes: Beyond writing out a raw, per-event flattened representation, the dataset also has + the capability to summarize these flattened representations over the historical windows + specified in this argument. These are strings specifying time deltas, using this syntax: + `link`_. Each window size will be summarized to a separate directory, and will share the same + subject file split as is used in the raw representation files. + codes: A list of codes to include in the flat representation. If `None`, all codes will be included + in the flat representation. + aggs: A list of aggregations to apply to the raw representation. Must have length greater than 0. + n_patients_per_sub_shard: The number of subjects that should be included in each output file. + Lowering this number increases the number of files written, making the process of creating and + leveraging these files slower but more memory efficient. + do_overwrite: If `True`, this function will overwrite the data already stored in the target save + directory. + do_update: bool = True + seed: The seed to use for random number generation. + + .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 + """ + setup_environment(cfg) diff --git a/scripts/tabularize_static.py b/scripts/tabularize_static.py new file mode 100644 index 0000000..d8fdd1b --- /dev/null +++ b/scripts/tabularize_static.py @@ -0,0 +1,125 @@ +"""The base class for core dataset processing logic. + +Attributes: + INPUT_DF_T: This defines the type of the allowable input dataframes -- e.g., databases, filepaths, + dataframes, etc. + DF_T: This defines the type of internal dataframes -- e.g. polars DataFrames. +""" + +from pathlib import Path + +import hydra +from omegaconf import DictConfig, OmegaConf +from tqdm.auto import tqdm + +from MEDS_tabular_automl.generate_static_features import get_flat_static_rep +from MEDS_tabular_automl.utils import setup_environment, write_df + + +def store_config_yaml(config_fp: Path, cfg: DictConfig): + """Stores configuration parameters into a JSON file. + + This function writes a dictionary of parameters, which includes patient partitioning + information and configuration details, to a specified JSON file. + + Args: + - config_fp (Path): The file path for the JSON file where config should be stored. + - cfg (DictConfig): A configuration object containing settings like the number of patients + per sub-shard, minimum code inclusion frequency, and flags for updating or overwriting existing files. + + Behavior: + - If config_fp exists and cfg.do_overwrite is False (without do_update being True), a + FileExistsError is raised to prevent unintentional data loss. + + Raises: + - ValueError: If there are discrepancies between old and new parameters during an update. + - FileExistsError: If the file exists and neither updating nor overwriting is allowed. + + Example: + >>> cfg = DictConfig({ + ... "n_patients_per_sub_shard": 100, + ... "min_code_inclusion_frequency": 5, + ... "do_update": False, + ... "do_overwrite": True + ... }) + >>> import tempfile + >>> from pathlib import Path + >>> with tempfile.TemporaryDirectory() as d: + ... config_fp = Path(d) / "config.yaml" + ... store_config_yaml(config_fp, cfg) + ... assert config_fp.exists() + """ + if config_fp.exists(): + if not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {config_fp} exists!") + OmegaConf.save(cfg, config_fp) + + +@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") +def tabularize_static_data( + cfg: DictConfig, +): + """Writes a flat (historically summarized) representation of the dataset to disk. + + This file caches a set of files useful for building flat representations of the dataset to disk, + suitable for, e.g., sklearn style modeling for downstream tasks. It will produce a few sets of files: + + * A new directory ``self.config.save_dir / "flat_reps"`` which contains the following: + * A subdirectory ``raw`` which contains: (1) a json file with the configuration arguments and (2) a + set of parquet files containing flat (e.g., wide) representations of summarized events per subject, + broken out by split and subject chunk. + * A set of subdirectories ``past/*`` which contains summarized views over the past ``*`` time period + per subject per event, for all time periods in ``window_sizes``, if any. + + Args: + cfg: + MEDS_cohort_dir: directory of MEDS format dataset that is ingested. + tabularized_data_dir: output directory of tabularized data. + min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate + what features can be included in the flat representation. It can either be a float, in which + case it applies across all measurements, or `None`, in which case no filtering is applied, or + a dictionary from measurement type to a float dictating a per-measurement-type inclusion + cutoff. + window_sizes: Beyond writing out a raw, per-event flattened representation, the dataset also has + the capability to summarize these flattened representations over the historical windows + specified in this argument. These are strings specifying time deltas, using this syntax: + `link`_. Each window size will be summarized to a separate directory, and will share the same + subject file split as is used in the raw representation files. + codes: A list of codes to include in the flat representation. If `None`, all codes will be included + in the flat representation. + aggs: A list of aggregations to apply to the raw representation. Must have length greater than 0. + n_patients_per_sub_shard: The number of subjects that should be included in each output file. + Lowering this number increases the number of files written, making the process of creating and + leveraging these files slower but more memory efficient. + do_overwrite: If `True`, this function will overwrite the data already stored in the target save + directory. + do_update: bool = True + seed: The seed to use for random number generation. + + .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 + """ + flat_dir, split_to_df, feature_columns = setup_environment(cfg) + + # Produce static representation + static_subdir = flat_dir / "static" + + static_dfs = {} + for sp, subjects_dfs in tqdm(list(split_to_df.items()), desc="Flattening Splits"): + static_dfs[sp] = [] + sp_dir = static_subdir / sp + + for i, shard_df in enumerate(tqdm(subjects_dfs, desc="Subject chunks", leave=False)): + fp = sp_dir / f"{i}.parquet" + static_dfs[sp].append(fp) + if fp.exists(): + if cfg.do_update: + continue + elif not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") + + df = get_flat_static_rep( + feature_columns=feature_columns, + shard_df=shard_df, + ) + + write_df(df, fp, do_overwrite=cfg.do_overwrite) diff --git a/scripts/tabularize_ts.py b/scripts/tabularize_ts.py new file mode 100644 index 0000000..2d9ac95 --- /dev/null +++ b/scripts/tabularize_ts.py @@ -0,0 +1,51 @@ +"""WIP.""" +import hydra +from omegaconf import DictConfig + +from MEDS_tabular_automl.utils import setup_environment + + +@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") +def tabularize_ts_data( + cfg: DictConfig, +): + """Writes a flat (historically summarized) representation of the dataset to disk. + + This file caches a set of files useful for building flat representations of the dataset to disk, + suitable for, e.g., sklearn style modeling for downstream tasks. It will produce a few sets of files: + + * A new directory ``self.config.save_dir / "flat_reps"`` which contains the following: + * A subdirectory ``raw`` which contains: (1) a json file with the configuration arguments and (2) a + set of parquet files containing flat (e.g., wide) representations of summarized events per subject, + broken out by split and subject chunk. + * A set of subdirectories ``past/*`` which contains summarized views over the past ``*`` time period + per subject per event, for all time periods in ``window_sizes``, if any. + + Args: + cfg: + MEDS_cohort_dir: directory of MEDS format dataset that is ingested. + tabularized_data_dir: output directory of tabularized data. + min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate + what features can be included in the flat representation. It can either be a float, in which + case it applies across all measurements, or `None`, in which case no filtering is applied, or + a dictionary from measurement type to a float dictating a per-measurement-type inclusion + cutoff. + window_sizes: Beyond writing out a raw, per-event flattened representation, the dataset also has + the capability to summarize these flattened representations over the historical windows + specified in this argument. These are strings specifying time deltas, using this syntax: + `link`_. Each window size will be summarized to a separate directory, and will share the same + subject file split as is used in the raw representation files. + codes: A list of codes to include in the flat representation. If `None`, all codes will be included + in the flat representation. + aggs: A list of aggregations to apply to the raw representation. Must have length greater than 0. + n_patients_per_sub_shard: The number of subjects that should be included in each output file. + Lowering this number increases the number of files written, making the process of creating and + leveraging these files slower but more memory efficient. + do_overwrite: If `True`, this function will overwrite the data already stored in the target save + directory. + do_update: bool = True + seed: The seed to use for random number generation. + + .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 + """ + setup_environment(cfg) diff --git a/src/MEDS_tabular_automl/generate_static_features.py b/src/MEDS_tabular_automl/generate_static_features.py index 207d537..6da9610 100644 --- a/src/MEDS_tabular_automl/generate_static_features.py +++ b/src/MEDS_tabular_automl/generate_static_features.py @@ -10,11 +10,7 @@ import polars as pl -from MEDS_tabular_automl.utils import ( - DF_T, - _normalize_flat_rep_df_cols, - _parse_flat_feature_column, -) +from MEDS_tabular_automl.utils import DF_T, add_missing_cols, parse_flat_feature_column def _summarize_static_measurements( @@ -40,7 +36,7 @@ def _summarize_static_measurements( static_first = [c for c in feature_columns if c.startswith("STATIC_") and c.endswith("first")] # Handling 'first' static values - static_first_codes = [_parse_flat_feature_column(c)[1] for c in static_first] + static_first_codes = [parse_flat_feature_column(c)[1] for c in static_first] code_subset = df.filter(pl.col("code").is_in(static_first_codes)) first_code_subset = code_subset.groupby(pl.col("patient_id")).first().collect() static_value_pivot_df = first_code_subset.pivot( @@ -59,7 +55,7 @@ def _summarize_static_measurements( # TODO: consider casting with .cast(pl.Float32)) # Handling 'present' static indicators - static_present_codes = [_parse_flat_feature_column(c)[1] for c in static_present] + static_present_codes = [parse_flat_feature_column(c)[1] for c in static_present] static_present_pivot_df = ( df.select(*["patient_id", "code"]) .filter(pl.col("code").is_in(static_present_codes)) @@ -104,7 +100,7 @@ def get_flat_static_rep( static_features = [c for c in feature_columns if c.startswith("STATIC_")] static_measurements = _summarize_static_measurements(static_features, df=shard_df) # fill up missing feature columns with nulls - normalized_measurements = _normalize_flat_rep_df_cols( + normalized_measurements = add_missing_cols( static_measurements, static_features, set_count_0_to_null=False, diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index be2c089..9d5956f 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -9,11 +9,7 @@ import polars as pl import polars.selectors as cs -from MEDS_tabular_automl.utils import ( - DF_T, - _normalize_flat_rep_df_cols, - _parse_flat_feature_column, -) +from MEDS_tabular_automl.utils import DF_T, add_missing_cols, parse_flat_feature_column def _summarize_dynamic_measurements( @@ -32,7 +28,7 @@ def _summarize_dynamic_measurements( valid_measures = {} for feat_col in feature_columns: - temp, meas, feat = _parse_flat_feature_column(feat_col) + temp, meas, feat = parse_flat_feature_column(feat_col) if temp != "dynamic": continue @@ -211,7 +207,7 @@ def f(c: str) -> str: cols_to_max.max().map_alias(time_aggd_col_alias_fntr()), ) - return _normalize_flat_rep_df_cols(df, set_count_0_to_null=True) + return add_missing_cols(df, set_count_0_to_null=True) def get_flat_ts_rep( @@ -220,7 +216,7 @@ def get_flat_ts_rep( ) -> pl.LazyFrame: """Produce raw representation for dynamic data.""" - return _normalize_flat_rep_df_cols( + return add_missing_cols( _summarize_dynamic_measurements(feature_columns, **kwargs) .sort(by=["subject_id", "timestamp"]) .collect() diff --git a/src/MEDS_tabular_automl/tabularize.py b/src/MEDS_tabular_automl/tabularize.py index 83a0d19..86948f5 100644 --- a/src/MEDS_tabular_automl/tabularize.py +++ b/src/MEDS_tabular_automl/tabularize.py @@ -5,17 +5,15 @@ dataframes, etc. DF_T: This defines the type of internal dataframes -- e.g. polars DataFrames. """ - -import json -from collections.abc import Mapping, Sequence +from collections.abc import Mapping from pathlib import Path -import numpy as np import polars as pl -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf from tqdm.auto import tqdm from MEDS_tabular_automl.generate_static_features import get_flat_static_rep +from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep from MEDS_tabular_automl.utils import get_flat_rep_feature_cols, write_df @@ -28,38 +26,44 @@ def load_meds_data(MEDS_cohort_dir: str) -> Mapping[str, pl.DataFrame]: Returns: Mapping[str, pl.DataFrame]: Mapping from split name to a polars DataFrame containing the MEDS dataset. + + Example: + >>> import tempfile + >>> from pathlib import Path + >>> MEDS_cohort_dir = Path(tempfile.mkdtemp()) + >>> for split in ["train", "val", "test"]: + ... split_dir = MEDS_cohort_dir / split + ... split_dir.mkdir() + ... pl.DataFrame({"patient_id": [1, 2, 3]}).write_parquet(split_dir / "data.parquet") + >>> split_to_df = load_meds_data(MEDS_cohort_dir) + >>> assert "train" in split_to_df + >>> assert len(split_to_df) == 3 + >>> assert len(split_to_df["train"]) == 1 + >>> assert isinstance(split_to_df["train"][0], pl.DataFrame) """ MEDS_cohort_dir = Path(MEDS_cohort_dir) meds_fps = list(MEDS_cohort_dir.glob("*/*.parquet")) splits = {fp.parent.stem for fp in meds_fps} - assert "train" in splits, f"Expected 'train' split in {splits}." split_to_fps = {split: [fp for fp in meds_fps if fp.parent.stem == split] for split in splits} split_to_df = { - split: pl.concat([pl.scan_parquet(fp) for fp in split_fps]) - for split, split_fps in split_to_fps.items() + split: [pl.scan_parquet(fp) for fp in split_fps] for split, split_fps in split_to_fps.items() } return split_to_df -def store_params_json(params_fp: Path, cfg: DictConfig, sp_subjects: Mapping[str, Sequence[Sequence[int]]]): +def store_config_yaml(config_fp: Path, cfg: DictConfig): """Stores configuration parameters into a JSON file. This function writes a dictionary of parameters, which includes patient partitioning - information and configuration details, to a specified JSON file. If the file already exists, - the function can update it with new values depending on the configuration settings provided. + information and configuration details, to a specified JSON file. - Parameters: - - params_fp (Path): The file path for the JSON file where parameters should be stored. + Args: + - config_fp (Path): The file path for the JSON file where config should be stored. - cfg (DictConfig): A configuration object containing settings like the number of patients per sub-shard, minimum code inclusion frequency, and flags for updating or overwriting existing files. - - sp_subjects (Mapping[str, Sequence[Sequence[int]]]): A mapping of split names to sequences - representing patient IDs, structured in sub-shards. Behavior: - - If params_fp exists and cfg.do_update is True, the function checks for differences - between existing and new parameters. If discrepancies are found, it will raise an error detailing - the differences. The number of patients per sub-shard will be standardized to match the existing record. - - If params_fp exists and cfg.do_overwrite is False (without do_update being True), a + - If config_fp exists and cfg.do_overwrite is False (without do_update being True), a FileExistsError is raised to prevent unintentional data loss. Raises: @@ -68,58 +72,22 @@ def store_params_json(params_fp: Path, cfg: DictConfig, sp_subjects: Mapping[str Example: >>> cfg = DictConfig({ - >>> "n_patients_per_sub_shard": 100, - >>> "min_code_inclusion_frequency": 5, - >>> "do_update": False, - >>> "do_overwrite": True - >>> }) - >>> sp_subjects = {"train": [[1, 2, 3], [4, 5]], "test": [[6, 7]]} - >>> params = store_params_json(Path("/path/to/params.json"), cfg, sp_subjects) + ... "n_patients_per_sub_shard": 100, + ... "min_code_inclusion_frequency": 5, + ... "do_update": False, + ... "do_overwrite": True + ... }) + >>> import tempfile + >>> from pathlib import Path + >>> with tempfile.TemporaryDirectory() as d: + ... config_fp = Path(d) / "config.yaml" + ... store_config_yaml(config_fp, cfg) + ... assert config_fp.exists() """ - params = { - "n_patients_per_sub_shard": cfg.n_patients_per_sub_shard, - "min_code_inclusion_frequency": cfg.min_code_inclusion_frequency, - "patient_shard_by_split": sp_subjects, - } - if params_fp.exists(): - if cfg.do_update: - with open(params_fp) as f: - old_params = json.load(f) - - if old_params["n_patients_per_sub_shard"] != params["n_patients_per_sub_shard"]: - print( - "Standardizing chunk size to existing record " - f"({old_params['n_patients_per_sub_shard']})." - ) - params["n_patients_per_sub_shard"] = old_params["n_patients_per_sub_shard"] - params["patient_shard_by_split"] = old_params["patient_shard_by_split"] - - if old_params != params: - err_strings = ["Asked to update but parameters differ:"] - old = set(old_params.keys()) - new = set(params.keys()) - if old != new: - err_strings.append("Keys differ: ") - if old - new: - err_strings.append(f" old - new = {old - new}") - if new - old: - err_strings.append(f" new - old = {old - new}") - - for k in old & new: - old_val = old_params[k] - new_val = params[k] - - if old_val != new_val: - err_strings.append(f"Values differ for {k}:") - err_strings.append(f" Old: {old_val}") - err_strings.append(f" New: {new_val}") - - raise ValueError("\n".join(err_strings)) - elif not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {params_fp} exists!") - with open(params_fp, mode="w") as f: - json.dump(params, f) - return params + if config_fp.exists(): + if not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {config_fp} exists!") + OmegaConf.save(cfg, config_fp) def cache_flat_representation( @@ -164,9 +132,6 @@ def cache_flat_representation( .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 """ - # setup rng seed - rng = np.random.default_rng(cfg.seed) - # create output dir flat_dir = Path(cfg.tabularized_data_dir) / "flat_reps" flat_dir.mkdir(exist_ok=True, parents=True) @@ -174,41 +139,24 @@ def cache_flat_representation( # load MEDS data split_to_df = load_meds_data(cfg.MEDS_cohort_dir) - # for every dataset split, create shards to output flat representations to - sp_subjects = {} - sp_dfs = {} - for split_name, split_df in split_to_df.items(): - split_patient_ids = ( - split_df.select(pl.col("patient_id").cast(pl.Int32).unique()).collect().to_series().to_list() - ) - print(len(split_patient_ids)) - if cfg.n_patients_per_sub_shard is None: - sp_subjects[split_name] = split_patient_ids - sp_dfs[split_name] = [split_df] - else: - shuffled_patient_ids = rng.permutation(split_patient_ids) - num_shards = max(len(split_patient_ids) // cfg.n_patients_per_sub_shard, 1) # must be 1 or larger - sharded_patient_ids = np.array_split(shuffled_patient_ids, num_shards) - sp_subjects[split_name] = [shard.tolist() for shard in sharded_patient_ids] - sp_dfs[split_name] = [ - split_df.filter(pl.col("patient_id").is_in(set(shard))) for shard in sharded_patient_ids - ] - # store params in json file - params_fp = flat_dir / "params.json" - store_params_json(params_fp, cfg, sp_subjects) + config_fp = flat_dir / "config.json" + store_config_yaml(config_fp, cfg) # 0. Identify Output Columns # We set window_sizes to None here because we want to get the feature column names for the raw flat # representation, not the summarized one. - feature_columns, code_properties = get_flat_rep_feature_cols(cfg, sp_dfs) + feature_columns = set() + for shard_df in split_to_df["train"]: + feature_columns.update(get_flat_rep_feature_cols(cfg, shard_df)) + feature_columns = sorted(list(feature_columns)) # 1. Produce static representation static_subdir = flat_dir / "static" static_dfs = {} actual_num_patients = 0 - for sp, subjects_dfs in tqdm(list(sp_dfs.items()), desc="Flattening Splits"): + for sp, subjects_dfs in tqdm(list(split_to_df.items()), desc="Flattening Splits"): static_dfs[sp] = [] sp_dir = static_subdir / sp @@ -228,38 +176,37 @@ def cache_flat_representation( write_df(df, fp, do_overwrite=cfg.do_overwrite) actual_num_patients += df.shape[0] - expected_num_patients = sum(len(ids) for split_ids in sp_subjects.values() for ids in split_ids) - assert ( - actual_num_patients == expected_num_patients - ), f"Expected {expected_num_patients} patients, got {actual_num_patients}." - - # # 2. Produce raw representation - # ts_subdir = flat_dir / "at_ts" - # import pdb; pdb.set_trace() - - # ts_dfs = {} - # for sp, subjects_dfs in tqdm(list(sp_dfs.items()), desc="Flattening Splits"): - # ts_dfs[sp] = [] - # sp_dir = ts_subdir / sp - - # for i, shard_df in enumerate(tqdm(subjects_dfs, desc="Subject chunks", leave=False)): - # fp = sp_dir / f"{i}.parquet" - # ts_dfs[sp].append(fp) - # if fp.exists(): - # if cfg.do_update: - # continue - # elif not cfg.do_overwrite: - # raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") - - # df = get_flat_ts_rep( - # feature_columns=feature_columns, - # shard_df=shard_df, - # ) - - # write_df(df, fp, do_overwrite=cfg.do_overwrite) - - # if cfg.window_sizes is None: - # return + # expected_num_patients = sum(len(ids) for split_ids in sp_subjects.values() for ids in split_ids) + # assert ( + # actual_num_patients == expected_num_patients + # ), f"Expected {expected_num_patients} patients, got {actual_num_patients}." + + # 2. Produce raw representation + ts_subdir = flat_dir / "at_ts" + + ts_dfs = {} + for sp, subjects_dfs in tqdm(list(split_to_df.items()), desc="Flattening Splits"): + ts_dfs[sp] = [] + sp_dir = ts_subdir / sp + + for i, shard_df in enumerate(tqdm(subjects_dfs, desc="Subject chunks", leave=False)): + fp = sp_dir / f"{i}.parquet" + ts_dfs[sp].append(fp) + if fp.exists(): + if cfg.do_update: + continue + elif not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") + + df = get_flat_ts_rep( + feature_columns=feature_columns, + shard_df=shard_df, + ) + + write_df(df, fp, do_overwrite=cfg.do_overwrite) + + if cfg.window_sizes is None: + return # # 3. Produce summarized history representations # history_subdir = flat_dir / "over_history" diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index b2c9299..0d703df 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -5,30 +5,20 @@ dataframes, etc. DF_T: This defines the type of internal dataframes -- e.g. polars DataFrames. """ - -import enum -from collections import OrderedDict -from collections.abc import Sequence +import json +from collections.abc import Mapping from pathlib import Path import polars as pl import polars.selectors as cs +import yaml +from omegaconf import DictConfig, OmegaConf - -class CodeType(enum.Enum): - """Enum for the type of code.""" - - STATIC_CATEGORICAL = "STATIC_CATEGORICAL" - DYNAMIC_CATEGORICAL = "DYNAMIC_CATEGORICAL" - STATIC_CONTINUOUS = "STATIC_CONTINUOUS" - DYNAMIC_CONTINUOUS = "DYNAMIC_CONTINUOUS" - - -DF_T = pl.DataFrame +DF_T = pl.LazyFrame WRITE_USE_PYARROW = True -def _parse_flat_feature_column(c: str) -> tuple[str, str, str, str]: +def parse_flat_feature_column(c: str) -> tuple[str, str, str, str]: parts = c.split("/") if len(parts) < 3: raise ValueError(f"Column {c} is not a valid flat feature column!") @@ -50,47 +40,10 @@ def write_df(df: DF_T, fp: Path, **kwargs): df.write_parquet(fp, use_pyarrow=WRITE_USE_PYARROW) -def get_smallest_valid_uint_type(num: int | float | pl.Expr) -> pl.DataType: - """Returns the smallest valid unsigned integral type for an ID variable with `num` unique options. - - Args: - num: The number of IDs that must be uniquely expressed. - - Raises: - ValueError: If there is no unsigned int type big enough to express the passed number of ID - variables. - - Examples: - >>> import polars as pl - >>> Dataset.get_smallest_valid_uint_type(num=1) - UInt8 - >>> Dataset.get_smallest_valid_uint_type(num=2**8-1) - UInt16 - >>> Dataset.get_smallest_valid_uint_type(num=2**16-1) - UInt32 - >>> Dataset.get_smallest_valid_uint_type(num=2**32-1) - UInt64 - >>> Dataset.get_smallest_valid_uint_type(num=2**64-1) - Traceback (most recent call last): - ... - ValueError: Value is too large to be expressed as an int! - """ - if num >= (2**64) - 1: - raise ValueError("Value is too large to be expressed as an int!") - if num >= (2**32) - 1: - return pl.UInt64 - elif num >= (2**16) - 1: - return pl.UInt32 - elif num >= (2**8) - 1: - return pl.UInt16 - else: - return pl.UInt8 - - def get_flat_col_dtype(col: str) -> pl.DataType: """Gets the appropriate minimal dtype for the given flat representation column string.""" - code_type, code, agg = _parse_flat_feature_column(col) + code_type, code, agg = parse_flat_feature_column(col) match agg: case "sum" | "sum_sqd" | "min" | "max" | "value" | "first": @@ -99,15 +52,11 @@ def get_flat_col_dtype(col: str) -> pl.DataType: return pl.Boolean case "count" | "has_values_count": return pl.UInt32 - # TODO: reduce the dtype to the smallest possible unsigned int type - # return get_smallest_valid_uint_type(total_observations) case _: raise ValueError(f"Column name {col} malformed!") -def _normalize_flat_rep_df_cols( - flat_df: DF_T, feature_columns: list[str], set_count_0_to_null: bool = False -) -> DF_T: +def add_missing_cols(flat_df: DF_T, feature_columns: list[str], set_count_0_to_null: bool = False) -> DF_T: """Normalizes columns in a DataFrame so all expected columns are present and appropriately typed. Parameters: @@ -149,121 +98,143 @@ def _normalize_flat_rep_df_cols( return flat_df -def evaluate_code_properties(df, cfg): - """Evaluates and categorizes each code in a dataframe based on its timestamp presence and numerical - values. - - This function categorizes codes as 'dynamic' or 'static' based on the presence - of timestamps, and as 'continuous' or 'categorical' based on the presence of - numerical values. A code is considered: - - Dynamic if the ratio of present timestamps to its total occurrences exceeds - the configured dynamic threshold. - - Continuous if the ratio of non-null numerical values to total occurrences - exceeds the numerical value threshold - and there is more than one unique numerical value. +def get_static_feature_cols(shard_df) -> list[str]: + """Generates a list of feature column names from the data within each shard based on specified + configurations. Parameters: - - df (DataFrame): The dataframe containing the codes and their attributes. - - cfg (dict): Configuration dictionary with keys 'dynamic_threshold', 'numerical_value_threshold', - and 'min_code_inclusion_frequency' to determine the thresholds for categorizing codes. + - cfg (dict): Configuration dictionary specifying how features should be evaluated and aggregated. + - split_to_shard_df (dict): A dictionary of DataFrames, divided by data split (e.g., 'train', 'test'). Returns: - - dict: A dictionary with code as keys and their properties (e.g., 'dynamic_continuous') as values. - Codes with total occurrences less than 'min_code_inclusion_frequency' are excluded. + - tuple[list[str], dict]: A tuple containing a list of feature columns and a dictionary of code properties + identified during the evaluation. + This function evaluates the properties of codes within training data and applies configured + aggregations to generate a comprehensive list of feature columns for modeling purposes. Examples: >>> import polars as pl >>> data = {'code': ['A', 'A', 'B', 'B', 'C', 'C', 'C'], - ... 'timestamp': [None, '2021-01-01', None, '2021-01-02', '2021-01-03', '2021-01-04', None], + ... 'timestamp': [None, '2021-01-01', '2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04', None], # noqa: E501 ... 'numerical_value': [1, None, 2, 2, None, None, 3]} - >>> df = pl.DataFrame(data) - >>> cfg = {'dynamic_threshold': 0.5, 'numerical_value_threshold': 0.5, 'min_code_inclusion_frequency': 1} - >>> evaluate_code_properties(df, cfg) - {'A': 'static_categorical', 'B': 'dynamic_continuous', 'C': 'dynamic_categorical'} + >>> df = pl.DataFrame(data).lazy() + >>> get_static_feature_cols(df) + ['static/A/first', 'static/A/present', 'static/C/first', 'static/C/present'] """ - code_properties = OrderedDict() - for code in df.select(pl.col("code").unique()).collect().to_series(): - # Determine total count, timestamp count, and numerical count - code_data = df.filter(pl.col("code") == code) - total_count = code_data.select(pl.count("code")).collect().item() - if total_count < cfg["min_code_inclusion_frequency"]: - continue - - timestamp_count = code_data.select(pl.col("timestamp").count()).collect().item() - numerical_count = code_data.select(pl.col("numerical_value").count()).collect().item() - - # Determine dynamic vs static - is_dynamic = (timestamp_count / total_count) > cfg["dynamic_threshold"] - - # Determine categorical vs continuous - is_continuous = (numerical_count / total_count) > cfg[ - "numerical_value_threshold" - ] and code_data.select(pl.col("numerical_value").n_unique()).collect().item() > 1 - - match (is_dynamic, is_continuous): - case (False, False): - code_properties[code] = CodeType.STATIC_CATEGORICAL - case (False, True): - code_properties[code] = CodeType.STATIC_CONTINUOUS - case (True, False): - code_properties[code] = CodeType.DYNAMIC_CATEGORICAL - case (True, True): - code_properties[code] = CodeType.DYNAMIC_CONTINUOUS - - return code_properties + feature_columns = [] + static_df = shard_df.filter(pl.col("timestamp").is_null()) + for code in static_df.select(pl.col("code").unique()).collect().to_series(): + static_aggregations = [f"static/{code}/present", f"static/{code}/first"] + feature_columns.extend(static_aggregations) + return sorted(feature_columns) -def get_code_column(code: str, code_type: CodeType, aggs: Sequence[str]): - """Constructs feature column names based on a given code, its type, and specified aggregations. +def get_ts_feature_cols(aggregations: list[str], shard_df: DF_T) -> list[str]: + """Generates a list of feature column names from the data within each shard based on specified + configurations. Parameters: - - code (str): The specific code identifier for which the feature columns are being generated. - - code_type (CodeType): The type of the code (e.g., STATIC_CATEGORICAL, DYNAMIC_CONTINUOUS) - that determines how the code is processed. - - aggs (Sequence[str]): A list of aggregation operations to apply to the code, e.g., - "count", "sum". + - cfg (dict): Configuration dictionary specifying how features should be evaluated and aggregated. + - split_to_shard_df (dict): A dictionary of DataFrames, divided by data split (e.g., 'train', 'test'). Returns: - - list[str]: A list of fully qualified feature column names constructed based on the - code type and applicable aggregations. + - tuple[list[str], dict]: A tuple containing a list of feature columns and a dictionary of code properties + identified during the evaluation. - This function builds a list of feature column names using the code and its type to apply - the correct prefix and filters applicable aggregations based on whether they are relevant - to the code type. + This function evaluates the properties of codes within training data and applies configured + aggregations to generate a comprehensive list of feature columns for modeling purposes. + Examples: + >>> import polars as pl + >>> data = {'code': ['A', 'A', 'B', 'B', 'C', 'C', 'C'], + ... 'timestamp': [None, '2021-01-01', None, None, '2021-01-03', '2021-01-04', None], + ... 'numerical_value': [1, None, 2, 2, None, None, 3]} + >>> df = pl.DataFrame(data).lazy() + >>> aggs = ['sum', 'count'] + >>> get_ts_feature_cols(aggs, df) + ['A/count', 'A/sum', 'C/count', 'C/sum'] """ - prefix = f"{code_type.value}/{code}" - if code_type == CodeType.STATIC_CATEGORICAL: - return [f"{prefix}/present"] - elif code_type == CodeType.DYNAMIC_CATEGORICAL: - valid_aggs = [agg[4:] for agg in aggs if agg.startswith("code")] - return [f"{prefix}/{agg}" for agg in valid_aggs] - elif code_type == CodeType.STATIC_CONTINUOUS: - return [f"{prefix}/present", f"{prefix}/first"] - elif code_type == CodeType.DYNAMIC_CONTINUOUS: - valid_aggs = [agg[5:] for agg in aggs if agg.startswith("value")] - return [f"{prefix}/{agg}" for agg in valid_aggs] - else: - raise ValueError(f"Invalid code type: {code_type}") + feature_columns = [] + ts_df = shard_df.filter(pl.col("timestamp").is_not_null()) + for code in ts_df.select(pl.col("code").unique()).collect().to_series(): + ts_aggregations = [f"{code}/{agg}" for agg in aggregations] + feature_columns.extend(ts_aggregations) + return sorted(feature_columns) -def get_flat_rep_feature_cols(cfg, split_to_shard_df) -> list[str]: +def get_flat_rep_feature_cols(cfg: DictConfig, shard_df: DF_T) -> list[str]: """Generates a list of feature column names from the data within each shard based on specified configurations. Parameters: - cfg (dict): Configuration dictionary specifying how features should be evaluated and aggregated. - - split_to_shard_df (dict): A dictionary of DataFrames, divided by data split (e.g., 'train', 'test'). + - shard_df (DF_T): MEDS format dataframe shard. Returns: - - tuple[list[str], dict]: A tuple containing a list of feature columns and a dictionary of code properties - identified during the evaluation. + - list[str]: list of all feature columns. This function evaluates the properties of codes within training data and applies configured aggregations to generate a comprehensive list of feature columns for modeling purposes. + Example: + >>> data = {'code': ['A', 'A', 'B', 'B'], + ... 'timestamp': [None, '2021-01-01', None, None], + ... 'numerical_value': [1, None, 2, 2]} + >>> df = pl.DataFrame(data).lazy() + >>> aggs = ['sum', 'count'] + >>> cfg = DictConfig({'aggs': aggs}) + >>> get_flat_rep_feature_cols(cfg, df) + ['static/A/first', 'static/A/present', 'static/B/first', 'static/B/present', 'A/count', 'A/sum'] """ - feature_columns = [] - all_train_data = pl.concat(split_to_shard_df["train"]) - code_properties = evaluate_code_properties(all_train_data, cfg) - for code, code_type in code_properties.items(): - feature_columns.extend(get_code_column(code, code_type, cfg.aggs)) - return feature_columns, code_properties + static_feature_columns = get_static_feature_cols(shard_df) + ts_feature_columns = get_ts_feature_cols(cfg.aggs, shard_df) + return static_feature_columns + ts_feature_columns + + +def load_meds_data(MEDS_cohort_dir: str) -> Mapping[str, pl.DataFrame]: + """Loads the MEDS dataset from disk. + + Args: + MEDS_cohort_dir: The directory containing the MEDS datasets split by subfolders. + We expect `train` to be a split so `MEDS_cohort_dir/train` should exist. + + Returns: + Mapping[str, pl.DataFrame]: Mapping from split name to a polars DataFrame containing the MEDS dataset. + + Example: + >>> import tempfile + >>> from pathlib import Path + >>> MEDS_cohort_dir = Path(tempfile.mkdtemp()) + >>> for split in ["train", "val", "test"]: + ... split_dir = MEDS_cohort_dir / split + ... split_dir.mkdir() + ... pl.DataFrame({"patient_id": [1, 2, 3]}).write_parquet(split_dir / "data.parquet") + >>> split_to_df = load_meds_data(MEDS_cohort_dir) + >>> assert "train" in split_to_df + >>> assert len(split_to_df) == 3 + >>> assert len(split_to_df["train"]) == 1 + >>> assert isinstance(split_to_df["train"][0], pl.DataFrame) + """ + MEDS_cohort_dir = Path(MEDS_cohort_dir) + meds_fps = list(MEDS_cohort_dir.glob("*/*.parquet")) + splits = {fp.parent.stem for fp in meds_fps} + split_to_fps = {split: [fp for fp in meds_fps if fp.parent.stem == split] for split in splits} + split_to_df = { + split: [pl.scan_parquet(fp) for fp in split_fps] for split, split_fps in split_to_fps.items() + } + return split_to_df + + +def setup_environment(cfg: DictConfig): + # check output dir + flat_dir = Path(cfg.tabularized_data_dir) / "flat_reps" + assert flat_dir.exists() + + # load MEDS data + split_to_df = load_meds_data(cfg.MEDS_cohort_dir) + feature_columns = json.load(open(flat_dir / "feature_columns.json")) + + # Check that the stored config matches the current config + with open(flat_dir / "config.yaml") as file: + yaml_config = yaml.safe_load(file) + stored_config = OmegaConf.create(yaml_config) + assert stored_config == cfg, "Stored config does not match current config." + return flat_dir, split_to_df, feature_columns diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 0c72576..730ee0b 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -11,7 +11,10 @@ from hydra import compose, initialize from loguru import logger -from MEDS_tabular_automl.tabularize import cache_flat_representation +from scripts.identify_columns import store_columns +from scripts.summarize_over_windows import summarize_ts_data_over_windows +from scripts.tabularize_static import tabularize_static_data +from scripts.tabularize_ts import tabularize_ts_data SPLITS_JSON = """{"train/0": [239684, 1195293], "train/1": [68729, 814703], "tuning/0": [754281], "held_out/0": [1500733]}""" # noqa: E501 @@ -134,5 +137,8 @@ def test_tabularize(): with initialize(version_base=None, config_path="../configs/"): # path to config.yaml overrides = [f"{k}={v}" for k, v in tabularize_config_kwargs.items()] cfg = compose(config_name="tabularize", overrides=overrides) # config.yaml - logger.info("caching flat representation of MEDS data") - cache_flat_representation(cfg) + logger.info("caching flat representation of MEDS data") + store_columns(cfg) + tabularize_static_data(cfg) + tabularize_ts_data(cfg) + summarize_ts_data_over_windows(cfg) From 63b9ba6c42696d6f97eddfb319169123a5626915 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 27 May 2024 00:54:26 +0000 Subject: [PATCH 005/106] fixed doctests and updated github workflow tests to use python 3.12 --- .github/workflows/tests.yaml | 4 ++-- src/MEDS_tabular_automl/tabularize.py | 2 +- src/MEDS_tabular_automl/utils.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index b32a1bd..4a0dbf0 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -19,10 +19,10 @@ jobs: - name: Checkout uses: actions/checkout@v3 - - name: Set up Python 3.11 + - name: Set up Python 3.12 uses: actions/setup-python@v3 with: - python-version: "3.11" + python-version: "3.12" - name: Install packages run: | diff --git a/src/MEDS_tabular_automl/tabularize.py b/src/MEDS_tabular_automl/tabularize.py index 86948f5..a5ab4c8 100644 --- a/src/MEDS_tabular_automl/tabularize.py +++ b/src/MEDS_tabular_automl/tabularize.py @@ -39,7 +39,7 @@ def load_meds_data(MEDS_cohort_dir: str) -> Mapping[str, pl.DataFrame]: >>> assert "train" in split_to_df >>> assert len(split_to_df) == 3 >>> assert len(split_to_df["train"]) == 1 - >>> assert isinstance(split_to_df["train"][0], pl.DataFrame) + >>> assert isinstance(split_to_df["train"][0], pl.LazyFrame) """ MEDS_cohort_dir = Path(MEDS_cohort_dir) meds_fps = list(MEDS_cohort_dir.glob("*/*.parquet")) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 0d703df..85ec597 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -211,7 +211,7 @@ def load_meds_data(MEDS_cohort_dir: str) -> Mapping[str, pl.DataFrame]: >>> assert "train" in split_to_df >>> assert len(split_to_df) == 3 >>> assert len(split_to_df["train"]) == 1 - >>> assert isinstance(split_to_df["train"][0], pl.DataFrame) + >>> assert isinstance(split_to_df["train"][0], pl.LazyFrame) """ MEDS_cohort_dir = Path(MEDS_cohort_dir) meds_fps = list(MEDS_cohort_dir.glob("*/*.parquet")) From cd067f816d8b3a9372e42692ecce813cb9da2fc2 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 27 May 2024 04:59:24 +0000 Subject: [PATCH 006/106] Implement data processing for MEDS format to pivot tables into two indexed file types: - Code data: Files with a patient_id and timestamp index, containing columns for each code with binary presence indicators (1 and 0). - Value data: Files similar in structure but containing the numerical values observed for each code. --- scripts/identify_columns.py | 2 +- scripts/tabularize_ts.py | 52 ++- .../generate_ts_features.py | 341 +++++++----------- src/MEDS_tabular_automl/utils.py | 2 +- tests/test_tabularize.py | 22 +- 5 files changed, 183 insertions(+), 236 deletions(-) diff --git a/scripts/identify_columns.py b/scripts/identify_columns.py index d74f811..4334df8 100644 --- a/scripts/identify_columns.py +++ b/scripts/identify_columns.py @@ -98,7 +98,7 @@ def store_columns( .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 """ # create output dir - flat_dir = Path(cfg.tabularized_data_dir) / "flat_reps" + flat_dir = Path(cfg.tabularized_data_dir) flat_dir.mkdir(exist_ok=True, parents=True) # load MEDS data diff --git a/scripts/tabularize_ts.py b/scripts/tabularize_ts.py index 2d9ac95..33e9dec 100644 --- a/scripts/tabularize_ts.py +++ b/scripts/tabularize_ts.py @@ -1,25 +1,21 @@ -"""WIP.""" import hydra from omegaconf import DictConfig +from tqdm import tqdm -from MEDS_tabular_automl.utils import setup_environment +from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep +from MEDS_tabular_automl.utils import setup_environment, write_df @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") def tabularize_ts_data( cfg: DictConfig, ): - """Writes a flat (historically summarized) representation of the dataset to disk. + """Processes a medical dataset to generates and stores flat representatiosn of time-series data. - This file caches a set of files useful for building flat representations of the dataset to disk, - suitable for, e.g., sklearn style modeling for downstream tasks. It will produce a few sets of files: - - * A new directory ``self.config.save_dir / "flat_reps"`` which contains the following: - * A subdirectory ``raw`` which contains: (1) a json file with the configuration arguments and (2) a - set of parquet files containing flat (e.g., wide) representations of summarized events per subject, - broken out by split and subject chunk. - * A set of subdirectories ``past/*`` which contains summarized views over the past ``*`` time period - per subject per event, for all time periods in ``window_sizes``, if any. + This function handles MEDS format data and pivots tables to create two types of data files + with patient_id and timestamp indexes: + code data: containing a column for every code and 1 and 0 values indicating presence + value data: containing a column for every code which the numerical value observed. Args: cfg: @@ -35,8 +31,8 @@ def tabularize_ts_data( specified in this argument. These are strings specifying time deltas, using this syntax: `link`_. Each window size will be summarized to a separate directory, and will share the same subject file split as is used in the raw representation files. - codes: A list of codes to include in the flat representation. If `None`, all codes will be included - in the flat representation. + codes: A list of codes to include in the flat representation. If `None`, all codes will be + included in the flat representation. aggs: A list of aggregations to apply to the raw representation. Must have length greater than 0. n_patients_per_sub_shard: The number of subjects that should be included in each output file. Lowering this number increases the number of files written, making the process of creating and @@ -45,7 +41,29 @@ def tabularize_ts_data( directory. do_update: bool = True seed: The seed to use for random number generation. - - .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 """ - setup_environment(cfg) + flat_dir, split_to_df, feature_columns = setup_environment(cfg) + # Produce ts representation + ts_subdir = flat_dir / "ts" + + for sp, subjects_dfs in tqdm(list(split_to_df.items()), desc="Flattening Splits"): + sp_dir = ts_subdir / sp + + for i, shard_df in enumerate(tqdm(subjects_dfs, desc="Subject chunks", leave=False)): + code_fp = sp_dir / f"{i}_code.parquet" + value_fp = sp_dir / f"{i}_value.parquet" + if code_fp.exists() or value_fp.exists(): + if cfg.do_update: + continue + elif not cfg.do_overwrite: + raise FileExistsError( + f"do_overwrite is {cfg.do_overwrite} and {code_fp.exists()}" + f" or {value_fp.exists()} exists!" + ) + + code_df, value_df = get_flat_ts_rep( + feature_columns=feature_columns, + shard_df=shard_df, + ) + write_df(code_df, code_fp, do_overwrite=cfg.do_overwrite) + write_df(value_df, value_fp, do_overwrite=cfg.do_overwrite) diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index 9d5956f..12ac571 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -1,226 +1,137 @@ -"""WIP. - -This file will be used to generate time series features from the raw data. -""" -from collections.abc import Callable -from pathlib import Path - -import numpy as np import polars as pl -import polars.selectors as cs - -from MEDS_tabular_automl.utils import DF_T, add_missing_cols, parse_flat_feature_column - - -def _summarize_dynamic_measurements( - self, - feature_columns: list[str], - include_only_subjects: set[int] | None = None, -) -> pl.LazyFrame: - if include_only_subjects is None: - df = self.dynamic_measurements_df - else: - df = self.dynamic_measurements_df.join( - self.events_df.filter(pl.col("subject_id").is_in(list(include_only_subjects))).select("event_id"), - on="event_id", - how="inner", - ) - - valid_measures = {} - for feat_col in feature_columns: - temp, meas, feat = parse_flat_feature_column(feat_col) - - if temp != "dynamic": - continue - if meas not in valid_measures: - valid_measures[meas] = set() - valid_measures[meas].add(feat) +from MEDS_tabular_automl.utils import DF_T - out_dfs = {} - for m, allowed_vocab in valid_measures.items(): - cfg = self.measurement_configs[m] - - total_observations = int( - np.ceil( - cfg.observation_rate_per_case - * cfg.observation_rate_over_cases - * sum(self.n_events_per_subject.values()) - ) - ) - - count_type = self.get_smallest_valid_uint_type(total_observations) - - if cfg.modality == "univariate_regression" and cfg.vocabulary is None: - prefix = f"dynamic/{m}/{m}" - - key_col = pl.col(m) - val_col = pl.col(m).drop_nans().cast(pl.Float32) - - out_dfs[m] = ( - df.lazy() - .select("measurement_id", "event_id", m) - .filter(pl.col(m).is_not_null()) - .groupby("event_id") - .agg( - pl.col(m).is_not_null().sum().cast(count_type).alias(f"{prefix}/count"), - ( - (pl.col(m).is_not_nan() & pl.col(m).is_not_null()) - .sum() - .cast(count_type) - .alias(f"{prefix}/has_values_count") - ), - val_col.sum().alias(f"{prefix}/sum"), - (val_col**2).sum().alias(f"{prefix}/sum_sqd"), - val_col.min().alias(f"{prefix}/min"), - val_col.max().alias(f"{prefix}/max"), - ) - ) - continue - elif cfg.modality == "multivariate_regression": - column_cols = [m, m] - values_cols = [m, cfg.values_column] - key_prefix = f"{m}_{m}_" - val_prefix = f"{cfg.values_column}_{m}_" - - key_col = cs.starts_with(key_prefix) - val_col = cs.starts_with(val_prefix).drop_nans().cast(pl.Float32) - - aggs = [ - key_col.is_not_null() - .sum() - .cast(count_type) - .map_alias(lambda c: f"dynamic/{m}/{c.replace(key_prefix, '')}/count"), - ( - (cs.starts_with(val_prefix).is_not_null() & cs.starts_with(val_prefix).is_not_nan()) - .sum() - .map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/has_values_count") - ), - val_col.sum().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/sum"), - (val_col**2).sum().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/sum_sqd"), - val_col.min().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/min"), - val_col.max().map_alias(lambda c: f"dynamic/{m}/{c.replace(val_prefix, '')}/max"), - ] - else: - column_cols = [m] - values_cols = [m] - aggs = [ - pl.all().is_not_null().sum().cast(count_type).map_alias(lambda c: f"dynamic/{m}/{c}/count") - ] - - ID_cols = ["measurement_id", "event_id"] - out_dfs[m] = ( - df.select(*ID_cols, *set(column_cols + values_cols)) - .filter(pl.col(m).is_in(allowed_vocab)) - .pivot( - index=ID_cols, - columns=column_cols, - values=values_cols, - aggregate_function=None, - ) - .lazy() - .drop("measurement_id") - .groupby("event_id") - .agg(*aggs) - ) +VALID_AGGREGATIONS = [ + "sum", + "sum_sqd", + "min", + "max", + "value", + "first", + "present", + "count", + "has_values_count", +] - return pl.concat(list(out_dfs.values()), how="align") - -def _summarize_over_window(df: DF_T, window_size: str) -> pl.LazyFrame: - """Apply aggregations to the raw representation over a window size.""" - if isinstance(df, Path): - df = pl.scan_parquet(df) - - def time_aggd_col_alias_fntr(new_agg: str | None = None) -> Callable[[str], str]: - if new_agg is None: - - def f(c: str) -> str: - return "/".join([window_size] + c.split("/")[1:]) - - else: - - def f(c: str) -> str: - return "/".join([window_size] + c.split("/")[1:-1] + [new_agg]) - - return f - - # Columns to convert to counts: - present_indicator_cols = cs.ends_with("/present") - - # Columns to convert to value aggregations: - value_cols = cs.ends_with("/value") - - # Columns to aggregate via other operations - cnt_cols = (cs.ends_with("/count") | cs.ends_with("/has_values_count")).fill_null(0) - - cols_to_sum = cs.ends_with("/sum") | cs.ends_with("/sum_sqd") - cols_to_min = cs.ends_with("/min") - cols_to_max = cs.ends_with("/max") - - if window_size == "FULL": - df = df.groupby("subject_id").agg( - "timestamp", - # present to counts - present_indicator_cols.cumsum().map_alias(time_aggd_col_alias_fntr("count")), - # values to stats - value_cols.is_not_null().cumsum().map_alias(time_aggd_col_alias_fntr("count")), - ( - (value_cols.is_not_null() & value_cols.is_not_nan()) - .cumsum() - .map_alias(time_aggd_col_alias_fntr("has_values_count")) - ), - value_cols.cumsum().map_alias(time_aggd_col_alias_fntr("sum")), - (value_cols**2).cumsum().map_alias(time_aggd_col_alias_fntr("sum_sqd")), - value_cols.cummin().map_alias(time_aggd_col_alias_fntr("min")), - value_cols.cummax().map_alias(time_aggd_col_alias_fntr("max")), - # Raw aggregations - cnt_cols.cumsum().map_alias(time_aggd_col_alias_fntr()), - cols_to_sum.cumsum().map_alias(time_aggd_col_alias_fntr()), - cols_to_min.cummin().map_alias(time_aggd_col_alias_fntr()), - cols_to_max.cummax().map_alias(time_aggd_col_alias_fntr()), - ) - df = df.explode(*[c for c in df.columns if c != "subject_id"]) - else: - df = df.groupby_rolling( - index_column="timestamp", - by="subject_id", - period=window_size, - ).agg( - # present to counts - present_indicator_cols.sum().map_alias(time_aggd_col_alias_fntr("count")), - # values to stats - value_cols.is_not_null().sum().map_alias(time_aggd_col_alias_fntr("count")), - ( - (value_cols.is_not_null() & value_cols.is_not_nan()) - .sum() - .map_alias(time_aggd_col_alias_fntr("has_values_count")) - ), - value_cols.sum().map_alias(time_aggd_col_alias_fntr("sum")), - (value_cols**2).sum().map_alias(time_aggd_col_alias_fntr("sum_sqd")), - value_cols.min().map_alias(time_aggd_col_alias_fntr("min")), - value_cols.max().map_alias(time_aggd_col_alias_fntr("max")), - # Raw aggregations - cnt_cols.sum().map_alias(time_aggd_col_alias_fntr()), - cols_to_sum.sum().map_alias(time_aggd_col_alias_fntr()), - cols_to_min.min().map_alias(time_aggd_col_alias_fntr()), - cols_to_max.max().map_alias(time_aggd_col_alias_fntr()), +def summarize_dynamic_measurements( + ts_columns: list[str], + df: DF_T, +) -> pl.LazyFrame: + """Summarize dynamic measurements for feature columns that are marked as 'dynamic'. + + Args: + - ts_columns (list[str]): List of feature column identifiers that are specifically marked for dynamic + analysis. + - shard_df (DF_T): Data frame from which features will be extracted and summarized. + + Returns: + - pl.LazyFrame: A summarized data frame containing the dynamic features. + + Example: + >>> data = {'patient_id': [1, 1, 1, 2], + ... 'code': ['A', 'A', 'B', 'B'], + ... 'timestamp': ['2021-01-01', '2021-01-01', '2020-01-01', '2021-01-04'], + ... 'numerical_value': [1, 2, 2, 2]} + >>> df = pl.DataFrame(data).lazy() + >>> ts_columns = ['A', 'B'] + >>> code_df, value_df = summarize_dynamic_measurements(ts_columns, df) + >>> code_df.collect() + shape: (4, 4) + ┌────────────┬────────┬────────┬────────────┐ + │ patient_id ┆ code/A ┆ code/B ┆ timestamp │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ u8 ┆ u8 ┆ str │ + ╞════════════╪════════╪════════╪════════════╡ + │ 1 ┆ 1 ┆ 0 ┆ 2021-01-01 │ + │ 1 ┆ 1 ┆ 0 ┆ 2021-01-01 │ + │ 1 ┆ 0 ┆ 1 ┆ 2020-01-01 │ + │ 2 ┆ 0 ┆ 1 ┆ 2021-01-04 │ + └────────────┴────────┴────────┴────────────┘ + >>> value_df.collect() + shape: (3, 4) + ┌────────────┬────────────┬─────────┬─────────┐ + │ patient_id ┆ timestamp ┆ value/A ┆ value/B │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 ┆ f64 │ + ╞════════════╪════════════╪═════════╪═════════╡ + │ 1 ┆ 2021-01-01 ┆ 1.5 ┆ null │ + │ 1 ┆ 2020-01-01 ┆ null ┆ 2.0 │ + │ 2 ┆ 2021-01-04 ┆ null ┆ 2.0 │ + └────────────┴────────────┴─────────┴─────────┘ + """ + + value_df = ( + df.select("patient_id", "timestamp", "code", "numerical_value") + .collect() + .pivot( + index=["patient_id", "timestamp"], + columns=["code"], + values=["numerical_value"], + aggregate_function="mean", # TODO round up counts so they are binary + separator="/", ) - - return add_missing_cols(df, set_count_0_to_null=True) + .lazy() + ) + value_df = value_df.rename(lambda c: f"value/{c}" if c not in ["patient_id", "timestamp"] else c) + code_df = df.drop("numerical_value").collect().to_dummies(columns=["code"], separator="/").lazy() + return code_df, value_df def get_flat_ts_rep( feature_columns: list[str], - **kwargs, + shard_df: DF_T, ) -> pl.LazyFrame: - """Produce raw representation for dynamic data.""" - - return add_missing_cols( - _summarize_dynamic_measurements(feature_columns, **kwargs) - .sort(by=["subject_id", "timestamp"]) - .collect() - .lazy(), - [c for c in feature_columns if c.startswith("dynamic")], - ) - # The above .collect().lazy() shouldn't be necessary but it appears to be for some reason... + """Produce a flat time series representation from a given data frame, focusing on non-static feature + columns. + + This function filters the given data frame for non-static features based on the 'feature_columns' + provided and generates a flat time series representation using these dynamic features. The resulting + data frame includes both codes and values transformed and aggregated appropriately. + + Args: + feature_columns (list[str]): A list of column identifiers that determine which features are considered + for dynamic analysis. + shard_df (DF_T): The data frame containing time-stamped data from which features will be extracted + and summarized. + + Returns: + pl.LazyFrame: A LazyFrame consisting of the processed time series data, combining both code and value + representations. + + Example: + >>> feature_columns = ['A', 'B', 'C', "static/A"] + >>> data = {'patient_id': [1, 1, 1, 2, 2, 2], + ... 'code': ['A', 'A', 'B', 'B', 'C', 'C'], + ... 'timestamp': ['2021-01-01', '2021-01-01', '2020-01-01', '2021-01-04', None, None], + ... 'numerical_value': [1, 2, 2, 2, 3, 4]} + >>> df = pl.DataFrame(data).lazy() + >>> code_df, value_df = get_flat_ts_rep(feature_columns, df) + >>> code_df.collect() + shape: (4, 4) + ┌────────────┬────────┬────────┬────────────┐ + │ patient_id ┆ code/A ┆ code/B ┆ timestamp │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ u8 ┆ u8 ┆ str │ + ╞════════════╪════════╪════════╪════════════╡ + │ 1 ┆ 1 ┆ 0 ┆ 2021-01-01 │ + │ 1 ┆ 1 ┆ 0 ┆ 2021-01-01 │ + │ 1 ┆ 0 ┆ 1 ┆ 2020-01-01 │ + │ 2 ┆ 0 ┆ 1 ┆ 2021-01-04 │ + └────────────┴────────┴────────┴────────────┘ + >>> value_df.collect() + shape: (3, 4) + ┌────────────┬────────────┬─────────┬─────────┐ + │ patient_id ┆ timestamp ┆ value/A ┆ value/B │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 ┆ f64 │ + ╞════════════╪════════════╪═════════╪═════════╡ + │ 1 ┆ 2021-01-01 ┆ 1.5 ┆ null │ + │ 1 ┆ 2020-01-01 ┆ null ┆ 2.0 │ + │ 2 ┆ 2021-01-04 ┆ null ┆ 2.0 │ + └────────────┴────────────┴─────────┴─────────┘ + """ + ts_columns = [c for c in feature_columns if not c.startswith("static")] + ts_shard_df = shard_df.filter(pl.col("timestamp").is_not_null()) + return summarize_dynamic_measurements(ts_columns, ts_shard_df) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 85ec597..bb68cad 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -225,7 +225,7 @@ def load_meds_data(MEDS_cohort_dir: str) -> Mapping[str, pl.DataFrame]: def setup_environment(cfg: DictConfig): # check output dir - flat_dir = Path(cfg.tabularized_data_dir) / "flat_reps" + flat_dir = Path(cfg.tabularized_data_dir) assert flat_dir.exists() # load MEDS data diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 730ee0b..24c8f1c 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -104,11 +104,10 @@ def test_tabularize(): with tempfile.TemporaryDirectory() as d: MEDS_cohort_dir = Path(d) / "MEDS_cohort" - tabularized_data_dir = Path(d) / "cached_reps" + tabularized_data_dir = Path(d) / "flat_reps" # Create the directories MEDS_cohort_dir.mkdir() - tabularized_data_dir.mkdir() # Store MEDS outputs for split, data in MEDS_OUTPUTS.items(): @@ -140,5 +139,24 @@ def test_tabularize(): logger.info("caching flat representation of MEDS data") store_columns(cfg) tabularize_static_data(cfg) + actual_files = [ + (f.parent.stem, f.stem) for f in list(tabularized_data_dir.glob("static/*/*.parquet")) + ] + expected_files = [("train", "1"), ("train", "0"), ("held_out", "0"), ("tuning", "0")] + assert set(actual_files) == set(expected_files) tabularize_ts_data(cfg) + # confirm the time series files exist: + actual_files = [(f.parent.stem, f.stem) for f in list(tabularized_data_dir.glob("ts/*/*.parquet"))] + expected_files = [ + ("train", "1_value"), + ("train", "0_code"), + ("train", "0_value"), + ("train", "1_code"), + ("held_out", "0_code"), + ("held_out", "0_value"), + ("tuning", "0_code"), + ("tuning", "0_value"), + ] + assert set(actual_files) == set(expected_files) + summarize_ts_data_over_windows(cfg) From c8ca3bb765fb417b6565be4fc6edde0c72d723fd Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 27 May 2024 08:51:43 +0000 Subject: [PATCH 007/106] Enhance data aggregation framework with dynamic window and aggregation handling - Introduce VALID_AGGREGATIONS to define permissible aggregations. - Implement to generate dynamic column aliases based on window size and aggregation. - Extend for dynamic expression creation based on aggregation type and window size, handling both cumulative and windowed aggregations. - Enhance to apply specified aggregations over defined window sizes, ensuring comprehensive data summarization. - Update to handle multiple dataframes, aggregate data using specified window sizes and aggregations, and ensure inclusion of all specified feature columns, adding missing ones with default values. - Add extensive doctests to ensure accuracy of the summarization functions, demonstrating usage with both code and value data types. --- configs/tabularize.yaml | 10 +- scripts/summarize_over_windows.py | 46 +++- .../generate_static_features.py | 4 +- .../generate_summarized_reps.py | 250 ++++++++++++++++++ .../generate_ts_features.py | 12 - 5 files changed, 299 insertions(+), 23 deletions(-) create mode 100644 src/MEDS_tabular_automl/generate_summarized_reps.py diff --git a/configs/tabularize.yaml b/configs/tabularize.yaml index 17b8ab5..deb4caa 100644 --- a/configs/tabularize.yaml +++ b/configs/tabularize.yaml @@ -8,18 +8,14 @@ window_sizes: ??? codes: null aggs: - "code/count" - - "code/time_since_last" - - "code/time_since_first" + - "code/present" - "value/count" + - "value/present" - "value/sum" - "value/sum_sqd" - "value/min" - - "value/time_since_min" - "value/max" - - "value/time_since_max" - - "value/last" - - "value/slope" - - "value/intercept" + - "value/first" dynamic_threshold: 0.01 numerical_value_threshold: 0.1 diff --git a/scripts/summarize_over_windows.py b/scripts/summarize_over_windows.py index acf1d51..157bca4 100644 --- a/scripts/summarize_over_windows.py +++ b/scripts/summarize_over_windows.py @@ -1,10 +1,15 @@ """WIP.""" +from pathlib import Path + import hydra +import polars as pl +from loguru import logger from omegaconf import DictConfig -from MEDS_tabular_automl.utils import setup_environment +from MEDS_tabular_automl.generate_summarized_reps import generate_summary +from MEDS_tabular_automl.utils import setup_environment, write_df @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") @@ -50,4 +55,41 @@ def summarize_ts_data_over_windows( .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 """ - setup_environment(cfg) + flat_dir, _, feature_columns = setup_environment(cfg) + + # Assuming MEDS_cohort_dir is correctly defined somewhere above this snippet + ts_dir = Path(cfg.tabularized_data_dir) / "ts" + ts_fps = list(ts_dir.glob("*/*.parquet")) + splits = {fp.parent.stem for fp in ts_fps} + + split_to_pair_fps = {} + for split in splits: + # Categorize files by identifier (base name without '_code' or '_value') using a list comprehension + categorized_files = { + file.stem.rsplit("_", 1)[0]: {"code": None, "value": None} + for file in ts_fps + if file.parent.stem == split + } + for file in ts_fps: + if file.parent.stem == split: + identifier = file.stem.rsplit("_", 1)[0] + suffix = file.stem.split("_")[-1] # 'code' or 'value' + categorized_files[identifier][suffix] = file + + # Process categorized files into pairs ensuring code is first and value is second + code_value_pairs = [ + (info["code"], info["value"]) + for info in categorized_files.values() + if info["code"] is not None and info["value"] is not None + ] + + split_to_pair_fps[split] = code_value_pairs + + # Example use of split_to_pair_fps + for split, pairs in split_to_pair_fps.items(): + logger.info(f"Processing {split}:") + for code_file, value_file in pairs: + logger.info(f" - Code file: {code_file}, Value file: {value_file}") + summary_df = generate_summary(pl.scan_parquet(code_file), pl.scan_parquet(value_file)) + shard_number = code_file.stem.rsplit("_", 1)[0] + write_df(summary_df, flat_dir / split / f"{shard_number}.parquet") diff --git a/src/MEDS_tabular_automl/generate_static_features.py b/src/MEDS_tabular_automl/generate_static_features.py index 6da9610..ee28b77 100644 --- a/src/MEDS_tabular_automl/generate_static_features.py +++ b/src/MEDS_tabular_automl/generate_static_features.py @@ -13,7 +13,7 @@ from MEDS_tabular_automl.utils import DF_T, add_missing_cols, parse_flat_feature_column -def _summarize_static_measurements( +def summarize_static_measurements( feature_columns: list[str], df: DF_T, ) -> pl.LazyFrame: @@ -98,7 +98,7 @@ def get_flat_static_rep( suitable for further analysis or machine learning tasks. """ static_features = [c for c in feature_columns if c.startswith("STATIC_")] - static_measurements = _summarize_static_measurements(static_features, df=shard_df) + static_measurements = summarize_static_measurements(static_features, df=shard_df) # fill up missing feature columns with nulls normalized_measurements = add_missing_cols( static_measurements, diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py new file mode 100644 index 0000000..9e9202c --- /dev/null +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -0,0 +1,250 @@ +from collections.abc import Callable + +import polars as pl +import polars.selectors as cs + +from MEDS_tabular_automl.utils import DF_T + +VALID_AGGREGATIONS = [ + "code/count", + "value/count", + "value/has_values_count", + "value/sum", + "value/sum_sqd", + "value/min", + "value/max", + "value/first", +] + + +def time_aggd_col_alias_fntr(window_size: str, agg: str) -> Callable[[str], str]: + assert agg is not None, "agg must be provided" + + def f(c: str) -> str: + return "/".join([window_size] + c.split("/") + [agg]) + + return f + + +def get_agg_pl_expr(window_size: str, agg: str): + code_cols = cs.starts_with("code/") + value_cols = cs.starts_with("value/") + if window_size == "full": + match agg: + case "code/count": + return code_cols.cumsum().map_alias(time_aggd_col_alias_fntr(window_size, "count")) + case "value/count": + return ( + value_cols.is_not_null() + .cumsum() + .map_alias(time_aggd_col_alias_fntr(window_size, "count")) + ) + case "value/has_values_count": + return ( + (value_cols.is_not_null() & value_cols.is_not_nan()) + .cumsum() + .map_alias(time_aggd_col_alias_fntr(window_size, "has_values_count")) + ) + case "value/sum": + return value_cols.cumsum().map_alias(time_aggd_col_alias_fntr(window_size, "sum")) + case "value/sum_sqd": + return (value_cols**2).cumsum().map_alias(time_aggd_col_alias_fntr(window_size, "sum_sqd")) + case "value/min": + value_cols.cummin().map_alias(time_aggd_col_alias_fntr(window_size, "min")) + case "value/max": + value_cols.cummax().map_alias(time_aggd_col_alias_fntr(window_size, "max")) + case _: + raise ValueError(f"Invalid aggregation `{agg}` for window_size `{window_size}`") + else: + match agg: + case "code/count": + return code_cols.sum().map_alias(time_aggd_col_alias_fntr(window_size, "count")) + case "value/count": + return ( + value_cols.is_not_null().sum().map_alias(time_aggd_col_alias_fntr(window_size, "count")) + ) + case "value/has_values_count": + return ( + (value_cols.is_not_null() & value_cols.is_not_nan()) + .sum() + .map_alias(time_aggd_col_alias_fntr(window_size, "has_values_count")) + ) + case "value/sum": + return value_cols.sum().map_alias(time_aggd_col_alias_fntr(window_size, "sum")) + case "value/sum_sqd": + return (value_cols**2).sum().map_alias(time_aggd_col_alias_fntr(window_size, "sum_sqd")) + case "value/min": + value_cols.min().map_alias(time_aggd_col_alias_fntr(window_size, "min")) + case "value/max": + value_cols.max().map_alias(time_aggd_col_alias_fntr(window_size, "max")) + case _: + raise ValueError(f"Invalid aggregation `{agg}` for window_size `{window_size}`") + + +def _generate_summary(df: DF_T, window_size: str, agg: str) -> pl.LazyFrame: + """Generate a summary of the data frame for a given window size and aggregation. + + Args: + - df (DF_T): The data frame to summarize. + - window_size (str): The window size to use for the summary. + - agg (str): The aggregation to apply to the data frame. + + Returns: + - pl.LazyFrame: The summarized data frame. + + Expect: + >>> from datetime import date + >>> code_df = pl.DataFrame({"patient_id": [1, 1, 1, 2], + ... "code/A": [1, 1, 0, 0], + ... "code/B": [0, 0, 1, 1], + ... "timestamp": [date(2021, 1, 1), date(2021, 1, 2),date(2020, 1, 3), date(2021, 1, 4)], + ... }).lazy() + >>> _generate_summary(code_df.lazy(), "full", "code/count" + ... ).collect().sort(["patient_id", "timestamp"]) + shape: (4, 4) + ┌────────────┬────────────┬───────────────────┬───────────────────┐ + │ patient_id ┆ timestamp ┆ full/code/A/count ┆ full/code/B/count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ date ┆ i64 ┆ i64 │ + ╞════════════╪════════════╪═══════════════════╪═══════════════════╡ + │ 1 ┆ 2020-01-03 ┆ 2 ┆ 1 │ + │ 1 ┆ 2021-01-01 ┆ 1 ┆ 0 │ + │ 1 ┆ 2021-01-02 ┆ 2 ┆ 0 │ + │ 2 ┆ 2021-01-04 ┆ 0 ┆ 1 │ + └────────────┴────────────┴───────────────────┴───────────────────┘ + >>> value_df = pl.DataFrame({"patient_id": [1, 1, 1, 2], + ... "timestamp": [date(2021, 1, 1), date(2021, 1, 2), + ... date(2020, 1, 3), date(2021, 1, 4)], + ... "value/A": [1, 2, 3, None], + ... "value/B": [None, None, None, 4.0],}) + >>> _generate_summary(value_df.lazy(), "full", "value/sum").collect().sort( + ... ["patient_id", "timestamp"]) + shape: (4, 4) + ┌────────────┬────────────┬──────────────────┬──────────────────┐ + │ patient_id ┆ timestamp ┆ full/value/A/sum ┆ full/value/B/sum │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ date ┆ i64 ┆ f64 │ + ╞════════════╪════════════╪══════════════════╪══════════════════╡ + │ 1 ┆ 2020-01-03 ┆ 6 ┆ null │ + │ 1 ┆ 2021-01-01 ┆ 1 ┆ null │ + │ 1 ┆ 2021-01-02 ┆ 3 ┆ null │ + │ 2 ┆ 2021-01-04 ┆ null ┆ 4.0 │ + └────────────┴────────────┴──────────────────┴──────────────────┘ + >>> _generate_summary(value_df.lazy(), "1d", "value/count").collect().sort( + ... ["patient_id", "timestamp"]) + shape: (4, 4) + ┌────────────┬────────────┬──────────────────┬──────────────────┐ + │ patient_id ┆ timestamp ┆ 1d/value/A/count ┆ 1d/value/B/count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ date ┆ u32 ┆ u32 │ + ╞════════════╪════════════╪══════════════════╪══════════════════╡ + │ 1 ┆ 2020-01-03 ┆ 1 ┆ 0 │ + │ 1 ┆ 2021-01-01 ┆ 1 ┆ 0 │ + │ 1 ┆ 2021-01-02 ┆ 1 ┆ 0 │ + │ 2 ┆ 2021-01-04 ┆ 0 ┆ 1 │ + └────────────┴────────────┴──────────────────┴──────────────────┘ + """ + assert agg in VALID_AGGREGATIONS, f"Invalid aggregation: {agg}" + assert agg.split("/")[0] in [ + c.split("/")[0] for c in df.columns + ], f"df is invalid, no column with prefix: `{agg.split('/')[0]}`" + + if window_size == "full": + out_df = df.groupby("patient_id").agg( + "timestamp", + get_agg_pl_expr(window_size, agg), + ) + out_df = out_df.explode(*[c for c in out_df.columns if c != "patient_id"]) + else: + out_df = ( + df.sort(["patient_id", "timestamp"]) + .groupby_rolling( + index_column="timestamp", + by="patient_id", + period=window_size, + ) + .agg( + get_agg_pl_expr(window_size, agg), + ) + ) + + return out_df + + +def generate_summary( + feature_columns: list[str], dfs: list[pl.LazyFrame], window_sizes: list[str], aggregations: list[str] +) -> pl.LazyFrame: + """Generate a summary of the data frame for given window sizes and aggregations. + + This function processes a dataframe to apply specified aggregations over defined window sizes. + It then joins the resulting frames on 'patient_id' and 'timestamp', and ensures all specified + feature columns exist in the final output, adding missing ones with default values. + + Args: + feature_columns (list[str]): List of all feature columns that must exist in the final output. + df (list[pl.LazyFrame]): The input dataframes to process, expected to be length 2 list with code_df + (pivoted shard with binary presence of codes) and value_df (pivoted shard with numerical values + for each code). + window_sizes (list[str]): List of window sizes to apply for summarization. + aggregations (list[str]): List of aggregations to perform within each window size. + + Returns: + pl.LazyFrame: A LazyFrame containing the summarized data with all required features present. + + Expect: + >>> from datetime import date + >>> value_df = pl.DataFrame({"patient_id": [1, 1, 1, 2], + ... "timestamp": [date(2021, 1, 1), date(2021, 1, 2),date(2020, 1, 3), date(2021, 1, 4)], + ... "value/A": [1, 2, 3, None], + ... "value/B": [None, None, None, 4.0],}) + >>> code_df = pl.DataFrame({"patient_id": [1, 1, 1, 2], + ... "code/A": [1, 1, 0, 0], + ... "code/B": [0, 0, 1, 1], + ... "timestamp": [date(2021, 1, 1), date(2021, 1, 2),date(2020, 1, 3), date(2021, 1, 5)], + ... }).lazy() + >>> feature_columns = ["code/A", "code/B", "value/A", "value/B"] + >>> aggregations = ["code/count", "value/sum"] + >>> window_sizes = ["full", "1d"] + >>> out_df = generate_summary(feature_columns, [value_df.lazy(), code_df.lazy()], + ... window_sizes, aggregations).collect().sort(["patient_id", "timestamp"]) + >>> print(out_df.shape) + (5, 10) + >>> for c in out_df.columns: print(c) + patient_id + timestamp + 1d/code/A/count + 1d/code/B/count + 1d/value/A/sum + 1d/value/B/sum + full/code/A/count + full/code/B/count + full/value/A/sum + full/value/B/sum + """ + final_columns = [] + out_dfs = [] + # Generate summaries for each window size and aggregation + for window_size in window_sizes: + for agg in aggregations: + code_type, agg_name = agg.split("/") + final_columns.extend( + [f"{window_size}/{c}/{agg_name}" for c in feature_columns if c.startswith(code_type)] + ) + for df in dfs: + if agg.split("/")[0] in [c.split("/")[0] for c in df.columns]: + out_df = _generate_summary(df, window_size, agg) + out_dfs.append(out_df) + + final_columns = sorted(final_columns) + # Combine all dataframes using successive joins + result_df = out_dfs[0] + for df in out_dfs[1:]: + result_df = result_df.join(df, on=["patient_id", "timestamp"], how="outer", coalesce=True) + + # Add in missing feature columns with default values + existing_columns = result_df.columns + for column in final_columns: + if column not in existing_columns: + result_df = result_df.with_columns(pl.lit(None).alias(column)) + result_df = result_df.select(pl.col(*["patient_id", "timestamp"], *final_columns)) + return result_df diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index 12ac571..c768db1 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -2,18 +2,6 @@ from MEDS_tabular_automl.utils import DF_T -VALID_AGGREGATIONS = [ - "sum", - "sum_sqd", - "min", - "max", - "value", - "first", - "present", - "count", - "has_values_count", -] - def summarize_dynamic_measurements( ts_columns: list[str], From 7fdc37d44cf13f5231aeb2512431f7316f404b08 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Mon, 27 May 2024 09:01:02 -0400 Subject: [PATCH 008/106] Update src/MEDS_tabular_automl/generate_summarized_reps.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- src/MEDS_tabular_automl/generate_summarized_reps.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 9e9202c..9888aa3 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -18,7 +18,8 @@ def time_aggd_col_alias_fntr(window_size: str, agg: str) -> Callable[[str], str]: - assert agg is not None, "agg must be provided" + if agg is None: + raise ValueError("Aggregation type 'agg' must be provided") def f(c: str) -> str: return "/".join([window_size] + c.split("/") + [agg]) From 4dd3cadefb0d081f732b7a6999ceb8dfc8f4120f Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Mon, 27 May 2024 09:02:32 -0400 Subject: [PATCH 009/106] Update src/MEDS_tabular_automl/generate_summarized_reps.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- src/MEDS_tabular_automl/generate_summarized_reps.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 9888aa3..23db3dc 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -145,10 +145,10 @@ def _generate_summary(df: DF_T, window_size: str, agg: str) -> pl.LazyFrame: │ 2 ┆ 2021-01-04 ┆ 0 ┆ 1 │ └────────────┴────────────┴──────────────────┴──────────────────┘ """ - assert agg in VALID_AGGREGATIONS, f"Invalid aggregation: {agg}" - assert agg.split("/")[0] in [ - c.split("/")[0] for c in df.columns - ], f"df is invalid, no column with prefix: `{agg.split('/')[0]}`" + if agg not in VALID_AGGREGATIONS: + raise ValueError(f"Invalid aggregation: {agg}. Valid options are: {VALID_AGGREGATIONS}") + if agg.split("/")[0] not in [c.split("/")[0] for c in df.columns]: + raise ValueError(f"DataFrame is invalid, no column with prefix: `{agg.split('/')[0]}`") if window_size == "full": out_df = df.groupby("patient_id").agg( From 720a5330739739ecbcb455a727a6cefb079a6fb9 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Mon, 27 May 2024 09:03:06 -0400 Subject: [PATCH 010/106] Update src/MEDS_tabular_automl/generate_summarized_reps.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- src/MEDS_tabular_automl/generate_summarized_reps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 23db3dc..e0f8920 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -55,7 +55,7 @@ def get_agg_pl_expr(window_size: str, agg: str): case "value/max": value_cols.cummax().map_alias(time_aggd_col_alias_fntr(window_size, "max")) case _: - raise ValueError(f"Invalid aggregation `{agg}` for window_size `{window_size}`") + raise ValueError(f"Invalid aggregation '{agg}' provided for window_size '{window_size}'. Please choose from the valid options: {VALID_AGGREGATIONS}") else: match agg: case "code/count": From 548e29a1c15145fed7e627b2cfa5ac3ef9ff3c7c Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 27 May 2024 16:03:29 +0000 Subject: [PATCH 011/106] Added doctest and updated docstrings in identiy_columns.py. [WIP] addressing comments in other files --- README.md | 82 +++++++++++++------ configs/tabularize.yaml | 3 - scripts/identify_columns.py | 64 ++++----------- scripts/summarize_over_windows.py | 13 ++- .../generate_summarized_reps.py | 6 +- .../generate_ts_features.py | 2 +- tests/test_tabularize.py | 23 +++++- 7 files changed, 108 insertions(+), 85 deletions(-) diff --git a/README.md b/README.md index 792fec7..63b9f70 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ # Scalable tabularization and tabular feature usage utilities over generic MEDS datasets + This repository provides utilities and scripts to run limited automatic tabular ML pipelines for generic MEDS datasets. #### Q1: What do you mean "tabular pipelines"? Isn't _all_ structured EHR data already tabular? + This is a common misconception. _Tabular_ data refers to data that can be organized in a consistent, logical set of rows/columns such that the entirety of a "sample" or "instance" for modeling or analysis is contained in a single row, and the set of columns possibly observed (there can be missingness) is consistent across all @@ -15,28 +17,33 @@ or future windows in time to produce a single row per patient with a consistent, (though there may still be missingness). #### Q2: Why not other systems? - - [TemporAI](https://github.com/vanderschaarlab/temporai) is the most natural competitor, and already - supports AutoML capabilities. However, TemporAI (as of now) does not support generic MEDS datasets, and it - is not clear if their AutoML systems will scale to the size of datasets we need to support. But, further - investigation is needed, and it may be the case that the best solution here is simply to write a custom - data source for MEDS data within TemporAI and leverage their tools. + +- [TemporAI](https://github.com/vanderschaarlab/temporai) is the most natural competitor, and already + supports AutoML capabilities. However, TemporAI (as of now) does not support generic MEDS datasets, and it + is not clear if their AutoML systems will scale to the size of datasets we need to support. But, further + investigation is needed, and it may be the case that the best solution here is simply to write a custom + data source for MEDS data within TemporAI and leverage their tools. # Installation + Clone this repository and install the requirements by running `pip install .` in the root directory. # Usage + This repository consists of two key pieces: - 1. Construction of and efficient loading of tabular (flat, non-longitudinal) summary features describing - patient records in MEDS over arbitrary time-windows (e.g. 1 year, 6 months, etc.) either backwards or - forwards in time from a given index date. Naturally, only "look-back" windows should be used for - future-event prediction tasks; however, the capability to summarize "look-ahead" windows is also useful - for characterizing and describing the differences between patient populations statistically. - 2. Running basic AutoML pipelines over these tabular features to predict arbitrary binary classification - downstream tasks defined over these datasets. The "AutoML" part of this is not particularly advanced -- - what is more advanced is the efficient construction, storage, and loading of tabular features for the - candidate AutoML models, enabling a far more extensive search over different featurization strategies. + +1. Construction of and efficient loading of tabular (flat, non-longitudinal) summary features describing + patient records in MEDS over arbitrary time-windows (e.g. 1 year, 6 months, etc.) either backwards or + forwards in time from a given index date. Naturally, only "look-back" windows should be used for + future-event prediction tasks; however, the capability to summarize "look-ahead" windows is also useful + for characterizing and describing the differences between patient populations statistically. +2. Running basic AutoML pipelines over these tabular features to predict arbitrary binary classification + downstream tasks defined over these datasets. The "AutoML" part of this is not particularly advanced -- + what is more advanced is the efficient construction, storage, and loading of tabular features for the + candidate AutoML models, enabling a far more extensive search over different featurization strategies. ## Feature Construction, Storage, and Loading + Tabularization of a (raw) MEDS dataset is done by running the `scripts/data/tabularize.py` script. This script must inherently do a base level of preprocessing over the MEDS data, then will construct a sharded tabular representation that respects the overall sharding of the raw data. This script uses [Hydra](https://hydra.cc/) @@ -45,14 +52,39 @@ to manage configuration, and the configuration file is located at `configs/tabul ## AutoML Pipelines # TODOs - 1. Leverage the "event bound aggregation" capabilities of [ESGPT Task - Select](https://github.com/justin13601/ESGPTTaskQuerying/) to construct tabular summary features for - event-bound historical windows (e.g., until the prior admission, until the last diagnosis of some type, - etc.). - 2. Support more feature aggregation functions. - 3. Probably rename this repository, as the focus is really more on the tabularization and feature usage - utilities than on the AutoML pipelines themselves. - 4. Import, rather than reimplement, the mapper utilities from the MEDS preprocessing repository. - 5. Investigate the feasibility of using TemporAI for this task. - 6. Consider splitting the feature construction and AutoML pipeline parts of this repository into separate - repositories. + +1. Leverage the "event bound aggregation" capabilities of [ESGPT Task + Select](https://github.com/justin13601/ESGPTTaskQuerying/) to construct tabular summary features for + event-bound historical windows (e.g., until the prior admission, until the last diagnosis of some type, + etc.). +2. Support more feature aggregation functions. +3. Probably rename this repository, as the focus is really more on the tabularization and feature usage + utilities than on the AutoML pipelines themselves. +4. Import, rather than reimplement, the mapper utilities from the MEDS preprocessing repository. +5. Investigate the feasibility of using TemporAI for this task. +6. Consider splitting the feature construction and AutoML pipeline parts of this repository into separate + repositories. + +# Config Args Description + +- MEDS_cohort_dir: directory of MEDS format dataset that is ingested. +- tabularized_data_dir: output directory of tabularized data. +- min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate + what features can be included in the flat representation. It can either be a float, in which + case it applies across all measurements, or `None`, in which case no filtering is applied, or + a dictionary from measurement type to a float dictating a per-measurement-type inclusion + cutoff. +- window_sizes: Beyond writing out a raw, per-event flattened representation, the dataset also has + the capability to summarize these flattened representations over the historical windows + specified in this argument. These are strings specifying time deltas, using this syntax: + `link`\_. Each window size will be summarized to a separate directory, and will share the same + subject file split as is used in the raw representation files. +- codes: A list of codes to include in the flat representation. If `None`, all codes will be included + in the flat representation. +- aggs: A list of aggregations to apply to the raw representation. Must have length greater than 0. +- n_patients_per_sub_shard: The number of subjects that should be included in each output file. + Lowering this number increases the number of files written, making the process of creating and + leveraging these files slower but more memory efficient. +- do_overwrite: If `True`, this function will overwrite the data already stored in the target save + directory. +- seed: The seed to use for random number generation. diff --git a/configs/tabularize.yaml b/configs/tabularize.yaml index deb4caa..aa1ecba 100644 --- a/configs/tabularize.yaml +++ b/configs/tabularize.yaml @@ -8,14 +8,11 @@ window_sizes: ??? codes: null aggs: - "code/count" - - "code/present" - "value/count" - - "value/present" - "value/sum" - "value/sum_sqd" - "value/min" - "value/max" - - "value/first" dynamic_threshold: 0.01 numerical_value_threshold: 0.1 diff --git a/scripts/identify_columns.py b/scripts/identify_columns.py index 4334df8..1e85b58 100644 --- a/scripts/identify_columns.py +++ b/scripts/identify_columns.py @@ -1,11 +1,5 @@ -"""This Python script, utilizing the Hydra and Polars libraries, automates the creation of flat -representations of medical datasets for machine learning modeling. - -It includes functions to store configuration parameters in a JSON file and write summarized dataset -representations to disk based on configurable parameters such as inclusion frequencies and historical window -sizes. The script ensures data integrity through conditional checks on overwriting and updating existing -files, and enhances traceability by recording configuration details and feature columns used in the output. -""" +#!/usr/bin/env python +"""This Python script, stores the configuration parameters and feature columns used in the output.""" import json from pathlib import Path @@ -32,21 +26,27 @@ def store_config_yaml(config_fp: Path, cfg: DictConfig): Raises: - ValueError: If there are discrepancies between old and new parameters during an update. - - FileExistsError: If the file exists and neither updating nor overwriting is allowed. + - FileExistsError: If the file exists and overwriting is not allowed. Example: >>> cfg = DictConfig({ ... "n_patients_per_sub_shard": 100, ... "min_code_inclusion_frequency": 5, - ... "do_update": False, - ... "do_overwrite": True + ... "do_overwrite": True, ... }) >>> import tempfile >>> from pathlib import Path - >>> with tempfile.TemporaryDirectory() as d: - ... config_fp = Path(d) / "config.yaml" + >>> with tempfile.NamedTemporaryFile() as temp_f: + ... config_fp = Path(temp_f.name) ... store_config_yaml(config_fp, cfg) ... assert config_fp.exists() + ... store_config_yaml(config_fp, cfg) + ... cfg.do_overwrite = False + ... try: + ... store_config_yaml(config_fp, cfg) + ... except FileExistsError as e: + ... print("FileExistsError Error Triggered") + FileExistsError Error Triggered """ if config_fp.exists(): if not cfg.do_overwrite: @@ -58,44 +58,10 @@ def store_config_yaml(config_fp: Path, cfg: DictConfig): def store_columns( cfg: DictConfig, ): - """Writes a flat (historically summarized) representation of the dataset to disk. - - This file caches a set of files useful for building flat representations of the dataset to disk, - suitable for, e.g., sklearn style modeling for downstream tasks. It will produce a few sets of files: - - * A new directory ``self.config.save_dir / "flat_reps"`` which contains the following: - * A subdirectory ``raw`` which contains: (1) a json file with the configuration arguments and (2) a - set of parquet files containing flat (e.g., wide) representations of summarized events per subject, - broken out by split and subject chunk. - * A set of subdirectories ``past/*`` which contains summarized views over the past ``*`` time period - per subject per event, for all time periods in ``window_sizes``, if any. + """Stores the configuration parameters and feature columns tabularized data we will be generated for. Args: - cfg: - MEDS_cohort_dir: directory of MEDS format dataset that is ingested. - tabularized_data_dir: output directory of tabularized data. - min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate - what features can be included in the flat representation. It can either be a float, in which - case it applies across all measurements, or `None`, in which case no filtering is applied, or - a dictionary from measurement type to a float dictating a per-measurement-type inclusion - cutoff. - window_sizes: Beyond writing out a raw, per-event flattened representation, the dataset also has - the capability to summarize these flattened representations over the historical windows - specified in this argument. These are strings specifying time deltas, using this syntax: - `link`_. Each window size will be summarized to a separate directory, and will share the same - subject file split as is used in the raw representation files. - codes: A list of codes to include in the flat representation. If `None`, all codes will be included - in the flat representation. - aggs: A list of aggregations to apply to the raw representation. Must have length greater than 0. - n_patients_per_sub_shard: The number of subjects that should be included in each output file. - Lowering this number increases the number of files written, making the process of creating and - leveraging these files slower but more memory efficient. - do_overwrite: If `True`, this function will overwrite the data already stored in the target save - directory. - do_update: bool = True - seed: The seed to use for random number generation. - - .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 + cfg: The configuration object for the tabularization process. """ # create output dir flat_dir = Path(cfg.tabularized_data_dir) diff --git a/scripts/summarize_over_windows.py b/scripts/summarize_over_windows.py index 157bca4..425489e 100644 --- a/scripts/summarize_over_windows.py +++ b/scripts/summarize_over_windows.py @@ -85,11 +85,18 @@ def summarize_ts_data_over_windows( split_to_pair_fps[split] = code_value_pairs - # Example use of split_to_pair_fps + # Summarize data and store + summary_dir = flat_dir / "summary" for split, pairs in split_to_pair_fps.items(): logger.info(f"Processing {split}:") for code_file, value_file in pairs: logger.info(f" - Code file: {code_file}, Value file: {value_file}") - summary_df = generate_summary(pl.scan_parquet(code_file), pl.scan_parquet(value_file)) + summary_df = generate_summary( + feature_columns, + [pl.scan_parquet(code_file), pl.scan_parquet(value_file)], + cfg.window_sizes, + cfg.aggs, + ) + shard_number = code_file.stem.rsplit("_", 1)[0] - write_df(summary_df, flat_dir / split / f"{shard_number}.parquet") + write_df(summary_df, summary_dir / split / f"{shard_number}.parquet") diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 9e9202c..837c3c2 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -13,7 +13,6 @@ "value/sum_sqd", "value/min", "value/max", - "value/first", ] @@ -232,6 +231,11 @@ def generate_summary( ) for df in dfs: if agg.split("/")[0] in [c.split("/")[0] for c in df.columns]: + timestamp_dtype = df.dtypes[df.columns.index("timestamp")] + assert timestamp_dtype in [ + pl.Datetime, + pl.Date, + ], f"timestamp must be of type Date, but is {timestamp_dtype}" out_df = _generate_summary(df, window_size, agg) out_dfs.append(out_df) diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index c768db1..5589ce6 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -54,7 +54,7 @@ def summarize_dynamic_measurements( df.select("patient_id", "timestamp", "code", "numerical_value") .collect() .pivot( - index=["patient_id", "timestamp"], + index=["patient_id", "timestamp"], # add row index and set agg to None columns=["code"], values=["numerical_value"], aggregate_function="mean", # TODO round up counts so they are binary diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 24c8f1c..fa913f5 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -113,7 +113,10 @@ def test_tabularize(): for split, data in MEDS_OUTPUTS.items(): file_path = MEDS_cohort_dir / f"{split}.parquet" file_path.parent.mkdir(exist_ok=True) - pl.read_csv(StringIO(data)).write_parquet(file_path) + df = pl.read_csv(StringIO(data)) + df.with_columns(pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S.%f")).write_parquet( + file_path + ) split_json = json.load(StringIO(SPLITS_JSON)) splits_fp = MEDS_cohort_dir / "splits.json" @@ -123,9 +126,8 @@ def test_tabularize(): "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), "tabularized_data_dir": str(tabularized_data_dir.resolve()), "min_code_inclusion_frequency": 1, - "window_sizes": [30, 365, None], + "window_sizes": ["30d", "365d", "full"], "codes": None, - # "aggs": None, "n_patients_per_sub_shard": 2, "do_overwrite": False, "do_update": True, @@ -160,3 +162,18 @@ def test_tabularize(): assert set(actual_files) == set(expected_files) summarize_ts_data_over_windows(cfg) + # confirm summary files exist: + actual_files = [ + (f.parent.stem, f.stem) for f in list(tabularized_data_dir.glob("summary/*/*.parquet")) + ] + expected_files = [ + ("train", "1"), + ("train", "0"), + ("held_out", "0"), + ("tuning", "0"), + ] + assert set(actual_files) == set(expected_files) + for f in list(tabularized_data_dir.glob("summary/*/*.parquet")): + df = pl.read_parquet(f) + assert df.shape[0] > 0 + assert df.columns == ["hi"] From f0b1cbb236f9b3ae380d471672318502ad4bc460 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Tue, 28 May 2024 15:55:40 +0000 Subject: [PATCH 012/106] working on xgboost --- configs/tabularize_sweep.yaml | 67 ++++++++++ xgboost_sweep.py | 225 ++++++++++++++++++++++++++++++++++ 2 files changed, 292 insertions(+) create mode 100644 configs/tabularize_sweep.yaml create mode 100644 xgboost_sweep.py diff --git a/configs/tabularize_sweep.yaml b/configs/tabularize_sweep.yaml new file mode 100644 index 0000000..e939041 --- /dev/null +++ b/configs/tabularize_sweep.yaml @@ -0,0 +1,67 @@ +# Raw data +MEDS_cohort_dir: +tabularized_data_dir: /storage/teya/meds_automl/test_data/test + +# Pre-processing +min_code_inclusion_frequency: 1 +window_sizes: [1d, 7d, full] +codes: null +aggs: + - "code/count" + - "code/time_since_last" + - "code/time_since_first" + - "value/count" + - "value/sum" + - "value/sum_sqd" + - "count" + - "sum" + - "sum_sqd" + + +# Sharding +n_patients_per_sub_shard: null + +# Misc +do_overwrite: False +seed: 1 + + +model: + booster: gbtree + device: gpu + nthread: 4 + max_depth: 6 + eta: 0.3 + gamma: 0 + subsample: 1 + lambda: 1 + alpha: 0 + tree_method: hist + objective: binary:logistic + +iterator: + keep_static_data_in_memory: True + +# Hydra settings for sweep +defaults: + - override hydra/sweeper: optuna + - override hydra/sweeper/sampler: tpe + +hydra: + sweep: + dir: ${tabularized_data_dir}/.logs/etl/${now:%Y-%m-%d_%H-%M-%S} + run: + dir: ${tabularized_data_dir}/.logs/etl/${now:%Y-%m-%d_%H-%M-%S} + + # Optuna Sweeper + sweeper: + sampler: + seed: 1 + storage: null + study_name: tabularize_study_${now:%Y-%m-%d_%H-%M-%S} + direction: minimize + n_trials: 10 + + # Define search space for Optuna + params: + window_sizes: choice([1d], [1d, 7d], [7d, full]) diff --git a/xgboost_sweep.py b/xgboost_sweep.py new file mode 100644 index 0000000..ad57e1e --- /dev/null +++ b/xgboost_sweep.py @@ -0,0 +1,225 @@ +import hydra +from omegaconf import DictConfig, OmegaConf +from pathlib import Path +import xgboost as xgb +import polars as pl +import numpy as np +import pyarrow as pa +import polars.selectors as cs +from sklearn.metrics import mean_absolute_error + +import os +from typing import List, Callable + +class Iterator(xgb.DataIter): + def __init__(self, cfg: DictConfig, split: str = "train"): + """ + Initialize the Iterator with the provided configuration and split. + + Args: + - cfg (DictConfig): Configuration dictionary. + - split (str): The data split to use ("train", "tuning", or "held_out"). + + """ + + self.cfg = cfg + self.data_path = Path(cfg.tabularized_data_dir) + self.dynamic_data_path = self.data_path / "summarize" / split + self.static_data_path = self.data_path / "static" / split + self._data_shards = [ + x.stem + for x in self.static_data_path.iterdir() + if x.is_file() and x.suffix == ".parquet" + ] + + if cfg.iterator.keep_static_data_in_memory: + self._static_shards = self._get_static_shards() # do we want to cache this differently to share across workers or iterators? + + self._it = 0 + # XGBoost will generate some cache files under current directory with the prefix + # "cache" + super().__init__(cache_prefix=os.path.join(".", "cache")) + + def _get_static_shards(self) -> dict: + """ + Load static shards into memory. + + Returns: + - dict: Dictionary with shard names as keys and data frames as values. + + """ + static_shards = {} + for iter in self._data_shards: + static_shards[iter] = pl.scan_parquet(self.static_data_path / f"{iter}.parquet") + return static_shards + + def _load_shard(self, idx: int) -> tuple[np.ndarray, np.ndarray]: + """ + Load a specific shard of data from disk and concatenate with static data. + + Args: + - idx (int): Index of the shard to load. + + Returns: + - X (pl.DataFrame): Feature data frame. + - y (pl.Series): Labels. + + """ + # concatinate with static data + if self.cfg.iterator.keep_static_data_in_memory: + df = self._static_shards[self._data_shards[idx]] + else: + df = pl.scan_parquet(self.static_data_path / f"{self._data_shards[idx]}.parquet") + + + ### TODO: Add in min_code_inclusion_frequency? + + codes_set = set(self.cfg.codes) if self.cfg.codes else None + aggs_set = set(self.cfg.aggs) if self.cfg.aggs else None + + for window in self.cfg.window_sizes: + dynamic_df = pl.scan_parquet( + self.dynamic_data_path / window / f"{self._data_shards[idx]}.parquet" + ) + + ### TODO: Update this for the correct order of column names from Nassim + columns = dynamic_df.schema.keys() # should I use df.columns instead? + selected_columns = [ + col for col in columns + if (parts := col.split('/')) and len(parts) > 2 + and (codes_set is None or parts[0] in codes_set) + and (aggs_set is None or parts[-1] in aggs_set) + ] + selected_columns.extend(['patient_id', 'timestamp']) + dynamic_df = dynamic_df.select(selected_columns) + + + df = pl.concat([df, dynamic_df], how='align') + + ### TODO: Figure out features vs labels --> look at esgpt_baseline for loading in labels based on tasks + + y = df.select("label") + X = df.select([col for col in df.schema.keys() if col != "label"]) + + ### TODO: Figure out best way to export this to dmatrix --> can we use scipy sparse matrix? + ### TODO: fill nones/nulls with zero if this is needed for xgboost + return X.collect().to_numpy(), y.collect().to_numpy() # convert to sparse matrix instead + + def next(self, input_data: Callable): + """ + Advance the iterator by 1 step and pass the data to XGBoost. This function is + called by XGBoost during the construction of ``DMatrix`` + + Args: + - input_data (Callable): A function passed by XGBoost with the same signature as `DMatrix`. + + Returns: + - int: 0 if end of iteration, 1 otherwise. + """ + if self._it == len(self._data_shards): + # return 0 to let XGBoost know this is the end of iteration + return 0 + + # input_data is a function passed in by XGBoost who has the exact same signature of + # ``DMatrix`` + X, y = self._load_shard(self._it) # self._data_shards[self._it]) + input_data(data=X, label=y) + self._it += 1 + # Return 1 to let XGBoost know we haven't seen all the files yet. + return 1 + + def reset(self): + """ + Reset the iterator to its beginning. + + Example: + >>> cfg_dict = { + ... "tabularize": { + ... "tabularized_data_dir": "/path/to/tabularized/data", + ... }, + ... "iterator": { + ... "keep_static_data_in_memory": True + ... } + ... } + >>> cfg = OmegaConf.create(cfg_dict) + >>> it = Iterator(cfg, split='train') + >>> it._it = 1 + >>> it.reset() + >>> it._it + 0 + """ + self._it = 0 + +class XGBoostClassifier: + def __init__(self, cfg: DictConfig): + """ + Initialize the XGBoostClassifier with the provided configuration. + + Args: + - cfg (DictConfig): Configuration dictionary. + """ + + self.cfg = cfg + + self.itrain = Iterator(cfg) + self.ival = Iterator(cfg, split="tuning") + self.itest = Iterator(cfg, split="held_out") + + self.dtrain = xgb.DMatrix(self.ival) + self.dval = xgb.DMatrix(self.itest) + self.dtest = xgb.DMatrix(self.itest) + + self.model = xgb.train(OmegaConf.to_container(self.cfg.model), self.dtrain) + + def evaluate(self) -> float: + """ + Evaluate the model on the test set. + + Returns: + - float: Evaluation metric (mae). + + Example: + >>> cfg_dict = { + ... "model": { + ... "booster": "gbtree", + ... "objective": "reg:squarederror", + ... } + ... } + >>> cfg = OmegaConf.create(cfg_dict) + >>> classifier = XGBoostClassifier(cfg=cfg) + + >>> n_samples = 1000 + >>> n_features = 10 + >>> X_test = np.random.rand(n_samples, n_features) + >>> y_test = np.random.rand(n_samples) + + >>> mae = classifier.evaluate(X_test, y_test) + >>> isinstance(mae, float) + True + """ + ### TODO: Figure out exactly what we want to do here + + y_pred = self.model.predict(self.dtest) + y_true = self.dtest.get_label() + return mean_absolute_error(y_true, y_pred) + + +@hydra.main(version_base=None, config_path="configs", config_name="tabularize_sweep") +def optimize(cfg: DictConfig) -> float: + """ + Optimize the model based on the provided configuration. + + Args: + - cfg (DictConfig): Configuration dictionary. + + Returns: + - float: Evaluation result. + + """ + + model = XGBoostClassifier(cfg) + return model.evaluate() + + +if __name__ == "__main__": + optimize() From ba954efd0717b48df70e48421902facd3989967d Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 28 May 2024 17:16:35 +0000 Subject: [PATCH 013/106] current state --- README.md | 36 ++- configs/tabularize.yaml | 2 + scripts/identify_columns.py | 4 + scripts/summarize_over_windows.py | 66 +++-- scripts/tabularize_static.py | 20 +- scripts/tabularize_ts.py | 48 +--- .../generate_summarized_reps.py | 161 ++++++------- .../generate_ts_features.py | 115 +++++---- src/MEDS_tabular_automl/mapper.py | 207 ++++++++++++++++ src/MEDS_tabular_automl/tabularize.py | 225 ------------------ src/MEDS_tabular_automl/utils.py | 13 +- tests/test_tabularize.py | 39 ++- 12 files changed, 435 insertions(+), 501 deletions(-) create mode 100644 src/MEDS_tabular_automl/mapper.py delete mode 100644 src/MEDS_tabular_automl/tabularize.py diff --git a/README.md b/README.md index 63b9f70..1e4d634 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,22 @@ This repository consists of two key pieces: what is more advanced is the efficient construction, storage, and loading of tabular features for the candidate AutoML models, enabling a far more extensive search over different featurization strategies. +### Scripts and Examples + +See `tests/test_tabularize_integration.py` for an example of the end-to-end pipeline being run on synthetic data. This +script is a functional test that is also run with `pytest` to verify correctness of the algorithm. + +#### Core Scripts: + +1. `scripts/tabularize/identify_columns.py` loads all training shard to identify which feature columns + to generate tabular data for. +2. `scripts/tabularize/tabularize_static.py` Iterates through shards and generates tabular vectors for + each patient. There is a single row per patient for each shard. +3. `scripts/tabularize/tabularize_ts.py` Iterates through shards and pivots time series data such + that we have a column for every feature column and binary presence for codes and numerical values filled in for columns with numeirical measurements. There is a row for every timeseries input. +4. `scripts/tabularize/summarize_over_windows.py` For each shard, iterates through window sizes and aggregations to + and horizontally concatenates the outputs to generate the final tabular representations at every event time for every patient. + ## Feature Construction, Storage, and Loading Tabularization of a (raw) MEDS dataset is done by running the `scripts/data/tabularize.py` script. This script @@ -65,26 +81,26 @@ to manage configuration, and the configuration file is located at `configs/tabul 6. Consider splitting the feature construction and AutoML pipeline parts of this repository into separate repositories. -# Config Args Description +# YAML Configuration File -- MEDS_cohort_dir: directory of MEDS format dataset that is ingested. -- tabularized_data_dir: output directory of tabularized data. -- min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate +- `MEDS_cohort_dir`: directory of MEDS format dataset that is ingested. +- `tabularized_data_dir`: output directory of tabularized data. +- `min_code_inclusion_frequency`: The base feature inclusion frequency that should be used to dictate what features can be included in the flat representation. It can either be a float, in which case it applies across all measurements, or `None`, in which case no filtering is applied, or a dictionary from measurement type to a float dictating a per-measurement-type inclusion cutoff. -- window_sizes: Beyond writing out a raw, per-event flattened representation, the dataset also has +- `window_sizes`: Beyond writing out a raw, per-event flattened representation, the dataset also has the capability to summarize these flattened representations over the historical windows specified in this argument. These are strings specifying time deltas, using this syntax: `link`\_. Each window size will be summarized to a separate directory, and will share the same subject file split as is used in the raw representation files. -- codes: A list of codes to include in the flat representation. If `None`, all codes will be included +- `codes`: A list of codes to include in the flat representation. If `None`, all codes will be included in the flat representation. -- aggs: A list of aggregations to apply to the raw representation. Must have length greater than 0. -- n_patients_per_sub_shard: The number of subjects that should be included in each output file. +- `aggs`: A list of aggregations to apply to the raw representation. Must have length greater than 0. +- `n_patients_per_sub_shard`: The number of subjects that should be included in each output file. Lowering this number increases the number of files written, making the process of creating and leveraging these files slower but more memory efficient. -- do_overwrite: If `True`, this function will overwrite the data already stored in the target save +- `do_overwrite`: If `True`, this function will overwrite the data already stored in the target save directory. -- seed: The seed to use for random number generation. +- `seed`: The seed to use for random number generation. diff --git a/configs/tabularize.yaml b/configs/tabularize.yaml index aa1ecba..72894ff 100644 --- a/configs/tabularize.yaml +++ b/configs/tabularize.yaml @@ -7,6 +7,8 @@ min_code_inclusion_frequency: ??? window_sizes: ??? codes: null aggs: + - "static/present" + - "static/first" - "code/count" - "value/count" - "value/sum" diff --git a/scripts/identify_columns.py b/scripts/identify_columns.py index 1e85b58..48f871b 100644 --- a/scripts/identify_columns.py +++ b/scripts/identify_columns.py @@ -82,3 +82,7 @@ def store_columns( feature_columns.update(get_flat_rep_feature_cols(cfg, shard_df)) feature_columns = sorted(list(feature_columns)) json.dump(feature_columns, open(flat_dir / "feature_columns.json", "w")) + + +if __name__ == "__main__": + store_columns() diff --git a/scripts/summarize_over_windows.py b/scripts/summarize_over_windows.py index 425489e..5efeb17 100644 --- a/scripts/summarize_over_windows.py +++ b/scripts/summarize_over_windows.py @@ -1,4 +1,6 @@ -"""WIP.""" +#!/usr/bin/env python + +"""Aggregates time-series data for feature columns across different window sizes.""" from pathlib import Path @@ -16,49 +18,37 @@ def summarize_ts_data_over_windows( cfg: DictConfig, ): - """Writes a flat (historically summarized) representation of the dataset to disk. - - This file caches a set of files useful for building flat representations of the dataset to disk, - suitable for, e.g., sklearn style modeling for downstream tasks. It will produce a few sets of files: + """Processes time-series data by summarizing it across different windows, creating a flat, summarized + representation of the data for analysis. - * A new directory ``self.config.save_dir / "flat_reps"`` which contains the following: - * A subdirectory ``raw`` which contains: (1) a json file with the configuration arguments and (2) a - set of parquet files containing flat (e.g., wide) representations of summarized events per subject, - broken out by split and subject chunk. - * A set of subdirectories ``past/*`` which contains summarized views over the past ``*`` time period - per subject per event, for all time periods in ``window_sizes``, if any. + This function orchestrates the data processing pipeline for summarizing time-series data. It loads + data from the tabularize_ts stage, iterates through the pivoted wide dataframes for each split and + shards and then applies a range aggregations across different window sizes defined in the config + The summarized data is then written to disk in a structured directory format. Args: - cfg: - MEDS_cohort_dir: directory of MEDS format dataset that is ingested. - tabularized_data_dir: output directory of tabularized data. - min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate - what features can be included in the flat representation. It can either be a float, in which - case it applies across all measurements, or `None`, in which case no filtering is applied, or - a dictionary from measurement type to a float dictating a per-measurement-type inclusion - cutoff. - window_sizes: Beyond writing out a raw, per-event flattened representation, the dataset also has - the capability to summarize these flattened representations over the historical windows - specified in this argument. These are strings specifying time deltas, using this syntax: - `link`_. Each window size will be summarized to a separate directory, and will share the same - subject file split as is used in the raw representation files. - codes: A list of codes to include in the flat representation. If `None`, all codes will be included - in the flat representation. - aggs: A list of aggregations to apply to the raw representation. Must have length greater than 0. - n_patients_per_sub_shard: The number of subjects that should be included in each output file. - Lowering this number increases the number of files written, making the process of creating and - leveraging these files slower but more memory efficient. - do_overwrite: If `True`, this function will overwrite the data already stored in the target save - directory. - do_update: bool = True - seed: The seed to use for random number generation. - - .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 + cfg: A configuration dictionary derived from Hydra, containing parameters such as the input data + directory, output directory, and specifics regarding the summarization process (like window + sizes and aggregation functions). + + Workflow: + 1. Set up the environment based on configuration settings. + 2. Load and categorize time-series file paths by their data splits. + 3. Pair code and value files for each split. + 4. For each pair of files in each split: + - Load the dataframes in a lazy manner. + - Summarize the dataframes based on predefined window sizes and aggregation methods. + - Write the summarized dataframe to disk. + + Raises: + FileNotFoundError: If specified directories or files in the configuration are not found. + ValueError: If required columns like 'code' or 'value' are missing in the data files. """ flat_dir, _, feature_columns = setup_environment(cfg) # Assuming MEDS_cohort_dir is correctly defined somewhere above this snippet ts_dir = Path(cfg.tabularized_data_dir) / "ts" + # TODO: Use patient splits here instead ts_fps = list(ts_dir.glob("*/*.parquet")) splits = {fp.parent.stem for fp in ts_fps} @@ -100,3 +90,7 @@ def summarize_ts_data_over_windows( shard_number = code_file.stem.rsplit("_", 1)[0] write_df(summary_df, summary_dir / split / f"{shard_number}.parquet") + + +if __name__ == "__main__": + summarize_ts_data_over_windows() diff --git a/scripts/tabularize_static.py b/scripts/tabularize_static.py index d8fdd1b..e5cf9c5 100644 --- a/scripts/tabularize_static.py +++ b/scripts/tabularize_static.py @@ -1,10 +1,5 @@ -"""The base class for core dataset processing logic. - -Attributes: - INPUT_DF_T: This defines the type of the allowable input dataframes -- e.g., databases, filepaths, - dataframes, etc. - DF_T: This defines the type of internal dataframes -- e.g. polars DataFrames. -""" +#!/usr/bin/env python +"""Tabularizes static data in MEDS format into tabular representations.""" from pathlib import Path @@ -111,11 +106,8 @@ def tabularize_static_data( for i, shard_df in enumerate(tqdm(subjects_dfs, desc="Subject chunks", leave=False)): fp = sp_dir / f"{i}.parquet" static_dfs[sp].append(fp) - if fp.exists(): - if cfg.do_update: - continue - elif not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") + if fp.exists() and not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") df = get_flat_static_rep( feature_columns=feature_columns, @@ -123,3 +115,7 @@ def tabularize_static_data( ) write_df(df, fp, do_overwrite=cfg.do_overwrite) + + +if __name__ == "__main__": + tabularize_static_data() diff --git a/scripts/tabularize_ts.py b/scripts/tabularize_ts.py index 33e9dec..20d4022 100644 --- a/scripts/tabularize_ts.py +++ b/scripts/tabularize_ts.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python +"""Tabularizes time-series data in MEDS format into tabular representations.""" import hydra from omegaconf import DictConfig from tqdm import tqdm @@ -18,29 +20,7 @@ def tabularize_ts_data( value data: containing a column for every code which the numerical value observed. Args: - cfg: - MEDS_cohort_dir: directory of MEDS format dataset that is ingested. - tabularized_data_dir: output directory of tabularized data. - min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate - what features can be included in the flat representation. It can either be a float, in which - case it applies across all measurements, or `None`, in which case no filtering is applied, or - a dictionary from measurement type to a float dictating a per-measurement-type inclusion - cutoff. - window_sizes: Beyond writing out a raw, per-event flattened representation, the dataset also has - the capability to summarize these flattened representations over the historical windows - specified in this argument. These are strings specifying time deltas, using this syntax: - `link`_. Each window size will be summarized to a separate directory, and will share the same - subject file split as is used in the raw representation files. - codes: A list of codes to include in the flat representation. If `None`, all codes will be - included in the flat representation. - aggs: A list of aggregations to apply to the raw representation. Must have length greater than 0. - n_patients_per_sub_shard: The number of subjects that should be included in each output file. - Lowering this number increases the number of files written, making the process of creating and - leveraging these files slower but more memory efficient. - do_overwrite: If `True`, this function will overwrite the data already stored in the target save - directory. - do_update: bool = True - seed: The seed to use for random number generation. + cfg: configuration dictionary containing the necessary parameters for tabularizing the data. """ flat_dir, split_to_df, feature_columns = setup_environment(cfg) # Produce ts representation @@ -50,20 +30,16 @@ def tabularize_ts_data( sp_dir = ts_subdir / sp for i, shard_df in enumerate(tqdm(subjects_dfs, desc="Subject chunks", leave=False)): - code_fp = sp_dir / f"{i}_code.parquet" - value_fp = sp_dir / f"{i}_value.parquet" - if code_fp.exists() or value_fp.exists(): - if cfg.do_update: - continue - elif not cfg.do_overwrite: - raise FileExistsError( - f"do_overwrite is {cfg.do_overwrite} and {code_fp.exists()}" - f" or {value_fp.exists()} exists!" - ) + pivot_fp = sp_dir / f"{i}.parquet" + if pivot_fp.exists() and not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {pivot_fp.exists()} exists!") - code_df, value_df = get_flat_ts_rep( + pivot_df = get_flat_ts_rep( feature_columns=feature_columns, shard_df=shard_df, ) - write_df(code_df, code_fp, do_overwrite=cfg.do_overwrite) - write_df(value_df, value_fp, do_overwrite=cfg.do_overwrite) + write_df(pivot_df, pivot_fp, do_overwrite=cfg.do_overwrite) + + +if __name__ == "__main__": + tabularize_ts_data() diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 7637922..0036dc5 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -3,10 +3,13 @@ import polars as pl import polars.selectors as cs -from MEDS_tabular_automl.utils import DF_T +from MEDS_tabular_automl.utils import DF_T, ROW_IDX_NAME -VALID_AGGREGATIONS = [ +CODE_AGGREGATIONS = [ "code/count", +] + +VALUE_AGGREGATIONS = [ "value/count", "value/has_values_count", "value/sum", @@ -15,6 +18,8 @@ "value/max", ] +VALID_AGGREGATIONS = CODE_AGGREGATIONS + VALUE_AGGREGATIONS + def time_aggd_col_alias_fntr(window_size: str, agg: str) -> Callable[[str], str]: if agg is None: @@ -27,8 +32,8 @@ def f(c: str) -> str: def get_agg_pl_expr(window_size: str, agg: str): - code_cols = cs.starts_with("code/") - value_cols = cs.starts_with("value/") + code_cols = cs.ends_with("code") + value_cols = cs.ends_with("value") if window_size == "full": match agg: case "code/count": @@ -39,12 +44,6 @@ def get_agg_pl_expr(window_size: str, agg: str): .cumsum() .map_alias(time_aggd_col_alias_fntr(window_size, "count")) ) - case "value/has_values_count": - return ( - (value_cols.is_not_null() & value_cols.is_not_nan()) - .cumsum() - .map_alias(time_aggd_col_alias_fntr(window_size, "has_values_count")) - ) case "value/sum": return value_cols.cumsum().map_alias(time_aggd_col_alias_fntr(window_size, "sum")) case "value/sum_sqd": @@ -54,7 +53,10 @@ def get_agg_pl_expr(window_size: str, agg: str): case "value/max": value_cols.cummax().map_alias(time_aggd_col_alias_fntr(window_size, "max")) case _: - raise ValueError(f"Invalid aggregation '{agg}' provided for window_size '{window_size}'. Please choose from the valid options: {VALID_AGGREGATIONS}") + raise ValueError( + f"Invalid aggregation '{agg}' provided for window_size '{window_size}'." + f" Please choose from the valid options: {VALID_AGGREGATIONS}" + ) else: match agg: case "code/count": @@ -94,16 +96,18 @@ def _generate_summary(df: DF_T, window_size: str, agg: str) -> pl.LazyFrame: Expect: >>> from datetime import date - >>> code_df = pl.DataFrame({"patient_id": [1, 1, 1, 2], - ... "code/A": [1, 1, 0, 0], - ... "code/B": [0, 0, 1, 1], - ... "timestamp": [date(2021, 1, 1), date(2021, 1, 2),date(2020, 1, 3), date(2021, 1, 4)], - ... }).lazy() - >>> _generate_summary(code_df.lazy(), "full", "code/count" - ... ).collect().sort(["patient_id", "timestamp"]) + >>> pivot_df = pl.DataFrame({"patient_id": [1, 1, 1, 2], + ... "A/code": [True, True, None, None], + ... "B/code": [None, None, True, True], + ... "A/value": [1, 2, 3, None], + ... "B/value": [None, None, None, 4.0], + ... "timestamp": [date(2020, 1, 1), date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)], + ... }).lazy().with_row_index(ROW_IDX_NAME) + >>> _generate_summary(pivot_df.lazy(), "2d", "code/count" + ... ).collect().drop(ROW_IDX_NAME).sort(["patient_id", "timestamp"]) shape: (4, 4) ┌────────────┬────────────┬───────────────────┬───────────────────┐ - │ patient_id ┆ timestamp ┆ full/code/A/count ┆ full/code/B/count │ + │ patient_id ┆ timestamp ┆ full/A/code/count ┆ full/B/code/count │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ date ┆ i64 ┆ i64 │ ╞════════════╪════════════╪═══════════════════╪═══════════════════╡ @@ -112,29 +116,11 @@ def _generate_summary(df: DF_T, window_size: str, agg: str) -> pl.LazyFrame: │ 1 ┆ 2021-01-02 ┆ 2 ┆ 0 │ │ 2 ┆ 2021-01-04 ┆ 0 ┆ 1 │ └────────────┴────────────┴───────────────────┴───────────────────┘ - >>> value_df = pl.DataFrame({"patient_id": [1, 1, 1, 2], - ... "timestamp": [date(2021, 1, 1), date(2021, 1, 2), - ... date(2020, 1, 3), date(2021, 1, 4)], - ... "value/A": [1, 2, 3, None], - ... "value/B": [None, None, None, 4.0],}) - >>> _generate_summary(value_df.lazy(), "full", "value/sum").collect().sort( + >>> _generate_summary(pivot_df.lazy(), "full", "value/sum").collect().drop(ROW_IDX_NAME).sort( ... ["patient_id", "timestamp"]) shape: (4, 4) ┌────────────┬────────────┬──────────────────┬──────────────────┐ - │ patient_id ┆ timestamp ┆ full/value/A/sum ┆ full/value/B/sum │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ date ┆ i64 ┆ f64 │ - ╞════════════╪════════════╪══════════════════╪══════════════════╡ - │ 1 ┆ 2020-01-03 ┆ 6 ┆ null │ - │ 1 ┆ 2021-01-01 ┆ 1 ┆ null │ - │ 1 ┆ 2021-01-02 ┆ 3 ┆ null │ - │ 2 ┆ 2021-01-04 ┆ null ┆ 4.0 │ - └────────────┴────────────┴──────────────────┴──────────────────┘ - >>> _generate_summary(value_df.lazy(), "1d", "value/count").collect().sort( - ... ["patient_id", "timestamp"]) - shape: (4, 4) - ┌────────────┬────────────┬──────────────────┬──────────────────┐ - │ patient_id ┆ timestamp ┆ 1d/value/A/count ┆ 1d/value/B/count │ + │ patient_id ┆ timestamp ┆ 1d/A/value/sum ┆ 1d/B/value/sum │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ date ┆ u32 ┆ u32 │ ╞════════════╪════════════╪══════════════════╪══════════════════╡ @@ -146,33 +132,26 @@ def _generate_summary(df: DF_T, window_size: str, agg: str) -> pl.LazyFrame: """ if agg not in VALID_AGGREGATIONS: raise ValueError(f"Invalid aggregation: {agg}. Valid options are: {VALID_AGGREGATIONS}") - if agg.split("/")[0] not in [c.split("/")[0] for c in df.columns]: - raise ValueError(f"DataFrame is invalid, no column with prefix: `{agg.split('/')[0]}`") - + id_cols = [ROW_IDX_NAME, "patient_id"] if window_size == "full": - out_df = df.groupby("patient_id").agg( + out_df = df.groupby(id_cols).agg( "timestamp", get_agg_pl_expr(window_size, agg), ) - out_df = out_df.explode(*[c for c in out_df.columns if c != "patient_id"]) + out_df = out_df.explode(*[c for c in out_df.columns if c not in id_cols]) else: - out_df = ( - df.sort(["patient_id", "timestamp"]) - .groupby_rolling( - index_column="timestamp", - by="patient_id", - period=window_size, - ) - .agg( - get_agg_pl_expr(window_size, agg), - ) + out_df = df.groupby_rolling( + index_column="timestamp", + by=id_cols, + period=window_size, + ).agg( + get_agg_pl_expr(window_size, agg), ) - return out_df def generate_summary( - feature_columns: list[str], dfs: list[pl.LazyFrame], window_sizes: list[str], aggregations: list[str] + feature_columns: list[str], df: pl.LazyFrame, window_sizes: list[str], aggregations: list[str] ) -> pl.LazyFrame: """Generate a summary of the data frame for given window sizes and aggregations. @@ -193,34 +172,33 @@ def generate_summary( Expect: >>> from datetime import date - >>> value_df = pl.DataFrame({"patient_id": [1, 1, 1, 2], - ... "timestamp": [date(2021, 1, 1), date(2021, 1, 2),date(2020, 1, 3), date(2021, 1, 4)], - ... "value/A": [1, 2, 3, None], - ... "value/B": [None, None, None, 4.0],}) - >>> code_df = pl.DataFrame({"patient_id": [1, 1, 1, 2], - ... "code/A": [1, 1, 0, 0], - ... "code/B": [0, 0, 1, 1], - ... "timestamp": [date(2021, 1, 1), date(2021, 1, 2),date(2020, 1, 3), date(2021, 1, 5)], + >>> pivot_df = pl.DataFrame({"patient_id": [1, 1, 1, 2], + ... "A/code": [1, 1, 0, 0], + ... "B/code": [0, 0, 1, 1], + ... "A/value": [1, 2, 3, None], + ... "B/value": [None, None, None, 4.0], + ... "timestamp": [date(2021, 1, 1), date(2021, 1, 1),date(2020, 1, 3), date(2021, 1, 4)], ... }).lazy() - >>> feature_columns = ["code/A", "code/B", "value/A", "value/B"] + >>> feature_columns = ["A/code", "B/code", "A/value", "B/value"] >>> aggregations = ["code/count", "value/sum"] >>> window_sizes = ["full", "1d"] - >>> out_df = generate_summary(feature_columns, [value_df.lazy(), code_df.lazy()], + >>> out_df = generate_summary(feature_columns, pivot_df.lazy(), ... window_sizes, aggregations).collect().sort(["patient_id", "timestamp"]) >>> print(out_df.shape) - (5, 10) - >>> for c in out_df.columns: print(c) + (4, 10) + >>> for c in sorted(out_df.columns): print(c) + 1d/A/code/count + 1d/A/value/sum + 1d/B/code/count + 1d/B/value/sum + full/A/code/count + full/A/value/sum + full/B/code/count + full/B/value/sum patient_id timestamp - 1d/code/A/count - 1d/code/B/count - 1d/value/A/sum - 1d/value/B/sum - full/code/A/count - full/code/B/count - full/value/A/sum - full/value/B/sum """ + df = df.sort(["patient_id", "timestamp"]).with_row_index(ROW_IDX_NAME) final_columns = [] out_dfs = [] # Generate summaries for each window size and aggregation @@ -228,28 +206,23 @@ def generate_summary( for agg in aggregations: code_type, agg_name = agg.split("/") final_columns.extend( - [f"{window_size}/{c}/{agg_name}" for c in feature_columns if c.startswith(code_type)] + [f"{window_size}/{c}/{agg_name}" for c in feature_columns if c.endswith(code_type)] ) - for df in dfs: - if agg.split("/")[0] in [c.split("/")[0] for c in df.columns]: - timestamp_dtype = df.dtypes[df.columns.index("timestamp")] - assert timestamp_dtype in [ - pl.Datetime, - pl.Date, - ], f"timestamp must be of type Date, but is {timestamp_dtype}" - out_df = _generate_summary(df, window_size, agg) - out_dfs.append(out_df) + # only iterate through code_types that exist in the dataframe columns + if any([c.endswith(code_type) for c in df.columns]): + timestamp_dtype = df.dtypes[df.columns.index("timestamp")] + assert timestamp_dtype in [ + pl.Datetime, + pl.Date, + ], f"timestamp must be of type Date, but is {timestamp_dtype}" + out_df = _generate_summary(df, window_size, agg) + out_dfs.append(out_df) final_columns = sorted(final_columns) # Combine all dataframes using successive joins - result_df = out_dfs[0] - for df in out_dfs[1:]: - result_df = result_df.join(df, on=["patient_id", "timestamp"], how="outer", coalesce=True) - + result_df = pl.concat(out_dfs, how="align").drop(ROW_IDX_NAME) # Add in missing feature columns with default values - existing_columns = result_df.columns - for column in final_columns: - if column not in existing_columns: - result_df = result_df.with_columns(pl.lit(None).alias(column)) + missing_columns = [col for col in final_columns if col not in result_df.columns] + result_df = result_df.with_columns([pl.lit(None).alias(col) for col in missing_columns]) result_df = result_df.select(pl.col(*["patient_id", "timestamp"], *final_columns)) return result_df diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index 5589ce6..1e6ac71 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -1,6 +1,6 @@ import polars as pl -from MEDS_tabular_automl.utils import DF_T +from MEDS_tabular_automl.utils import DF_T, ROW_IDX_NAME def summarize_dynamic_measurements( @@ -24,47 +24,49 @@ def summarize_dynamic_measurements( ... 'numerical_value': [1, 2, 2, 2]} >>> df = pl.DataFrame(data).lazy() >>> ts_columns = ['A', 'B'] - >>> code_df, value_df = summarize_dynamic_measurements(ts_columns, df) - >>> code_df.collect() - shape: (4, 4) - ┌────────────┬────────┬────────┬────────────┐ - │ patient_id ┆ code/A ┆ code/B ┆ timestamp │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ u8 ┆ u8 ┆ str │ - ╞════════════╪════════╪════════╪════════════╡ - │ 1 ┆ 1 ┆ 0 ┆ 2021-01-01 │ - │ 1 ┆ 1 ┆ 0 ┆ 2021-01-01 │ - │ 1 ┆ 0 ┆ 1 ┆ 2020-01-01 │ - │ 2 ┆ 0 ┆ 1 ┆ 2021-01-04 │ - └────────────┴────────┴────────┴────────────┘ - >>> value_df.collect() - shape: (3, 4) - ┌────────────┬────────────┬─────────┬─────────┐ - │ patient_id ┆ timestamp ┆ value/A ┆ value/B │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ f64 ┆ f64 │ - ╞════════════╪════════════╪═════════╪═════════╡ - │ 1 ┆ 2021-01-01 ┆ 1.5 ┆ null │ - │ 1 ┆ 2020-01-01 ┆ null ┆ 2.0 │ - │ 2 ┆ 2021-01-04 ┆ null ┆ 2.0 │ - └────────────┴────────────┴─────────┴─────────┘ + >>> pivot_df = summarize_dynamic_measurements(ts_columns, df) + >>> pivot_df.collect() + shape: (4, 7) + ┌───────────┬────────────┬────────────┬─────────┬─────────┬────────┬────────┐ + │ __row_idx ┆ patient_id ┆ timestamp ┆ A/value ┆ B/value ┆ A/code ┆ B/code │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ str ┆ i64 ┆ i64 ┆ bool ┆ bool │ + ╞═══════════╪════════════╪════════════╪═════════╪═════════╪════════╪════════╡ + │ 0 ┆ 1 ┆ 2021-01-01 ┆ 1 ┆ null ┆ true ┆ null │ + │ 1 ┆ 1 ┆ 2021-01-01 ┆ 2 ┆ null ┆ true ┆ null │ + │ 2 ┆ 1 ┆ 2020-01-01 ┆ null ┆ 2 ┆ null ┆ true │ + │ 3 ┆ 2 ┆ 2021-01-04 ┆ null ┆ 2 ┆ null ┆ true │ + └───────────┴────────────┴────────────┴─────────┴─────────┴────────┴────────┘ """ - - value_df = ( - df.select("patient_id", "timestamp", "code", "numerical_value") + df = df.with_row_index(ROW_IDX_NAME) + id_cols = [ROW_IDX_NAME, "patient_id", "timestamp"] + pivot_df = ( + df.select(*id_cols, "code", "numerical_value") + .with_columns(pl.lit(True).alias("__indicator")) .collect() .pivot( - index=["patient_id", "timestamp"], # add row index and set agg to None + index=id_cols, # add row index and set agg to None columns=["code"], - values=["numerical_value"], - aggregate_function="mean", # TODO round up counts so they are binary + values=["numerical_value", "__indicator"], + aggregate_function=None, # TODO round up counts so they are binary separator="/", ) .lazy() ) - value_df = value_df.rename(lambda c: f"value/{c}" if c not in ["patient_id", "timestamp"] else c) - code_df = df.drop("numerical_value").collect().to_dummies(columns=["code"], separator="/").lazy() - return code_df, value_df + + def rename(c): + """Remove value and column prefix.""" + numerical_val_col_name = "numerical_value" + indicator_col_name = "__indicator" + if c.startswith(numerical_val_col_name): + return f"{c[len(numerical_val_col_name)+6:]}/value" + elif c.startswith(indicator_col_name): + return f"{c[len(indicator_col_name)+6:]}/code" + else: + return c + + pivot_df = pivot_df.rename(rename) + return pivot_df def get_flat_ts_rep( @@ -89,37 +91,30 @@ def get_flat_ts_rep( representations. Example: - >>> feature_columns = ['A', 'B', 'C', "static/A"] + >>> feature_columns = ['A', 'B', 'C', "A/static/present"] >>> data = {'patient_id': [1, 1, 1, 2, 2, 2], ... 'code': ['A', 'A', 'B', 'B', 'C', 'C'], ... 'timestamp': ['2021-01-01', '2021-01-01', '2020-01-01', '2021-01-04', None, None], ... 'numerical_value': [1, 2, 2, 2, 3, 4]} >>> df = pl.DataFrame(data).lazy() - >>> code_df, value_df = get_flat_ts_rep(feature_columns, df) - >>> code_df.collect() - shape: (4, 4) - ┌────────────┬────────┬────────┬────────────┐ - │ patient_id ┆ code/A ┆ code/B ┆ timestamp │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ u8 ┆ u8 ┆ str │ - ╞════════════╪════════╪════════╪════════════╡ - │ 1 ┆ 1 ┆ 0 ┆ 2021-01-01 │ - │ 1 ┆ 1 ┆ 0 ┆ 2021-01-01 │ - │ 1 ┆ 0 ┆ 1 ┆ 2020-01-01 │ - │ 2 ┆ 0 ┆ 1 ┆ 2021-01-04 │ - └────────────┴────────┴────────┴────────────┘ - >>> value_df.collect() - shape: (3, 4) - ┌────────────┬────────────┬─────────┬─────────┐ - │ patient_id ┆ timestamp ┆ value/A ┆ value/B │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ f64 ┆ f64 │ - ╞════════════╪════════════╪═════════╪═════════╡ - │ 1 ┆ 2021-01-01 ┆ 1.5 ┆ null │ - │ 1 ┆ 2020-01-01 ┆ null ┆ 2.0 │ - │ 2 ┆ 2021-01-04 ┆ null ┆ 2.0 │ - └────────────┴────────────┴─────────┴─────────┘ + >>> pivot_df = get_flat_ts_rep(feature_columns, df) + >>> pivot_df.collect() + shape: (4, 7) + ┌───────────┬────────────┬────────────┬─────────┬─────────┬────────┬────────┐ + │ __row_idx ┆ patient_id ┆ timestamp ┆ A/value ┆ B/value ┆ A/code ┆ B/code │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ str ┆ i64 ┆ i64 ┆ bool ┆ bool │ + ╞═══════════╪════════════╪════════════╪═════════╪═════════╪════════╪════════╡ + │ 0 ┆ 1 ┆ 2021-01-01 ┆ 1 ┆ null ┆ true ┆ null │ + │ 1 ┆ 1 ┆ 2021-01-01 ┆ 2 ┆ null ┆ true ┆ null │ + │ 2 ┆ 1 ┆ 2020-01-01 ┆ null ┆ 2 ┆ null ┆ true │ + │ 3 ┆ 2 ┆ 2021-01-04 ┆ null ┆ 2 ┆ null ┆ true │ + └───────────┴────────────┴────────────┴─────────┴─────────┴────────┴────────┘ """ - ts_columns = [c for c in feature_columns if not c.startswith("static")] + + def is_static(c): + return len(c.split("/")) > 2 and c.split("/")[-2] == "static" + + ts_columns = [c for c in feature_columns if not is_static(c)] ts_shard_df = shard_df.filter(pl.col("timestamp").is_not_null()) return summarize_dynamic_measurements(ts_columns, ts_shard_df) diff --git a/src/MEDS_tabular_automl/mapper.py b/src/MEDS_tabular_automl/mapper.py new file mode 100644 index 0000000..deefd0d --- /dev/null +++ b/src/MEDS_tabular_automl/mapper.py @@ -0,0 +1,207 @@ +"""Basic utilities for parallelizable map operations on sharded MEDS datasets with caching and locking.""" + +import json +import shutil +from collections.abc import Callable +from datetime import datetime +from pathlib import Path + +from loguru import logger + + +def wrap[ + DF_T +]( + in_fp: Path, + out_fp: Path, + read_fn: Callable[[Path], DF_T], + write_fn: Callable[[DF_T, Path], None], + *transform_fns: Callable[[DF_T], DF_T], + cache_intermediate: bool = True, + clear_cache_on_completion: bool = True, + do_overwrite: bool = False, + do_return: bool = False, +) -> tuple[bool, DF_T | None]: + """Wrap a series of file-in file-out map transformations on a dataframe with caching and locking. + + Args: + in_fp: The file path of the input dataframe. Must exist and be readable via `read_fn`. + out_fp: Output file path. The parent directory will be created if it does not exist. If this file + already exists, it will be deleted before any computations are done if `do_overwrite=True`, which + can result in data loss if the transformation functions do not complete successfully on + intermediate steps. If `do_overwrite` is `False` and this file exists, the function will use the + `read_fn` to read the file and return the dataframe directly. + read_fn: Function that reads the dataframe from a file. This must take as input a Path object and + return a dataframe of (generic) type DF_T. Ideally, this read function can make use of lazy + loading to further accelerate unnecessary reads when resuming from intermediate cached steps. + write_fn: Function that writes the dataframe to a file. This must take as input a dataframe of + (generic) type DF_T and a Path object, and will write the dataframe to that file. + transform_fns: A series of functions that transform the dataframe. Each function must take as input + a dataframe of (generic) type DF_T and return a dataframe of (generic) type DF_T. The functions + will be applied in the passed order. + cache_intermediate: If True, intermediate outputs of the transformations will be cached in a hidden + directory in the same parent directory as `out_fp` of the form + `{out_fp.parent}/.{out_fp.stem}_cache`. This can be useful for debugging and resuming from + intermediate steps when nontrivial transformations are composed. Cached files will be named + `step_{i}.output` where `i` is the index of the transformation function in `transform_fns`. **Note + that if you change the order of the transformations, the cache will be no longer valid but the + system will _not_ automatically delete the cache!**. This is `True` by default. + If `do_overwrite=True`, any prior individual cache files that are detected during the run will be + deleted before their corresponding step is run. If `do_overwrite=False` and a cache file exists, + that step of the transformation will be skipped and the cache file will be read directly. + clear_cache_on_completion: If True, the cache directory will be deleted after the final output is + written. This is `True` by default. + do_overwrite: If True, the output file will be overwritten if it already exists. This is `False` by + default. + do_return: If True, the final dataframe will be returned. This is `False` by default. + + Returns: + The dataframe resulting from the transformations applied in sequence to the dataframe stored in + `in_fp`. + + Examples: + >>> import polars as pl + >>> import tempfile + >>> directory = tempfile.TemporaryDirectory() + >>> root = Path(directory.name) + >>> # For this example we'll use a simple CSV file, but in practice we *strongly* recommend using + >>> # Parquet files for performance reasons. + >>> in_fp = root / "input.csv" + >>> out_fp = root / "output.csv" + >>> in_df = pl.DataFrame({"a": [1, 3, 3], "b": [2, 4, 5], "c": [3, -1, 6]}) + >>> in_df.write_csv(in_fp) + >>> read_fn = pl.read_csv + >>> write_fn = pl.DataFrame.write_csv + >>> transform_fns = [ + ... lambda df: df.with_columns(pl.col("c") * 2), + ... lambda df: df.filter(pl.col("c") > 4) + ... ] + >>> result_computed = wrap(in_fp, out_fp, read_fn, write_fn, *transform_fns, do_return=False) + >>> assert result_computed + >>> print(out_fp.read_text()) + a,b,c + 1,2,6 + 3,5,12 + + >>> out_fp.unlink() + >>> cache_directory = root / f".output_cache" + >>> assert not cache_directory.is_dir() + >>> transform_fns = [ + ... lambda df: df.with_columns(pl.col("c") * 2), + ... lambda df: df.filter(pl.col("d") > 4) + ... ] + >>> wrap(in_fp, out_fp, read_fn, write_fn, *transform_fns) + Traceback (most recent call last): + ... + polars.exceptions.ColumnNotFoundError: unable to find column "d"; valid columns: ["a", "b", "c"] + >>> assert cache_directory.is_dir() + >>> cache_fp = cache_directory / "step_0.output" + >>> pl.read_csv(cache_fp) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 6 │ + │ 3 ┆ 4 ┆ -2 │ + │ 3 ┆ 5 ┆ 12 │ + └─────┴─────┴─────┘ + >>> shutil.rmtree(cache_directory) + >>> lock_fp = cache_directory / "lock.json" + >>> assert not lock_fp.is_file() + >>> def lock_fp_checker_fn(df: pl.DataFrame) -> pl.DataFrame: + ... print(f"Lock fp exists? {lock_fp.is_file()}") + ... return df + >>> result_computed, out_df = wrap( + ... in_fp, out_fp, read_fn, write_fn, lock_fp_checker_fn, do_return=True + ... ) + Lock fp exists? True + >>> assert result_computed + >>> out_df + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 3 ┆ 4 ┆ -1 │ + │ 3 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + >>> directory.cleanup() + """ + + if out_fp.is_file(): + if do_overwrite: + logger.info(f"Deleting existing {out_fp} as do_overwrite={do_overwrite}.") + out_fp.unlink() + else: + logger.info(f"{out_fp} exists; reading directly and returning.") + if do_return: + return True, read_fn(out_fp) + else: + return True + + cache_directory = out_fp.parent / f".{out_fp.stem}_cache" + cache_directory.mkdir(exist_ok=True, parents=True) + + st_time = datetime.now() + runtime_info = {"start": str(st_time)} + + lock_fp = cache_directory / "lock.json" + if lock_fp.is_file(): + started_at = json.loads(lock_fp.read_text())["start"] + logger.info( + f"{out_fp} is under construction as of {started_at} as {lock_fp} exists. " "Returning None." + ) + if do_return: + return False, None + else: + return False + + lock_fp.write_text(json.dumps(runtime_info)) + + logger.info(f"Reading input dataframe from {in_fp}") + df = read_fn(in_fp) + logger.info("Read dataset") + + try: + for i, transform_fn in enumerate(transform_fns): + cache_fp = cache_directory / f"step_{i}.output" + + st_time_step = datetime.now() + if cache_fp.is_file(): + if do_overwrite: + logger.info( + f"Deleting existing cached output for step {i} " f"as do_overwrite={do_overwrite}" + ) + cache_fp.unlink() + else: + logger.info(f"Reading cached output for step {i}") + df = read_fn(cache_fp) + else: + df = transform_fn(df) + + if cache_intermediate and i < len(transform_fns) - 1: + logger.info(f"Writing intermediate output for step {i} to {cache_fp}") + write_fn(df, cache_fp) + logger.info(f"Completed step {i} in {datetime.now() - st_time_step}") + + logger.info(f"Writing final output to {out_fp}") + write_fn(df, out_fp) + logger.info(f"Succeeded in {datetime.now() - st_time}") + if clear_cache_on_completion: + logger.info(f"Clearing cache directory {cache_directory}") + shutil.rmtree(cache_directory) + else: + logger.info(f"Leaving cache directory {cache_directory}, but clearing lock at {lock_fp}") + lock_fp.unlink() + if do_return: + return True, df + else: + return True + except Exception as e: + logger.warning(f"Clearing lock due to Exception {e} at {lock_fp} after {datetime.now() - st_time}") + lock_fp.unlink() + raise e diff --git a/src/MEDS_tabular_automl/tabularize.py b/src/MEDS_tabular_automl/tabularize.py deleted file mode 100644 index a5ab4c8..0000000 --- a/src/MEDS_tabular_automl/tabularize.py +++ /dev/null @@ -1,225 +0,0 @@ -"""The base class for core dataset processing logic. - -Attributes: - INPUT_DF_T: This defines the type of the allowable input dataframes -- e.g., databases, filepaths, - dataframes, etc. - DF_T: This defines the type of internal dataframes -- e.g. polars DataFrames. -""" -from collections.abc import Mapping -from pathlib import Path - -import polars as pl -from omegaconf import DictConfig, OmegaConf -from tqdm.auto import tqdm - -from MEDS_tabular_automl.generate_static_features import get_flat_static_rep -from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep -from MEDS_tabular_automl.utils import get_flat_rep_feature_cols, write_df - - -def load_meds_data(MEDS_cohort_dir: str) -> Mapping[str, pl.DataFrame]: - """Loads the MEDS dataset from disk. - - Args: - MEDS_cohort_dir: The directory containing the MEDS datasets split by subfolders. - We expect `train` to be a split so `MEDS_cohort_dir/train` should exist. - - Returns: - Mapping[str, pl.DataFrame]: Mapping from split name to a polars DataFrame containing the MEDS dataset. - - Example: - >>> import tempfile - >>> from pathlib import Path - >>> MEDS_cohort_dir = Path(tempfile.mkdtemp()) - >>> for split in ["train", "val", "test"]: - ... split_dir = MEDS_cohort_dir / split - ... split_dir.mkdir() - ... pl.DataFrame({"patient_id": [1, 2, 3]}).write_parquet(split_dir / "data.parquet") - >>> split_to_df = load_meds_data(MEDS_cohort_dir) - >>> assert "train" in split_to_df - >>> assert len(split_to_df) == 3 - >>> assert len(split_to_df["train"]) == 1 - >>> assert isinstance(split_to_df["train"][0], pl.LazyFrame) - """ - MEDS_cohort_dir = Path(MEDS_cohort_dir) - meds_fps = list(MEDS_cohort_dir.glob("*/*.parquet")) - splits = {fp.parent.stem for fp in meds_fps} - split_to_fps = {split: [fp for fp in meds_fps if fp.parent.stem == split] for split in splits} - split_to_df = { - split: [pl.scan_parquet(fp) for fp in split_fps] for split, split_fps in split_to_fps.items() - } - return split_to_df - - -def store_config_yaml(config_fp: Path, cfg: DictConfig): - """Stores configuration parameters into a JSON file. - - This function writes a dictionary of parameters, which includes patient partitioning - information and configuration details, to a specified JSON file. - - Args: - - config_fp (Path): The file path for the JSON file where config should be stored. - - cfg (DictConfig): A configuration object containing settings like the number of patients - per sub-shard, minimum code inclusion frequency, and flags for updating or overwriting existing files. - - Behavior: - - If config_fp exists and cfg.do_overwrite is False (without do_update being True), a - FileExistsError is raised to prevent unintentional data loss. - - Raises: - - ValueError: If there are discrepancies between old and new parameters during an update. - - FileExistsError: If the file exists and neither updating nor overwriting is allowed. - - Example: - >>> cfg = DictConfig({ - ... "n_patients_per_sub_shard": 100, - ... "min_code_inclusion_frequency": 5, - ... "do_update": False, - ... "do_overwrite": True - ... }) - >>> import tempfile - >>> from pathlib import Path - >>> with tempfile.TemporaryDirectory() as d: - ... config_fp = Path(d) / "config.yaml" - ... store_config_yaml(config_fp, cfg) - ... assert config_fp.exists() - """ - if config_fp.exists(): - if not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {config_fp} exists!") - OmegaConf.save(cfg, config_fp) - - -def cache_flat_representation( - cfg: DictConfig, -): - """Writes a flat (historically summarized) representation of the dataset to disk. - - This file caches a set of files useful for building flat representations of the dataset to disk, - suitable for, e.g., sklearn style modeling for downstream tasks. It will produce a few sets of files: - - * A new directory ``self.config.save_dir / "flat_reps"`` which contains the following: - * A subdirectory ``raw`` which contains: (1) a json file with the configuration arguments and (2) a - set of parquet files containing flat (e.g., wide) representations of summarized events per subject, - broken out by split and subject chunk. - * A set of subdirectories ``past/*`` which contains summarized views over the past ``*`` time period - per subject per event, for all time periods in ``window_sizes``, if any. - - Args: - cfg: - MEDS_cohort_dir: directory of MEDS format dataset that is ingested. - tabularized_data_dir: output directory of tabularized data. - min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate - what features can be included in the flat representation. It can either be a float, in which - case it applies across all measurements, or `None`, in which case no filtering is applied, or - a dictionary from measurement type to a float dictating a per-measurement-type inclusion - cutoff. - window_sizes: Beyond writing out a raw, per-event flattened representation, the dataset also has - the capability to summarize these flattened representations over the historical windows - specified in this argument. These are strings specifying time deltas, using this syntax: - `link`_. Each window size will be summarized to a separate directory, and will share the same - subject file split as is used in the raw representation files. - codes: A list of codes to include in the flat representation. If `None`, all codes will be included - in the flat representation. - aggs: A list of aggregations to apply to the raw representation. Must have length greater than 0. - n_patients_per_sub_shard: The number of subjects that should be included in each output file. - Lowering this number increases the number of files written, making the process of creating and - leveraging these files slower but more memory efficient. - do_overwrite: If `True`, this function will overwrite the data already stored in the target save - directory. - do_update: bool = True - seed: The seed to use for random number generation. - - .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 - """ - # create output dir - flat_dir = Path(cfg.tabularized_data_dir) / "flat_reps" - flat_dir.mkdir(exist_ok=True, parents=True) - - # load MEDS data - split_to_df = load_meds_data(cfg.MEDS_cohort_dir) - - # store params in json file - config_fp = flat_dir / "config.json" - store_config_yaml(config_fp, cfg) - - # 0. Identify Output Columns - # We set window_sizes to None here because we want to get the feature column names for the raw flat - # representation, not the summarized one. - feature_columns = set() - for shard_df in split_to_df["train"]: - feature_columns.update(get_flat_rep_feature_cols(cfg, shard_df)) - feature_columns = sorted(list(feature_columns)) - - # 1. Produce static representation - static_subdir = flat_dir / "static" - - static_dfs = {} - actual_num_patients = 0 - for sp, subjects_dfs in tqdm(list(split_to_df.items()), desc="Flattening Splits"): - static_dfs[sp] = [] - sp_dir = static_subdir / sp - - for i, shard_df in enumerate(tqdm(subjects_dfs, desc="Subject chunks", leave=False)): - fp = sp_dir / f"{i}.parquet" - static_dfs[sp].append(fp) - if fp.exists(): - if cfg.do_update: - continue - elif not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") - - df = get_flat_static_rep( - feature_columns=feature_columns, - shard_df=shard_df, - ) - - write_df(df, fp, do_overwrite=cfg.do_overwrite) - actual_num_patients += df.shape[0] - # expected_num_patients = sum(len(ids) for split_ids in sp_subjects.values() for ids in split_ids) - # assert ( - # actual_num_patients == expected_num_patients - # ), f"Expected {expected_num_patients} patients, got {actual_num_patients}." - - # 2. Produce raw representation - ts_subdir = flat_dir / "at_ts" - - ts_dfs = {} - for sp, subjects_dfs in tqdm(list(split_to_df.items()), desc="Flattening Splits"): - ts_dfs[sp] = [] - sp_dir = ts_subdir / sp - - for i, shard_df in enumerate(tqdm(subjects_dfs, desc="Subject chunks", leave=False)): - fp = sp_dir / f"{i}.parquet" - ts_dfs[sp].append(fp) - if fp.exists(): - if cfg.do_update: - continue - elif not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") - - df = get_flat_ts_rep( - feature_columns=feature_columns, - shard_df=shard_df, - ) - - write_df(df, fp, do_overwrite=cfg.do_overwrite) - - if cfg.window_sizes is None: - return - - # # 3. Produce summarized history representations - # history_subdir = flat_dir / "over_history" - - # for window_size in tqdm(cfg.window_sizes, desc="History window sizes"): - # for sp, df_fps in tqdm(list(ts_dfs.items()), desc="Windowing Splits", leave=False): - # for i, df_fp in enumerate(tqdm(df_fps, desc="Subject chunks", leave=False)): - # fp = history_subdir / sp / window_size / f"{i}.parquet" - # if fp.exists(): - # if cfg.do_update: - # continue - # elif not cfg.do_overwrite: - # raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") - - # df = _summarize_over_window(df_fp, window_size) - # write_df(df, fp) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index bb68cad..22e2dd5 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -16,6 +16,7 @@ DF_T = pl.LazyFrame WRITE_USE_PYARROW = True +ROW_IDX_NAME = "__row_idx" def parse_flat_feature_column(c: str) -> tuple[str, str, str, str]: @@ -119,12 +120,12 @@ def get_static_feature_cols(shard_df) -> list[str]: ... 'numerical_value': [1, None, 2, 2, None, None, 3]} >>> df = pl.DataFrame(data).lazy() >>> get_static_feature_cols(df) - ['static/A/first', 'static/A/present', 'static/C/first', 'static/C/present'] + ['A/static/first', 'A/static/present', 'C/static/first', 'C/static/present'] """ feature_columns = [] static_df = shard_df.filter(pl.col("timestamp").is_null()) for code in static_df.select(pl.col("code").unique()).collect().to_series(): - static_aggregations = [f"static/{code}/present", f"static/{code}/first"] + static_aggregations = [f"{code}/static/present", f"{code}/static/first"] feature_columns.extend(static_aggregations) return sorted(feature_columns) @@ -149,9 +150,9 @@ def get_ts_feature_cols(aggregations: list[str], shard_df: DF_T) -> list[str]: ... 'timestamp': [None, '2021-01-01', None, None, '2021-01-03', '2021-01-04', None], ... 'numerical_value': [1, None, 2, 2, None, None, 3]} >>> df = pl.DataFrame(data).lazy() - >>> aggs = ['sum', 'count'] + >>> aggs = ['value/sum', 'code/count'] >>> get_ts_feature_cols(aggs, df) - ['A/count', 'A/sum', 'C/count', 'C/sum'] + ['A/code/count', 'A/value/sum', 'C/code/count', 'C/value/sum'] """ feature_columns = [] ts_df = shard_df.filter(pl.col("timestamp").is_not_null()) @@ -179,10 +180,10 @@ def get_flat_rep_feature_cols(cfg: DictConfig, shard_df: DF_T) -> list[str]: ... 'timestamp': [None, '2021-01-01', None, None], ... 'numerical_value': [1, None, 2, 2]} >>> df = pl.DataFrame(data).lazy() - >>> aggs = ['sum', 'count'] + >>> aggs = ['value/sum', 'code/count'] >>> cfg = DictConfig({'aggs': aggs}) >>> get_flat_rep_feature_cols(cfg, df) - ['static/A/first', 'static/A/present', 'static/B/first', 'static/B/present', 'A/count', 'A/sum'] + ['A/static/first', 'A/static/present', 'B/static/first', 'B/static/present', 'A/code/count', 'A/value/sum'] # noqa: 501 """ static_feature_columns = get_static_feature_cols(shard_df) ts_feature_columns = get_ts_feature_cols(cfg.aggs, shard_df) diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index fa913f5..92c54da 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -12,7 +12,6 @@ from loguru import logger from scripts.identify_columns import store_columns -from scripts.summarize_over_windows import summarize_ts_data_over_windows from scripts.tabularize_static import tabularize_static_data from scripts.tabularize_ts import tabularize_ts_data @@ -149,23 +148,6 @@ def test_tabularize(): tabularize_ts_data(cfg) # confirm the time series files exist: actual_files = [(f.parent.stem, f.stem) for f in list(tabularized_data_dir.glob("ts/*/*.parquet"))] - expected_files = [ - ("train", "1_value"), - ("train", "0_code"), - ("train", "0_value"), - ("train", "1_code"), - ("held_out", "0_code"), - ("held_out", "0_value"), - ("tuning", "0_code"), - ("tuning", "0_value"), - ] - assert set(actual_files) == set(expected_files) - - summarize_ts_data_over_windows(cfg) - # confirm summary files exist: - actual_files = [ - (f.parent.stem, f.stem) for f in list(tabularized_data_dir.glob("summary/*/*.parquet")) - ] expected_files = [ ("train", "1"), ("train", "0"), @@ -173,7 +155,20 @@ def test_tabularize(): ("tuning", "0"), ] assert set(actual_files) == set(expected_files) - for f in list(tabularized_data_dir.glob("summary/*/*.parquet")): - df = pl.read_parquet(f) - assert df.shape[0] > 0 - assert df.columns == ["hi"] + + # summarize_ts_data_over_windows(cfg) + # # confirm summary files exist: + # actual_files = [ + # (f.parent.stem, f.stem) for f in list(tabularized_data_dir.glob("summary/*/*.parquet")) + # ] + # expected_files = [ + # ("train", "1"), + # ("train", "0"), + # ("held_out", "0"), + # ("tuning", "0"), + # ] + # assert set(actual_files) == set(expected_files) + # for f in list(tabularized_data_dir.glob("summary/*/*.parquet")): + # df = pl.read_parquet(f) + # assert df.shape[0] > 0 + # assert df.columns == ["hi"] From df2750a77ac4f4162808879c5ce4aa0c292d188a Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 28 May 2024 13:48:26 -0400 Subject: [PATCH 014/106] Removed tqdm, fixed deprecated groupbys, fixed doctest long-line issue. --- scripts/tabularize_static.py | 5 ++--- scripts/tabularize_ts.py | 5 ++--- src/MEDS_tabular_automl/generate_static_features.py | 2 +- src/MEDS_tabular_automl/generate_summarized_reps.py | 4 ++-- src/MEDS_tabular_automl/utils.py | 9 ++++++--- 5 files changed, 13 insertions(+), 12 deletions(-) diff --git a/scripts/tabularize_static.py b/scripts/tabularize_static.py index e5cf9c5..27fb0c0 100644 --- a/scripts/tabularize_static.py +++ b/scripts/tabularize_static.py @@ -5,7 +5,6 @@ import hydra from omegaconf import DictConfig, OmegaConf -from tqdm.auto import tqdm from MEDS_tabular_automl.generate_static_features import get_flat_static_rep from MEDS_tabular_automl.utils import setup_environment, write_df @@ -99,11 +98,11 @@ def tabularize_static_data( static_subdir = flat_dir / "static" static_dfs = {} - for sp, subjects_dfs in tqdm(list(split_to_df.items()), desc="Flattening Splits"): + for sp, subjects_dfs in split_to_df.items(): static_dfs[sp] = [] sp_dir = static_subdir / sp - for i, shard_df in enumerate(tqdm(subjects_dfs, desc="Subject chunks", leave=False)): + for i, shard_df in enumerate(subjects_dfs): fp = sp_dir / f"{i}.parquet" static_dfs[sp].append(fp) if fp.exists() and not cfg.do_overwrite: diff --git a/scripts/tabularize_ts.py b/scripts/tabularize_ts.py index 20d4022..5be3233 100644 --- a/scripts/tabularize_ts.py +++ b/scripts/tabularize_ts.py @@ -2,7 +2,6 @@ """Tabularizes time-series data in MEDS format into tabular representations.""" import hydra from omegaconf import DictConfig -from tqdm import tqdm from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep from MEDS_tabular_automl.utils import setup_environment, write_df @@ -26,10 +25,10 @@ def tabularize_ts_data( # Produce ts representation ts_subdir = flat_dir / "ts" - for sp, subjects_dfs in tqdm(list(split_to_df.items()), desc="Flattening Splits"): + for sp, subjects_dfs in split_to_df.items(): sp_dir = ts_subdir / sp - for i, shard_df in enumerate(tqdm(subjects_dfs, desc="Subject chunks", leave=False)): + for i, shard_df in enumerate(subjects_dfs): pivot_fp = sp_dir / f"{i}.parquet" if pivot_fp.exists() and not cfg.do_overwrite: raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {pivot_fp.exists()} exists!") diff --git a/src/MEDS_tabular_automl/generate_static_features.py b/src/MEDS_tabular_automl/generate_static_features.py index ee28b77..b32c2b0 100644 --- a/src/MEDS_tabular_automl/generate_static_features.py +++ b/src/MEDS_tabular_automl/generate_static_features.py @@ -38,7 +38,7 @@ def summarize_static_measurements( # Handling 'first' static values static_first_codes = [parse_flat_feature_column(c)[1] for c in static_first] code_subset = df.filter(pl.col("code").is_in(static_first_codes)) - first_code_subset = code_subset.groupby(pl.col("patient_id")).first().collect() + first_code_subset = code_subset.group_by(pl.col("patient_id")).first().collect() static_value_pivot_df = first_code_subset.pivot( index=["patient_id"], columns=["code"], values=["numerical_value"], aggregate_function=None ) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 0036dc5..6e7255e 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -134,13 +134,13 @@ def _generate_summary(df: DF_T, window_size: str, agg: str) -> pl.LazyFrame: raise ValueError(f"Invalid aggregation: {agg}. Valid options are: {VALID_AGGREGATIONS}") id_cols = [ROW_IDX_NAME, "patient_id"] if window_size == "full": - out_df = df.groupby(id_cols).agg( + out_df = df.group_by(id_cols).agg( "timestamp", get_agg_pl_expr(window_size, agg), ) out_df = out_df.explode(*[c for c in out_df.columns if c not in id_cols]) else: - out_df = df.groupby_rolling( + out_df = df.rolling( index_column="timestamp", by=id_cols, period=window_size, diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 22e2dd5..4478ee4 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -116,7 +116,9 @@ def get_static_feature_cols(shard_df) -> list[str]: Examples: >>> import polars as pl >>> data = {'code': ['A', 'A', 'B', 'B', 'C', 'C', 'C'], - ... 'timestamp': [None, '2021-01-01', '2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04', None], # noqa: E501 + ... 'timestamp': [ + ... None, '2021-01-01', '2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04', None + ... ], ... 'numerical_value': [1, None, 2, 2, None, None, 3]} >>> df = pl.DataFrame(data).lazy() >>> get_static_feature_cols(df) @@ -182,8 +184,9 @@ def get_flat_rep_feature_cols(cfg: DictConfig, shard_df: DF_T) -> list[str]: >>> df = pl.DataFrame(data).lazy() >>> aggs = ['value/sum', 'code/count'] >>> cfg = DictConfig({'aggs': aggs}) - >>> get_flat_rep_feature_cols(cfg, df) - ['A/static/first', 'A/static/present', 'B/static/first', 'B/static/present', 'A/code/count', 'A/value/sum'] # noqa: 501 + >>> get_flat_rep_feature_cols(cfg, df) # doctest: +NORMALIZE_WHITESPACE + ['A/static/first', 'A/static/present', 'B/static/first', 'B/static/present', 'A/code/count', + 'A/value/sum'] """ static_feature_columns = get_static_feature_cols(shard_df) ts_feature_columns = get_ts_feature_cols(cfg.aggs, shard_df) From 4bbbc20f810b79fdbc2c1150d40093d8ed59c040 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 28 May 2024 14:18:10 -0400 Subject: [PATCH 015/106] Fixed one of the summary doctests. --- .../generate_summarized_reps.py | 111 +++++++++--------- 1 file changed, 56 insertions(+), 55 deletions(-) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 6e7255e..9172780 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -3,7 +3,7 @@ import polars as pl import polars.selectors as cs -from MEDS_tabular_automl.utils import DF_T, ROW_IDX_NAME +from MEDS_tabular_automl.utils import DF_T CODE_AGGREGATIONS = [ "code/count", @@ -95,54 +95,69 @@ def _generate_summary(df: DF_T, window_size: str, agg: str) -> pl.LazyFrame: - pl.LazyFrame: The summarized data frame. Expect: - >>> from datetime import date - >>> pivot_df = pl.DataFrame({"patient_id": [1, 1, 1, 2], - ... "A/code": [True, True, None, None], - ... "B/code": [None, None, True, True], + >>> from datetime import datetime + >>> wide_df = pl.DataFrame({ + ... "patient_id": [1, 1, 1, 2], + ... "A/code": [True, True, False, False], + ... "B/code": [False, False, True, True], ... "A/value": [1, 2, 3, None], ... "B/value": [None, None, None, 4.0], - ... "timestamp": [date(2020, 1, 1), date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)], - ... }).lazy().with_row_index(ROW_IDX_NAME) - >>> _generate_summary(pivot_df.lazy(), "2d", "code/count" - ... ).collect().drop(ROW_IDX_NAME).sort(["patient_id", "timestamp"]) - shape: (4, 4) - ┌────────────┬────────────┬───────────────────┬───────────────────┐ - │ patient_id ┆ timestamp ┆ full/A/code/count ┆ full/B/code/count │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ date ┆ i64 ┆ i64 │ - ╞════════════╪════════════╪═══════════════════╪═══════════════════╡ - │ 1 ┆ 2020-01-03 ┆ 2 ┆ 1 │ - │ 1 ┆ 2021-01-01 ┆ 1 ┆ 0 │ - │ 1 ┆ 2021-01-02 ┆ 2 ┆ 0 │ - │ 2 ┆ 2021-01-04 ┆ 0 ┆ 1 │ - └────────────┴────────────┴───────────────────┴───────────────────┘ - >>> _generate_summary(pivot_df.lazy(), "full", "value/sum").collect().drop(ROW_IDX_NAME).sort( - ... ["patient_id", "timestamp"]) - shape: (4, 4) - ┌────────────┬────────────┬──────────────────┬──────────────────┐ - │ patient_id ┆ timestamp ┆ 1d/A/value/sum ┆ 1d/B/value/sum │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ date ┆ u32 ┆ u32 │ - ╞════════════╪════════════╪══════════════════╪══════════════════╡ - │ 1 ┆ 2020-01-03 ┆ 1 ┆ 0 │ - │ 1 ┆ 2021-01-01 ┆ 1 ┆ 0 │ - │ 1 ┆ 2021-01-02 ┆ 1 ┆ 0 │ - │ 2 ┆ 2021-01-04 ┆ 0 ┆ 1 │ - └────────────┴────────────┴──────────────────┴──────────────────┘ + ... "timestamp": [ + ... datetime(2020, 1, 1), + ... datetime(2021, 1, 1), + ... datetime(2021, 1, 2), + ... datetime(2011, 1, 3), + ... ], + ... }) + >>> wide_df # Just so we can see the data we're working with: + shape: (4, 6) + ┌────────────┬────────┬────────┬─────────┬─────────┬─────────────────────┐ + │ patient_id ┆ A/code ┆ B/code ┆ A/value ┆ B/value ┆ timestamp │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ bool ┆ bool ┆ i64 ┆ f64 ┆ datetime[μs] │ + ╞════════════╪════════╪════════╪═════════╪═════════╪═════════════════════╡ + │ 1 ┆ true ┆ false ┆ 1 ┆ null ┆ 2020-01-01 00:00:00 │ + │ 1 ┆ true ┆ false ┆ 2 ┆ null ┆ 2021-01-01 00:00:00 │ + │ 1 ┆ false ┆ true ┆ 3 ┆ null ┆ 2021-01-02 00:00:00 │ + │ 2 ┆ false ┆ true ┆ null ┆ 4.0 ┆ 2011-01-03 00:00:00 │ + └────────────┴────────┴────────┴─────────┴─────────┴─────────────────────┘ + >>> _generate_summary(wide_df.lazy(), "2d", "code/count").collect() + shape: (4, 5) + ┌────────────┬─────────────────────┬─────────────────┬─────────────────┐ + │ patient_id ┆ timestamp ┆ 2d/A/code/count ┆ 2d/B/code/count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ u32 ┆ u32 │ + ╞════════════╪═════════════════════╪═════════════════╪═════════════════╡ + │ 1 ┆ 2020-01-01 00:00:00 ┆ 1 ┆ 0 │ + │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 0 │ + │ 1 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 1 │ + │ 2 ┆ 2011-01-03 00:00:00 ┆ 0 ┆ 1 │ + └────────────┴─────────────────────┴─────────────────┴─────────────────┘ + >>> _generate_summary(wide_df.lazy(), "full", "value/sum").collect() + shape: (4, 5) + ┌────────────┬─────────────────────┬──────────────────┬──────────────────┐ + │ patient_id ┆ timestamp ┆ full/A/value/sum ┆ full/B/value/sum │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ i64 ┆ f64 │ + ╞════════════╪═════════════════════╪══════════════════╪══════════════════╡ + │ 1 ┆ 2020-01-01 00:00:00 ┆ 1 ┆ null │ + │ 1 ┆ 2021-01-01 00:00:00 ┆ 3 ┆ null │ + │ 1 ┆ 2021-01-02 00:00:00 ┆ 6 ┆ null │ + │ 2 ┆ 2011-01-03 00:00:00 ┆ null ┆ 4.0 │ + └────────────┴─────────────────────┴──────────────────┴──────────────────┘ """ if agg not in VALID_AGGREGATIONS: raise ValueError(f"Invalid aggregation: {agg}. Valid options are: {VALID_AGGREGATIONS}") - id_cols = [ROW_IDX_NAME, "patient_id"] if window_size == "full": - out_df = df.group_by(id_cols).agg( + out_df = df.group_by("patient_id", maintain_order=True).agg( "timestamp", get_agg_pl_expr(window_size, agg), ) - out_df = out_df.explode(*[c for c in out_df.columns if c not in id_cols]) + out_df = out_df.explode(*[c for c in out_df.columns if c != "patient_id"]) else: out_df = df.rolling( index_column="timestamp", - by=id_cols, + by="patient_id", period=window_size, ).agg( get_agg_pl_expr(window_size, agg), @@ -172,7 +187,7 @@ def generate_summary( Expect: >>> from datetime import date - >>> pivot_df = pl.DataFrame({"patient_id": [1, 1, 1, 2], + >>> wide_df = pl.DataFrame({"patient_id": [1, 1, 1, 2], ... "A/code": [1, 1, 0, 0], ... "B/code": [0, 0, 1, 1], ... "A/value": [1, 2, 3, None], @@ -182,23 +197,9 @@ def generate_summary( >>> feature_columns = ["A/code", "B/code", "A/value", "B/value"] >>> aggregations = ["code/count", "value/sum"] >>> window_sizes = ["full", "1d"] - >>> out_df = generate_summary(feature_columns, pivot_df.lazy(), - ... window_sizes, aggregations).collect().sort(["patient_id", "timestamp"]) - >>> print(out_df.shape) - (4, 10) - >>> for c in sorted(out_df.columns): print(c) - 1d/A/code/count - 1d/A/value/sum - 1d/B/code/count - 1d/B/value/sum - full/A/code/count - full/A/value/sum - full/B/code/count - full/B/value/sum - patient_id - timestamp + >>> generate_summary(feature_columns, wide_df.lazy(), window_sizes, aggregations).collect() """ - df = df.sort(["patient_id", "timestamp"]).with_row_index(ROW_IDX_NAME) + df = df.sort(["patient_id", "timestamp"]) final_columns = [] out_dfs = [] # Generate summaries for each window size and aggregation @@ -220,7 +221,7 @@ def generate_summary( final_columns = sorted(final_columns) # Combine all dataframes using successive joins - result_df = pl.concat(out_dfs, how="align").drop(ROW_IDX_NAME) + result_df = pl.concat(out_dfs, how="align") # Add in missing feature columns with default values missing_columns = [col for col in final_columns if col not in result_df.columns] result_df = result_df.with_columns([pl.lit(None).alias(col) for col in missing_columns]) From d39bf1a25ed1934beddc44134ef0e4575613c063 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Tue, 28 May 2024 21:05:01 +0000 Subject: [PATCH 016/106] updates based on formats... still many to dos --- xgboost_sweep.py | 246 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 175 insertions(+), 71 deletions(-) diff --git a/xgboost_sweep.py b/xgboost_sweep.py index ad57e1e..1ed43b5 100644 --- a/xgboost_sweep.py +++ b/xgboost_sweep.py @@ -11,6 +11,7 @@ import os from typing import List, Callable + class Iterator(xgb.DataIter): def __init__(self, cfg: DictConfig, split: str = "train"): """ @@ -21,38 +22,75 @@ def __init__(self, cfg: DictConfig, split: str = "train"): - split (str): The data split to use ("train", "tuning", or "held_out"). """ - self.cfg = cfg self.data_path = Path(cfg.tabularized_data_dir) self.dynamic_data_path = self.data_path / "summarize" / split self.static_data_path = self.data_path / "static" / split + self._data_shards = [ x.stem for x in self.static_data_path.iterdir() if x.is_file() and x.suffix == ".parquet" ] - if cfg.iterator.keep_static_data_in_memory: - self._static_shards = self._get_static_shards() # do we want to cache this differently to share across workers or iterators? + self._static_shards = ( + self._get_static_shards() + ) # do we want to cache this differently to share across workers or iterators? + + self.codes_set, self.aggs_set, self.min_frequency_set = self._get_inclusion_sets() self._it = 0 + # XGBoost will generate some cache files under current directory with the prefix # "cache" super().__init__(cache_prefix=os.path.join(".", "cache")) - + + def _get_inclusion_sets(self) -> tuple[set, set, set]: + """ + Get the inclusion sets for codes and aggregations. + + Returns: + - tuple[set, set, set]: Tuple of sets for codes, aggregations, and minimum code frequency. + """ + codes_set = None + aggs_set = None + min_frequency_set = None + if self.cfg.codes is not None: + codes_set = set(self.cfg.codes) + if self.cfg.aggs is not None: + aggs_set = set(self.cfg.aggs) + if self.cfg.min_code_inclusion_frequency is not None: + # given parquet file with code frequencies for overall dataset, find which codes have high enough frequency to be included and make a set of them + dataset_freuqency = pl.scan_parquet( + self.data_path / "code_frequencies.parquet" # TODO: make sure this is the right path + ) + min_frequency_set = set( + dataset_freuqency.filter( + cs.col("frequency") >= self.cfg.min_code_inclusion_frequency + ) + .select("code") + .collect() + .to_numpy() + .flatten() + ) + + return codes_set, aggs_set, min_frequency_set + def _get_static_shards(self) -> dict: """ Load static shards into memory. Returns: - dict: Dictionary with shard names as keys and data frames as values. - + """ static_shards = {} for iter in self._data_shards: - static_shards[iter] = pl.scan_parquet(self.static_data_path / f"{iter}.parquet") + static_shards[iter] = pl.scan_parquet( + self.static_data_path / f"{iter}.parquet" + ) return static_shards - + def _load_shard(self, idx: int) -> tuple[np.ndarray, np.ndarray]: """ Load a specific shard of data from disk and concatenate with static data. @@ -61,49 +99,73 @@ def _load_shard(self, idx: int) -> tuple[np.ndarray, np.ndarray]: - idx (int): Index of the shard to load. Returns: - - X (pl.DataFrame): Feature data frame. - - y (pl.Series): Labels. - + - X (np.ndarray): Feature data frame. + - y (np.ndarray): Labels. + """ # concatinate with static data if self.cfg.iterator.keep_static_data_in_memory: df = self._static_shards[self._data_shards[idx]] else: - df = pl.scan_parquet(self.static_data_path / f"{self._data_shards[idx]}.parquet") - - - ### TODO: Add in min_code_inclusion_frequency? - - codes_set = set(self.cfg.codes) if self.cfg.codes else None - aggs_set = set(self.cfg.aggs) if self.cfg.aggs else None + df = pl.scan_parquet( + self.static_data_path / f"{self._data_shards[idx]}.parquet" + ) for window in self.cfg.window_sizes: dynamic_df = pl.scan_parquet( self.dynamic_data_path / window / f"{self._data_shards[idx]}.parquet" ) - ### TODO: Update this for the correct order of column names from Nassim - columns = dynamic_df.schema.keys() # should I use df.columns instead? + columns = dynamic_df.schema.names selected_columns = [ - col for col in columns - if (parts := col.split('/')) and len(parts) > 2 - and (codes_set is None or parts[0] in codes_set) - and (aggs_set is None or parts[-1] in aggs_set) + col + for col in columns + if (parts := col.split("/")) + and len(parts) > 3 + and (self.codes_set is None or "/".join(parts[1:-2]) in self.codes_set) + and (self.min_frequency_set is None or "/".join(parts[1:-2]) in self.min_frequency_set) + and (self.aggs_set is None or "/".join(parts[-2:]) in self.aggs_set) ] - selected_columns.extend(['patient_id', 'timestamp']) + selected_columns.extend(["patient_id", "timestamp"]) dynamic_df = dynamic_df.select(selected_columns) + # Task data + task_df = pl.scan_parquet(self.data_path / "tasks.parquet") # need to know if this should be done every time or if it is pulled once for all the data... also need to know if this is the right path + task_df = task_df.rename({col: f"{col}/task" for col in task_df.schema.names}) # TODO: filtering of the tasks?? + df = task_df.join_asof( + df, + by="subject_id", + on="timestamp", + strategy="forward" if "-" in window else "backward", + ) + df = pl.concat([df, dynamic_df], how="align") - df = pl.concat([df, dynamic_df], how='align') + ### TODO: add in some type checking etc for safety - ### TODO: Figure out features vs labels --> look at esgpt_baseline for loading in labels based on tasks + ### TODO: Figure out features vs labels --> look at esgpt_baseline for loading in labels based on tasks - y = df.select("label") - X = df.select([col for col in df.schema.keys() if col != "label"]) - ### TODO: Figure out best way to export this to dmatrix --> can we use scipy sparse matrix? - ### TODO: fill nones/nulls with zero if this is needed for xgboost - return X.collect().to_numpy(), y.collect().to_numpy() # convert to sparse matrix instead + y = df.select( + [ + col + for col in df.schema.names + if col.endswith("/task") + ] + ) + X = df.select( + [ + col + for col in df.schema.names + if col not in ["label", "patient_id", "timestamp"] + and not col.endswith("/task") + ] + ) + + ### TODO: Figure out best way to export this to dmatrix --> can we use scipy sparse matrix/array? --> likely we will not be able to collect in memory + return ( + X.collect().to_numpy(), + y.collect().to_numpy(), + ) # convert to sparse matrix instead def next(self, input_data: Callable): """ @@ -132,25 +194,29 @@ def reset(self): """ Reset the iterator to its beginning. - Example: - >>> cfg_dict = { - ... "tabularize": { - ... "tabularized_data_dir": "/path/to/tabularized/data", - ... }, - ... "iterator": { - ... "keep_static_data_in_memory": True - ... } - ... } - >>> cfg = OmegaConf.create(cfg_dict) - >>> it = Iterator(cfg, split='train') - >>> it._it = 1 - >>> it.reset() - >>> it._it - 0 """ self._it = 0 -class XGBoostClassifier: + def collect_in_memory(self) -> tuple[np.ndarray, np.ndarray]: + """ + Collect the data in memory. + + Returns: + - tuple[np.ndarray, np.ndarray]: Tuple of feature data and labels. + + """ + X = [] + y = [] + for i in range(len(self._data_shards)): + X_, y_ = self._load_shard(i) + X.append(X_) + y.append(y_) + X = np.concatenate(X, axis=0) + y = np.concatenate(y, axis=0) + return X, y + + +class XGBoostModel: def __init__(self, cfg: DictConfig): """ Initialize the XGBoostClassifier with the provided configuration. @@ -160,16 +226,71 @@ def __init__(self, cfg: DictConfig): """ self.cfg = cfg + self.keep_data_in_memory = getattr( + getattr(cfg, "model", {}), "keep_data_in_memory", True + ) + + self.itrain = None + self.ival = None + self.itest = None + + self.dtrain = None + self.dval = None + self.dtest = None - self.itrain = Iterator(cfg) - self.ival = Iterator(cfg, split="tuning") - self.itest = Iterator(cfg, split="held_out") + self.model = None + + def train(self): + """ + Train the model. + + """ + self._build() + self.model = xgb.train( + OmegaConf.to_container(self.cfg.model), self.dtrain + ) # do we want eval and things? + + def _build(self): + """ + Build necessary data structures for training. + """ + if self.keep_data_in_memory: + self._build_iterators() + self._build_dmatrix_in_memory() + else: + self._build_iterators() + self._build_dmatrix_from_iterators() + + def _build_dmatrix_in_memory(self): + """ + Build the DMatrix from the data in memory. + + """ + X_train, y_train = self.itrain.collect_in_memory() + X_val, y_val = self.ival.collect_in_memory() + X_test, y_test = self.itest.collect_in_memory() + self.dtrain = xgb.DMatrix(X_train, label=y_train) + self.dval = xgb.DMatrix(X_val, label=y_val) + self.dtest = xgb.DMatrix(X_test, label=y_test) + + def _build_dmatrix_from_iterators(self): + """ + Build the DMatrix from the iterators. + + """ self.dtrain = xgb.DMatrix(self.ival) self.dval = xgb.DMatrix(self.itest) self.dtest = xgb.DMatrix(self.itest) - self.model = xgb.train(OmegaConf.to_container(self.cfg.model), self.dtrain) + def _build_iterators(self): + """ + Build the iterators for training, validation, and testing. + + """ + self.itrain = Iterator(self.cfg, split="train") + self.ival = Iterator(self.cfg, split="tuning") + self.itest = Iterator(self.cfg, split="held_out") def evaluate(self) -> float: """ @@ -178,29 +299,11 @@ def evaluate(self) -> float: Returns: - float: Evaluation metric (mae). - Example: - >>> cfg_dict = { - ... "model": { - ... "booster": "gbtree", - ... "objective": "reg:squarederror", - ... } - ... } - >>> cfg = OmegaConf.create(cfg_dict) - >>> classifier = XGBoostClassifier(cfg=cfg) - - >>> n_samples = 1000 - >>> n_features = 10 - >>> X_test = np.random.rand(n_samples, n_features) - >>> y_test = np.random.rand(n_samples) - - >>> mae = classifier.evaluate(X_test, y_test) - >>> isinstance(mae, float) - True """ ### TODO: Figure out exactly what we want to do here y_pred = self.model.predict(self.dtest) - y_true = self.dtest.get_label() + y_true = self.dtest.get_label() return mean_absolute_error(y_true, y_pred) @@ -217,7 +320,8 @@ def optimize(cfg: DictConfig) -> float: """ - model = XGBoostClassifier(cfg) + model = XGBoostModel(cfg) + model.train() return model.evaluate() From 41fe4b41a1318ca08352c718767364785452f5ee Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 29 May 2024 03:07:40 +0000 Subject: [PATCH 017/106] using sparse matrices for generating time series representations --- pyproject.toml | 2 +- scripts/tabularize_static.py | 5 +- .../generate_ts_features.py | 151 +++++++++++------- src/MEDS_tabular_automl/utils.py | 16 +- 4 files changed, 110 insertions(+), 64 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1aa7f41..53155e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy"] +dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", "pandas"] [project.optional-dependencies] dev = ["pre-commit"] diff --git a/scripts/tabularize_static.py b/scripts/tabularize_static.py index 27fb0c0..daf4ea6 100644 --- a/scripts/tabularize_static.py +++ b/scripts/tabularize_static.py @@ -4,11 +4,14 @@ from pathlib import Path import hydra +import polars as pl from omegaconf import DictConfig, OmegaConf from MEDS_tabular_automl.generate_static_features import get_flat_static_rep from MEDS_tabular_automl.utils import setup_environment, write_df +pl.enable_string_cache() + def store_config_yaml(config_fp: Path, cfg: DictConfig): """Stores configuration parameters into a JSON file. @@ -113,7 +116,7 @@ def tabularize_static_data( shard_df=shard_df, ) - write_df(df, fp, do_overwrite=cfg.do_overwrite) + write_df(df, fp, do_overwrite=cfg.do_overwrite, pandas=True) if __name__ == "__main__": diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index 1e6ac71..db354ec 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -1,12 +1,59 @@ +import warnings + +import numpy as np +import pandas as pd import polars as pl +from scipy.sparse import csc_array + +from MEDS_tabular_automl.utils import DF_T + +warnings.simplefilter(action="ignore", category=FutureWarning) + + +def get_ts_columns(feature_columns): + def get_code_type(c): + return c.split("/")[-2] == "code" + + def get_code_name(c): + return "/".join(c.split("/")[0:-2]) + + ts_columns = sorted(list({get_code_name(c) for c in feature_columns if not get_code_type(c) == "static"})) + return ts_columns + -from MEDS_tabular_automl.utils import DF_T, ROW_IDX_NAME +def fill_missing_entries_with_nan(sparse_df, type): + # Fill missing entries with NaN + for col in sparse_df.columns: + sparse_df[col] = sparse_df[col].astype(pd.SparseDtype(type, fill_value=np.nan)) + return sparse_df + + +def get_long_code_df(df, ts_columns): + column_to_int = {col: i for i, col in enumerate(ts_columns)} + rows = range(len(df)) + cols = df["code"].map(column_to_int) + data = np.ones(len(df), dtype=np.bool_) + sparse_matrix = csc_array((data, (rows, cols)), shape=(len(df), len(ts_columns))) + long_code_df = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, columns=ts_columns) + # long_code_df = fill_missing_entries_with_nan(long_code_df, bool) + return long_code_df + + +def get_long_value_df(df, ts_columns): + column_to_int = {col: i for i, col in enumerate(ts_columns)} + value_rows = range(len(df)) + value_cols = df["code"].map(column_to_int) + value_data = df["numerical_value"] + value_sparse_matrix = csc_array((value_data, (value_rows, value_cols)), shape=(len(df), len(ts_columns))) + long_value_df = pd.DataFrame.sparse.from_spmatrix(value_sparse_matrix, columns=ts_columns) + # long_value_df = fill_missing_entries_with_nan(long_value_df, np.float64) + return long_value_df def summarize_dynamic_measurements( ts_columns: list[str], df: DF_T, -) -> pl.LazyFrame: +) -> pd.DataFrame: """Summarize dynamic measurements for feature columns that are marked as 'dynamic'. Args: @@ -24,49 +71,37 @@ def summarize_dynamic_measurements( ... 'numerical_value': [1, 2, 2, 2]} >>> df = pl.DataFrame(data).lazy() >>> ts_columns = ['A', 'B'] - >>> pivot_df = summarize_dynamic_measurements(ts_columns, df) - >>> pivot_df.collect() - shape: (4, 7) - ┌───────────┬────────────┬────────────┬─────────┬─────────┬────────┬────────┐ - │ __row_idx ┆ patient_id ┆ timestamp ┆ A/value ┆ B/value ┆ A/code ┆ B/code │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ str ┆ i64 ┆ i64 ┆ bool ┆ bool │ - ╞═══════════╪════════════╪════════════╪═════════╪═════════╪════════╪════════╡ - │ 0 ┆ 1 ┆ 2021-01-01 ┆ 1 ┆ null ┆ true ┆ null │ - │ 1 ┆ 1 ┆ 2021-01-01 ┆ 2 ┆ null ┆ true ┆ null │ - │ 2 ┆ 1 ┆ 2020-01-01 ┆ null ┆ 2 ┆ null ┆ true │ - │ 3 ┆ 2 ┆ 2021-01-04 ┆ null ┆ 2 ┆ null ┆ true │ - └───────────┴────────────┴────────────┴─────────┴─────────┴────────┴────────┘ + >>> long_df = summarize_dynamic_measurements(ts_columns, df) + >>> long_df.head() + patient_id timestamp A/value B/value A/code B/code + 0 1 2021-01-01 1.0 NaN True NaN + 1 1 2021-01-01 2.0 NaN True NaN + 2 1 2020-01-01 NaN 2.0 NaN True + 3 2 2021-01-04 NaN 2.0 NaN True + >>> long_df.shape + (4, 6) + >>> long_df = summarize_dynamic_measurements(ts_columns, df.filter(pl.col("code") == "A")) + >>> long_df + patient_id timestamp A/value B/value A/code B/code + 0 1 2021-01-01 1.0 NaN True NaN + 1 1 2021-01-01 2.0 NaN True NaN """ - df = df.with_row_index(ROW_IDX_NAME) - id_cols = [ROW_IDX_NAME, "patient_id", "timestamp"] - pivot_df = ( - df.select(*id_cols, "code", "numerical_value") - .with_columns(pl.lit(True).alias("__indicator")) - .collect() - .pivot( - index=id_cols, # add row index and set agg to None - columns=["code"], - values=["numerical_value", "__indicator"], - aggregate_function=None, # TODO round up counts so they are binary - separator="/", - ) - .lazy() + df = df.collect().to_pandas() + id_cols = ["patient_id", "timestamp"] + code_df = df.drop(columns=id_cols + ["numerical_value"]) + long_code_df = get_long_code_df(code_df, ts_columns) + + value_df = df.drop(columns=id_cols) + long_value_df = get_long_value_df(value_df, ts_columns) + long_df = pd.concat( + [ + df[["patient_id", "timestamp"]], + long_value_df.rename(columns=lambda c: f"{c}/value"), + long_code_df.rename(columns=lambda c: f"{c}/code"), + ], + axis=1, ) - - def rename(c): - """Remove value and column prefix.""" - numerical_val_col_name = "numerical_value" - indicator_col_name = "__indicator" - if c.startswith(numerical_val_col_name): - return f"{c[len(numerical_val_col_name)+6:]}/value" - elif c.startswith(indicator_col_name): - return f"{c[len(indicator_col_name)+6:]}/code" - else: - return c - - pivot_df = pivot_df.rename(rename) - return pivot_df + return long_df def get_flat_ts_rep( @@ -91,30 +126,24 @@ def get_flat_ts_rep( representations. Example: - >>> feature_columns = ['A', 'B', 'C', "A/static/present"] + >>> feature_columns = ['A/value/sum', 'A/code/count', 'B/value/sum', 'B/code/count', + ... "C/value/sum", "C/code/count", "A/static/present"] >>> data = {'patient_id': [1, 1, 1, 2, 2, 2], ... 'code': ['A', 'A', 'B', 'B', 'C', 'C'], ... 'timestamp': ['2021-01-01', '2021-01-01', '2020-01-01', '2021-01-04', None, None], ... 'numerical_value': [1, 2, 2, 2, 3, 4]} >>> df = pl.DataFrame(data).lazy() >>> pivot_df = get_flat_ts_rep(feature_columns, df) - >>> pivot_df.collect() - shape: (4, 7) - ┌───────────┬────────────┬────────────┬─────────┬─────────┬────────┬────────┐ - │ __row_idx ┆ patient_id ┆ timestamp ┆ A/value ┆ B/value ┆ A/code ┆ B/code │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ str ┆ i64 ┆ i64 ┆ bool ┆ bool │ - ╞═══════════╪════════════╪════════════╪═════════╪═════════╪════════╪════════╡ - │ 0 ┆ 1 ┆ 2021-01-01 ┆ 1 ┆ null ┆ true ┆ null │ - │ 1 ┆ 1 ┆ 2021-01-01 ┆ 2 ┆ null ┆ true ┆ null │ - │ 2 ┆ 1 ┆ 2020-01-01 ┆ null ┆ 2 ┆ null ┆ true │ - │ 3 ┆ 2 ┆ 2021-01-04 ┆ null ┆ 2 ┆ null ┆ true │ - └───────────┴────────────┴────────────┴─────────┴─────────┴────────┴────────┘ + >>> pivot_df + patient_id timestamp A/value B/value C/value A/code B/code C/code + 0 1 2021-01-01 1.0 NaN NaN True NaN NaN + 1 1 2021-01-01 2.0 NaN NaN True NaN NaN + 2 1 2020-01-01 NaN 2.0 NaN NaN True NaN + 3 2 2021-01-04 NaN 2.0 NaN NaN True NaN """ - def is_static(c): - return len(c.split("/")) > 2 and c.split("/")[-2] == "static" - - ts_columns = [c for c in feature_columns if not is_static(c)] - ts_shard_df = shard_df.filter(pl.col("timestamp").is_not_null()) + ts_columns = get_ts_columns(feature_columns) + ts_shard_df = shard_df.drop_nulls( + subset=["timestamp", "code"] + ) # filter(pl.col("timestamp", "").is_not_null()) return summarize_dynamic_measurements(ts_columns, ts_shard_df) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 4478ee4..abba1c9 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -9,6 +9,7 @@ from collections.abc import Mapping from pathlib import Path +import pandas as pd import polars as pl import polars.selectors as cs import yaml @@ -37,8 +38,21 @@ def write_df(df: DF_T, fp: Path, **kwargs): if isinstance(df, pl.LazyFrame): df.collect().write_parquet(fp, use_pyarrow=WRITE_USE_PYARROW) - else: + elif isinstance(df, pl.DataFrame): df.write_parquet(fp, use_pyarrow=WRITE_USE_PYARROW) + elif isinstance(df, pd.DataFrame): + if not all(df.columns[:2] == ["patient_id", "timestamp"]): + raise ValueError( + f"Expected DataFrame to have columns ['patient_id', 'timestamp'], got {df.columns[:2]}" + ) + coo_matrix = df[df.columns[2:]].sparse.to_coo() + rows = coo_matrix.row + cols = coo_matrix.col + data = coo_matrix.data + df = pd.DataFrame(dict(row=rows, col=cols, data=data)) + df.to_parquet(fp, engine="pyarrow") + else: + raise ValueError(f"Unsupported type for df: {type(df)}") def get_flat_col_dtype(col: str) -> pl.DataType: From cb5f689830d05c682555081d12036fd737d446d1 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Wed, 29 May 2024 12:59:36 +0000 Subject: [PATCH 018/106] still working on sparse matrix to external memory xgboost --- xgboost_sweep.py | 96 ++++++++++++++++++++++++++++++------------------ 1 file changed, 60 insertions(+), 36 deletions(-) diff --git a/xgboost_sweep.py b/xgboost_sweep.py index 1ed43b5..5751fd4 100644 --- a/xgboost_sweep.py +++ b/xgboost_sweep.py @@ -4,10 +4,9 @@ import xgboost as xgb import polars as pl import numpy as np -import pyarrow as pa import polars.selectors as cs from sklearn.metrics import mean_absolute_error - +import scipy.sparse as sp import os from typing import List, Callable @@ -91,7 +90,54 @@ def _get_static_shards(self) -> dict: ) return static_shards - def _load_shard(self, idx: int) -> tuple[np.ndarray, np.ndarray]: + def _sparsify_shard(self, df: pl.DataFrame) -> tuple[sp.csc_matrix, sp.csc_matrix]: + """ + Make X and y as scipy sparse arrays for XGBoost. + + Args: + - df (pl.DataFrame): Data frame to sparsify. + + Returns: + - tuple[sp.csc_matrix, sp.csc_matrix]: Tuple of feature data and labels. + + """ + ### TODO: make sure we are handling nulls and 0s correctly + + # labels = df.select( + # [ + # col + # for col in df.schema.keys() + # if col.endswith("/task") + # ] + # ) + labels = df.select( + [ + col + for col in df.schema.keys() + if col in [ "patient_id"] + ] + ) + data = df.select( + [ + col + for col in df.schema.keys() + if col not in ["label", "patient_id", "timestamp"] + and not col.endswith("/task") + ] + ) + X, y = None, None + ### TODO: This could be optimized so that we are collecting the largest shards possible at once and then sparsifying them + X = sp.csc_matrix(data.select([col for col in data.schema.keys() if col.startswith("static/")]).collect().to_numpy()) ### check if this is true!, else just doesnt start with window + for window in self.cfg.window_sizes: + col_csc = sp.csc_matrix(data.select([col for col in data.schema.keys() if col.startswith(f"{window}/")]).collect().to_numpy()) + X = sp.hstack([X, col_csc]) + + y = sp.csc_matrix(labels.collect().to_numpy()) + + ### TODO: fix the need to convert to array here!!! + return X.toarray(), y.toarray() + + def _load_shard(self, idx: int) -> tuple[sp.csc_matrix, sp.csc_matrix]: """ Load a specific shard of data from disk and concatenate with static data. @@ -99,8 +145,8 @@ def _load_shard(self, idx: int) -> tuple[np.ndarray, np.ndarray]: - idx (int): Index of the shard to load. Returns: - - X (np.ndarray): Feature data frame. - - y (np.ndarray): Labels. + - X (scipy.sparse.csc_matrix): Feature data frame. + - y (scipy.sparse.csc_matrix): Labels. """ # concatinate with static data @@ -116,7 +162,7 @@ def _load_shard(self, idx: int) -> tuple[np.ndarray, np.ndarray]: self.dynamic_data_path / window / f"{self._data_shards[idx]}.parquet" ) - columns = dynamic_df.schema.names + columns = dynamic_df.schema.keys() selected_columns = [ col for col in columns @@ -128,15 +174,6 @@ def _load_shard(self, idx: int) -> tuple[np.ndarray, np.ndarray]: ] selected_columns.extend(["patient_id", "timestamp"]) dynamic_df = dynamic_df.select(selected_columns) - # Task data - task_df = pl.scan_parquet(self.data_path / "tasks.parquet") # need to know if this should be done every time or if it is pulled once for all the data... also need to know if this is the right path - task_df = task_df.rename({col: f"{col}/task" for col in task_df.schema.names}) # TODO: filtering of the tasks?? - df = task_df.join_asof( - df, - by="subject_id", - on="timestamp", - strategy="forward" if "-" in window else "backward", - ) df = pl.concat([df, dynamic_df], how="align") @@ -144,28 +181,15 @@ def _load_shard(self, idx: int) -> tuple[np.ndarray, np.ndarray]: ### TODO: Figure out features vs labels --> look at esgpt_baseline for loading in labels based on tasks + # task_df = pl.scan_parquet(self.data_path / "tasks.parquet") + # task_df = task_df.rename({col: f"{col}/task" for col in task_df.schema.keys()}) # TODO: filtering of the tasks?? --> need to know more about tasks + # ### TODO: Change to join_on with left merge orig df on left, labels on right join on subject_id and timestamp + # df = df.join(task_df, on=["subject_id", "timestamp"], how="left") - y = df.select( - [ - col - for col in df.schema.names - if col.endswith("/task") - ] - ) - X = df.select( - [ - col - for col in df.schema.names - if col not in ["label", "patient_id", "timestamp"] - and not col.endswith("/task") - ] - ) - ### TODO: Figure out best way to export this to dmatrix --> can we use scipy sparse matrix/array? --> likely we will not be able to collect in memory - return ( - X.collect().to_numpy(), - y.collect().to_numpy(), - ) # convert to sparse matrix instead + ### TODO: Figure out best way to export this to dmatrix + # --> can we use scipy sparse matrix/array? --> likely we will not be able to collect in memory + return self._sparsify_shard(df) def next(self, input_data: Callable): """ @@ -227,7 +251,7 @@ def __init__(self, cfg: DictConfig): self.cfg = cfg self.keep_data_in_memory = getattr( - getattr(cfg, "model", {}), "keep_data_in_memory", True + getattr(cfg, "iterator", {}), "keep_data_in_memory", True ) self.itrain = None From 8bc9a16266806cd482fd33e6340239c5ed1a9308 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Wed, 29 May 2024 13:18:59 +0000 Subject: [PATCH 019/106] same problem --- xgboost_sweep.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xgboost_sweep.py b/xgboost_sweep.py index 5751fd4..8c2e3e6 100644 --- a/xgboost_sweep.py +++ b/xgboost_sweep.py @@ -9,6 +9,7 @@ import scipy.sparse as sp import os from typing import List, Callable +import sys class Iterator(xgb.DataIter): @@ -127,7 +128,7 @@ def _sparsify_shard(self, df: pl.DataFrame) -> tuple[sp.csc_matrix, sp.csc_matri ) X, y = None, None ### TODO: This could be optimized so that we are collecting the largest shards possible at once and then sparsifying them - X = sp.csc_matrix(data.select([col for col in data.schema.keys() if col.startswith("static/")]).collect().to_numpy()) ### check if this is true!, else just doesnt start with window + X = sp.csc_matrix(data.select([col for col in data.schema.keys() if not col.startswith(tuple(self.cfg.window_sizes))]).collect().to_numpy()) ### check if this is true!, else just doesnt start with window for window in self.cfg.window_sizes: col_csc = sp.csc_matrix(data.select([col for col in data.schema.keys() if col.startswith(f"{window}/")]).collect().to_numpy()) X = sp.hstack([X, col_csc]) From 1e275265cadd2109c5083bb2056bddd7900b6486 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Wed, 29 May 2024 13:22:21 +0000 Subject: [PATCH 020/106] cleaned some testing --- xgboost_sweep.py | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/xgboost_sweep.py b/xgboost_sweep.py index 8c2e3e6..9c0bca8 100644 --- a/xgboost_sweep.py +++ b/xgboost_sweep.py @@ -35,7 +35,7 @@ def __init__(self, cfg: DictConfig, split: str = "train"): if cfg.iterator.keep_static_data_in_memory: self._static_shards = ( self._get_static_shards() - ) # do we want to cache this differently to share across workers or iterators? + ) self.codes_set, self.aggs_set, self.min_frequency_set = self._get_inclusion_sets() @@ -60,7 +60,6 @@ def _get_inclusion_sets(self) -> tuple[set, set, set]: if self.cfg.aggs is not None: aggs_set = set(self.cfg.aggs) if self.cfg.min_code_inclusion_frequency is not None: - # given parquet file with code frequencies for overall dataset, find which codes have high enough frequency to be included and make a set of them dataset_freuqency = pl.scan_parquet( self.data_path / "code_frequencies.parquet" # TODO: make sure this is the right path ) @@ -102,20 +101,11 @@ def _sparsify_shard(self, df: pl.DataFrame) -> tuple[sp.csc_matrix, sp.csc_matri - tuple[sp.csc_matrix, sp.csc_matrix]: Tuple of feature data and labels. """ - ### TODO: make sure we are handling nulls and 0s correctly - - # labels = df.select( - # [ - # col - # for col in df.schema.keys() - # if col.endswith("/task") - # ] - # ) labels = df.select( [ col for col in df.schema.keys() - if col in [ "patient_id"] + if col.endswith("/task") ] ) data = df.select( @@ -128,7 +118,7 @@ def _sparsify_shard(self, df: pl.DataFrame) -> tuple[sp.csc_matrix, sp.csc_matri ) X, y = None, None ### TODO: This could be optimized so that we are collecting the largest shards possible at once and then sparsifying them - X = sp.csc_matrix(data.select([col for col in data.schema.keys() if not col.startswith(tuple(self.cfg.window_sizes))]).collect().to_numpy()) ### check if this is true!, else just doesnt start with window + X = sp.csc_matrix(data.select([col for col in data.schema.keys() if not col.startswith(tuple(self.cfg.window_sizes))]).collect().to_numpy()) for window in self.cfg.window_sizes: col_csc = sp.csc_matrix(data.select([col for col in data.schema.keys() if col.startswith(f"{window}/")]).collect().to_numpy()) X = sp.hstack([X, col_csc]) @@ -150,7 +140,7 @@ def _load_shard(self, idx: int) -> tuple[sp.csc_matrix, sp.csc_matrix]: - y (scipy.sparse.csc_matrix): Labels. """ - # concatinate with static data + if self.cfg.iterator.keep_static_data_in_memory: df = self._static_shards[self._data_shards[idx]] else: @@ -182,10 +172,10 @@ def _load_shard(self, idx: int) -> tuple[sp.csc_matrix, sp.csc_matrix]: ### TODO: Figure out features vs labels --> look at esgpt_baseline for loading in labels based on tasks - # task_df = pl.scan_parquet(self.data_path / "tasks.parquet") - # task_df = task_df.rename({col: f"{col}/task" for col in task_df.schema.keys()}) # TODO: filtering of the tasks?? --> need to know more about tasks - # ### TODO: Change to join_on with left merge orig df on left, labels on right join on subject_id and timestamp - # df = df.join(task_df, on=["subject_id", "timestamp"], how="left") + task_df = pl.scan_parquet(self.data_path / "tasks.parquet") + task_df = task_df.rename({col: f"{col}/task" for col in task_df.schema.keys()}) # TODO: filtering of the tasks?? --> need to know more about tasks + ### TODO: Change to join_on with left merge orig df on left, labels on right join on subject_id and timestamp + df = df.join(task_df, on=["subject_id", "timestamp"], how="left") ### TODO: Figure out best way to export this to dmatrix From 97938a8b3ff3128cd3eab1f5825e0c5e167134ea Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 29 May 2024 14:34:36 +0000 Subject: [PATCH 021/106] sped up the tabularize_ts script by about 30% by concatenating the sparse matrices instead of the sparse dataframes for values and codes. Also started modifying the summarization script to work with the sparse dataframes. --- scripts/summarize_over_windows.py | 70 +++++------- scripts/tabularize_ts.py | 10 +- .../generate_summarized_reps.py | 100 ++++++------------ .../generate_ts_features.py | 90 ++++++++-------- 4 files changed, 114 insertions(+), 156 deletions(-) diff --git a/scripts/summarize_over_windows.py b/scripts/summarize_over_windows.py index 5efeb17..42f7b7c 100644 --- a/scripts/summarize_over_windows.py +++ b/scripts/summarize_over_windows.py @@ -1,16 +1,13 @@ #!/usr/bin/env python """Aggregates time-series data for feature columns across different window sizes.""" - - -from pathlib import Path - import hydra import polars as pl from loguru import logger from omegaconf import DictConfig from MEDS_tabular_automl.generate_summarized_reps import generate_summary +from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep from MEDS_tabular_automl.utils import setup_environment, write_df @@ -44,52 +41,39 @@ def summarize_ts_data_over_windows( FileNotFoundError: If specified directories or files in the configuration are not found. ValueError: If required columns like 'code' or 'value' are missing in the data files. """ - flat_dir, _, feature_columns = setup_environment(cfg) - - # Assuming MEDS_cohort_dir is correctly defined somewhere above this snippet - ts_dir = Path(cfg.tabularized_data_dir) / "ts" - # TODO: Use patient splits here instead - ts_fps = list(ts_dir.glob("*/*.parquet")) - splits = {fp.parent.stem for fp in ts_fps} - - split_to_pair_fps = {} - for split in splits: - # Categorize files by identifier (base name without '_code' or '_value') using a list comprehension - categorized_files = { - file.stem.rsplit("_", 1)[0]: {"code": None, "value": None} - for file in ts_fps - if file.parent.stem == split - } - for file in ts_fps: - if file.parent.stem == split: - identifier = file.stem.rsplit("_", 1)[0] - suffix = file.stem.split("_")[-1] # 'code' or 'value' - categorized_files[identifier][suffix] = file - - # Process categorized files into pairs ensuring code is first and value is second - code_value_pairs = [ - (info["code"], info["value"]) - for info in categorized_files.values() - if info["code"] is not None and info["value"] is not None - ] - - split_to_pair_fps[split] = code_value_pairs + flat_dir, split_to_df, feature_columns = setup_environment(cfg) + # Produce ts representation + ts_subdir = flat_dir / "ts" + + for sp, subjects_dfs in split_to_df.items(): + sp_dir = ts_subdir / sp + if sp != "train": + continue + + for i, shard_df in enumerate(subjects_dfs): + pivot_fp = sp_dir / f"{i}.parquet" + if pivot_fp.exists() and not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {pivot_fp.exists()} exists!") + if sp != "train": + # remove codes not in training set + shard_df = shard_df.filter(pl.col("code").is_in(feature_columns)) + + # Load Sparse DataFrame + pivot_df = get_flat_ts_rep( + feature_columns=feature_columns, + shard_df=shard_df, + ) - # Summarize data and store - summary_dir = flat_dir / "summary" - for split, pairs in split_to_pair_fps.items(): - logger.info(f"Processing {split}:") - for code_file, value_file in pairs: - logger.info(f" - Code file: {code_file}, Value file: {value_file}") + # Summarize data -- applying aggregations on various window sizes summary_df = generate_summary( feature_columns, - [pl.scan_parquet(code_file), pl.scan_parquet(value_file)], + pivot_df, cfg.window_sizes, cfg.aggs, ) - shard_number = code_file.stem.rsplit("_", 1)[0] - write_df(summary_df, summary_dir / split / f"{shard_number}.parquet") + logger.info("Writing pivot file") + write_df(summary_df, pivot_fp, do_overwrite=cfg.do_overwrite) if __name__ == "__main__": diff --git a/scripts/tabularize_ts.py b/scripts/tabularize_ts.py index 5be3233..09f79e5 100644 --- a/scripts/tabularize_ts.py +++ b/scripts/tabularize_ts.py @@ -1,7 +1,11 @@ #!/usr/bin/env python """Tabularizes time-series data in MEDS format into tabular representations.""" + import hydra +import polars as pl +from loguru import logger from omegaconf import DictConfig +from tqdm import tqdm from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep from MEDS_tabular_automl.utils import setup_environment, write_df @@ -28,15 +32,19 @@ def tabularize_ts_data( for sp, subjects_dfs in split_to_df.items(): sp_dir = ts_subdir / sp - for i, shard_df in enumerate(subjects_dfs): + for i, shard_df in enumerate(tqdm(subjects_dfs)): pivot_fp = sp_dir / f"{i}.parquet" if pivot_fp.exists() and not cfg.do_overwrite: raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {pivot_fp.exists()} exists!") + if sp != "train": + # remove codes not in training set + shard_df = shard_df.filter(pl.col("code").is_in(feature_columns)) pivot_df = get_flat_ts_rep( feature_columns=feature_columns, shard_df=shard_df, ) + logger.info("Writing pivot file") write_df(pivot_df, pivot_fp, do_overwrite=cfg.do_overwrite) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 9172780..7f2f661 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -1,10 +1,9 @@ from collections.abc import Callable +import pandas as pd import polars as pl import polars.selectors as cs -from MEDS_tabular_automl.utils import DF_T - CODE_AGGREGATIONS = [ "code/count", ] @@ -83,7 +82,7 @@ def get_agg_pl_expr(window_size: str, agg: str): raise ValueError(f"Invalid aggregation `{agg}` for window_size `{window_size}`") -def _generate_summary(df: DF_T, window_size: str, agg: str) -> pl.LazyFrame: +def _generate_summary(df: pd.DataFrame, window_size: str, agg: str) -> pl.LazyFrame: """Generate a summary of the data frame for a given window size and aggregation. Args: @@ -95,61 +94,28 @@ def _generate_summary(df: DF_T, window_size: str, agg: str) -> pl.LazyFrame: - pl.LazyFrame: The summarized data frame. Expect: - >>> from datetime import datetime - >>> wide_df = pl.DataFrame({ - ... "patient_id": [1, 1, 1, 2], - ... "A/code": [True, True, False, False], - ... "B/code": [False, False, True, True], - ... "A/value": [1, 2, 3, None], - ... "B/value": [None, None, None, 4.0], - ... "timestamp": [ - ... datetime(2020, 1, 1), - ... datetime(2021, 1, 1), - ... datetime(2021, 1, 2), - ... datetime(2011, 1, 3), - ... ], - ... }) - >>> wide_df # Just so we can see the data we're working with: - shape: (4, 6) - ┌────────────┬────────┬────────┬─────────┬─────────┬─────────────────────┐ - │ patient_id ┆ A/code ┆ B/code ┆ A/value ┆ B/value ┆ timestamp │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ bool ┆ bool ┆ i64 ┆ f64 ┆ datetime[μs] │ - ╞════════════╪════════╪════════╪═════════╪═════════╪═════════════════════╡ - │ 1 ┆ true ┆ false ┆ 1 ┆ null ┆ 2020-01-01 00:00:00 │ - │ 1 ┆ true ┆ false ┆ 2 ┆ null ┆ 2021-01-01 00:00:00 │ - │ 1 ┆ false ┆ true ┆ 3 ┆ null ┆ 2021-01-02 00:00:00 │ - │ 2 ┆ false ┆ true ┆ null ┆ 4.0 ┆ 2011-01-03 00:00:00 │ - └────────────┴────────┴────────┴─────────┴─────────┴─────────────────────┘ - >>> _generate_summary(wide_df.lazy(), "2d", "code/count").collect() - shape: (4, 5) - ┌────────────┬─────────────────────┬─────────────────┬─────────────────┐ - │ patient_id ┆ timestamp ┆ 2d/A/code/count ┆ 2d/B/code/count │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ u32 ┆ u32 │ - ╞════════════╪═════════════════════╪═════════════════╪═════════════════╡ - │ 1 ┆ 2020-01-01 00:00:00 ┆ 1 ┆ 0 │ - │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 0 │ - │ 1 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 1 │ - │ 2 ┆ 2011-01-03 00:00:00 ┆ 0 ┆ 1 │ - └────────────┴─────────────────────┴─────────────────┴─────────────────┘ - >>> _generate_summary(wide_df.lazy(), "full", "value/sum").collect() - shape: (4, 5) - ┌────────────┬─────────────────────┬──────────────────┬──────────────────┐ - │ patient_id ┆ timestamp ┆ full/A/value/sum ┆ full/B/value/sum │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ i64 ┆ f64 │ - ╞════════════╪═════════════════════╪══════════════════╪══════════════════╡ - │ 1 ┆ 2020-01-01 00:00:00 ┆ 1 ┆ null │ - │ 1 ┆ 2021-01-01 00:00:00 ┆ 3 ┆ null │ - │ 1 ┆ 2021-01-02 00:00:00 ┆ 6 ┆ null │ - │ 2 ┆ 2011-01-03 00:00:00 ┆ null ┆ 4.0 │ - └────────────┴─────────────────────┴──────────────────┴──────────────────┘ + >>> from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep + >>> feature_columns = ['A/value/sum', 'A/code/count', 'B/value/sum', 'B/code/count', + ... "C/value/sum", "C/code/count", "A/static/present"] + >>> data = {'patient_id': [1, 1, 1, 2, 2, 2], + ... 'code': ['A', 'A', 'B', 'B', 'C', 'C'], + ... 'timestamp': ['2021-01-01', '2021-01-01', '2020-01-01', '2021-01-04', None, None], + ... 'numerical_value': [1, 2, 2, 2, 3, 4]} + >>> df = pl.DataFrame(data).lazy() + >>> pivot_df = get_flat_ts_rep(feature_columns, df) + >>> pivot_df + patient_id timestamp A/value B/value C/value A/code B/code C/code + 0 1 2021-01-01 1 0 0 1 0 0 + 1 1 2021-01-01 2 0 0 1 0 0 + 2 1 2020-01-01 0 2 0 0 1 0 + 3 2 2021-01-04 0 2 0 0 1 0 + >>> _generate_summary(pivot_df, "full", "value/sum") + patient_id timestamp A/value/sum B/value/sum C/value/sum """ if agg not in VALID_AGGREGATIONS: raise ValueError(f"Invalid aggregation: {agg}. Valid options are: {VALID_AGGREGATIONS}") if window_size == "full": - out_df = df.group_by("patient_id", maintain_order=True).agg( + out_df = df.groupby("patient_id").agg( "timestamp", get_agg_pl_expr(window_size, agg), ) @@ -166,7 +132,7 @@ def _generate_summary(df: DF_T, window_size: str, agg: str) -> pl.LazyFrame: def generate_summary( - feature_columns: list[str], df: pl.LazyFrame, window_sizes: list[str], aggregations: list[str] + feature_columns: list[str], df: pd.DataFrame, window_sizes: list[str], aggregations: list[str] ) -> pl.LazyFrame: """Generate a summary of the data frame for given window sizes and aggregations. @@ -186,18 +152,18 @@ def generate_summary( pl.LazyFrame: A LazyFrame containing the summarized data with all required features present. Expect: - >>> from datetime import date - >>> wide_df = pl.DataFrame({"patient_id": [1, 1, 1, 2], - ... "A/code": [1, 1, 0, 0], - ... "B/code": [0, 0, 1, 1], - ... "A/value": [1, 2, 3, None], - ... "B/value": [None, None, None, 4.0], - ... "timestamp": [date(2021, 1, 1), date(2021, 1, 1),date(2020, 1, 3), date(2021, 1, 4)], - ... }).lazy() - >>> feature_columns = ["A/code", "B/code", "A/value", "B/value"] - >>> aggregations = ["code/count", "value/sum"] - >>> window_sizes = ["full", "1d"] - >>> generate_summary(feature_columns, wide_df.lazy(), window_sizes, aggregations).collect() + # >>> from datetime import date + # >>> wide_df = pd.DataFrame({"patient_id": [1, 1, 1, 2], + # ... "A/code": [1, 1, 0, 0], + # ... "B/code": [0, 0, 1, 1], + # ... "A/value": [1, 2, 3, None], + # ... "B/value": [None, None, None, 4.0], + # ... "timestamp": [date(2021, 1, 1), date(2021, 1, 1),date(2020, 1, 3), date(2021, 1, 4)], + # ... }).lazy() + # >>> feature_columns = ["A/code", "B/code", "A/value", "B/value"] + # >>> aggregations = ["code/count", "value/sum"] + # >>> window_sizes = ["full", "1d"] + # >>> generate_summary(feature_columns, wide_df.lazy(), window_sizes, aggregations).collect() """ df = df.sort(["patient_id", "timestamp"]) final_columns = [] diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index db354ec..1756b8e 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd import polars as pl +from loguru import logger from scipy.sparse import csc_array from MEDS_tabular_automl.utils import DF_T @@ -21,38 +22,32 @@ def get_code_name(c): return ts_columns -def fill_missing_entries_with_nan(sparse_df, type): +def fill_missing_entries_with_nan(sparse_df, type, columns): # Fill missing entries with NaN - for col in sparse_df.columns: + for col in columns: sparse_df[col] = sparse_df[col].astype(pd.SparseDtype(type, fill_value=np.nan)) return sparse_df -def get_long_code_df(df, ts_columns): - column_to_int = {col: i for i, col in enumerate(ts_columns)} +def get_long_code_df(df, ts_columns, col_offset): + column_to_int = {col: i + col_offset for i, col in enumerate(ts_columns)} rows = range(len(df)) cols = df["code"].map(column_to_int) data = np.ones(len(df), dtype=np.bool_) - sparse_matrix = csc_array((data, (rows, cols)), shape=(len(df), len(ts_columns))) - long_code_df = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, columns=ts_columns) - # long_code_df = fill_missing_entries_with_nan(long_code_df, bool) - return long_code_df + return data, (rows, cols) def get_long_value_df(df, ts_columns): column_to_int = {col: i for i, col in enumerate(ts_columns)} - value_rows = range(len(df)) - value_cols = df["code"].map(column_to_int) - value_data = df["numerical_value"] - value_sparse_matrix = csc_array((value_data, (value_rows, value_cols)), shape=(len(df), len(ts_columns))) - long_value_df = pd.DataFrame.sparse.from_spmatrix(value_sparse_matrix, columns=ts_columns) - # long_value_df = fill_missing_entries_with_nan(long_value_df, np.float64) - return long_value_df + rows = range(0, len(df)) + cols = df["code"].map(column_to_int) + data = df["numerical_value"] + return data, (rows, cols) def summarize_dynamic_measurements( ts_columns: list[str], - df: DF_T, + df: pd.DataFrame, ) -> pd.DataFrame: """Summarize dynamic measurements for feature columns that are marked as 'dynamic'. @@ -69,38 +64,44 @@ def summarize_dynamic_measurements( ... 'code': ['A', 'A', 'B', 'B'], ... 'timestamp': ['2021-01-01', '2021-01-01', '2020-01-01', '2021-01-04'], ... 'numerical_value': [1, 2, 2, 2]} - >>> df = pl.DataFrame(data).lazy() + >>> df = pd.DataFrame(data) >>> ts_columns = ['A', 'B'] >>> long_df = summarize_dynamic_measurements(ts_columns, df) >>> long_df.head() patient_id timestamp A/value B/value A/code B/code - 0 1 2021-01-01 1.0 NaN True NaN - 1 1 2021-01-01 2.0 NaN True NaN - 2 1 2020-01-01 NaN 2.0 NaN True - 3 2 2021-01-04 NaN 2.0 NaN True + 0 1 2021-01-01 1 0 1 0 + 1 1 2021-01-01 2 0 1 0 + 2 1 2020-01-01 0 2 0 1 + 3 2 2021-01-04 0 2 0 1 >>> long_df.shape (4, 6) - >>> long_df = summarize_dynamic_measurements(ts_columns, df.filter(pl.col("code") == "A")) + >>> long_df = summarize_dynamic_measurements(ts_columns, df[df.code == "A"]) >>> long_df patient_id timestamp A/value B/value A/code B/code - 0 1 2021-01-01 1.0 NaN True NaN - 1 1 2021-01-01 2.0 NaN True NaN + 0 1 2021-01-01 1 0 1 0 + 1 1 2021-01-01 2 0 1 0 """ - df = df.collect().to_pandas() + logger.info("create code and value") id_cols = ["patient_id", "timestamp"] - code_df = df.drop(columns=id_cols + ["numerical_value"]) - long_code_df = get_long_code_df(code_df, ts_columns) - value_df = df.drop(columns=id_cols) - long_value_df = get_long_value_df(value_df, ts_columns) - long_df = pd.concat( - [ - df[["patient_id", "timestamp"]], - long_value_df.rename(columns=lambda c: f"{c}/value"), - long_code_df.rename(columns=lambda c: f"{c}/code"), - ], - axis=1, + value_data, (value_rows, value_cols) = get_long_value_df(value_df, ts_columns) + + code_df = df.drop(columns=id_cols + ["numerical_value"]) + code_data, (code_rows, code_cols) = get_long_code_df(code_df, ts_columns, col_offset=len(ts_columns)) + + logger.info("merge") + merge_data = np.concatenate([value_data, code_data]) + merge_rows = np.concatenate([value_rows, code_rows]) + merge_cols = np.concatenate([value_cols, code_cols]) + merge_columns = [f"{c}/value" for c in ts_columns] + [f"{c}/code" for c in ts_columns] + long_df = pd.DataFrame.sparse.from_spmatrix( + csc_array((merge_data, (merge_rows, merge_cols)), shape=(len(value_df), len(merge_columns))), + columns=merge_columns, ) + logger.info("add id columns") + long_df["timestamp"] = df["timestamp"] + long_df["patient_id"] = df["patient_id"] + long_df = long_df[id_cols + merge_columns] return long_df @@ -136,14 +137,13 @@ def get_flat_ts_rep( >>> pivot_df = get_flat_ts_rep(feature_columns, df) >>> pivot_df patient_id timestamp A/value B/value C/value A/code B/code C/code - 0 1 2021-01-01 1.0 NaN NaN True NaN NaN - 1 1 2021-01-01 2.0 NaN NaN True NaN NaN - 2 1 2020-01-01 NaN 2.0 NaN NaN True NaN - 3 2 2021-01-04 NaN 2.0 NaN NaN True NaN + 0 1 2021-01-01 1 0 0 1 0 0 + 1 1 2021-01-01 2 0 0 1 0 0 + 2 1 2020-01-01 0 2 0 0 1 0 + 3 2 2021-01-04 0 2 0 0 1 0 """ - + logger.info("load") ts_columns = get_ts_columns(feature_columns) - ts_shard_df = shard_df.drop_nulls( - subset=["timestamp", "code"] - ) # filter(pl.col("timestamp", "").is_not_null()) - return summarize_dynamic_measurements(ts_columns, ts_shard_df) + ts_shard_df = shard_df.drop_nulls(subset=["timestamp", "code"]) + pd_df = ts_shard_df.collect().to_pandas() + return summarize_dynamic_measurements(ts_columns, pd_df) From c28e6b2c05c2899e1b06c3776a2eed74695144a0 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Wed, 29 May 2024 14:45:39 +0000 Subject: [PATCH 022/106] got iterator working with csr_matrices for X and numpy arrays for y --- xgboost_sweep.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/xgboost_sweep.py b/xgboost_sweep.py index 9c0bca8..b69a49e 100644 --- a/xgboost_sweep.py +++ b/xgboost_sweep.py @@ -61,7 +61,7 @@ def _get_inclusion_sets(self) -> tuple[set, set, set]: aggs_set = set(self.cfg.aggs) if self.cfg.min_code_inclusion_frequency is not None: dataset_freuqency = pl.scan_parquet( - self.data_path / "code_frequencies.parquet" # TODO: make sure this is the right path + self.data_path / "code_frequencies.json" # TODO: make sure this is the right path ) min_frequency_set = set( dataset_freuqency.filter( @@ -90,7 +90,7 @@ def _get_static_shards(self) -> dict: ) return static_shards - def _sparsify_shard(self, df: pl.DataFrame) -> tuple[sp.csc_matrix, sp.csc_matrix]: + def _sparsify_shard(self, df: pl.DataFrame) -> tuple[sp.csc_matrix, np.ndarray]: """ Make X and y as scipy sparse arrays for XGBoost. @@ -98,7 +98,7 @@ def _sparsify_shard(self, df: pl.DataFrame) -> tuple[sp.csc_matrix, sp.csc_matri - df (pl.DataFrame): Data frame to sparsify. Returns: - - tuple[sp.csc_matrix, sp.csc_matrix]: Tuple of feature data and labels. + - tuple[scipy.sparse.csr_matrix, numpy.ndarray]: Tuple of feature data and labels. """ labels = df.select( @@ -123,12 +123,12 @@ def _sparsify_shard(self, df: pl.DataFrame) -> tuple[sp.csc_matrix, sp.csc_matri col_csc = sp.csc_matrix(data.select([col for col in data.schema.keys() if col.startswith(f"{window}/")]).collect().to_numpy()) X = sp.hstack([X, col_csc]) - y = sp.csc_matrix(labels.collect().to_numpy()) + y = labels.collect().to_numpy() ### TODO: fix the need to convert to array here!!! - return X.toarray(), y.toarray() + return X.tocsr(), y - def _load_shard(self, idx: int) -> tuple[sp.csc_matrix, sp.csc_matrix]: + def _load_shard(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: """ Load a specific shard of data from disk and concatenate with static data. @@ -136,8 +136,8 @@ def _load_shard(self, idx: int) -> tuple[sp.csc_matrix, sp.csc_matrix]: - idx (int): Index of the shard to load. Returns: - - X (scipy.sparse.csc_matrix): Feature data frame. - - y (scipy.sparse.csc_matrix): Labels. + - X (scipy.sparse.csr_matrix): Feature data frame. + - y (numpy.ndarray): Labels. """ @@ -173,9 +173,9 @@ def _load_shard(self, idx: int) -> tuple[sp.csc_matrix, sp.csc_matrix]: ### TODO: Figure out features vs labels --> look at esgpt_baseline for loading in labels based on tasks task_df = pl.scan_parquet(self.data_path / "tasks.parquet") - task_df = task_df.rename({col: f"{col}/task" for col in task_df.schema.keys()}) # TODO: filtering of the tasks?? --> need to know more about tasks + task_df = task_df.rename({col: f"{col}/task" for col in task_df.schema.keys() if col not in ["patient_id", "timestamp"]}) # TODO: filtering of the tasks?? --> need to know more about tasks ### TODO: Change to join_on with left merge orig df on left, labels on right join on subject_id and timestamp - df = df.join(task_df, on=["subject_id", "timestamp"], how="left") + df = df.join(task_df, on=["patient_id", "timestamp"], how="left") ### TODO: Figure out best way to export this to dmatrix @@ -212,7 +212,7 @@ def reset(self): """ self._it = 0 - def collect_in_memory(self) -> tuple[np.ndarray, np.ndarray]: + def collect_in_memory(self) -> tuple[sp.csr_matrix, np.ndarray]: """ Collect the data in memory. @@ -226,7 +226,8 @@ def collect_in_memory(self) -> tuple[np.ndarray, np.ndarray]: X_, y_ = self._load_shard(i) X.append(X_) y.append(y_) - X = np.concatenate(X, axis=0) + + X = sp.vstack(X) y = np.concatenate(y, axis=0) return X, y From 6f3b1ec7c317572fd4b420ddf350d756ff6ae1e0 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 29 May 2024 18:28:33 +0000 Subject: [PATCH 023/106] added support for sparse aggregations --- .../generate_summarized_reps.py | 281 ++++++++++++------ 1 file changed, 191 insertions(+), 90 deletions(-) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 7f2f661..94849ed 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -1,8 +1,10 @@ from collections.abc import Callable import pandas as pd + +# pd.set_option("compute.use_numba", True) import polars as pl -import polars.selectors as cs +from scipy.sparse import coo_matrix CODE_AGGREGATIONS = [ "code/count", @@ -25,61 +27,136 @@ def time_aggd_col_alias_fntr(window_size: str, agg: str) -> Callable[[str], str] raise ValueError("Aggregation type 'agg' must be provided") def f(c: str) -> str: - return "/".join([window_size] + c.split("/") + [agg]) + if c in ["patient_id", "timestamp"]: + return c + else: + return "/".join([window_size] + c.split("/") + [agg]) return f -def get_agg_pl_expr(window_size: str, agg: str): - code_cols = cs.ends_with("code") - value_cols = cs.ends_with("value") +# def sparse_groupby_sum(df): +# id_cols = ["patient_id", "timestamp"] +# ohe = OneHotEncoder(sparse_output=True) +# # Get all other columns we are not grouping by +# other_columns = [col for col in df.columns if col not in id_cols] +# # Get a 607875 x nDistinctIDs matrix in sparse row format with exactly +# # 1 nonzero entry per row +# onehot = ohe.fit_transform(df[id_cols].values.reshape(-1, 1)) +# # Transpose it. then convert from sparse column back to sparse row, as +# # dot products of two sparse row matrices are faster than sparse col with +# # sparse row +# onehot = onehot.T.tocsr() +# # Dot the transposed matrix with the other columns of the df, converted to sparse row +# # format, then convert the resulting matrix back into a sparse +# # dataframe with the same column names +# out = pd.DataFrame.sparse.from_spmatrix( +# onehot.dot(df[other_columns].sparse.to_coo().tocsr()), +# columns=other_columns) +# # Add in the groupby column to this resulting dataframe with the proper class labels +# out[groupby] = ohe.categories_[0] +# # This final groupby sum simply ensures the result is in the format you would expect +# # for a regular pandas groupby and sum, but you can just return out if this is going to be +# # a performance penalty. Note in that case that the groupby column may have changed index +# return out.groupby(groupby).sum() + + +def sparse_rolling(df, timedelta, agg): + """Iterates through rolling windows while maintaining sparsity. + + Example: + + >>> df = pd.DataFrame({'patient_id': {0: 1, 1: 1, 2: 1}, + ... 'timestamp': {0: pd.Timestamp('2021-01-01 00:00:00'), + ... 1: pd.Timestamp('2021-01-01 00:00:00'), 2: pd.Timestamp('2020-01-01 00:00:00')}, + ... 'A/code': {0: 1, 1: 1, 2: 0}, 'B/code': {0: 0, 1: 0, 2: 1}, 'C/code': {0: 0, 1: 0, 2: 0}}) + >>> for col in ["A/code", "B/code", "C/code"]: df[col] = pd.arrays.SparseArray(df[col]) + >>> sparse_rolling(df, pd.Timedelta("1d"), "sum").dtypes + A/code Sparse[int64, 0] + B/code Sparse[int64, 0] + C/code Sparse[int64, 0] + timestamp datetime64[ns] + dtype: object + """ + df = df.drop(columns="patient_id") + out_dfs = [] + timestamps = [] + for each in df.rolling(on="timestamp", window=timedelta): + timestamps.append(each.index[0]) + out_dfs.append(each.agg(agg)) + df = pd.concat(out_dfs, axis=1).T + df["timestamp"] = timestamps + return df + + +def compute_agg(df, window_size: str, agg: str): + """Applies aggreagtion to dataframe. + + Dataframe is expected to only have the relevant columns for aggregating + It should have the patient_id and timestamp columns, and then only code columns + if agg is a code aggregation or only value columns if it is a value aggreagation. + + Example: + >>> from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep + >>> feature_columns = ['A/value/sum', 'A/code/count', 'B/value/sum', 'B/code/count', + ... "C/value/sum", "C/code/count", "A/static/present"] + >>> data = {'patient_id': [1, 1, 1, 2, 2, 2], + ... 'code': ['A', 'A', 'B', 'B', 'C', 'C'], + ... 'timestamp': ['2021-01-01', '2021-01-01', '2020-01-01', '2021-01-04', None, None], + ... 'numerical_value': [1, 2, 2, 2, 3, 4]} + >>> df = pl.DataFrame(data).lazy() + >>> df = get_flat_ts_rep(feature_columns, df) + >>> df + patient_id timestamp A/value B/value C/value A/code B/code C/code + 0 1 2021-01-01 1 0 0 1 0 0 + 1 1 2021-01-01 2 0 0 1 0 0 + 2 1 2020-01-01 0 2 0 0 1 0 + 3 2 2021-01-04 0 2 0 0 1 0 + >>> df['timestamp'] = pd.to_datetime(df['timestamp']) + >>> df.dtypes + patient_id int64 + timestamp datetime64[ns] + A/value Sparse[int64, 0] + B/value Sparse[int64, 0] + C/value Sparse[int64, 0] + A/code Sparse[int64, 0] + B/code Sparse[int64, 0] + C/code Sparse[int64, 0] + dtype: object + >>> output = compute_agg(df[['patient_id', 'timestamp', 'A/code', 'B/code', 'C/code']], + ... "1d", "code/count") + >>> output + 1d/A/code/count 1d/B/code/count 1d/C/code/count timestamp patient_id + 0 1 0 0 2021-01-01 1 + 1 2 0 0 2021-01-01 1 + 2 0 1 0 2020-01-01 1 + 0 0 1 0 2021-01-04 2 + >>> output.dtypes + 1d/A/code/count Sparse[int64, 0] + 1d/B/code/count Sparse[int64, 0] + 1d/C/code/count Sparse[int64, 0] + timestamp datetime64[ns] + patient_id int64 + dtype: object + """ if window_size == "full": - match agg: - case "code/count": - return code_cols.cumsum().map_alias(time_aggd_col_alias_fntr(window_size, "count")) - case "value/count": - return ( - value_cols.is_not_null() - .cumsum() - .map_alias(time_aggd_col_alias_fntr(window_size, "count")) - ) - case "value/sum": - return value_cols.cumsum().map_alias(time_aggd_col_alias_fntr(window_size, "sum")) - case "value/sum_sqd": - return (value_cols**2).cumsum().map_alias(time_aggd_col_alias_fntr(window_size, "sum_sqd")) - case "value/min": - value_cols.cummin().map_alias(time_aggd_col_alias_fntr(window_size, "min")) - case "value/max": - value_cols.cummax().map_alias(time_aggd_col_alias_fntr(window_size, "max")) - case _: - raise ValueError( - f"Invalid aggregation '{agg}' provided for window_size '{window_size}'." - f" Please choose from the valid options: {VALID_AGGREGATIONS}" - ) + timedelta = df["timestamp"].max() - df["timestamp"].min() + pd.Timedelta(days=1) else: - match agg: - case "code/count": - return code_cols.sum().map_alias(time_aggd_col_alias_fntr(window_size, "count")) - case "value/count": - return ( - value_cols.is_not_null().sum().map_alias(time_aggd_col_alias_fntr(window_size, "count")) - ) - case "value/has_values_count": - return ( - (value_cols.is_not_null() & value_cols.is_not_nan()) - .sum() - .map_alias(time_aggd_col_alias_fntr(window_size, "has_values_count")) - ) - case "value/sum": - return value_cols.sum().map_alias(time_aggd_col_alias_fntr(window_size, "sum")) - case "value/sum_sqd": - return (value_cols**2).sum().map_alias(time_aggd_col_alias_fntr(window_size, "sum_sqd")) - case "value/min": - value_cols.min().map_alias(time_aggd_col_alias_fntr(window_size, "min")) - case "value/max": - value_cols.max().map_alias(time_aggd_col_alias_fntr(window_size, "max")) - case _: - raise ValueError(f"Invalid aggregation `{agg}` for window_size `{window_size}`") + timedelta = pd.Timedelta(window_size) + group = df.groupby("patient_id") + match agg: + case "code/count" | "value/sum": + agg = "sum" + out_dfs = [] + for patient_id, subset_df in group: + df = sparse_rolling(subset_df, timedelta, agg) + df["patient_id"] = patient_id + out_dfs.append(df) + out_df = pd.concat(out_dfs, axis=0) + return out_df.rename(columns=time_aggd_col_alias_fntr(window_size, "count")) + + case _: + raise ValueError(f"Invalid aggregation `{agg}` for window_size `{window_size}`") def _generate_summary(df: pd.DataFrame, window_size: str, agg: str) -> pl.LazyFrame: @@ -103,31 +180,31 @@ def _generate_summary(df: pd.DataFrame, window_size: str, agg: str) -> pl.LazyFr ... 'numerical_value': [1, 2, 2, 2, 3, 4]} >>> df = pl.DataFrame(data).lazy() >>> pivot_df = get_flat_ts_rep(feature_columns, df) + >>> pivot_df['timestamp'] = pd.to_datetime(pivot_df['timestamp']) >>> pivot_df - patient_id timestamp A/value B/value C/value A/code B/code C/code - 0 1 2021-01-01 1 0 0 1 0 0 - 1 1 2021-01-01 2 0 0 1 0 0 - 2 1 2020-01-01 0 2 0 0 1 0 - 3 2 2021-01-04 0 2 0 0 1 0 + patient_id timestamp A/value B/value C/value A/code B/code C/code + 0 1 2021-01-01 1 0 0 1 0 0 + 1 1 2021-01-01 2 0 0 1 0 0 + 2 1 2020-01-01 0 2 0 0 1 0 + 3 2 2021-01-04 0 2 0 0 1 0 >>> _generate_summary(pivot_df, "full", "value/sum") - patient_id timestamp A/value/sum B/value/sum C/value/sum + full/A/value/count full/B/value/count full/C/value/count timestamp patient_id + 0 1 0 0 2021-01-01 1 + 1 3 0 0 2021-01-01 1 + 2 3 2 0 2021-01-01 1 + 0 0 2 0 2021-01-04 2 """ if agg not in VALID_AGGREGATIONS: raise ValueError(f"Invalid aggregation: {agg}. Valid options are: {VALID_AGGREGATIONS}") - if window_size == "full": - out_df = df.groupby("patient_id").agg( - "timestamp", - get_agg_pl_expr(window_size, agg), - ) - out_df = out_df.explode(*[c for c in out_df.columns if c != "patient_id"]) + code_cols = [c for c in df.columns if c.endswith("code")] + value_cols = [c for c in df.columns if c.endswith("value")] + if agg in CODE_AGGREGATIONS: + cols = code_cols else: - out_df = df.rolling( - index_column="timestamp", - by="patient_id", - period=window_size, - ).agg( - get_agg_pl_expr(window_size, agg), - ) + cols = value_cols + id_cols = ["patient_id", "timestamp"] + df = df.loc[:, id_cols + cols] + out_df = compute_agg(df, window_size, agg) return out_df @@ -152,20 +229,41 @@ def generate_summary( pl.LazyFrame: A LazyFrame containing the summarized data with all required features present. Expect: - # >>> from datetime import date - # >>> wide_df = pd.DataFrame({"patient_id": [1, 1, 1, 2], - # ... "A/code": [1, 1, 0, 0], - # ... "B/code": [0, 0, 1, 1], - # ... "A/value": [1, 2, 3, None], - # ... "B/value": [None, None, None, 4.0], - # ... "timestamp": [date(2021, 1, 1), date(2021, 1, 1),date(2020, 1, 3), date(2021, 1, 4)], - # ... }).lazy() - # >>> feature_columns = ["A/code", "B/code", "A/value", "B/value"] - # >>> aggregations = ["code/count", "value/sum"] - # >>> window_sizes = ["full", "1d"] - # >>> generate_summary(feature_columns, wide_df.lazy(), window_sizes, aggregations).collect() + >>> from datetime import date + >>> wide_df = pd.DataFrame({"patient_id": [1, 1, 1, 2], + ... "A/code": [1, 1, 0, 0], + ... "B/code": [0, 0, 1, 1], + ... "A/value": [1, 2, 3, None], + ... "B/value": [None, None, None, 4.0], + ... "timestamp": [date(2021, 1, 1), date(2021, 1, 1),date(2020, 1, 3), date(2021, 1, 4)], + ... }) + >>> wide_df['timestamp'] = pd.to_datetime(wide_df['timestamp']) + >>> for col in ["A/code", "B/code", "A/value", "B/value"]: + ... wide_df[col] = pd.arrays.SparseArray(wide_df[col]) + >>> feature_columns = ["A/code", "B/code", "A/value", "B/value"] + >>> aggregations = ["code/count", "value/sum"] + >>> window_sizes = ["full", "1d"] + >>> generate_summary(feature_columns, wide_df, window_sizes, aggregations)[ + ... ["1d/A/code/count", "full/B/code/count", "full/B/value/sum"]] + 1d/A/code/count full/B/code/count full/B/value/sum + 0 NaN 1.0 0 + 1 NaN 1.0 0 + 2 NaN 1.0 0 + 0 NaN 1.0 0 + 0 NaN NaN 0 + 1 NaN NaN 0 + 2 NaN NaN 0 + 0 NaN NaN 0 + 0 0 NaN 0 + 1 1.0 NaN 0 + 2 2.0 NaN 0 + 0 0 NaN 0 + 0 NaN NaN 0 + 1 NaN NaN 0 + 2 NaN NaN 0 + 0 NaN NaN 0 """ - df = df.sort(["patient_id", "timestamp"]) + df = df.sort_values(["patient_id", "timestamp"]) final_columns = [] out_dfs = [] # Generate summaries for each window size and aggregation @@ -177,19 +275,22 @@ def generate_summary( ) # only iterate through code_types that exist in the dataframe columns if any([c.endswith(code_type) for c in df.columns]): - timestamp_dtype = df.dtypes[df.columns.index("timestamp")] - assert timestamp_dtype in [ - pl.Datetime, - pl.Date, - ], f"timestamp must be of type Date, but is {timestamp_dtype}" + # timestamp_dtype = df.dtypes[df.columns.index("timestamp")] + # assert timestamp_dtype in [ + # pl.Datetime, + # pl.Date, + # ], f"timestamp must be of type Date, but is {timestamp_dtype}" out_df = _generate_summary(df, window_size, agg) out_dfs.append(out_df) final_columns = sorted(final_columns) # Combine all dataframes using successive joins - result_df = pl.concat(out_dfs, how="align") + result_df = pd.concat(out_dfs) # Add in missing feature columns with default values missing_columns = [col for col in final_columns if col not in result_df.columns] - result_df = result_df.with_columns([pl.lit(None).alias(col) for col in missing_columns]) - result_df = result_df.select(pl.col(*["patient_id", "timestamp"], *final_columns)) + + result_df[missing_columns] = pd.DataFrame.sparse.from_spmatrix( + coo_matrix((result_df.shape[0], len(missing_columns))) + ) + result_df = result_df[["patient_id", "timestamp"] + final_columns] return result_df From 675360983948a6ba60897b70d3b3c9a0f2da4b25 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 29 May 2024 18:52:39 +0000 Subject: [PATCH 024/106] passing unit tests for sparse aggregations (only code/count and value/sum implemented at the moment) --- scripts/summarize_over_windows.py | 1 + .../generate_summarized_reps.py | 9 +++-- tests/test_tabularize.py | 34 +++++++++---------- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/scripts/summarize_over_windows.py b/scripts/summarize_over_windows.py index 42f7b7c..ab55f94 100644 --- a/scripts/summarize_over_windows.py +++ b/scripts/summarize_over_windows.py @@ -71,6 +71,7 @@ def summarize_ts_data_over_windows( cfg.window_sizes, cfg.aggs, ) + assert summary_df.shape[1] > 2, "No data found in the summarized dataframe" logger.info("Writing pivot file") write_df(summary_df, pivot_fp, do_overwrite=cfg.do_overwrite) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 94849ed..c6feff1 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -6,6 +6,8 @@ import polars as pl from scipy.sparse import coo_matrix +from MEDS_tabular_automl.generate_ts_features import get_ts_columns + CODE_AGGREGATIONS = [ "code/count", ] @@ -240,7 +242,7 @@ def generate_summary( >>> wide_df['timestamp'] = pd.to_datetime(wide_df['timestamp']) >>> for col in ["A/code", "B/code", "A/value", "B/value"]: ... wide_df[col] = pd.arrays.SparseArray(wide_df[col]) - >>> feature_columns = ["A/code", "B/code", "A/value", "B/value"] + >>> feature_columns = ["A/code/count", "B/code/count", "A/value/sum", "B/value/sum"] >>> aggregations = ["code/count", "value/sum"] >>> window_sizes = ["full", "1d"] >>> generate_summary(feature_columns, wide_df, window_sizes, aggregations)[ @@ -264,6 +266,9 @@ def generate_summary( 0 NaN NaN 0 """ df = df.sort_values(["patient_id", "timestamp"]) + assert len(feature_columns), "feature_columns must be a non-empty list" + ts_columns = get_ts_columns(feature_columns) + code_value_ts_columns = [f"{c}/code" for c in ts_columns] + [f"{c}/value" for c in ts_columns] final_columns = [] out_dfs = [] # Generate summaries for each window size and aggregation @@ -271,7 +276,7 @@ def generate_summary( for agg in aggregations: code_type, agg_name = agg.split("/") final_columns.extend( - [f"{window_size}/{c}/{agg_name}" for c in feature_columns if c.endswith(code_type)] + [f"{window_size}/{c}/{agg_name}" for c in code_value_ts_columns if c.endswith(code_type)] ) # only iterate through code_types that exist in the dataframe columns if any([c.endswith(code_type) for c in df.columns]): diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 92c54da..1045ea6 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -12,6 +12,7 @@ from loguru import logger from scripts.identify_columns import store_columns +from scripts.summarize_over_windows import summarize_ts_data_over_windows from scripts.tabularize_static import tabularize_static_data from scripts.tabularize_ts import tabularize_ts_data @@ -126,9 +127,10 @@ def test_tabularize(): "tabularized_data_dir": str(tabularized_data_dir.resolve()), "min_code_inclusion_frequency": 1, "window_sizes": ["30d", "365d", "full"], + "aggs": ["code/count", "value/sum"], "codes": None, "n_patients_per_sub_shard": 2, - "do_overwrite": False, + "do_overwrite": True, "do_update": True, "seed": 1, "hydra.verbose": True, @@ -156,19 +158,17 @@ def test_tabularize(): ] assert set(actual_files) == set(expected_files) - # summarize_ts_data_over_windows(cfg) - # # confirm summary files exist: - # actual_files = [ - # (f.parent.stem, f.stem) for f in list(tabularized_data_dir.glob("summary/*/*.parquet")) - # ] - # expected_files = [ - # ("train", "1"), - # ("train", "0"), - # ("held_out", "0"), - # ("tuning", "0"), - # ] - # assert set(actual_files) == set(expected_files) - # for f in list(tabularized_data_dir.glob("summary/*/*.parquet")): - # df = pl.read_parquet(f) - # assert df.shape[0] > 0 - # assert df.columns == ["hi"] + summarize_ts_data_over_windows(cfg) + # confirm summary files exist: + actual_files = [(f.parent.stem, f.stem) for f in list(tabularized_data_dir.glob("ts/*/*.parquet"))] + expected_files = [ + ("train", "1"), + ("train", "0"), + ("held_out", "0"), + ("tuning", "0"), + ] + assert set(actual_files) == set(expected_files) + for f in list(tabularized_data_dir.glob("summary/*/*.parquet")): + df = pl.read_parquet(f) + assert df.shape[0] > 0 + assert df.columns == ["hi"] From f1256009f5f23d2f0f9cc77bf04034dc1df44b3c Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 29 May 2024 20:56:07 +0000 Subject: [PATCH 025/106] added significant speed improvements for rolling window aggregations --- .../generate_summarized_reps.py | 61 +++++++++++++------ 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index c6feff1..0bd4a82 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -1,10 +1,13 @@ from collections.abc import Callable import pandas as pd +from scipy.sparse import vstack -# pd.set_option("compute.use_numba", True) +pd.set_option("compute.use_numba", True) import polars as pl -from scipy.sparse import coo_matrix +from loguru import logger +from scipy.sparse import coo_matrix, csr_matrix +from tqdm import tqdm from MEDS_tabular_automl.generate_ts_features import get_ts_columns @@ -80,15 +83,22 @@ def sparse_rolling(df, timedelta, agg): timestamp datetime64[ns] dtype: object """ - df = df.drop(columns="patient_id") - out_dfs = [] + df = df.drop(columns="patient_id").reset_index(drop=True).reset_index() timestamps = [] - for each in df.rolling(on="timestamp", window=timedelta): - timestamps.append(each.index[0]) - out_dfs.append(each.agg(agg)) - df = pd.concat(out_dfs, axis=1).T - df["timestamp"] = timestamps - return df + logger.info("rolling for patient_id") + sparse_matrix = csr_matrix(df[df.columns[2:]].sparse.to_coo()) + out_sparse_matrix = coo_matrix((0, sparse_matrix.shape[1]), dtype=sparse_matrix.dtype) + for each in tqdm(df[["index", "timestamp"]].rolling(on="timestamp", window=timedelta), total=len(df)): + subset_matrix = sparse_matrix[each["index"]] + + # TODO this is where we would apply the aggregation + timestamps.append(each.index.max()) + agg_subset_matrix = subset_matrix.sum(axis=0) + out_sparse_matrix = vstack([out_sparse_matrix, agg_subset_matrix]) + out_df = pd.DataFrame({"timestamp": timestamps}) + out_df = pd.concat([out_df, pd.DataFrame.sparse.from_spmatrix(out_sparse_matrix)], axis=1) + out_df.columns = df.columns[1:] + return out_df def compute_agg(df, window_size: str, agg: str): @@ -145,21 +155,36 @@ def compute_agg(df, window_size: str, agg: str): timedelta = df["timestamp"].max() - df["timestamp"].min() + pd.Timedelta(days=1) else: timedelta = pd.Timedelta(window_size) - group = df.groupby("patient_id") + logger.info("grouping by patient_id") + group = dict(list(df[["patient_id", "timestamp"]].groupby("patient_id"))) + sparse_matrix = df[df.columns[2:]].sparse.to_coo() + sparse_matrix = csr_matrix(sparse_matrix) + logger.info("done grouping") match agg: case "code/count" | "value/sum": agg = "sum" out_dfs = [] - for patient_id, subset_df in group: - df = sparse_rolling(subset_df, timedelta, agg) - df["patient_id"] = patient_id - out_dfs.append(df) + for patient_id, subset_df in group.items(): + logger.info(f"rolling for patient_id {patient_id}") + subset_sparse_matrix = sparse_matrix[subset_df.index] + sparse_df = pd.DataFrame.sparse.from_spmatrix(subset_sparse_matrix) + sparse_df.index = subset_df.index + patient_df = pd.concat([subset_df[["patient_id", "timestamp"]], sparse_df], axis=1) + patient_df.columns = df.columns + assert patient_df.timestamp.isnull().sum() == 0, "timestamp cannot be null" + patient_df = sparse_rolling(patient_df, timedelta, agg) + patient_df["patient_id"] = patient_id + out_dfs.append(patient_df) out_df = pd.concat(out_dfs, axis=0) - return out_df.rename(columns=time_aggd_col_alias_fntr(window_size, "count")) + out_df.rename(columns=time_aggd_col_alias_fntr(window_size, "count")) case _: raise ValueError(f"Invalid aggregation `{agg}` for window_size `{window_size}`") + id_cols = ["patient_id", "timestamp"] + out_df = out_df.loc[:, id_cols + list(df.columns[2:])] + return out_df + def _generate_summary(df: pd.DataFrame, window_size: str, agg: str) -> pl.LazyFrame: """Generate a summary of the data frame for a given window size and aggregation. @@ -265,7 +290,8 @@ def generate_summary( 2 NaN NaN 0 0 NaN NaN 0 """ - df = df.sort_values(["patient_id", "timestamp"]) + logger.info("Sorting sparse dataframe by patient_id and timestamp") + df = df.sort_values(["patient_id", "timestamp"]).reset_index(drop=True) assert len(feature_columns), "feature_columns must be a non-empty list" ts_columns = get_ts_columns(feature_columns) code_value_ts_columns = [f"{c}/code" for c in ts_columns] + [f"{c}/value" for c in ts_columns] @@ -280,6 +306,7 @@ def generate_summary( ) # only iterate through code_types that exist in the dataframe columns if any([c.endswith(code_type) for c in df.columns]): + logger.info(f"Generating aggregation {agg} for window_size {window_size}") # timestamp_dtype = df.dtypes[df.columns.index("timestamp")] # assert timestamp_dtype in [ # pl.Datetime, From 2acc3bcffdd2fcc98cc76a84981007d8f8e3eae7 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 29 May 2024 21:22:22 +0000 Subject: [PATCH 026/106] improved speed, by removing conversion from sparse scipy matrix to sparse pandas array for each patient, now we just use sparse scipy matrices --- .../generate_summarized_reps.py | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 0bd4a82..e8d3e42 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -66,7 +66,7 @@ def f(c: str) -> str: # return out.groupby(groupby).sum() -def sparse_rolling(df, timedelta, agg): +def sparse_rolling(df, sparse_matrix, timedelta, agg): """Iterates through rolling windows while maintaining sparsity. Example: @@ -83,22 +83,22 @@ def sparse_rolling(df, timedelta, agg): timestamp datetime64[ns] dtype: object """ + patient_id = df.iloc[0].patient_id df = df.drop(columns="patient_id").reset_index(drop=True).reset_index() timestamps = [] logger.info("rolling for patient_id") - sparse_matrix = csr_matrix(df[df.columns[2:]].sparse.to_coo()) out_sparse_matrix = coo_matrix((0, sparse_matrix.shape[1]), dtype=sparse_matrix.dtype) - for each in tqdm(df[["index", "timestamp"]].rolling(on="timestamp", window=timedelta), total=len(df)): + for each in df[["index", "timestamp"]].rolling(on="timestamp", window=timedelta): subset_matrix = sparse_matrix[each["index"]] # TODO this is where we would apply the aggregation timestamps.append(each.index.max()) agg_subset_matrix = subset_matrix.sum(axis=0) out_sparse_matrix = vstack([out_sparse_matrix, agg_subset_matrix]) - out_df = pd.DataFrame({"timestamp": timestamps}) - out_df = pd.concat([out_df, pd.DataFrame.sparse.from_spmatrix(out_sparse_matrix)], axis=1) - out_df.columns = df.columns[1:] - return out_df + out_df = pd.DataFrame({"patient_id": [patient_id] * len(timestamps), "timestamp": timestamps}) + # out_df = pd.concat([out_df, pd.DataFrame.sparse.from_spmatrix(out_sparse_matrix)], axis=1) + # out_df.columns = df.columns[1:] + return out_df, out_sparse_matrix def compute_agg(df, window_size: str, agg: str): @@ -160,22 +160,29 @@ def compute_agg(df, window_size: str, agg: str): sparse_matrix = df[df.columns[2:]].sparse.to_coo() sparse_matrix = csr_matrix(sparse_matrix) logger.info("done grouping") + out_sparse_matrix = coo_matrix((0, sparse_matrix.shape[1]), dtype=sparse_matrix.dtype) match agg: case "code/count" | "value/sum": agg = "sum" out_dfs = [] - for patient_id, subset_df in group.items(): - logger.info(f"rolling for patient_id {patient_id}") + for patient_id, subset_df in tqdm(group.items(), total=len(group)): + logger.info("sparse rolling setup") subset_sparse_matrix = sparse_matrix[subset_df.index] - sparse_df = pd.DataFrame.sparse.from_spmatrix(subset_sparse_matrix) - sparse_df.index = subset_df.index - patient_df = pd.concat([subset_df[["patient_id", "timestamp"]], sparse_df], axis=1) - patient_df.columns = df.columns + patient_df = subset_df[ + ["patient_id", "timestamp"] + ] # pd.concat([subset_df[["patient_id", "timestamp"]], sparse_df], axis=1) assert patient_df.timestamp.isnull().sum() == 0, "timestamp cannot be null" - patient_df = sparse_rolling(patient_df, timedelta, agg) - patient_df["patient_id"] = patient_id + logger.info("sparse rolling start") + patient_df, out_sparse = sparse_rolling(patient_df, subset_sparse_matrix, timedelta, agg) + logger.info("sparse rolling complete") + # patient_df["patient_id"] = patient_id out_dfs.append(patient_df) + out_sparse_matrix = vstack([out_sparse_matrix, out_sparse]) out_df = pd.concat(out_dfs, axis=0) + out_df = pd.concat( + [out_df.reset_index(drop=True), pd.DataFrame.sparse.from_spmatrix(out_sparse_matrix)], axis=1 + ) + out_df.columns = df.columns out_df.rename(columns=time_aggd_col_alias_fntr(window_size, "count")) case _: From eec05e2cd93b338ce25b3eac078c8a1b5ed10607 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Thu, 30 May 2024 06:23:58 +0000 Subject: [PATCH 027/106] takes about an hour to run through a shard. The speed gain is from merging rows that occur at the same time based on the current aggregation strategy. For example if the aggregation is sum, we sum up all rows on the same date, or if the aggregation is count we count up all rows on the same date. --- .../generate_summarized_reps.py | 91 ++++++++++++------- 1 file changed, 56 insertions(+), 35 deletions(-) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index e8d3e42..092ef6d 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -40,30 +40,59 @@ def f(c: str) -> str: return f -# def sparse_groupby_sum(df): -# id_cols = ["patient_id", "timestamp"] -# ohe = OneHotEncoder(sparse_output=True) -# # Get all other columns we are not grouping by -# other_columns = [col for col in df.columns if col not in id_cols] -# # Get a 607875 x nDistinctIDs matrix in sparse row format with exactly -# # 1 nonzero entry per row -# onehot = ohe.fit_transform(df[id_cols].values.reshape(-1, 1)) -# # Transpose it. then convert from sparse column back to sparse row, as -# # dot products of two sparse row matrices are faster than sparse col with -# # sparse row -# onehot = onehot.T.tocsr() -# # Dot the transposed matrix with the other columns of the df, converted to sparse row -# # format, then convert the resulting matrix back into a sparse -# # dataframe with the same column names -# out = pd.DataFrame.sparse.from_spmatrix( -# onehot.dot(df[other_columns].sparse.to_coo().tocsr()), -# columns=other_columns) -# # Add in the groupby column to this resulting dataframe with the proper class labels -# out[groupby] = ohe.categories_[0] -# # This final groupby sum simply ensures the result is in the format you would expect -# # for a regular pandas groupby and sum, but you can just return out if this is going to be -# # a performance penalty. Note in that case that the groupby column may have changed index -# return out.groupby(groupby).sum() +def sparse_aggregate(sparse_matrix, agg): + if agg == "sum": + merged_matrix = sparse_matrix.sum(axis=0) + elif agg == "min": + merged_matrix = sparse_matrix.min(axis=0) + elif agg == "max": + merged_matrix = sparse_matrix.max(axis=0) + elif agg == "sum_sqd": + merged_matrix = sparse_matrix.power(2).sum(axis=0) + elif agg == "count": + merged_matrix = sparse_matrix.getnnz(axis=0) + else: + raise ValueError(f"Aggregation method '{agg}' not implemented.") + return csr_matrix(merged_matrix) + + +def sum_merge_timestamps(df, sparse_matrix, agg): + """Groups by timestamp and combines rows that are on the same date. + + The combining is done by summing the rows in the sparse matrix that correspond to the same date. + + Args: + df (DataFrame): The DataFrame with 'timestamp' and 'patient_id'. + sparse_matrix (csr_matrix): The corresponding sparse matrix with data. + agg (str): Aggregation method, currently only 'sum' is implemented. + + Returns: + DataFrame, csr_matrix: Tuple containing the DataFrame with aggregated timestamps and the corresponding + sparse matrix. + """ + # Assuming 'timestamp' is already sorted; if not, uncomment the next line: + # df = df.sort_values(by='timestamp') + + # Group by timestamp and sum the data + grouped = df.groupby("timestamp") + indices = grouped.indices + + # Create a new sparse matrix with summed rows per unique timestamp + patient_id = df["patient_id"].iloc[0] + timestamps = [] + output_matrix = csr_matrix((0, sparse_matrix.shape[1]), dtype=sparse_matrix.dtype) + + # Loop through each group and sum + for timestamp, rows in indices.items(): + # Combine the rows in the sparse matrix for the current group (respecting the aggregation being used) + merged_matrix = sparse_aggregate(sparse_matrix[rows], agg) + # Save the non-zero elements + output_matrix = vstack([output_matrix, merged_matrix]) + timestamps.extend([timestamp]) + + # Create output DataFrame + out_df = pd.DataFrame({"patient_id": [patient_id] * len(timestamps), "timestamp": timestamps}) + return out_df, output_matrix def sparse_rolling(df, sparse_matrix, timedelta, agg): @@ -86,18 +115,12 @@ def sparse_rolling(df, sparse_matrix, timedelta, agg): patient_id = df.iloc[0].patient_id df = df.drop(columns="patient_id").reset_index(drop=True).reset_index() timestamps = [] - logger.info("rolling for patient_id") out_sparse_matrix = coo_matrix((0, sparse_matrix.shape[1]), dtype=sparse_matrix.dtype) for each in df[["index", "timestamp"]].rolling(on="timestamp", window=timedelta): - subset_matrix = sparse_matrix[each["index"]] - - # TODO this is where we would apply the aggregation timestamps.append(each.index.max()) - agg_subset_matrix = subset_matrix.sum(axis=0) + agg_subset_matrix = sparse_aggregate(sparse_matrix[each["index"]], agg) out_sparse_matrix = vstack([out_sparse_matrix, agg_subset_matrix]) out_df = pd.DataFrame({"patient_id": [patient_id] * len(timestamps), "timestamp": timestamps}) - # out_df = pd.concat([out_df, pd.DataFrame.sparse.from_spmatrix(out_sparse_matrix)], axis=1) - # out_df.columns = df.columns[1:] return out_df, out_sparse_matrix @@ -168,14 +191,12 @@ def compute_agg(df, window_size: str, agg: str): for patient_id, subset_df in tqdm(group.items(), total=len(group)): logger.info("sparse rolling setup") subset_sparse_matrix = sparse_matrix[subset_df.index] - patient_df = subset_df[ - ["patient_id", "timestamp"] - ] # pd.concat([subset_df[["patient_id", "timestamp"]], sparse_df], axis=1) + patient_df = subset_df[["patient_id", "timestamp"]] assert patient_df.timestamp.isnull().sum() == 0, "timestamp cannot be null" logger.info("sparse rolling start") + patient_df, subset_sparse_matrix = sum_merge_timestamps(patient_df, subset_sparse_matrix, agg) patient_df, out_sparse = sparse_rolling(patient_df, subset_sparse_matrix, timedelta, agg) logger.info("sparse rolling complete") - # patient_df["patient_id"] = patient_id out_dfs.append(patient_df) out_sparse_matrix = vstack([out_sparse_matrix, out_sparse]) out_df = pd.concat(out_dfs, axis=0) From bd9bdae6f657f8ea4fb3adb70d9ecc115aa85795 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Thu, 30 May 2024 07:00:25 +0000 Subject: [PATCH 028/106] added scripts to the readme --- README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1e4d634..033c5c4 100644 --- a/README.md +++ b/README.md @@ -51,13 +51,25 @@ script is a functional test that is also run with `pytest` to verify correctness 1. `scripts/tabularize/identify_columns.py` loads all training shard to identify which feature columns to generate tabular data for. + +```bash +POLARS_MAX_THREADS=32 python scripts/identify_columns.py MEDS_cohort_dir=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort tabularized_data_dir=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize min_code_inclusion_frequency=1 "window_sizes=[1d, 7d, full]" do_overwrite=True +``` + 2. `scripts/tabularize/tabularize_static.py` Iterates through shards and generates tabular vectors for each patient. There is a single row per patient for each shard. -3. `scripts/tabularize/tabularize_ts.py` Iterates through shards and pivots time series data such - that we have a column for every feature column and binary presence for codes and numerical values filled in for columns with numeirical measurements. There is a row for every timeseries input. + +```bash +POLARS_MAX_THREADS=32 python scripts/tabularize_static.py MEDS_cohort_dir=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort tabularized_data_dir=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize min_code_inclusion_frequency=1 "window_sizes=[1d, 7d, full]" do_overwrite=True +``` + 4. `scripts/tabularize/summarize_over_windows.py` For each shard, iterates through window sizes and aggregations to and horizontally concatenates the outputs to generate the final tabular representations at every event time for every patient. +```bash +POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py MEDS_cohort_dir=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort tabularized_data_dir=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize min_code_inclusion_frequency=1 "window_sizes=[1d, 7d, full]" do_overwrite=True +``` + ## Feature Construction, Storage, and Loading Tabularization of a (raw) MEDS dataset is done by running the `scripts/data/tabularize.py` script. This script From 29c8c5f46e2709c53a476543623e381970b1d7a7 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Thu, 30 May 2024 19:35:51 +0000 Subject: [PATCH 029/106] save before breaking it --- xgboost_sweep.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/xgboost_sweep.py b/xgboost_sweep.py index b69a49e..1ab8c89 100644 --- a/xgboost_sweep.py +++ b/xgboost_sweep.py @@ -10,7 +10,8 @@ import os from typing import List, Callable import sys - +import pandas as pd +import glob class Iterator(xgb.DataIter): def __init__(self, cfg: DictConfig, split: str = "train"): @@ -27,11 +28,12 @@ def __init__(self, cfg: DictConfig, split: str = "train"): self.dynamic_data_path = self.data_path / "summarize" / split self.static_data_path = self.data_path / "static" / split - self._data_shards = [ - x.stem - for x in self.static_data_path.iterdir() - if x.is_file() and x.suffix == ".parquet" - ] + self._data_shards = list(self.static_data_path.glob("*.parquet")) + # [ + # x.stem + # for x in self.static_data_path.iterdir() + # if x.is_file() and x.suffix == ".parquet" + # ] if cfg.iterator.keep_static_data_in_memory: self._static_shards = ( self._get_static_shards() @@ -60,8 +62,8 @@ def _get_inclusion_sets(self) -> tuple[set, set, set]: if self.cfg.aggs is not None: aggs_set = set(self.cfg.aggs) if self.cfg.min_code_inclusion_frequency is not None: - dataset_freuqency = pl.scan_parquet( - self.data_path / "code_frequencies.json" # TODO: make sure this is the right path + dataset_freuqency = pd.read_json( + self.data_path / "feature_freqs.json" # TODO: make sure this is the right path ) min_frequency_set = set( dataset_freuqency.filter( From 4c7d3e77f8d336f65d20cc09a170794af790d09f Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Thu, 30 May 2024 23:23:25 +0000 Subject: [PATCH 030/106] added support for parallelism using mapper warp function. We cache feature frequencies at the beginning of the identify columns script as well. summarize over windows can be parallelized over aggregations x window_size combinations --- configs/tabularize.yaml | 1 + scripts/identify_columns.py | 78 ++++++++-- scripts/summarize_over_windows.py | 77 ++++++---- scripts/tabularize_static.py | 32 +++-- scripts/tabularize_ts.py | 47 ++++--- .../generate_static_features.py | 23 ++- .../generate_summarized_reps.py | 83 +++++------ .../generate_ts_features.py | 26 ++-- src/MEDS_tabular_automl/utils.py | 133 +++++++++++++++--- tests/test_tabularize.py | 52 +++++-- 10 files changed, 394 insertions(+), 158 deletions(-) diff --git a/configs/tabularize.yaml b/configs/tabularize.yaml index 72894ff..10a834c 100644 --- a/configs/tabularize.yaml +++ b/configs/tabularize.yaml @@ -25,6 +25,7 @@ n_patients_per_sub_shard: null do_overwrite: False do_update: True seed: 1 +tqdm: True # Hydra hydra: diff --git a/scripts/identify_columns.py b/scripts/identify_columns.py index 48f871b..186b6c5 100644 --- a/scripts/identify_columns.py +++ b/scripts/identify_columns.py @@ -1,12 +1,20 @@ #!/usr/bin/env python """This Python script, stores the configuration parameters and feature columns used in the output.""" import json +from collections import defaultdict from pathlib import Path import hydra +import polars as pl +from loguru import logger from omegaconf import DictConfig, OmegaConf -from MEDS_tabular_automl.utils import get_flat_rep_feature_cols, load_meds_data +from MEDS_tabular_automl.mapper import wrap as rwlock_wrap +from MEDS_tabular_automl.utils import ( + compute_feature_frequencies, + load_meds_data, + load_tqdm, +) def store_config_yaml(config_fp: Path, cfg: DictConfig): @@ -63,25 +71,75 @@ def store_columns( Args: cfg: The configuration object for the tabularization process. """ + iter_wrapper = load_tqdm(cfg.tqdm) # create output dir flat_dir = Path(cfg.tabularized_data_dir) flat_dir.mkdir(exist_ok=True, parents=True) # load MEDS data - split_to_df = load_meds_data(cfg.MEDS_cohort_dir) + split_to_fps = load_meds_data(cfg.MEDS_cohort_dir, load_data=False) # store params in json file config_fp = flat_dir / "config.yaml" store_config_yaml(config_fp, cfg) - # 0. Identify Output Columns - # We set window_sizes to None here because we want to get the feature column names for the raw flat - # representation, not the summarized one. - feature_columns = set() - for shard_df in split_to_df["train"]: - feature_columns.update(get_flat_rep_feature_cols(cfg, shard_df)) - feature_columns = sorted(list(feature_columns)) - json.dump(feature_columns, open(flat_dir / "feature_columns.json", "w")) + # 0. Identify Output Columns and Frequencies + logger.info("Iterating through shards and caching feature frequencies.") + + def compute_fn(shard_df): + return compute_feature_frequencies(cfg, shard_df) + + def write_fn(data, out_fp): + json.dump(data, open(out_fp, "w")) + + def read_fn(in_fp): + return pl.scan_parquet(in_fp) + + # Map: Iterates through shards and caches feature frequencies + feature_freq_fp = flat_dir / "feature_freqs" + feature_freq_fp.mkdir(exist_ok=True) + for shard_fp in iter_wrapper(split_to_fps["train"]): + name = shard_fp.stem + out_fp = feature_freq_fp / f"{name}.json" + rwlock_wrap( + shard_fp, + out_fp, + read_fn, + write_fn, + compute_fn, + do_overwrite=cfg.do_overwrite, + do_return=False, + ) + + logger.info("Summing frequency computations.") + # Reduce: sum the frequency computations + + def compute_fn(feature_freq_list): + feature_freqs = defaultdict(int) + for shard_feature_freq in feature_freq_list: + for feature, freq in shard_feature_freq.items(): + feature_freqs[feature] += freq + return feature_freqs, sorted(list(feature_freqs.keys())) + + def write_fn(data, out_fp): + feature_freqs, feature_columns = data + json.dump(feature_columns, open(out_fp / "feature_columns.json", "w")) + json.dump(feature_freqs, open(flat_dir / "feature_freqs.json", "w")) + + def read_fn(in_fp): + files = list(in_fp.glob("*.json")) + return [json.load(open(fp)) for fp in files] + + rwlock_wrap( + feature_freq_fp, + flat_dir, + read_fn, + write_fn, + compute_fn, + do_overwrite=cfg.do_overwrite, + do_return=False, + ) + logger.info("Stored feature columns and frequencies.") if __name__ == "__main__": diff --git a/scripts/summarize_over_windows.py b/scripts/summarize_over_windows.py index ab55f94..c67a9f3 100644 --- a/scripts/summarize_over_windows.py +++ b/scripts/summarize_over_windows.py @@ -8,6 +8,7 @@ from MEDS_tabular_automl.generate_summarized_reps import generate_summary from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep +from MEDS_tabular_automl.mapper import wrap as rwlock_wrap from MEDS_tabular_automl.utils import setup_environment, write_df @@ -41,40 +42,56 @@ def summarize_ts_data_over_windows( FileNotFoundError: If specified directories or files in the configuration are not found. ValueError: If required columns like 'code' or 'value' are missing in the data files. """ - flat_dir, split_to_df, feature_columns = setup_environment(cfg) + flat_dir, split_to_fps, feature_columns = setup_environment(cfg, load_data=False) # Produce ts representation ts_subdir = flat_dir / "ts" - for sp, subjects_dfs in split_to_df.items(): + for sp, shard_fps in split_to_fps.items(): sp_dir = ts_subdir / sp - if sp != "train": - continue - - for i, shard_df in enumerate(subjects_dfs): - pivot_fp = sp_dir / f"{i}.parquet" - if pivot_fp.exists() and not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {pivot_fp.exists()} exists!") - if sp != "train": - # remove codes not in training set - shard_df = shard_df.filter(pl.col("code").is_in(feature_columns)) - - # Load Sparse DataFrame - pivot_df = get_flat_ts_rep( - feature_columns=feature_columns, - shard_df=shard_df, - ) - - # Summarize data -- applying aggregations on various window sizes - summary_df = generate_summary( - feature_columns, - pivot_df, - cfg.window_sizes, - cfg.aggs, - ) - assert summary_df.shape[1] > 2, "No data found in the summarized dataframe" - - logger.info("Writing pivot file") - write_df(summary_df, pivot_fp, do_overwrite=cfg.do_overwrite) + + for i, shard_fp in enumerate(shard_fps): + for window_size in cfg.window_sizes: + for agg in cfg.aggs: + pivot_fp = sp_dir / window_size / agg / f"{i}.pkl" + if pivot_fp.exists() and not cfg.do_overwrite: + raise FileExistsError( + f"do_overwrite is {cfg.do_overwrite} and {pivot_fp.exists()} exists!" + ) + + def read_fn(fp): + return pl.scan_parquet(fp) + + def compute_fn(shard_df): + # Load Sparse DataFrame + pivot_df = get_flat_ts_rep( + feature_columns=feature_columns, + shard_df=shard_df, + ) + + # Summarize data -- applying aggregations on various window sizes + summary_df = generate_summary( + feature_columns, + pivot_df, + window_size, + agg, + ) + assert summary_df.shape[1] > 2, "No data found in the summarized dataframe" + + logger.info("Writing pivot file") + return summary_df + + def write_fn(out_df, out_fp): + write_df(out_df, out_fp, do_overwrite=cfg.do_overwrite) + + rwlock_wrap( + shard_fp, + pivot_fp, + read_fn, + write_fn, + compute_fn, + do_overwrite=cfg.do_overwrite, + do_return=False, + ) if __name__ == "__main__": diff --git a/scripts/tabularize_static.py b/scripts/tabularize_static.py index daf4ea6..8f19ae6 100644 --- a/scripts/tabularize_static.py +++ b/scripts/tabularize_static.py @@ -8,6 +8,7 @@ from omegaconf import DictConfig, OmegaConf from MEDS_tabular_automl.generate_static_features import get_flat_static_rep +from MEDS_tabular_automl.mapper import wrap as rwlock_wrap from MEDS_tabular_automl.utils import setup_environment, write_df pl.enable_string_cache() @@ -95,29 +96,44 @@ def tabularize_static_data( .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 """ - flat_dir, split_to_df, feature_columns = setup_environment(cfg) + flat_dir, split_to_fp, feature_columns = setup_environment(cfg, load_data=False) # Produce static representation static_subdir = flat_dir / "static" static_dfs = {} - for sp, subjects_dfs in split_to_df.items(): + for sp, shard_fps in split_to_fp.items(): static_dfs[sp] = [] sp_dir = static_subdir / sp - for i, shard_df in enumerate(subjects_dfs): + for i, shard_fp in enumerate(shard_fps): fp = sp_dir / f"{i}.parquet" static_dfs[sp].append(fp) if fp.exists() and not cfg.do_overwrite: raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") - df = get_flat_static_rep( - feature_columns=feature_columns, - shard_df=shard_df, + def read_fn(in_fp): + return pl.scan_parquet(in_fp) + + def compute_fn(shard_df): + return get_flat_static_rep( + feature_columns=feature_columns, + shard_df=shard_df, + ) + + def write_fn(data, out_df): + write_df(data, out_df, do_overwrite=cfg.do_overwrite) + + rwlock_wrap( + shard_fp, + fp, + read_fn, + write_fn, + compute_fn, + do_overwrite=cfg.do_overwrite, + do_return=False, ) - write_df(df, fp, do_overwrite=cfg.do_overwrite, pandas=True) - if __name__ == "__main__": tabularize_static_data() diff --git a/scripts/tabularize_ts.py b/scripts/tabularize_ts.py index 09f79e5..ae39595 100644 --- a/scripts/tabularize_ts.py +++ b/scripts/tabularize_ts.py @@ -5,10 +5,10 @@ import polars as pl from loguru import logger from omegaconf import DictConfig -from tqdm import tqdm from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep -from MEDS_tabular_automl.utils import setup_environment, write_df +from MEDS_tabular_automl.mapper import wrap as rwlock_wrap +from MEDS_tabular_automl.utils import load_tqdm, setup_environment, write_df @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") @@ -25,27 +25,40 @@ def tabularize_ts_data( Args: cfg: configuration dictionary containing the necessary parameters for tabularizing the data. """ - flat_dir, split_to_df, feature_columns = setup_environment(cfg) + iter_wrapper = load_tqdm(cfg.tqdm) + flat_dir, split_to_fp, feature_columns = setup_environment(cfg, load_data=False) + # Produce ts representation ts_subdir = flat_dir / "ts" - for sp, subjects_dfs in split_to_df.items(): + for sp, shard_fps in split_to_fp.items(): sp_dir = ts_subdir / sp - for i, shard_df in enumerate(tqdm(subjects_dfs)): - pivot_fp = sp_dir / f"{i}.parquet" - if pivot_fp.exists() and not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {pivot_fp.exists()} exists!") - if sp != "train": - # remove codes not in training set - shard_df = shard_df.filter(pl.col("code").is_in(feature_columns)) - - pivot_df = get_flat_ts_rep( - feature_columns=feature_columns, - shard_df=shard_df, + for i, shard_fp in enumerate(iter_wrapper(shard_fps)): + out_fp = sp_dir / f"{i}.pkl" + + def read_fn(in_fp): + return pl.scan_parquet(in_fp) + + def compute_fn(shard_df): + return get_flat_ts_rep( + feature_columns=feature_columns, + shard_df=shard_df, + ) + + def write_fn(data, out_df): + write_df(data, out_df, do_overwrite=cfg.do_overwrite) + + rwlock_wrap( + shard_fp, + out_fp, + read_fn, + write_fn, + compute_fn, + do_overwrite=cfg.do_overwrite, + do_return=False, ) - logger.info("Writing pivot file") - write_df(pivot_df, pivot_fp, do_overwrite=cfg.do_overwrite) + logger.info("Generated TS flat representations.") if __name__ == "__main__": diff --git a/src/MEDS_tabular_automl/generate_static_features.py b/src/MEDS_tabular_automl/generate_static_features.py index b32c2b0..3786ffa 100644 --- a/src/MEDS_tabular_automl/generate_static_features.py +++ b/src/MEDS_tabular_automl/generate_static_features.py @@ -10,7 +10,14 @@ import polars as pl -from MEDS_tabular_automl.utils import DF_T, add_missing_cols, parse_flat_feature_column +from MEDS_tabular_automl.utils import ( + DF_T, + add_static_missing_cols, + parse_static_feature_column, +) + +STATIC_CODE_COL = "/static/present" +STATIC_VALUE_COL = "/static/first" def summarize_static_measurements( @@ -32,11 +39,11 @@ def summarize_static_measurements( or simply as present, then performs a pivot to reshape the data for each patient, providing a tabular format where each row represents a patient and each column represents a static feature. """ - static_present = [c for c in feature_columns if c.startswith("STATIC_") and c.endswith("present")] - static_first = [c for c in feature_columns if c.startswith("STATIC_") and c.endswith("first")] + static_present = [c for c in feature_columns if c.endswith(STATIC_CODE_COL)] + static_first = [c for c in feature_columns if c.endswith(STATIC_VALUE_COL)] # Handling 'first' static values - static_first_codes = [parse_flat_feature_column(c)[1] for c in static_first] + static_first_codes = [parse_static_feature_column(c)[0] for c in static_first] code_subset = df.filter(pl.col("code").is_in(static_first_codes)) first_code_subset = code_subset.group_by(pl.col("patient_id")).first().collect() static_value_pivot_df = first_code_subset.pivot( @@ -55,7 +62,7 @@ def summarize_static_measurements( # TODO: consider casting with .cast(pl.Float32)) # Handling 'present' static indicators - static_present_codes = [parse_flat_feature_column(c)[1] for c in static_present] + static_present_codes = [parse_static_feature_column(c)[0] for c in static_present] static_present_pivot_df = ( df.select(*["patient_id", "code"]) .filter(pl.col("code").is_in(static_present_codes)) @@ -97,10 +104,12 @@ def get_flat_static_rep( _summarize_static_measurements, and then normalizes the resulting data to ensure it is suitable for further analysis or machine learning tasks. """ - static_features = [c for c in feature_columns if c.startswith("STATIC_")] + static_features = [ + c for c in feature_columns if c.endswith(STATIC_CODE_COL) or c.endswith(STATIC_VALUE_COL) + ] static_measurements = summarize_static_measurements(static_features, df=shard_df) # fill up missing feature columns with nulls - normalized_measurements = add_missing_cols( + normalized_measurements = add_static_missing_cols( static_measurements, static_features, set_count_0_to_null=False, diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 092ef6d..94158a8 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -7,9 +7,9 @@ import polars as pl from loguru import logger from scipy.sparse import coo_matrix, csr_matrix -from tqdm import tqdm from MEDS_tabular_automl.generate_ts_features import get_ts_columns +from MEDS_tabular_automl.utils import load_tqdm CODE_AGGREGATIONS = [ "code/count", @@ -124,7 +124,7 @@ def sparse_rolling(df, sparse_matrix, timedelta, agg): return out_df, out_sparse_matrix -def compute_agg(df, window_size: str, agg: str): +def compute_agg(df, window_size: str, agg: str, use_tqdm=False): """Applies aggreagtion to dataframe. Dataframe is expected to only have the relevant columns for aggregating @@ -184,37 +184,34 @@ def compute_agg(df, window_size: str, agg: str): sparse_matrix = csr_matrix(sparse_matrix) logger.info("done grouping") out_sparse_matrix = coo_matrix((0, sparse_matrix.shape[1]), dtype=sparse_matrix.dtype) - match agg: - case "code/count" | "value/sum": - agg = "sum" - out_dfs = [] - for patient_id, subset_df in tqdm(group.items(), total=len(group)): - logger.info("sparse rolling setup") - subset_sparse_matrix = sparse_matrix[subset_df.index] - patient_df = subset_df[["patient_id", "timestamp"]] - assert patient_df.timestamp.isnull().sum() == 0, "timestamp cannot be null" - logger.info("sparse rolling start") - patient_df, subset_sparse_matrix = sum_merge_timestamps(patient_df, subset_sparse_matrix, agg) - patient_df, out_sparse = sparse_rolling(patient_df, subset_sparse_matrix, timedelta, agg) - logger.info("sparse rolling complete") - out_dfs.append(patient_df) - out_sparse_matrix = vstack([out_sparse_matrix, out_sparse]) - out_df = pd.concat(out_dfs, axis=0) - out_df = pd.concat( - [out_df.reset_index(drop=True), pd.DataFrame.sparse.from_spmatrix(out_sparse_matrix)], axis=1 - ) - out_df.columns = df.columns - out_df.rename(columns=time_aggd_col_alias_fntr(window_size, "count")) - - case _: - raise ValueError(f"Invalid aggregation `{agg}` for window_size `{window_size}`") + + out_dfs = [] + iter_wrapper = load_tqdm(use_tqdm) + agg = agg.split("/")[1] + for patient_id, subset_df in iter_wrapper(group.items(), total=len(group)): + logger.info("sparse rolling setup") + subset_sparse_matrix = sparse_matrix[subset_df.index] + patient_df = subset_df[["patient_id", "timestamp"]] + assert patient_df.timestamp.isnull().sum() == 0, "timestamp cannot be null" + logger.info("sparse rolling start") + patient_df, subset_sparse_matrix = sum_merge_timestamps(patient_df, subset_sparse_matrix, agg) + patient_df, out_sparse = sparse_rolling(patient_df, subset_sparse_matrix, timedelta, agg) + logger.info("sparse rolling complete") + out_dfs.append(patient_df) + out_sparse_matrix = vstack([out_sparse_matrix, out_sparse]) + out_df = pd.concat(out_dfs, axis=0) + out_df = pd.concat( + [out_df.reset_index(drop=True), pd.DataFrame.sparse.from_spmatrix(out_sparse_matrix)], axis=1 + ) + out_df.columns = df.columns + out_df.rename(columns=time_aggd_col_alias_fntr(window_size, agg)) id_cols = ["patient_id", "timestamp"] out_df = out_df.loc[:, id_cols + list(df.columns[2:])] return out_df -def _generate_summary(df: pd.DataFrame, window_size: str, agg: str) -> pl.LazyFrame: +def _generate_summary(df: pd.DataFrame, window_size: str, agg: str, use_tqdm=False) -> pl.LazyFrame: """Generate a summary of the data frame for a given window size and aggregation. Args: @@ -259,12 +256,12 @@ def _generate_summary(df: pd.DataFrame, window_size: str, agg: str) -> pl.LazyFr cols = value_cols id_cols = ["patient_id", "timestamp"] df = df.loc[:, id_cols + cols] - out_df = compute_agg(df, window_size, agg) + out_df = compute_agg(df, window_size, agg, use_tqdm=use_tqdm) return out_df def generate_summary( - feature_columns: list[str], df: pd.DataFrame, window_sizes: list[str], aggregations: list[str] + feature_columns: list[str], df: pd.DataFrame, window_size, agg: str, use_tqdm=False ) -> pl.LazyFrame: """Generate a summary of the data frame for given window sizes and aggregations. @@ -326,22 +323,18 @@ def generate_summary( final_columns = [] out_dfs = [] # Generate summaries for each window size and aggregation - for window_size in window_sizes: - for agg in aggregations: - code_type, agg_name = agg.split("/") - final_columns.extend( - [f"{window_size}/{c}/{agg_name}" for c in code_value_ts_columns if c.endswith(code_type)] - ) - # only iterate through code_types that exist in the dataframe columns - if any([c.endswith(code_type) for c in df.columns]): - logger.info(f"Generating aggregation {agg} for window_size {window_size}") - # timestamp_dtype = df.dtypes[df.columns.index("timestamp")] - # assert timestamp_dtype in [ - # pl.Datetime, - # pl.Date, - # ], f"timestamp must be of type Date, but is {timestamp_dtype}" - out_df = _generate_summary(df, window_size, agg) - out_dfs.append(out_df) + code_type, agg_name = agg.split("/") + final_columns = [f"{window_size}/{c}/{agg_name}" for c in code_value_ts_columns if c.endswith(code_type)] + # only iterate through code_types that exist in the dataframe columns + if any([c.endswith(code_type) for c in df.columns]): + logger.info(f"Generating aggregation {agg} for window_size {window_size}") + # timestamp_dtype = df.dtypes[df.columns.index("timestamp")] + # assert timestamp_dtype in [ + # pl.Datetime, + # pl.Date, + # ], f"timestamp must be of type Date, but is {timestamp_dtype}" + out_df = _generate_summary(df, window_size, agg, use_tqdm=use_tqdm) + out_dfs.append(out_df) final_columns = sorted(final_columns) # Combine all dataframes using successive joins diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index 1756b8e..7ff413f 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -4,21 +4,25 @@ import pandas as pd import polars as pl from loguru import logger -from scipy.sparse import csc_array +from scipy.sparse import coo_matrix +from MEDS_tabular_automl.generate_static_features import ( + STATIC_CODE_COL, + STATIC_VALUE_COL, +) from MEDS_tabular_automl.utils import DF_T warnings.simplefilter(action="ignore", category=FutureWarning) def get_ts_columns(feature_columns): - def get_code_type(c): - return c.split("/")[-2] == "code" + def is_static(c): + return c.endswith(STATIC_CODE_COL) or c.endswith(STATIC_VALUE_COL) def get_code_name(c): - return "/".join(c.split("/")[0:-2]) + return "/".join(c.split("/")[0:-1]) - ts_columns = sorted(list({get_code_name(c) for c in feature_columns if not get_code_type(c) == "static"})) + ts_columns = sorted(list({get_code_name(c) for c in feature_columns if not is_static(c)})) return ts_columns @@ -95,10 +99,9 @@ def summarize_dynamic_measurements( merge_cols = np.concatenate([value_cols, code_cols]) merge_columns = [f"{c}/value" for c in ts_columns] + [f"{c}/code" for c in ts_columns] long_df = pd.DataFrame.sparse.from_spmatrix( - csc_array((merge_data, (merge_rows, merge_cols)), shape=(len(value_df), len(merge_columns))), + coo_matrix((merge_data, (merge_rows, merge_cols)), shape=(len(value_df), len(merge_columns))), columns=merge_columns, ) - logger.info("add id columns") long_df["timestamp"] = df["timestamp"] long_df["patient_id"] = df["patient_id"] long_df = long_df[id_cols + merge_columns] @@ -127,8 +130,8 @@ def get_flat_ts_rep( representations. Example: - >>> feature_columns = ['A/value/sum', 'A/code/count', 'B/value/sum', 'B/code/count', - ... "C/value/sum", "C/code/count", "A/static/present"] + >>> feature_columns = ['A/value', 'A/code', 'B/value', 'B/code', + ... "C/value", "C/code", "A/static/present"] >>> data = {'patient_id': [1, 1, 1, 2, 2, 2], ... 'code': ['A', 'A', 'B', 'B', 'C', 'C'], ... 'timestamp': ['2021-01-01', '2021-01-01', '2020-01-01', '2021-01-04', None, None], @@ -142,7 +145,10 @@ def get_flat_ts_rep( 2 1 2020-01-01 0 2 0 0 1 0 3 2 2021-01-04 0 2 0 0 1 0 """ - logger.info("load") + # Remove codes not in training set + raw_feature_columns = ["/".join(c.split("/")[:-1]) for c in feature_columns] + shard_df = shard_df.filter(pl.col("code").is_in(raw_feature_columns)) + ts_columns = get_ts_columns(feature_columns) ts_shard_df = shard_df.drop_nulls(subset=["timestamp", "code"]) pd_df = ts_shard_df.collect().to_pandas() diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index abba1c9..04bbc0d 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -20,11 +20,24 @@ ROW_IDX_NAME = "__row_idx" -def parse_flat_feature_column(c: str) -> tuple[str, str, str, str]: +def load_tqdm(use_tqdm): + if use_tqdm: + from tqdm import tqdm + + return tqdm + else: + + def noop(x, **kwargs): + return x + + return noop + + +def parse_static_feature_column(c: str) -> tuple[str, str, str, str]: parts = c.split("/") if len(parts) < 3: raise ValueError(f"Column {c} is not a valid flat feature column!") - return (parts[0], "/".join(parts[1:-1]), parts[-1]) + return ("/".join(parts[:-2]), parts[-2], parts[-1]) def write_df(df: DF_T, fp: Path, **kwargs): @@ -45,20 +58,15 @@ def write_df(df: DF_T, fp: Path, **kwargs): raise ValueError( f"Expected DataFrame to have columns ['patient_id', 'timestamp'], got {df.columns[:2]}" ) - coo_matrix = df[df.columns[2:]].sparse.to_coo() - rows = coo_matrix.row - cols = coo_matrix.col - data = coo_matrix.data - df = pd.DataFrame(dict(row=rows, col=cols, data=data)) - df.to_parquet(fp, engine="pyarrow") + df.to_pickle(fp) else: raise ValueError(f"Unsupported type for df: {type(df)}") -def get_flat_col_dtype(col: str) -> pl.DataType: +def get_static_col_dtype(col: str) -> pl.DataType: """Gets the appropriate minimal dtype for the given flat representation column string.""" - code_type, code, agg = parse_flat_feature_column(col) + code, code_type, agg = parse_static_feature_column(col) match agg: case "sum" | "sum_sqd" | "min" | "max" | "value" | "first": @@ -71,7 +79,9 @@ def get_flat_col_dtype(col: str) -> pl.DataType: raise ValueError(f"Column name {col} malformed!") -def add_missing_cols(flat_df: DF_T, feature_columns: list[str], set_count_0_to_null: bool = False) -> DF_T: +def add_static_missing_cols( + flat_df: DF_T, feature_columns: list[str], set_count_0_to_null: bool = False +) -> DF_T: """Normalizes columns in a DataFrame so all expected columns are present and appropriately typed. Parameters: @@ -89,8 +99,8 @@ def add_missing_cols(flat_df: DF_T, feature_columns: list[str], set_count_0_to_n cols_to_add = set(feature_columns) - set(flat_df.columns) cols_to_retype = set(feature_columns).intersection(set(flat_df.columns)) - cols_to_add = [(c, get_flat_col_dtype(c)) for c in cols_to_add] - cols_to_retype = [(c, get_flat_col_dtype(c)) for c in cols_to_retype] + cols_to_add = [(c, get_static_col_dtype(c)) for c in cols_to_add] + cols_to_retype = [(c, get_static_col_dtype(c)) for c in cols_to_retype] if "timestamp" in flat_df.columns: key_cols = ["patient_id", "timestamp"] @@ -146,7 +156,7 @@ def get_static_feature_cols(shard_df) -> list[str]: return sorted(feature_columns) -def get_ts_feature_cols(aggregations: list[str], shard_df: DF_T) -> list[str]: +def get_ts_feature_cols(shard_df: DF_T) -> list[str]: """Generates a list of feature column names from the data within each shard based on specified configurations. @@ -168,16 +178,93 @@ def get_ts_feature_cols(aggregations: list[str], shard_df: DF_T) -> list[str]: >>> df = pl.DataFrame(data).lazy() >>> aggs = ['value/sum', 'code/count'] >>> get_ts_feature_cols(aggs, df) - ['A/code/count', 'A/value/sum', 'C/code/count', 'C/value/sum'] + ['A/code', 'A/value', 'C/code', 'C/value'] """ - feature_columns = [] ts_df = shard_df.filter(pl.col("timestamp").is_not_null()) - for code in ts_df.select(pl.col("code").unique()).collect().to_series(): - ts_aggregations = [f"{code}/{agg}" for agg in aggregations] - feature_columns.extend(ts_aggregations) + feature_columns = list(ts_df.select(pl.col("code").unique()).collect().to_series()) + feature_columns = [f"{code}/code" for code in feature_columns] + [ + f"{code}/value" for code in feature_columns + ] return sorted(feature_columns) +def compute_feature_frequencies(cfg: DictConfig, shard_df: DF_T) -> list[str]: + """Generates a list of feature column names from the data within each shard based on specified + configurations. + + Parameters: + - cfg (DictConfig): Configuration dictionary specifying how features should be evaluated and aggregated. + - split_to_shard_df (dict): A dictionary of DataFrames, divided by data split (e.g., 'train', 'test'). + + Returns: + - tuple[list[str], dict]: A tuple containing a list of feature columns and a dictionary of code properties + identified during the evaluation. + + This function evaluates the properties of codes within training data and applies configured + aggregations to generate a comprehensive list of feature columns for modeling purposes. + Examples: + >>> import polars as pl + >>> data = {'code': ['A', 'A', 'B', 'B', 'C', 'C', 'C'], + ... 'timestamp': [None, '2021-01-01', None, None, '2021-01-03', '2021-01-04', None], + ... 'numerical_value': [1, None, 2, 2, None, None, 3]} + >>> df = pl.DataFrame(data).lazy() + >>> aggs = ['value/sum', 'code/count'] + >>> get_ts_feature_cols(aggs, df) + ['A/code', 'A/value', 'C/code', 'C/value'] + """ + static_df = shard_df.filter( + pl.col("patient_id").is_not_null() & pl.col("code").is_not_null() & pl.col("timestamp").is_null() + ) + static_code_freqs_df = static_df.groupby("code").agg(pl.count("code").alias("count")).collect() + static_code_freqs = { + row["code"] + "/static/present": row["count"] for row in static_code_freqs_df.iter_rows(named=True) + } + + static_value_df = static_df.filter(pl.col("numerical_value").is_not_null()) + static_value_freqs_df = ( + static_value_df.groupby("code").agg(pl.count("numerical_value").alias("count")).collect() + ) + static_value_freqs = { + row["code"] + "/static/first": row["count"] for row in static_value_freqs_df.iter_rows(named=True) + } + + ts_df = shard_df.filter( + pl.col("patient_id").is_not_null() & pl.col("code").is_not_null() & pl.col("timestamp").is_not_null() + ) + code_freqs_df = ts_df.groupby("code").agg(pl.count("code").alias("count")).collect() + code_freqs = {row["code"] + "/code": row["count"] for row in code_freqs_df.iter_rows(named=True)} + + value_df = ts_df.filter(pl.col("numerical_value").is_not_null()) + value_freqs_df = value_df.groupby("code").agg(pl.count("numerical_value").alias("count")).collect() + value_freqs = {row["code"] + "/value": row["count"] for row in value_freqs_df.iter_rows(named=True)} + + combined_freqs = {**static_code_freqs, **static_value_freqs, **code_freqs, **value_freqs} + return combined_freqs + + +def get_prediction_ts_cols( + aggregations: list[str], ts_feature_cols: DF_T, window_sizes: list[str] | None = None +) -> list[str]: + """Generates a list of feature column names that will be used for downstream task + Examples: + >>> feature_cols = ['A/code', 'A/value', 'C/code', 'C/value'] + >>> window_sizes = None + >>> aggs = ['value/sum', 'code/count'] + >>> get_prediction_ts_cols(aggs, feature_cols, window_sizes) + error + >>> window_sizes = ["1d"] + >>> get_prediction_ts_cols(aggs, feature_cols, window_sizes) + error + """ + agg_feature_columns = [] + for code in ts_feature_cols: + ts_aggregations = [f"{code}/{agg}" for agg in aggregations] + agg_feature_columns.extend(ts_aggregations) + if window_sizes: + ts_aggregations = [f"{window_size}/{code}" for window_size in window_sizes] + return sorted(ts_aggregations) + + def get_flat_rep_feature_cols(cfg: DictConfig, shard_df: DF_T) -> list[str]: """Generates a list of feature column names from the data within each shard based on specified configurations. @@ -207,7 +294,7 @@ def get_flat_rep_feature_cols(cfg: DictConfig, shard_df: DF_T) -> list[str]: return static_feature_columns + ts_feature_columns -def load_meds_data(MEDS_cohort_dir: str) -> Mapping[str, pl.DataFrame]: +def load_meds_data(MEDS_cohort_dir: str, load_data: bool = True) -> Mapping[str, pl.DataFrame]: """Loads the MEDS dataset from disk. Args: @@ -235,19 +322,21 @@ def load_meds_data(MEDS_cohort_dir: str) -> Mapping[str, pl.DataFrame]: meds_fps = list(MEDS_cohort_dir.glob("*/*.parquet")) splits = {fp.parent.stem for fp in meds_fps} split_to_fps = {split: [fp for fp in meds_fps if fp.parent.stem == split] for split in splits} + if not load_data: + return split_to_fps split_to_df = { split: [pl.scan_parquet(fp) for fp in split_fps] for split, split_fps in split_to_fps.items() } return split_to_df -def setup_environment(cfg: DictConfig): +def setup_environment(cfg: DictConfig, load_data: bool = True): # check output dir flat_dir = Path(cfg.tabularized_data_dir) assert flat_dir.exists() # load MEDS data - split_to_df = load_meds_data(cfg.MEDS_cohort_dir) + split_to_df = load_meds_data(cfg.MEDS_cohort_dir, load_data) feature_columns = json.load(open(flat_dir / "feature_columns.json")) # Check that the stored config matches the current config diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 1045ea6..1e763fb 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -3,10 +3,12 @@ root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) import json +import shutil import tempfile from io import StringIO from pathlib import Path +import pandas as pd import polars as pl from hydra import compose, initialize from loguru import logger @@ -134,6 +136,7 @@ def test_tabularize(): "do_update": True, "seed": 1, "hydra.verbose": True, + "tqdm": False, } with initialize(version_base=None, config_path="../configs/"): # path to config.yaml @@ -141,15 +144,23 @@ def test_tabularize(): cfg = compose(config_name="tabularize", overrides=overrides) # config.yaml logger.info("caching flat representation of MEDS data") store_columns(cfg) + assert (tabularized_data_dir / "config.yaml").is_file() + assert (tabularized_data_dir / "feature_columns.json").is_file() + assert (tabularized_data_dir / "feature_freqs.json").is_file() tabularize_static_data(cfg) actual_files = [ (f.parent.stem, f.stem) for f in list(tabularized_data_dir.glob("static/*/*.parquet")) ] expected_files = [("train", "1"), ("train", "0"), ("held_out", "0"), ("tuning", "0")] assert set(actual_files) == set(expected_files) + + # Check the files are not empty + for f in list(tabularized_data_dir.glob("static/*/*.parquet")): + assert pl.read_parquet(f).shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" + tabularize_ts_data(cfg) # confirm the time series files exist: - actual_files = [(f.parent.stem, f.stem) for f in list(tabularized_data_dir.glob("ts/*/*.parquet"))] + actual_files = [(f.parent.stem, f.stem) for f in list(tabularized_data_dir.glob("ts/*/*.pkl"))] expected_files = [ ("train", "1"), ("train", "0"), @@ -157,18 +168,41 @@ def test_tabularize(): ("tuning", "0"), ] assert set(actual_files) == set(expected_files) + for f in list(tabularized_data_dir.glob("ts/*/*.pkl")): + assert pd.read_pickle(f).shape[0] > 0, "Time-Series Tabular Dataframe Should not be Empty!" + shutil.rmtree(tabularized_data_dir / "ts") summarize_ts_data_over_windows(cfg) # confirm summary files exist: - actual_files = [(f.parent.stem, f.stem) for f in list(tabularized_data_dir.glob("ts/*/*.parquet"))] + output_files = list(tabularized_data_dir.glob("ts/*/*/*/*/*.pkl")) + actual_files = [str(Path(*f.parts[-5:])) for f in output_files] expected_files = [ - ("train", "1"), - ("train", "0"), - ("held_out", "0"), - ("tuning", "0"), + "train/365d/value/sum/0.pkl", + "train/365d/value/sum/1.pkl", + "train/365d/code/count/0.pkl", + "train/365d/code/count/1.pkl", + "train/full/value/sum/0.pkl", + "train/full/value/sum/1.pkl", + "train/full/code/count/0.pkl", + "train/full/code/count/1.pkl", + "train/30d/value/sum/0.pkl", + "train/30d/value/sum/1.pkl", + "train/30d/code/count/0.pkl", + "train/30d/code/count/1.pkl", + "held_out/365d/value/sum/0.pkl", + "held_out/365d/code/count/0.pkl", + "held_out/full/value/sum/0.pkl", + "held_out/full/code/count/0.pkl", + "held_out/30d/value/sum/0.pkl", + "held_out/30d/code/count/0.pkl", + "tuning/365d/value/sum/0.pkl", + "tuning/365d/code/count/0.pkl", + "tuning/full/value/sum/0.pkl", + "tuning/full/code/count/0.pkl", + "tuning/30d/value/sum/0.pkl", + "tuning/30d/code/count/0.pkl", ] assert set(actual_files) == set(expected_files) - for f in list(tabularized_data_dir.glob("summary/*/*.parquet")): - df = pl.read_parquet(f) + for f in output_files: + df = pd.read_pickle(f) assert df.shape[0] > 0 - assert df.columns == ["hi"] From ba796e5177cb29e751d8a0160979e7cb65201dae Mon Sep 17 00:00:00 2001 From: teyaberg Date: Thu, 30 May 2024 23:26:18 +0000 Subject: [PATCH 031/106] wip --- xgboost_sweep.py | 160 ++++++++++++++++++++++++----------------------- 1 file changed, 82 insertions(+), 78 deletions(-) diff --git a/xgboost_sweep.py b/xgboost_sweep.py index 1ab8c89..d54ffd5 100644 --- a/xgboost_sweep.py +++ b/xgboost_sweep.py @@ -9,9 +9,12 @@ import scipy.sparse as sp import os from typing import List, Callable -import sys +import sys import pandas as pd import glob +import json +from scipy.sparse import csr_matrix + class Iterator(xgb.DataIter): def __init__(self, cfg: DictConfig, split: str = "train"): @@ -27,19 +30,13 @@ def __init__(self, cfg: DictConfig, split: str = "train"): self.data_path = Path(cfg.tabularized_data_dir) self.dynamic_data_path = self.data_path / "summarize" / split self.static_data_path = self.data_path / "static" / split - - self._data_shards = list(self.static_data_path.glob("*.parquet")) - # [ - # x.stem - # for x in self.static_data_path.iterdir() - # if x.is_file() and x.suffix == ".parquet" - # ] + self._data_shards = [shard.stem for shard in list(self.static_data_path.glob("*.parquet"))] if cfg.iterator.keep_static_data_in_memory: - self._static_shards = ( - self._get_static_shards() - ) + self._static_shards = self._get_static_shards() - self.codes_set, self.aggs_set, self.min_frequency_set = self._get_inclusion_sets() + self.codes_set, self.aggs_set, self.min_frequency_set = ( + self._get_inclusion_sets() + ) self._it = 0 @@ -62,17 +59,14 @@ def _get_inclusion_sets(self) -> tuple[set, set, set]: if self.cfg.aggs is not None: aggs_set = set(self.cfg.aggs) if self.cfg.min_code_inclusion_frequency is not None: - dataset_freuqency = pd.read_json( - self.data_path / "feature_freqs.json" # TODO: make sure this is the right path + feature_freqs = json.load( + self.data_path + / "feature_freqs.json" # TODO: make sure this is the right path ) min_frequency_set = set( - dataset_freuqency.filter( - cs.col("frequency") >= self.cfg.min_code_inclusion_frequency - ) - .select("code") - .collect() - .to_numpy() - .flatten() + key + for key, value in feature_freqs.items() + if value >= self.cfg.min_code_inclusion_frequency ) return codes_set, aggs_set, min_frequency_set @@ -87,50 +81,70 @@ def _get_static_shards(self) -> dict: """ static_shards = {} for iter in self._data_shards: - static_shards[iter] = pl.scan_parquet( + static_shards[iter] = self._get_sparse_dynamic_shard_from_file( self.static_data_path / f"{iter}.parquet" ) return static_shards - def _sparsify_shard(self, df: pl.DataFrame) -> tuple[sp.csc_matrix, np.ndarray]: + def _sparsify_shard(self, df: pd.DataFrame) -> tuple[sp.csc_matrix, np.ndarray]: """ Make X and y as scipy sparse arrays for XGBoost. Args: - - df (pl.DataFrame): Data frame to sparsify. + - df (pandas.DataFrame): Data frame to sparsify. Returns: - tuple[scipy.sparse.csr_matrix, numpy.ndarray]: Tuple of feature data and labels. """ - labels = df.select( - [ - col - for col in df.schema.keys() - if col.endswith("/task") - ] + labels = df.loc[:,[col for col in df.columns if col.endswith("/task")]] + data = df.drop(columns=labels.columns) + return csr_matrix(data), labels.values + + def _validate_shard_file_inclusion(self, file:Path) -> bool: + parts = file.relative_to(self.dynamic_data_path).parts + if not parts: + return False + + codes_part = "/".join(parts[2:-2]) + aggs_part = "/".join(parts[-2:]) + + return ( + (self.codes_set is None or codes_part in self.codes_set) and + (self.min_frequency_set is None or codes_part in self.min_frequency_set) and + (self.aggs_set is None or aggs_part in self.aggs_set) ) - data = df.select( - [ - col - for col in df.schema.keys() - if col not in ["label", "patient_id", "timestamp"] - and not col.endswith("/task") - ] + def _assert_correct_sorting(self, shard: pd.DataFrame): + """ + Assert that the shard is sorted correctly. + """ + if "timestamp" in shard.columns: + sort_columns = ["patient_id", "timestamp"] + else: + sort_columns = ["patient_id"] + assert shard[sort_columns].equals(shard[sort_columns].sort_values(by=sort_columns)), ( + f"Shard is not sorted on correctly. " + "Please ensure that the data is sorted on patient_id and timestamp, if applicable." ) - X, y = None, None - ### TODO: This could be optimized so that we are collecting the largest shards possible at once and then sparsifying them - X = sp.csc_matrix(data.select([col for col in data.schema.keys() if not col.startswith(tuple(self.cfg.window_sizes))]).collect().to_numpy()) - for window in self.cfg.window_sizes: - col_csc = sp.csc_matrix(data.select([col for col in data.schema.keys() if col.startswith(f"{window}/")]).collect().to_numpy()) - X = sp.hstack([X, col_csc]) - y = labels.collect().to_numpy() - - ### TODO: fix the need to convert to array here!!! - return X.tocsr(), y + def _get_sparse_dynamic_shard_from_file(self, path: Path) -> pd.DataFrame: + """ + Load a sparse shard into memory. This returns a shard as a pandas dataframe, + asserted that it is sorted on patient id and timestamp, if included. + + Args: + - path (Path): Path to the sparse shard. + + Returns: + - pd.DataFrame: Data frame with the sparse shard. + + """ + shard = pd.read_parquet(path) + self._assert_correct_sorting(shard) + return shard.drop(columns=["patient_id", "timestamp"]) + - def _load_shard(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: + def _load_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: """ Load a specific shard of data from disk and concatenate with static data. @@ -144,43 +158,33 @@ def _load_shard(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: """ if self.cfg.iterator.keep_static_data_in_memory: - df = self._static_shards[self._data_shards[idx]] + static_df = self._static_shards[self._data_shards[idx]] else: - df = pl.scan_parquet( + static_df = self._get_sparse_dynamic_shard_from_file( self.static_data_path / f"{self._data_shards[idx]}.parquet" ) - for window in self.cfg.window_sizes: - dynamic_df = pl.scan_parquet( - self.dynamic_data_path / window / f"{self._data_shards[idx]}.parquet" - ) + files = list( + self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}*.parquet") + ) - columns = dynamic_df.schema.keys() - selected_columns = [ - col - for col in columns - if (parts := col.split("/")) - and len(parts) > 3 - and (self.codes_set is None or "/".join(parts[1:-2]) in self.codes_set) - and (self.min_frequency_set is None or "/".join(parts[1:-2]) in self.min_frequency_set) - and (self.aggs_set is None or "/".join(parts[-2:]) in self.aggs_set) - ] - selected_columns.extend(["patient_id", "timestamp"]) - dynamic_df = dynamic_df.select(selected_columns) - - df = pl.concat([df, dynamic_df], how="align") + files = [file for file in files if self._validate_shard_file_inclusion(file)] - ### TODO: add in some type checking etc for safety + dynamic_dfs = [self._get_sparse_dynamic_shard_from_file(file) for file in files] + dynamic_df = pd.concat(dynamic_dfs, axis=1) - ### TODO: Figure out features vs labels --> look at esgpt_baseline for loading in labels based on tasks - - task_df = pl.scan_parquet(self.data_path / "tasks.parquet") - task_df = task_df.rename({col: f"{col}/task" for col in task_df.schema.keys() if col not in ["patient_id", "timestamp"]}) # TODO: filtering of the tasks?? --> need to know more about tasks - ### TODO: Change to join_on with left merge orig df on left, labels on right join on subject_id and timestamp - df = df.join(task_df, on=["patient_id", "timestamp"], how="left") + ### TODO: add in some type checking etc for safety + ### TODO: Figure out features vs labels --> look at esgpt_baseline for loading in labels based on tasks --> nassim told me to do something else + task_df = pd.read_parquet(self.data_path / "tasks.parquet") + df = task_df.join(static_df, on=["patient_id"], how="left") + self._assert_correct_sorting(df) + df = df.drop(columns=["patient_id", "timestamp"]) + df = df.rename({col: f"{col}/task" for col in df.columns}) + df = task_df.join(static_df, on=["patient_id"], how="left") + df = pd.concat([df, dynamic_df], axis=1) - ### TODO: Figure out best way to export this to dmatrix + ### TODO: Figure out best way to export this to dmatrix # --> can we use scipy sparse matrix/array? --> likely we will not be able to collect in memory return self._sparsify_shard(df) @@ -201,7 +205,7 @@ def next(self, input_data: Callable): # input_data is a function passed in by XGBoost who has the exact same signature of # ``DMatrix`` - X, y = self._load_shard(self._it) # self._data_shards[self._it]) + X, y = self._load_shard_by_index(self._it) # self._data_shards[self._it]) input_data(data=X, label=y) self._it += 1 # Return 1 to let XGBoost know we haven't seen all the files yet. @@ -225,7 +229,7 @@ def collect_in_memory(self) -> tuple[sp.csr_matrix, np.ndarray]: X = [] y = [] for i in range(len(self._data_shards)): - X_, y_ = self._load_shard(i) + X_, y_ = self._load_shard_by_index(i) X.append(X_) y.append(y_) From 3678d30150e2109887dd9f41753695a9561b64e9 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Thu, 30 May 2024 23:37:14 +0000 Subject: [PATCH 032/106] automl --- xgboost_sweep.py | 155 ++++++++++++++++------------------------------- 1 file changed, 53 insertions(+), 102 deletions(-) diff --git a/xgboost_sweep.py b/xgboost_sweep.py index d54ffd5..c1e4c29 100644 --- a/xgboost_sweep.py +++ b/xgboost_sweep.py @@ -1,30 +1,25 @@ -import hydra -from omegaconf import DictConfig, OmegaConf +import json +import os +from collections.abc import Callable from pathlib import Path -import xgboost as xgb -import polars as pl + +import hydra import numpy as np -import polars.selectors as cs -from sklearn.metrics import mean_absolute_error -import scipy.sparse as sp -import os -from typing import List, Callable -import sys import pandas as pd -import glob -import json +import scipy.sparse as sp +import xgboost as xgb +from omegaconf import DictConfig, OmegaConf from scipy.sparse import csr_matrix +from sklearn.metrics import mean_absolute_error class Iterator(xgb.DataIter): def __init__(self, cfg: DictConfig, split: str = "train"): - """ - Initialize the Iterator with the provided configuration and split. + """Initialize the Iterator with the provided configuration and split. Args: - cfg (DictConfig): Configuration dictionary. - split (str): The data split to use ("train", "tuning", or "held_out"). - """ self.cfg = cfg self.data_path = Path(cfg.tabularized_data_dir) @@ -34,9 +29,7 @@ def __init__(self, cfg: DictConfig, split: str = "train"): if cfg.iterator.keep_static_data_in_memory: self._static_shards = self._get_static_shards() - self.codes_set, self.aggs_set, self.min_frequency_set = ( - self._get_inclusion_sets() - ) + self.codes_set, self.aggs_set, self.min_frequency_set = self._get_inclusion_sets() self._it = 0 @@ -45,8 +38,7 @@ def __init__(self, cfg: DictConfig, split: str = "train"): super().__init__(cache_prefix=os.path.join(".", "cache")) def _get_inclusion_sets(self) -> tuple[set, set, set]: - """ - Get the inclusion sets for codes and aggregations. + """Get the inclusion sets for codes and aggregations. Returns: - tuple[set, set, set]: Tuple of sets for codes, aggregations, and minimum code frequency. @@ -60,24 +52,19 @@ def _get_inclusion_sets(self) -> tuple[set, set, set]: aggs_set = set(self.cfg.aggs) if self.cfg.min_code_inclusion_frequency is not None: feature_freqs = json.load( - self.data_path - / "feature_freqs.json" # TODO: make sure this is the right path - ) - min_frequency_set = set( - key - for key, value in feature_freqs.items() - if value >= self.cfg.min_code_inclusion_frequency + self.data_path / "feature_freqs.json" # TODO: make sure this is the right path ) + min_frequency_set = { + key for key, value in feature_freqs.items() if value >= self.cfg.min_code_inclusion_frequency + } return codes_set, aggs_set, min_frequency_set def _get_static_shards(self) -> dict: - """ - Load static shards into memory. + """Load static shards into memory. Returns: - dict: Dictionary with shard names as keys and data frames as values. - """ static_shards = {} for iter in self._data_shards: @@ -87,66 +74,59 @@ def _get_static_shards(self) -> dict: return static_shards def _sparsify_shard(self, df: pd.DataFrame) -> tuple[sp.csc_matrix, np.ndarray]: - """ - Make X and y as scipy sparse arrays for XGBoost. + """Make X and y as scipy sparse arrays for XGBoost. Args: - df (pandas.DataFrame): Data frame to sparsify. Returns: - tuple[scipy.sparse.csr_matrix, numpy.ndarray]: Tuple of feature data and labels. - """ - labels = df.loc[:,[col for col in df.columns if col.endswith("/task")]] + labels = df.loc[:, [col for col in df.columns if col.endswith("/task")]] data = df.drop(columns=labels.columns) return csr_matrix(data), labels.values - - def _validate_shard_file_inclusion(self, file:Path) -> bool: + + def _validate_shard_file_inclusion(self, file: Path) -> bool: parts = file.relative_to(self.dynamic_data_path).parts if not parts: return False - + codes_part = "/".join(parts[2:-2]) aggs_part = "/".join(parts[-2:]) - + return ( - (self.codes_set is None or codes_part in self.codes_set) and - (self.min_frequency_set is None or codes_part in self.min_frequency_set) and - (self.aggs_set is None or aggs_part in self.aggs_set) + (self.codes_set is None or codes_part in self.codes_set) + and (self.min_frequency_set is None or codes_part in self.min_frequency_set) + and (self.aggs_set is None or aggs_part in self.aggs_set) ) + def _assert_correct_sorting(self, shard: pd.DataFrame): - """ - Assert that the shard is sorted correctly. - """ + """Assert that the shard is sorted correctly.""" if "timestamp" in shard.columns: sort_columns = ["patient_id", "timestamp"] else: sort_columns = ["patient_id"] assert shard[sort_columns].equals(shard[sort_columns].sort_values(by=sort_columns)), ( - f"Shard is not sorted on correctly. " + "Shard is not sorted on correctly. " "Please ensure that the data is sorted on patient_id and timestamp, if applicable." ) def _get_sparse_dynamic_shard_from_file(self, path: Path) -> pd.DataFrame: - """ - Load a sparse shard into memory. This returns a shard as a pandas dataframe, - asserted that it is sorted on patient id and timestamp, if included. + """Load a sparse shard into memory. This returns a shard as a pandas dataframe, asserted that it is + sorted on patient id and timestamp, if included. Args: - path (Path): Path to the sparse shard. Returns: - pd.DataFrame: Data frame with the sparse shard. - """ shard = pd.read_parquet(path) self._assert_correct_sorting(shard) return shard.drop(columns=["patient_id", "timestamp"]) - def _load_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: - """ - Load a specific shard of data from disk and concatenate with static data. + """Load a specific shard of data from disk and concatenate with static data. Args: - idx (int): Index of the shard to load. @@ -154,7 +134,6 @@ def _load_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: Returns: - X (scipy.sparse.csr_matrix): Feature data frame. - y (numpy.ndarray): Labels. - """ if self.cfg.iterator.keep_static_data_in_memory: @@ -164,34 +143,33 @@ def _load_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: self.static_data_path / f"{self._data_shards[idx]}.parquet" ) - files = list( - self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}*.parquet") - ) + files = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}*.parquet")) files = [file for file in files if self._validate_shard_file_inclusion(file)] dynamic_dfs = [self._get_sparse_dynamic_shard_from_file(file) for file in files] dynamic_df = pd.concat(dynamic_dfs, axis=1) - ### TODO: add in some type checking etc for safety + # TODO: add in some type checking etc for safety - ### TODO: Figure out features vs labels --> look at esgpt_baseline for loading in labels based on tasks --> nassim told me to do something else + # TODO: Figure out features vs labels + # --> look at esgpt_baseline for loading in labels based on tasks + # --> nassim told me to do something else task_df = pd.read_parquet(self.data_path / "tasks.parquet") df = task_df.join(static_df, on=["patient_id"], how="left") self._assert_correct_sorting(df) df = df.drop(columns=["patient_id", "timestamp"]) - df = df.rename({col: f"{col}/task" for col in df.columns}) + df = df.rename({col: f"{col}/task" for col in df.columns}) df = task_df.join(static_df, on=["patient_id"], how="left") df = pd.concat([df, dynamic_df], axis=1) - ### TODO: Figure out best way to export this to dmatrix + # TODO: Figure out best way to export this to dmatrix # --> can we use scipy sparse matrix/array? --> likely we will not be able to collect in memory return self._sparsify_shard(df) def next(self, input_data: Callable): - """ - Advance the iterator by 1 step and pass the data to XGBoost. This function is - called by XGBoost during the construction of ``DMatrix`` + """Advance the iterator by 1 step and pass the data to XGBoost. This function is called by XGBoost + during the construction of ``DMatrix`` Args: - input_data (Callable): A function passed by XGBoost with the same signature as `DMatrix`. @@ -212,19 +190,14 @@ def next(self, input_data: Callable): return 1 def reset(self): - """ - Reset the iterator to its beginning. - - """ + """Reset the iterator to its beginning.""" self._it = 0 def collect_in_memory(self) -> tuple[sp.csr_matrix, np.ndarray]: - """ - Collect the data in memory. + """Collect the data in memory. Returns: - tuple[np.ndarray, np.ndarray]: Tuple of feature data and labels. - """ X = [] y = [] @@ -240,17 +213,14 @@ def collect_in_memory(self) -> tuple[sp.csr_matrix, np.ndarray]: class XGBoostModel: def __init__(self, cfg: DictConfig): - """ - Initialize the XGBoostClassifier with the provided configuration. + """Initialize the XGBoostClassifier with the provided configuration. Args: - cfg (DictConfig): Configuration dictionary. """ self.cfg = cfg - self.keep_data_in_memory = getattr( - getattr(cfg, "iterator", {}), "keep_data_in_memory", True - ) + self.keep_data_in_memory = getattr(getattr(cfg, "iterator", {}), "keep_data_in_memory", True) self.itrain = None self.ival = None @@ -263,20 +233,14 @@ def __init__(self, cfg: DictConfig): self.model = None def train(self): - """ - Train the model. - - """ + """Train the model.""" self._build() self.model = xgb.train( OmegaConf.to_container(self.cfg.model), self.dtrain ) # do we want eval and things? def _build(self): - """ - Build necessary data structures for training. - - """ + """Build necessary data structures for training.""" if self.keep_data_in_memory: self._build_iterators() self._build_dmatrix_in_memory() @@ -285,10 +249,7 @@ def _build(self): self._build_dmatrix_from_iterators() def _build_dmatrix_in_memory(self): - """ - Build the DMatrix from the data in memory. - - """ + """Build the DMatrix from the data in memory.""" X_train, y_train = self.itrain.collect_in_memory() X_val, y_val = self.ival.collect_in_memory() X_test, y_test = self.itest.collect_in_memory() @@ -297,32 +258,24 @@ def _build_dmatrix_in_memory(self): self.dtest = xgb.DMatrix(X_test, label=y_test) def _build_dmatrix_from_iterators(self): - """ - Build the DMatrix from the iterators. - - """ + """Build the DMatrix from the iterators.""" self.dtrain = xgb.DMatrix(self.ival) self.dval = xgb.DMatrix(self.itest) self.dtest = xgb.DMatrix(self.itest) def _build_iterators(self): - """ - Build the iterators for training, validation, and testing. - - """ + """Build the iterators for training, validation, and testing.""" self.itrain = Iterator(self.cfg, split="train") self.ival = Iterator(self.cfg, split="tuning") self.itest = Iterator(self.cfg, split="held_out") def evaluate(self) -> float: - """ - Evaluate the model on the test set. + """Evaluate the model on the test set. Returns: - float: Evaluation metric (mae). - """ - ### TODO: Figure out exactly what we want to do here + # TODO: Figure out exactly what we want to do here y_pred = self.model.predict(self.dtest) y_true = self.dtest.get_label() @@ -331,15 +284,13 @@ def evaluate(self) -> float: @hydra.main(version_base=None, config_path="configs", config_name="tabularize_sweep") def optimize(cfg: DictConfig) -> float: - """ - Optimize the model based on the provided configuration. + """Optimize the model based on the provided configuration. Args: - cfg (DictConfig): Configuration dictionary. Returns: - float: Evaluation result. - """ model = XGBoostModel(cfg) From ffa0f3c9c65e287f1806548c182553894f464641 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Fri, 31 May 2024 04:12:38 +0000 Subject: [PATCH 033/106] working on collect_in_memory --- ...bularize_sweep.yaml => xgboost_sweep.yaml} | 34 ++-- xgboost_sweep.py => scripts/xgboost_sweep.py | 145 +++++++++++++----- tests/test_tabularize.py | 16 +- 3 files changed, 135 insertions(+), 60 deletions(-) rename configs/{tabularize_sweep.yaml => xgboost_sweep.yaml} (60%) rename xgboost_sweep.py => scripts/xgboost_sweep.py (66%) diff --git a/configs/tabularize_sweep.yaml b/configs/xgboost_sweep.yaml similarity index 60% rename from configs/tabularize_sweep.yaml rename to configs/xgboost_sweep.yaml index e939041..72ccac0 100644 --- a/configs/tabularize_sweep.yaml +++ b/configs/xgboost_sweep.yaml @@ -1,34 +1,29 @@ # Raw data -MEDS_cohort_dir: -tabularized_data_dir: /storage/teya/meds_automl/test_data/test +base_dir: /storage/teya/fake +MEDS_cohort_dir: ${base_dir}/MEDS_cohort +tabularized_data_dir: ${base_dir}/flat_reps +model_dir: ${base_dir}/models # Pre-processing min_code_inclusion_frequency: 1 -window_sizes: [1d, 7d, full] +window_sizes: [30d] codes: null aggs: - "code/count" - - "code/time_since_last" - - "code/time_since_first" - - "value/count" - "value/sum" - - "value/sum_sqd" - - "count" - - "sum" - - "sum_sqd" - # Sharding -n_patients_per_sub_shard: null +n_patients_per_sub_shard: 2 # Misc -do_overwrite: False +do_overwrite: True +do_update: True seed: 1 - +tqdm: False model: booster: gbtree - device: gpu + device: cpu nthread: 4 max_depth: 6 eta: 0.3 @@ -37,9 +32,10 @@ model: lambda: 1 alpha: 0 tree_method: hist - objective: binary:logistic + objective: reg:squaredlogerror iterator: + keep_data_in_memory: True keep_static_data_in_memory: True # Hydra settings for sweep @@ -48,6 +44,8 @@ defaults: - override hydra/sweeper/sampler: tpe hydra: + mode: MULTIRUN + verbose: False sweep: dir: ${tabularized_data_dir}/.logs/etl/${now:%Y-%m-%d_%H-%M-%S} run: @@ -64,4 +62,6 @@ hydra: # Define search space for Optuna params: - window_sizes: choice([1d], [1d, 7d], [7d, full]) + window_sizes: choice([30d, 365d, full], [30d, full], [30d]) + # iterator.keep_static_data_in_memory: choice([True], [False]) + # iterator.keep_data_in_memory: choice([True], [False]) diff --git a/xgboost_sweep.py b/scripts/xgboost_sweep.py similarity index 66% rename from xgboost_sweep.py rename to scripts/xgboost_sweep.py index c1e4c29..b01eee5 100644 --- a/xgboost_sweep.py +++ b/scripts/xgboost_sweep.py @@ -23,13 +23,13 @@ def __init__(self, cfg: DictConfig, split: str = "train"): """ self.cfg = cfg self.data_path = Path(cfg.tabularized_data_dir) - self.dynamic_data_path = self.data_path / "summarize" / split + self.dynamic_data_path = self.data_path / "ts" / split self.static_data_path = self.data_path / "static" / split self._data_shards = [shard.stem for shard in list(self.static_data_path.glob("*.parquet"))] if cfg.iterator.keep_static_data_in_memory: self._static_shards = self._get_static_shards() - self.codes_set, self.aggs_set, self.min_frequency_set = self._get_inclusion_sets() + self.codes_set, self.aggs_set, self.min_frequency_set, self.window_set = self._get_inclusion_sets() self._it = 0 @@ -51,14 +51,25 @@ def _get_inclusion_sets(self) -> tuple[set, set, set]: if self.cfg.aggs is not None: aggs_set = set(self.cfg.aggs) if self.cfg.min_code_inclusion_frequency is not None: - feature_freqs = json.load( - self.data_path / "feature_freqs.json" # TODO: make sure this is the right path - ) + with open(self.data_path / "feature_freqs.json") as f: + feature_freqs = json.load(f) min_frequency_set = { key for key, value in feature_freqs.items() if value >= self.cfg.min_code_inclusion_frequency } + window_set = set(self.cfg.window_sizes) - return codes_set, aggs_set, min_frequency_set + return codes_set, aggs_set, min_frequency_set, window_set + + def _load_static_shard_by_index(self, idx: int) -> sp.csc_matrix: + """Load a static shard into memory. + + Args: + - idx (int): Index of the shard to load. + + Returns: + - sp.csc_matrix: Sparse matrix with the static shard. + """ + return pd.read_parquet(self.static_data_path / f"{self._data_shards[int(idx)]}.parquet") def _get_static_shards(self) -> dict: """Load static shards into memory. @@ -68,9 +79,7 @@ def _get_static_shards(self) -> dict: """ static_shards = {} for iter in self._data_shards: - static_shards[iter] = self._get_sparse_dynamic_shard_from_file( - self.static_data_path / f"{iter}.parquet" - ) + static_shards[iter] = self._load_static_shard_by_index(iter) return static_shards def _sparsify_shard(self, df: pd.DataFrame) -> tuple[sp.csc_matrix, np.ndarray]: @@ -84,20 +93,22 @@ def _sparsify_shard(self, df: pd.DataFrame) -> tuple[sp.csc_matrix, np.ndarray]: """ labels = df.loc[:, [col for col in df.columns if col.endswith("/task")]] data = df.drop(columns=labels.columns) - return csr_matrix(data), labels.values + for col in data.columns: + if not isinstance(data[col].dtype, pd.SparseDtype): + data[col] = pd.arrays.SparseArray(data[col]) + sparse_matrix = data.sparse.to_coo() + return csr_matrix(sparse_matrix), labels.values def _validate_shard_file_inclusion(self, file: Path) -> bool: parts = file.relative_to(self.dynamic_data_path).parts if not parts: return False - codes_part = "/".join(parts[2:-2]) - aggs_part = "/".join(parts[-2:]) + windows_part = parts[0] + aggs_part = "/".join(parts[1:-1]) - return ( - (self.codes_set is None or codes_part in self.codes_set) - and (self.min_frequency_set is None or codes_part in self.min_frequency_set) - and (self.aggs_set is None or aggs_part in self.aggs_set) + return (self.window_set is None or windows_part in self.window_set) and ( + self.aggs_set is None or aggs_part in self.aggs_set ) def _assert_correct_sorting(self, shard: pd.DataFrame): @@ -121,11 +132,62 @@ def _get_sparse_dynamic_shard_from_file(self, path: Path) -> pd.DataFrame: Returns: - pd.DataFrame: Data frame with the sparse shard. """ - shard = pd.read_parquet(path) + shard = pd.read_pickle(path) self._assert_correct_sorting(shard) return shard.drop(columns=["patient_id", "timestamp"]) - def _load_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: + def _get_static_shard_by_index(self, idx: int) -> pd.DataFrame: + """Get the static shard from memory or disk. + + Args: + - shard_name (str): Name of the shard. + + Returns: + - pd.DataFrame: Data frame with the static shard. + """ + if self.cfg.iterator.keep_static_data_in_memory: + return self._static_shards[self._data_shards[idx]] + else: + return self._load_static_shard_by_index(self._data_shards[idx]) + + def _get_task(self, idx: int) -> pd.DataFrame: + """Get the task data for a specific shard. + + Args: + - idx (int): Index of the shard. + + Returns: + - pd.DataFrame: Data frame with the task data. + """ + # TODO: replace with something real + file = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}*.pkl"))[0] + shard = pd.read_pickle(file) + shard["label"] = np.random.randint(0, 2, shard.shape[0]) + return shard[["patient_id", "timestamp", "label"]] + + def _filter_df(self, df: pd.DataFrame) -> pd.DataFrame: + """Filter the dynamic data frame based on the inclusion sets. + + Args: + - df (pd.DataFrame): Data frame to filter. + + Returns: + - pd.DataFrame: Filtered data frame. + """ + code_parts = ["/".join(col.split("/")[1:-2]) for col in df.columns] + frequency_parts = ["/".join(col.split("/")[1:-1]) for col in df.columns] + + filtered_columns = [ + col + for col, code_part, freq_part in zip(df.columns, code_parts, frequency_parts) + if (self.codes_set is None or code_part in self.codes_set) + and (self.min_frequency_set is None or freq_part in self.min_frequency_set) + ] + filtered_columns.extend([col for col in df.columns if col.endswith("/task")]) + + return df[filtered_columns] + + def _load_dynamic_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: """Load a specific shard of data from disk and concatenate with static data. Args: @@ -136,35 +198,26 @@ def _load_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: - y (numpy.ndarray): Labels. """ - if self.cfg.iterator.keep_static_data_in_memory: - static_df = self._static_shards[self._data_shards[idx]] - else: - static_df = self._get_sparse_dynamic_shard_from_file( - self.static_data_path / f"{self._data_shards[idx]}.parquet" - ) - - files = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}*.parquet")) + files = list(self.dynamic_data_path.glob("*/*/*/*.pkl")) files = [file for file in files if self._validate_shard_file_inclusion(file)] dynamic_dfs = [self._get_sparse_dynamic_shard_from_file(file) for file in files] dynamic_df = pd.concat(dynamic_dfs, axis=1) + dynamic_df = self._filter_df(dynamic_df) # TODO: add in some type checking etc for safety + static_df = self._get_static_shard_by_index(idx) - # TODO: Figure out features vs labels - # --> look at esgpt_baseline for loading in labels based on tasks - # --> nassim told me to do something else - task_df = pd.read_parquet(self.data_path / "tasks.parquet") - df = task_df.join(static_df, on=["patient_id"], how="left") + task_df = self._get_task(idx) + task_df = task_df.rename( + columns={col: f"{col}/task" for col in task_df.columns if col not in ["patient_id", "timestamp"]} + ) + df = pd.merge(task_df, static_df, on=["patient_id"], how="left") self._assert_correct_sorting(df) - df = df.drop(columns=["patient_id", "timestamp"]) - df = df.rename({col: f"{col}/task" for col in df.columns}) - df = task_df.join(static_df, on=["patient_id"], how="left") + df = self._filter_df(df) df = pd.concat([df, dynamic_df], axis=1) - # TODO: Figure out best way to export this to dmatrix - # --> can we use scipy sparse matrix/array? --> likely we will not be able to collect in memory return self._sparsify_shard(df) def next(self, input_data: Callable): @@ -183,7 +236,7 @@ def next(self, input_data: Callable): # input_data is a function passed in by XGBoost who has the exact same signature of # ``DMatrix`` - X, y = self._load_shard_by_index(self._it) # self._data_shards[self._it]) + X, y = self._load_dynamic_shard_by_index(self._it) # self._data_shards[self._it]) input_data(data=X, label=y) self._it += 1 # Return 1 to let XGBoost know we haven't seen all the files yet. @@ -202,7 +255,7 @@ def collect_in_memory(self) -> tuple[sp.csr_matrix, np.ndarray]: X = [] y = [] for i in range(len(self._data_shards)): - X_, y_ = self._load_shard_by_index(i) + X_, y_ = self._load_dynamic_shard_by_index(i) X.append(X_) y.append(y_) @@ -282,8 +335,8 @@ def evaluate(self) -> float: return mean_absolute_error(y_true, y_pred) -@hydra.main(version_base=None, config_path="configs", config_name="tabularize_sweep") -def optimize(cfg: DictConfig) -> float: +@hydra.main(version_base=None, config_path="../configs", config_name="xgboost_sweep") +def xgboost(cfg: DictConfig) -> float: """Optimize the model based on the provided configuration. Args: @@ -292,11 +345,19 @@ def optimize(cfg: DictConfig) -> float: Returns: - float: Evaluation result. """ - model = XGBoostModel(cfg) model.train() + # save model + save_dir = ( + Path(cfg.model_dir) + / "_".join(map(str, cfg.window_sizes)) + / "_".join([agg.replace("/", "") for agg in cfg.aggs]) + ) + save_dir.mkdir(parents=True, exist_ok=True) + + model.model.save_model(save_dir / f"{np.random.randint(100000, 999999)}_model.json") return model.evaluate() if __name__ == "__main__": - optimize() + xgboost() diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 1e763fb..ea55393 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -17,6 +17,7 @@ from scripts.summarize_over_windows import summarize_ts_data_over_windows from scripts.tabularize_static import tabularize_static_data from scripts.tabularize_ts import tabularize_ts_data +from scripts.xgboost_sweep import xgboost SPLITS_JSON = """{"train/0": [239684, 1195293], "train/1": [68729, 814703], "tuning/0": [754281], "held_out/0": [1500733]}""" # noqa: E501 @@ -107,6 +108,7 @@ def test_tabularize(): with tempfile.TemporaryDirectory() as d: MEDS_cohort_dir = Path(d) / "MEDS_cohort" tabularized_data_dir = Path(d) / "flat_reps" + model_dir = Path(d) / "save_model" # Create the directories MEDS_cohort_dir.mkdir() @@ -130,7 +132,7 @@ def test_tabularize(): "min_code_inclusion_frequency": 1, "window_sizes": ["30d", "365d", "full"], "aggs": ["code/count", "value/sum"], - "codes": None, + "codes": "null", "n_patients_per_sub_shard": 2, "do_overwrite": True, "do_update": True, @@ -206,3 +208,15 @@ def test_tabularize(): for f in output_files: df = pd.read_pickle(f) assert df.shape[0] > 0 + + xgboost_config_kwargs = { + "model_dir": str(model_dir.resolve()), + "hydra.mode": "MULTIRUN", + } + xgboost_config_kwargs = {**tabularize_config_kwargs, **xgboost_config_kwargs} + with initialize(version_base=None, config_path="../configs/"): # path to config.yaml + overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] + cfg = compose(config_name="xgboost_sweep", overrides=overrides) # config.yaml + xgboost(cfg) + output_files = list(model_dir.glob("*/*/*_model.json")) + assert len(output_files) == 1 From c8f26ea62ca307a6e287f94628e5febd37f43faa Mon Sep 17 00:00:00 2001 From: teyaberg Date: Fri, 31 May 2024 13:41:41 +0000 Subject: [PATCH 034/106] collect in memory fixed --- scripts/xgboost_sweep.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/xgboost_sweep.py b/scripts/xgboost_sweep.py index b01eee5..9172f3a 100644 --- a/scripts/xgboost_sweep.py +++ b/scripts/xgboost_sweep.py @@ -160,7 +160,7 @@ def _get_task(self, idx: int) -> pd.DataFrame: - pd.DataFrame: Data frame with the task data. """ # TODO: replace with something real - file = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}*.pkl"))[0] + file = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}.pkl"))[0] shard = pd.read_pickle(file) shard["label"] = np.random.randint(0, 2, shard.shape[0]) return shard[["patient_id", "timestamp", "label"]] @@ -198,7 +198,7 @@ def _load_dynamic_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndar - y (numpy.ndarray): Labels. """ - files = list(self.dynamic_data_path.glob("*/*/*/*.pkl")) + files = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}.pkl")) files = [file for file in files if self._validate_shard_file_inclusion(file)] From f6a375114f14ea624c196c6e1076080e80deaad5 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Fri, 31 May 2024 14:06:36 +0000 Subject: [PATCH 035/106] added hf_cohort scripts --- configs/tabularize.yaml | 4 +- pyproject.toml | 4 +- scripts/hf_cohort_e2e.sh | 39 +++++++++++++++++ scripts/hf_cohort_shard.sh | 42 +++++++++++++++++++ scripts/summarize_over_windows.py | 13 ++++++ .../generate_summarized_reps.py | 14 ++++--- src/MEDS_tabular_automl/utils.py | 13 +++++- tests/test_tabularize.py | 1 + 8 files changed, 121 insertions(+), 9 deletions(-) create mode 100644 scripts/hf_cohort_e2e.sh create mode 100644 scripts/hf_cohort_shard.sh diff --git a/configs/tabularize.yaml b/configs/tabularize.yaml index 10a834c..9f64161 100644 --- a/configs/tabularize.yaml +++ b/configs/tabularize.yaml @@ -25,7 +25,9 @@ n_patients_per_sub_shard: null do_overwrite: False do_update: True seed: 1 -tqdm: True +tqdm: False +worker: 1 +test: False # Hydra hydra: diff --git a/pyproject.toml b/pyproject.toml index 53155e6..4ecb8f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,11 +16,13 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", "pandas"] +dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", "pandas", "numba", "tqdm"] [project.optional-dependencies] dev = ["pre-commit"] tests = ["pytest", "pytest-cov", "rootutils"] +local_parallelism = ["hydra-joblib-launcher"] +slurm_parallelism = ["hydra-submitit-launcher"] [project.urls] Homepage = "https://github.com/mmcdermott/MEDS_polars_functions" diff --git a/scripts/hf_cohort_e2e.sh b/scripts/hf_cohort_e2e.sh new file mode 100644 index 0000000..2fbc235 --- /dev/null +++ b/scripts/hf_cohort_e2e.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +MEDS_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort +OUTPUT_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize +N_PARALLEL_WORKERS="$1" +WINDOW_SIZES="window_sizes=[1d]" +AGGS="aggs=[code/count,value/sum]" +# WINDOW_SIZES="window_sizes=[1d,7d,30d,365d,full]" +# AGGS="aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" + +echo "Running identify_columns.py: Caching feature names and frequencies." +rm -rf $OUTPUT_DIR +POLARS_MAX_THREADS=32 python scripts/identify_columns.py \ + MEDS_cohort_dir=$MEDS_DIR \ + tabularized_data_dir=$OUTPUT_DIR \ + min_code_inclusion_frequency=1 $WINDOW_SIZES do_overwrite=False $AGGS + +echo "Running tabularize_static.py: tabularizing static data" +POLARS_MAX_THREADS=32 python scripts/tabularize_static.py \ + MEDS_cohort_dir=$MEDS_DIR \ + tabularized_data_dir=$OUTPUT_DIR \ + min_code_inclusion_frequency=1 $WINDOW_SIZES do_overwrite=False $AGGS + +# echo "Running summarize_over_windows.py with $N_PARALLEL_WORKERS workers in parallel" +# POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py \ +# --multirun \ +# worker="range(0,$N_PARALLEL_WORKERS)" \ +# hydra/launcher=joblib \ +# MEDS_cohort_dir=$MEDS_DIR \ +# tabularized_data_dir=$OUTPUT_DIR \ +# min_code_inclusion_frequency=1 do_overwrite=False \ +# $WINDOW_SIZES $AGGS + +echo "Running summarize_over_windows.py" +POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py \ + MEDS_cohort_dir=$MEDS_DIR \ + tabularized_data_dir=$OUTPUT_DIR \ + min_code_inclusion_frequency=1 do_overwrite=False \ + $WINDOW_SIZES $AGGS diff --git a/scripts/hf_cohort_shard.sh b/scripts/hf_cohort_shard.sh new file mode 100644 index 0000000..f30878e --- /dev/null +++ b/scripts/hf_cohort_shard.sh @@ -0,0 +1,42 @@ + +OUTPUT_DIR=/data/storage/shared/meds_tabular_ml/ebcl_dataset/processed +PATIENTS_PER_SHARD="2500" +CHUNKSIZE="200_000_000" + +rm -rf $OUTPUT_DIR + +echo "Running shard_events.py" +POLARS_MAX_THREADS=32 python /home/nassim/projects/MEDS_polars_functions/scripts/extraction/shard_events.py \ + raw_cohort_dir=/data/storage/shared/meds_tabular_ml/ebcl_dataset \ + MEDS_cohort_dir=$OUTPUT_DIR \ + event_conversion_config_fp=/data/storage/shared/meds_tabular_ml/ebcl_dataset/cohort.yaml \ + split_fracs.train=0.6666666666666666 split_fracs.tuning=0.16666666666666666 \ + split_fracs.held_out=0.16666666666666666 row_chunksize=$CHUNKSIZE \ + n_patients_per_shard=$PATIENTS_PER_SHARD hydra.verbose=True + +echo "Running split_and_shard_patients.py" +POLARS_MAX_THREADS=32 python /home/nassim/projects/MEDS_polars_functions/scripts/extraction/split_and_shard_patients.py \ + raw_cohort_dir=/data/storage/shared/meds_tabular_ml/ebcl_dataset \ + MEDS_cohort_dir=$OUTPUT_DIR \ + event_conversion_config_fp=/data/storage/shared/meds_tabular_ml/ebcl_dataset/cohort.yaml \ + split_fracs.train=0.6666666666666666 split_fracs.tuning=0.16666666666666666 \ + split_fracs.held_out=0.16666666666666666 row_chunksize=$CHUNKSIZE \ + n_patients_per_shard=$PATIENTS_PER_SHARD hydra.verbose=True + +echo "Running convert_to_sharded_events.py" +POLARS_MAX_THREADS=32 python /home/nassim/projects/MEDS_polars_functions/scripts/extraction/convert_to_sharded_events.py \ + raw_cohort_dir=/data/storage/shared/meds_tabular_ml/ebcl_dataset \ + MEDS_cohort_dir=$OUTPUT_DIR \ + event_conversion_config_fp=/data/storage/shared/meds_tabular_ml/ebcl_dataset/cohort.yaml \ + split_fracs.train=0.6666666666666666 split_fracs.tuning=0.16666666666666666 \ + split_fracs.held_out=0.16666666666666666 row_chunksize=$CHUNKSIZE \ + n_patients_per_shard=$PATIENTS_PER_SHARD hydra.verbose=True + +echo "Running merge_to_MEDS_cohort.py" +POLARS_MAX_THREADS=32 python /home/nassim/projects/MEDS_polars_functions/scripts/extraction/merge_to_MEDS_cohort.py \ + raw_cohort_dir=/data/storage/shared/meds_tabular_ml/ebcl_dataset \ + MEDS_cohort_dir=$OUTPUT_DIR \ + event_conversion_config_fp=/data/storage/shared/meds_tabular_ml/ebcl_dataset/cohort.yaml \ + split_fracs.train=0.6666666666666666 split_fracs.tuning=0.16666666666666666 \ + split_fracs.held_out=0.16666666666666666 row_chunksize=$CHUNKSIZE \ + n_patients_per_shard=$PATIENTS_PER_SHARD hydra.verbose=True diff --git a/scripts/summarize_over_windows.py b/scripts/summarize_over_windows.py index c67a9f3..66a4c71 100644 --- a/scripts/summarize_over_windows.py +++ b/scripts/summarize_over_windows.py @@ -1,6 +1,8 @@ #!/usr/bin/env python """Aggregates time-series data for feature columns across different window sizes.""" +import os + import hydra import polars as pl from loguru import logger @@ -12,6 +14,15 @@ from MEDS_tabular_automl.utils import setup_environment, write_df +def hydra_loguru_init() -> None: + """Adds loguru output to the logs that hydra scrapes. + + Must be called from a hydra main! + """ + hydra_path = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir + logger.add(os.path.join(hydra_path, "main.log")) + + @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") def summarize_ts_data_over_windows( cfg: DictConfig, @@ -42,6 +53,8 @@ def summarize_ts_data_over_windows( FileNotFoundError: If specified directories or files in the configuration are not found. ValueError: If required columns like 'code' or 'value' are missing in the data files. """ + if not cfg.test: + hydra_loguru_init() flat_dir, split_to_fps, feature_columns = setup_environment(cfg, load_data=False) # Produce ts representation ts_subdir = flat_dir / "ts" diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 94158a8..ee9e569 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -1,4 +1,5 @@ from collections.abc import Callable +from datetime import datetime import pandas as pd from scipy.sparse import vstack @@ -178,25 +179,26 @@ def compute_agg(df, window_size: str, agg: str, use_tqdm=False): timedelta = df["timestamp"].max() - df["timestamp"].min() + pd.Timedelta(days=1) else: timedelta = pd.Timedelta(window_size) - logger.info("grouping by patient_id") + logger.info("Grouping by patient_ids -- this may take a while.") group = dict(list(df[["patient_id", "timestamp"]].groupby("patient_id"))) sparse_matrix = df[df.columns[2:]].sparse.to_coo() sparse_matrix = csr_matrix(sparse_matrix) - logger.info("done grouping") + logger.info("Grouping Complete! Starting sparse rolling.") out_sparse_matrix = coo_matrix((0, sparse_matrix.shape[1]), dtype=sparse_matrix.dtype) out_dfs = [] iter_wrapper = load_tqdm(use_tqdm) agg = agg.split("/")[1] - for patient_id, subset_df in iter_wrapper(group.items(), total=len(group)): - logger.info("sparse rolling setup") + start_time = datetime.now() + for i, (patient_id, subset_df) in enumerate(iter_wrapper(group.items(), total=len(group))): + if i % 10 == 0: + logger.info(f"Progress is {i}/{len(group)}") + logger.info(f"Time elapsed: {datetime.now() - start_time}") subset_sparse_matrix = sparse_matrix[subset_df.index] patient_df = subset_df[["patient_id", "timestamp"]] assert patient_df.timestamp.isnull().sum() == 0, "timestamp cannot be null" - logger.info("sparse rolling start") patient_df, subset_sparse_matrix = sum_merge_timestamps(patient_df, subset_sparse_matrix, agg) patient_df, out_sparse = sparse_rolling(patient_df, subset_sparse_matrix, timedelta, agg) - logger.info("sparse rolling complete") out_dfs.append(patient_df) out_sparse_matrix = vstack([out_sparse_matrix, out_sparse]) out_df = pd.concat(out_dfs, axis=0) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 04bbc0d..ef3c016 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -13,6 +13,7 @@ import polars as pl import polars.selectors as cs import yaml +from loguru import logger from omegaconf import DictConfig, OmegaConf DF_T = pl.LazyFrame @@ -343,5 +344,15 @@ def setup_environment(cfg: DictConfig, load_data: bool = True): with open(flat_dir / "config.yaml") as file: yaml_config = yaml.safe_load(file) stored_config = OmegaConf.create(yaml_config) - assert stored_config == cfg, "Stored config does not match current config." + logger.info(f"Stored config: {stored_config}") + logger.info(f"Worker config: {cfg}") + assert cfg.keys() == stored_config.keys(), ( + f"Keys in stored config do not match current config.")`` + for key in cfg.keys(): + assert key in stored_config, f"Key {key} not found in stored config." + if key == "worker": + continue + assert ( + cfg[key] == stored_config[key] + ), f"Config key {key}, value is {cfg[key]} vs {stored_config[key]}" return flat_dir, split_to_df, feature_columns diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 1e763fb..cb6703d 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -137,6 +137,7 @@ def test_tabularize(): "seed": 1, "hydra.verbose": True, "tqdm": False, + "test": True, } with initialize(version_base=None, config_path="../configs/"): # path to config.yaml From 2ec18604a2a339eeba3474daa6cc93a4c9cc1cd9 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Fri, 31 May 2024 10:55:32 -0400 Subject: [PATCH 036/106] Apply suggestions from code review Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- README.md | 3 +-- scripts/hf_cohort_e2e.sh | 2 +- scripts/hf_cohort_shard.sh | 2 +- src/MEDS_tabular_automl/utils.py | 4 ++-- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 033c5c4..5a596d1 100644 --- a/README.md +++ b/README.md @@ -45,8 +45,7 @@ This repository consists of two key pieces: ### Scripts and Examples See `tests/test_tabularize_integration.py` for an example of the end-to-end pipeline being run on synthetic data. This -script is a functional test that is also run with `pytest` to verify correctness of the algorithm. - +script is a functional test that is also run with `pytest` to verify the correctness of the algorithm. #### Core Scripts: 1. `scripts/tabularize/identify_columns.py` loads all training shard to identify which feature columns diff --git a/scripts/hf_cohort_e2e.sh b/scripts/hf_cohort_e2e.sh index 2fbc235..3c39ea5 100644 --- a/scripts/hf_cohort_e2e.sh +++ b/scripts/hf_cohort_e2e.sh @@ -13,7 +13,7 @@ rm -rf $OUTPUT_DIR POLARS_MAX_THREADS=32 python scripts/identify_columns.py \ MEDS_cohort_dir=$MEDS_DIR \ tabularized_data_dir=$OUTPUT_DIR \ - min_code_inclusion_frequency=1 $WINDOW_SIZES do_overwrite=False $AGGS + min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" echo "Running tabularize_static.py: tabularizing static data" POLARS_MAX_THREADS=32 python scripts/tabularize_static.py \ diff --git a/scripts/hf_cohort_shard.sh b/scripts/hf_cohort_shard.sh index f30878e..351ef3f 100644 --- a/scripts/hf_cohort_shard.sh +++ b/scripts/hf_cohort_shard.sh @@ -1,4 +1,4 @@ - +#!/usr/bin/env bash OUTPUT_DIR=/data/storage/shared/meds_tabular_ml/ebcl_dataset/processed PATIENTS_PER_SHARD="2500" CHUNKSIZE="200_000_000" diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index ef3c016..538db88 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -77,7 +77,7 @@ def get_static_col_dtype(col: str) -> pl.DataType: case "count" | "has_values_count": return pl.UInt32 case _: - raise ValueError(f"Column name {col} malformed!") + raise ValueError(f"Column name {col} malformed! Expected aggregations: 'sum', 'sum_sqd', 'min', 'max', 'value', 'first', 'present', 'count', 'has_values_count'.") def add_static_missing_cols( @@ -347,7 +347,7 @@ def setup_environment(cfg: DictConfig, load_data: bool = True): logger.info(f"Stored config: {stored_config}") logger.info(f"Worker config: {cfg}") assert cfg.keys() == stored_config.keys(), ( - f"Keys in stored config do not match current config.")`` + f"Keys in stored config do not match current config.") for key in cfg.keys(): assert key in stored_config, f"Key {key} not found in stored config." if key == "worker": From db18dc5fc1aca4f3b427e25be7145cebecd3df63 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Fri, 31 May 2024 14:55:45 +0000 Subject: [PATCH 037/106] cleaning --- scripts/xgboost_sweep.py | 190 +++++++++++++++++++++------------------ 1 file changed, 101 insertions(+), 89 deletions(-) diff --git a/scripts/xgboost_sweep.py b/scripts/xgboost_sweep.py index 9172f3a..f8f74b4 100644 --- a/scripts/xgboost_sweep.py +++ b/scripts/xgboost_sweep.py @@ -27,7 +27,7 @@ def __init__(self, cfg: DictConfig, split: str = "train"): self.static_data_path = self.data_path / "static" / split self._data_shards = [shard.stem for shard in list(self.static_data_path.glob("*.parquet"))] if cfg.iterator.keep_static_data_in_memory: - self._static_shards = self._get_static_shards() + self._static_shards = self._collect_static_shards_in_memory() self.codes_set, self.aggs_set, self.min_frequency_set, self.window_set = self._get_inclusion_sets() @@ -46,32 +46,26 @@ def _get_inclusion_sets(self) -> tuple[set, set, set]: codes_set = None aggs_set = None min_frequency_set = None + window_set = None + if self.cfg.codes is not None: codes_set = set(self.cfg.codes) + if self.cfg.aggs is not None: aggs_set = set(self.cfg.aggs) + if self.cfg.min_code_inclusion_frequency is not None: with open(self.data_path / "feature_freqs.json") as f: feature_freqs = json.load(f) min_frequency_set = { key for key, value in feature_freqs.items() if value >= self.cfg.min_code_inclusion_frequency } - window_set = set(self.cfg.window_sizes) + if self.cfg.window_sizes is not None: + window_set = set(self.cfg.window_sizes) return codes_set, aggs_set, min_frequency_set, window_set - def _load_static_shard_by_index(self, idx: int) -> sp.csc_matrix: - """Load a static shard into memory. - - Args: - - idx (int): Index of the shard to load. - - Returns: - - sp.csc_matrix: Sparse matrix with the static shard. - """ - return pd.read_parquet(self.static_data_path / f"{self._data_shards[int(idx)]}.parquet") - - def _get_static_shards(self) -> dict: + def _collect_static_shards_in_memory(self) -> dict: """Load static shards into memory. Returns: @@ -82,59 +76,16 @@ def _get_static_shards(self) -> dict: static_shards[iter] = self._load_static_shard_by_index(iter) return static_shards - def _sparsify_shard(self, df: pd.DataFrame) -> tuple[sp.csc_matrix, np.ndarray]: - """Make X and y as scipy sparse arrays for XGBoost. - - Args: - - df (pandas.DataFrame): Data frame to sparsify. - - Returns: - - tuple[scipy.sparse.csr_matrix, numpy.ndarray]: Tuple of feature data and labels. - """ - labels = df.loc[:, [col for col in df.columns if col.endswith("/task")]] - data = df.drop(columns=labels.columns) - for col in data.columns: - if not isinstance(data[col].dtype, pd.SparseDtype): - data[col] = pd.arrays.SparseArray(data[col]) - sparse_matrix = data.sparse.to_coo() - return csr_matrix(sparse_matrix), labels.values - - def _validate_shard_file_inclusion(self, file: Path) -> bool: - parts = file.relative_to(self.dynamic_data_path).parts - if not parts: - return False - - windows_part = parts[0] - aggs_part = "/".join(parts[1:-1]) - - return (self.window_set is None or windows_part in self.window_set) and ( - self.aggs_set is None or aggs_part in self.aggs_set - ) - - def _assert_correct_sorting(self, shard: pd.DataFrame): - """Assert that the shard is sorted correctly.""" - if "timestamp" in shard.columns: - sort_columns = ["patient_id", "timestamp"] - else: - sort_columns = ["patient_id"] - assert shard[sort_columns].equals(shard[sort_columns].sort_values(by=sort_columns)), ( - "Shard is not sorted on correctly. " - "Please ensure that the data is sorted on patient_id and timestamp, if applicable." - ) - - def _get_sparse_dynamic_shard_from_file(self, path: Path) -> pd.DataFrame: - """Load a sparse shard into memory. This returns a shard as a pandas dataframe, asserted that it is - sorted on patient id and timestamp, if included. + def _load_static_shard_by_index(self, idx: int) -> sp.csc_matrix: + """Load a static shard into memory. Args: - - path (Path): Path to the sparse shard. + - idx (int): Index of the shard to load. Returns: - - pd.DataFrame: Data frame with the sparse shard. + - sp.csc_matrix: Sparse matrix with the static shard. """ - shard = pd.read_pickle(path) - self._assert_correct_sorting(shard) - return shard.drop(columns=["patient_id", "timestamp"]) + return pd.read_parquet(self.static_data_path / f"{self._data_shards[int(idx)]}.parquet") def _get_static_shard_by_index(self, idx: int) -> pd.DataFrame: """Get the static shard from memory or disk. @@ -150,7 +101,7 @@ def _get_static_shard_by_index(self, idx: int) -> pd.DataFrame: else: return self._load_static_shard_by_index(self._data_shards[idx]) - def _get_task(self, idx: int) -> pd.DataFrame: + def _get_task_by_index(self, idx: int) -> pd.DataFrame: """Get the task data for a specific shard. Args: @@ -165,29 +116,32 @@ def _get_task(self, idx: int) -> pd.DataFrame: shard["label"] = np.random.randint(0, 2, shard.shape[0]) return shard[["patient_id", "timestamp", "label"]] - def _filter_df(self, df: pd.DataFrame) -> pd.DataFrame: - """Filter the dynamic data frame based on the inclusion sets. + def _load_dynamic_shard_from_file(self, path: Path) -> pd.DataFrame: + """Load a sparse shard into memory. This returns a shard as a pandas dataframe, asserted that it is + sorted on patient id and timestamp, if included. Args: - - df (pd.DataFrame): Data frame to filter. + - path (Path): Path to the sparse shard. Returns: - - pd.DataFrame: Filtered data frame. + - pd.DataFrame: Data frame with the sparse shard. """ - code_parts = ["/".join(col.split("/")[1:-2]) for col in df.columns] - frequency_parts = ["/".join(col.split("/")[1:-1]) for col in df.columns] + shard = pd.read_pickle(path) + self._assert_correct_sorting(shard) + return shard.drop(columns=["patient_id", "timestamp"]) - filtered_columns = [ - col - for col, code_part, freq_part in zip(df.columns, code_parts, frequency_parts) - if (self.codes_set is None or code_part in self.codes_set) - and (self.min_frequency_set is None or freq_part in self.min_frequency_set) - ] - filtered_columns.extend([col for col in df.columns if col.endswith("/task")]) + def _get_dynamic_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: + """Load a specific shard of dynamic data from disk and return it as a sparse matrix after filtering + column inclusion.""" + files = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}.pkl")) - return df[filtered_columns] + files = [file for file in files if self._filter_shard_files_on_window_and_aggs(file)] - def _load_dynamic_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: + dynamic_dfs = [self._load_dynamic_shard_from_file(file) for file in files] + dynamic_df = pd.concat(dynamic_dfs, axis=1) + return self._filter_shard_on_codes_and_freqs(dynamic_df) + + def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: """Load a specific shard of data from disk and concatenate with static data. Args: @@ -198,28 +152,84 @@ def _load_dynamic_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndar - y (numpy.ndarray): Labels. """ - files = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}.pkl")) - - files = [file for file in files if self._validate_shard_file_inclusion(file)] - - dynamic_dfs = [self._get_sparse_dynamic_shard_from_file(file) for file in files] - dynamic_df = pd.concat(dynamic_dfs, axis=1) - dynamic_df = self._filter_df(dynamic_df) + dynamic_df = self._get_dynamic_shard_by_index(idx) # TODO: add in some type checking etc for safety static_df = self._get_static_shard_by_index(idx) - task_df = self._get_task(idx) + task_df = self._get_task_by_index(idx) task_df = task_df.rename( columns={col: f"{col}/task" for col in task_df.columns if col not in ["patient_id", "timestamp"]} ) df = pd.merge(task_df, static_df, on=["patient_id"], how="left") self._assert_correct_sorting(df) - df = self._filter_df(df) + df = self._filter_shard_on_codes_and_freqs(df) df = pd.concat([df, dynamic_df], axis=1) return self._sparsify_shard(df) + def _sparsify_shard(self, df: pd.DataFrame) -> tuple[sp.csc_matrix, np.ndarray]: + """Make X and y as scipy sparse arrays for XGBoost. + + Args: + - df (pandas.DataFrame): Data frame to sparsify. + + Returns: + - tuple[scipy.sparse.csr_matrix, numpy.ndarray]: Tuple of feature data and labels. + """ + labels = df.loc[:, [col for col in df.columns if col.endswith("/task")]] + data = df.drop(columns=labels.columns) + for col in data.columns: + if not isinstance(data[col].dtype, pd.SparseDtype): + data[col] = pd.arrays.SparseArray(data[col]) + sparse_matrix = data.sparse.to_coo() + return csr_matrix(sparse_matrix), labels.values + + def _filter_shard_files_on_window_and_aggs(self, file: Path) -> bool: + parts = file.relative_to(self.dynamic_data_path).parts + if not parts: + return False + + windows_part = parts[0] + aggs_part = "/".join(parts[1:-1]) + + return (self.window_set is None or windows_part in self.window_set) and ( + self.aggs_set is None or aggs_part in self.aggs_set + ) + + def _filter_shard_on_codes_and_freqs(self, df: pd.DataFrame) -> pd.DataFrame: + """Filter the dynamic data frame based on the inclusion sets. + + Args: + - df (pd.DataFrame): Data frame to filter. + + Returns: + - pd.DataFrame: Filtered data frame. + """ + code_parts = ["/".join(col.split("/")[1:-2]) for col in df.columns] + frequency_parts = ["/".join(col.split("/")[1:-1]) for col in df.columns] + + filtered_columns = [ + col + for col, code_part, freq_part in zip(df.columns, code_parts, frequency_parts) + if (self.codes_set is None or code_part in self.codes_set) + and (self.min_frequency_set is None or freq_part in self.min_frequency_set) + ] + filtered_columns.extend([col for col in df.columns if col.endswith("/task")]) + + return df[filtered_columns] + + def _assert_correct_sorting(self, shard: pd.DataFrame): + """Assert that the shard is sorted correctly.""" + if "timestamp" in shard.columns: + sort_columns = ["patient_id", "timestamp"] + else: + sort_columns = ["patient_id"] + assert shard[sort_columns].equals(shard[sort_columns].sort_values(by=sort_columns)), ( + "Shard is not sorted on correctly. " + "Please ensure that the data is sorted on patient_id and timestamp, if applicable." + ) + def next(self, input_data: Callable): """Advance the iterator by 1 step and pass the data to XGBoost. This function is called by XGBoost during the construction of ``DMatrix`` @@ -236,7 +246,7 @@ def next(self, input_data: Callable): # input_data is a function passed in by XGBoost who has the exact same signature of # ``DMatrix`` - X, y = self._load_dynamic_shard_by_index(self._it) # self._data_shards[self._it]) + X, y = self._get_shard_by_index(self._it) # self._data_shards[self._it]) input_data(data=X, label=y) self._it += 1 # Return 1 to let XGBoost know we haven't seen all the files yet. @@ -255,7 +265,7 @@ def collect_in_memory(self) -> tuple[sp.csr_matrix, np.ndarray]: X = [] y = [] for i in range(len(self._data_shards)): - X_, y_ = self._load_dynamic_shard_by_index(i) + X_, y_ = self._get_shard_by_index(i) X.append(X_) y.append(y_) @@ -288,6 +298,8 @@ def __init__(self, cfg: DictConfig): def train(self): """Train the model.""" self._build() + # TODO: add in eval, early stopping, etc. + # TODO: check for Nan and inf in labels! self.model = xgb.train( OmegaConf.to_container(self.cfg.model), self.dtrain ) # do we want eval and things? From abba3d25b47ee8b4595533b9516876c13fb5a598 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Fri, 31 May 2024 17:39:57 +0000 Subject: [PATCH 038/106] local WIP--changing to sparse matrix implementation --- configs/xgboost_sweep.yaml | 35 ++++---- scripts/xgboost_sweep.py | 175 ++++++++++++------------------------- 2 files changed, 72 insertions(+), 138 deletions(-) diff --git a/configs/xgboost_sweep.yaml b/configs/xgboost_sweep.yaml index 72ccac0..15b7391 100644 --- a/configs/xgboost_sweep.yaml +++ b/configs/xgboost_sweep.yaml @@ -1,42 +1,38 @@ # Raw data -base_dir: /storage/teya/fake -MEDS_cohort_dir: ${base_dir}/MEDS_cohort -tabularized_data_dir: ${base_dir}/flat_reps -model_dir: ${base_dir}/models +MEDS_cohort_dir: ??? +tabularized_data_dir: /storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize +model_dir: /storage/teya/test/ # Pre-processing min_code_inclusion_frequency: 1 -window_sizes: [30d] +window_sizes: [1d] codes: null aggs: - "code/count" - "value/sum" +dynamic_threshold: 0.01 +numerical_value_threshold: 0.1 + # Sharding -n_patients_per_sub_shard: 2 +n_patients_per_sub_shard: null # Misc -do_overwrite: True +do_overwrite: False do_update: True seed: 1 -tqdm: False +tqdm: True model: booster: gbtree device: cpu - nthread: 4 - max_depth: 6 - eta: 0.3 - gamma: 0 - subsample: 1 - lambda: 1 - alpha: 0 + epochs: 1 tree_method: hist - objective: reg:squaredlogerror + objective: binary:logistic iterator: - keep_data_in_memory: True - keep_static_data_in_memory: True + keep_data_in_memory: False + keep_static_data_in_memory: False # Hydra settings for sweep defaults: @@ -44,8 +40,7 @@ defaults: - override hydra/sweeper/sampler: tpe hydra: - mode: MULTIRUN - verbose: False + verbose: True sweep: dir: ${tabularized_data_dir}/.logs/etl/${now:%Y-%m-%d_%H-%M-%S} run: diff --git a/scripts/xgboost_sweep.py b/scripts/xgboost_sweep.py index f8f74b4..034a175 100644 --- a/scripts/xgboost_sweep.py +++ b/scripts/xgboost_sweep.py @@ -1,6 +1,7 @@ import json import os from collections.abc import Callable +from datetime import datetime from pathlib import Path import hydra @@ -8,8 +9,8 @@ import pandas as pd import scipy.sparse as sp import xgboost as xgb +from loguru import logger from omegaconf import DictConfig, OmegaConf -from scipy.sparse import csr_matrix from sklearn.metrics import mean_absolute_error @@ -25,11 +26,14 @@ def __init__(self, cfg: DictConfig, split: str = "train"): self.data_path = Path(cfg.tabularized_data_dir) self.dynamic_data_path = self.data_path / "ts" / split self.static_data_path = self.data_path / "static" / split - self._data_shards = [shard.stem for shard in list(self.static_data_path.glob("*.parquet"))] - if cfg.iterator.keep_static_data_in_memory: - self._static_shards = self._collect_static_shards_in_memory() + self._data_shards = [ + 0, + 1, + 2, + 3, + ] # [shard.stem for shard in list(self.static_data_path.glob("*.parquet"))] - self.codes_set, self.aggs_set, self.min_frequency_set, self.window_set = self._get_inclusion_sets() + self.codes_set, self.aggs_set, self.codes_mask = self._get_inclusion_sets() self._it = 0 @@ -37,7 +41,7 @@ def __init__(self, cfg: DictConfig, split: str = "train"): # "cache" super().__init__(cache_prefix=os.path.join(".", "cache")) - def _get_inclusion_sets(self) -> tuple[set, set, set]: + def _get_inclusion_sets(self) -> tuple[set, set, np.array]: """Get the inclusion sets for codes and aggregations. Returns: @@ -48,58 +52,37 @@ def _get_inclusion_sets(self) -> tuple[set, set, set]: min_frequency_set = None window_set = None - if self.cfg.codes is not None: - codes_set = set(self.cfg.codes) - if self.cfg.aggs is not None: aggs_set = set(self.cfg.aggs) + if self.cfg.window_sizes is not None: + window_set = set(self.cfg.window_sizes) + + feature_columns = json.load(self.data_path / "feature_columns.json") + + if self.cfg.codes is not None: + codes_mask = np.zeros(len(feature_columns), dtype=bool) + codes_set = set(self.cfg.codes) + for code in codes_set: + codes_mask |= np.array([code in col for col in feature_columns]) + else: + codes_mask = np.ones(len(feature_columns), dtype=bool) + if self.cfg.min_code_inclusion_frequency is not None: + frequency_mask = np.zeros(len(feature_columns), dtype=bool) with open(self.data_path / "feature_freqs.json") as f: feature_freqs = json.load(f) min_frequency_set = { key for key, value in feature_freqs.items() if value >= self.cfg.min_code_inclusion_frequency } - if self.cfg.window_sizes is not None: - window_set = set(self.cfg.window_sizes) - - return codes_set, aggs_set, min_frequency_set, window_set - - def _collect_static_shards_in_memory(self) -> dict: - """Load static shards into memory. - - Returns: - - dict: Dictionary with shard names as keys and data frames as values. - """ - static_shards = {} - for iter in self._data_shards: - static_shards[iter] = self._load_static_shard_by_index(iter) - return static_shards - - def _load_static_shard_by_index(self, idx: int) -> sp.csc_matrix: - """Load a static shard into memory. - - Args: - - idx (int): Index of the shard to load. - - Returns: - - sp.csc_matrix: Sparse matrix with the static shard. - """ - return pd.read_parquet(self.static_data_path / f"{self._data_shards[int(idx)]}.parquet") + for code in min_frequency_set: + frequency_mask |= np.array([code in col for col in feature_columns]) + else: + frequency_mask = np.ones(len(feature_columns), dtype=bool) - def _get_static_shard_by_index(self, idx: int) -> pd.DataFrame: - """Get the static shard from memory or disk. + mask = codes_mask | frequency_mask - Args: - - shard_name (str): Name of the shard. - - Returns: - - pd.DataFrame: Data frame with the static shard. - """ - if self.cfg.iterator.keep_static_data_in_memory: - return self._static_shards[self._data_shards[idx]] - else: - return self._load_static_shard_by_index(self._data_shards[idx]) + return aggs_set, window_set, mask def _get_task_by_index(self, idx: int) -> pd.DataFrame: """Get the task data for a specific shard. @@ -116,30 +99,28 @@ def _get_task_by_index(self, idx: int) -> pd.DataFrame: shard["label"] = np.random.randint(0, 2, shard.shape[0]) return shard[["patient_id", "timestamp", "label"]] - def _load_dynamic_shard_from_file(self, path: Path) -> pd.DataFrame: - """Load a sparse shard into memory. This returns a shard as a pandas dataframe, asserted that it is - sorted on patient id and timestamp, if included. + def _load_dynamic_shard_from_file(self, path: Path) -> sp.csr_matrix: + """Load a sparse shard into memory. Args: - path (Path): Path to the sparse shard. Returns: - - pd.DataFrame: Data frame with the sparse shard. + - sp.coo_matrix: Data frame with the sparse shard. """ - shard = pd.read_pickle(path) - self._assert_correct_sorting(shard) - return shard.drop(columns=["patient_id", "timestamp"]) + shard = np.load(path) # TODO: check this with nassim + self._filter_shard_on_codes_and_freqs(shard) + return shard - def _get_dynamic_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: + def _get_dynamic_shard_by_index(self, idx: int) -> sp.csc_matrix: """Load a specific shard of dynamic data from disk and return it as a sparse matrix after filtering column inclusion.""" - files = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}.pkl")) + files = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}.pkl")) files = [file for file in files if self._filter_shard_files_on_window_and_aggs(file)] - dynamic_dfs = [self._load_dynamic_shard_from_file(file) for file in files] - dynamic_df = pd.concat(dynamic_dfs, axis=1) - return self._filter_shard_on_codes_and_freqs(dynamic_df) + dynamic_coos = [sp.csc_matrix(self._load_dynamic_shard_from_file(file)) for file in files] + return sp.hstack(dynamic_coos) def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: """Load a specific shard of data from disk and concatenate with static data. @@ -148,42 +129,17 @@ def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: - idx (int): Index of the shard to load. Returns: - - X (scipy.sparse.csr_matrix): Feature data frame. + - X (scipy.sparse.csr_matrix): Feature data frame.ß - y (numpy.ndarray): Labels. """ - + time = datetime.now() dynamic_df = self._get_dynamic_shard_by_index(idx) - - # TODO: add in some type checking etc for safety - static_df = self._get_static_shard_by_index(idx) - + logger.debug(f"Dynamic data loading took {datetime.now() - time}") + time = datetime.now() task_df = self._get_task_by_index(idx) - task_df = task_df.rename( - columns={col: f"{col}/task" for col in task_df.columns if col not in ["patient_id", "timestamp"]} - ) - df = pd.merge(task_df, static_df, on=["patient_id"], how="left") - self._assert_correct_sorting(df) - df = self._filter_shard_on_codes_and_freqs(df) - df = pd.concat([df, dynamic_df], axis=1) + logger.debug(f"Task data loading took {datetime.now() - time}") - return self._sparsify_shard(df) - - def _sparsify_shard(self, df: pd.DataFrame) -> tuple[sp.csc_matrix, np.ndarray]: - """Make X and y as scipy sparse arrays for XGBoost. - - Args: - - df (pandas.DataFrame): Data frame to sparsify. - - Returns: - - tuple[scipy.sparse.csr_matrix, numpy.ndarray]: Tuple of feature data and labels. - """ - labels = df.loc[:, [col for col in df.columns if col.endswith("/task")]] - data = df.drop(columns=labels.columns) - for col in data.columns: - if not isinstance(data[col].dtype, pd.SparseDtype): - data[col] = pd.arrays.SparseArray(data[col]) - sparse_matrix = data.sparse.to_coo() - return csr_matrix(sparse_matrix), labels.values + return sp.csr_matrix(dynamic_df), task_df["label"].values def _filter_shard_files_on_window_and_aggs(self, file: Path) -> bool: parts = file.relative_to(self.dynamic_data_path).parts @@ -197,38 +153,17 @@ def _filter_shard_files_on_window_and_aggs(self, file: Path) -> bool: self.aggs_set is None or aggs_part in self.aggs_set ) - def _filter_shard_on_codes_and_freqs(self, df: pd.DataFrame) -> pd.DataFrame: - """Filter the dynamic data frame based on the inclusion sets. + def _filter_shard_on_codes_and_freqs(self, df: sp.coo_matrix) -> sp.sp.csr_matrix: + """Filter the dynamic data frame based on the inclusion sets. Given the codes_mask, filter the data + frame to only include columns that are True in the mask. Args: - - df (pd.DataFrame): Data frame to filter. + - df (scipy.sparse.coo_matrix): Data frame to filter. Returns: - - pd.DataFrame: Filtered data frame. + - df (scipy.sparse.sp.csr_matrix): Filtered data frame. """ - code_parts = ["/".join(col.split("/")[1:-2]) for col in df.columns] - frequency_parts = ["/".join(col.split("/")[1:-1]) for col in df.columns] - - filtered_columns = [ - col - for col, code_part, freq_part in zip(df.columns, code_parts, frequency_parts) - if (self.codes_set is None or code_part in self.codes_set) - and (self.min_frequency_set is None or freq_part in self.min_frequency_set) - ] - filtered_columns.extend([col for col in df.columns if col.endswith("/task")]) - - return df[filtered_columns] - - def _assert_correct_sorting(self, shard: pd.DataFrame): - """Assert that the shard is sorted correctly.""" - if "timestamp" in shard.columns: - sort_columns = ["patient_id", "timestamp"] - else: - sort_columns = ["patient_id"] - assert shard[sort_columns].equals(shard[sort_columns].sort_values(by=sort_columns)), ( - "Shard is not sorted on correctly. " - "Please ensure that the data is sorted on patient_id and timestamp, if applicable." - ) + return sp.csr_matrix(df)[:, self.codes_mask] def next(self, input_data: Callable): """Advance the iterator by 1 step and pass the data to XGBoost. This function is called by XGBoost @@ -240,6 +175,7 @@ def next(self, input_data: Callable): Returns: - int: 0 if end of iteration, 1 otherwise. """ + start_time = datetime.now() if self._it == len(self._data_shards): # return 0 to let XGBoost know this is the end of iteration return 0 @@ -250,13 +186,14 @@ def next(self, input_data: Callable): input_data(data=X, label=y) self._it += 1 # Return 1 to let XGBoost know we haven't seen all the files yet. + logger.debug(f"******** One iteration took {datetime.now() - start_time}") return 1 def reset(self): """Reset the iterator to its beginning.""" self._it = 0 - def collect_in_memory(self) -> tuple[sp.csr_matrix, np.ndarray]: + def collect_in_memory(self) -> tuple[sp.sp.csr_matrix, np.ndarray]: """Collect the data in memory. Returns: @@ -306,12 +243,14 @@ def train(self): def _build(self): """Build necessary data structures for training.""" + start_time = datetime.now() if self.keep_data_in_memory: self._build_iterators() self._build_dmatrix_in_memory() else: self._build_iterators() self._build_dmatrix_from_iterators() + logger.debug(f"Data loading took {datetime.now() - start_time}") def _build_dmatrix_in_memory(self): """Build the DMatrix from the data in memory.""" From 77f296f716848b6297c1619e99e2f240ea7f0b50 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Fri, 31 May 2024 18:50:00 +0000 Subject: [PATCH 039/106] added merging of static and time series data --- scripts/tabularize_merge.py | 135 ++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 scripts/tabularize_merge.py diff --git a/scripts/tabularize_merge.py b/scripts/tabularize_merge.py new file mode 100644 index 0000000..9261e00 --- /dev/null +++ b/scripts/tabularize_merge.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +"""Tabularizes time-series data in MEDS format into tabular representations.""" +from pathlib import Path + +import hydra +import numpy as np +import pandas as pd +import polars as pl +from loguru import logger +from omegaconf import DictConfig +from scipy.sparse import coo_matrix, csc_matrix, hstack + +from MEDS_tabular_automl.mapper import wrap as rwlock_wrap +from MEDS_tabular_automl.utils import load_tqdm, setup_environment, write_df + + +def merge_dfs(feature_columns, static_df, ts_df): + """Merges static and time-series dataframes. + + This function merges the static and time-series dataframes based on the patient_id column. + + Args: + - feature_columns (List[str]): A list of feature columns to include in the merged dataframe. + - static_df (pd.DataFrame): A dataframe containing static features. + - ts_df (pd.DataFrame): A dataframe containing time-series features. + + Returns: + - pd.DataFrame: A merged dataframe containing static and time-series features. + """ + # Make static data sparse and merge it with the time-series data + logger.info("Make static data sparse and merge it with the time-series data") + static_df[static_df.columns[1:]] = ( + static_df[static_df.columns[1:]].fillna(0).astype(pd.SparseDtype("float64", fill_value=0)) + ) + merge_df = pd.merge(ts_df, static_df, on=["patient_id"], how="left") + # indexes_df = merge_df[["patient_id", "timestamp"]] + # drop indexes + merge_df = merge_df.drop(columns=["patient_id", "timestamp"]) + # TODO: fix naming convention, we are generating value rows with zero frequency so remove those + merge_df = merge_df.rename( + columns={ + c: "/".join(c.split("/")[1:-1]) for c in merge_df.columns if c.split("/")[-2] in ["code", "value"] + } + ) + + # Convert to sparse matrix and remove 0 frequency columns (i.e. columns not in feature_columns) + logger.info( + "Convert to sparse matrix and remove 0 frequency columns (i.e. columns not in feature_columns)" + ) + original_sparse_matrix = merge_df.sparse.to_coo() + missing_columns = [col for col in feature_columns if col not in merge_df.columns] + + # reorder columns to be in order of feature_columns + logger.info("Reorder columns to be in order of feature_columns") + final_sparse_matrix = hstack( + [original_sparse_matrix, coo_matrix((merge_df.shape[0], len(missing_columns)))] + ) + index_map = {name: index for index, name in enumerate(feature_columns)} + reverse_map = [index_map[col] for col in feature_columns] + final_sparse_matrix = coo_matrix(csc_matrix(final_sparse_matrix)[:, reverse_map]) + + # convert to np matrix of data, row, col + logger.info(f"Final sparse matrix shape: {final_sparse_matrix.shape}") + data, row, col = final_sparse_matrix.data, final_sparse_matrix.row, final_sparse_matrix.col + final_matrix = np.matrix([data, row, col]) + return final_matrix + + +@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") +def tabularize_ts_data( + cfg: DictConfig, +): + """Processes a medical dataset to generates and stores flat representatiosn of time-series data. + + This function handles MEDS format data and pivots tables to create two types of data files + with patient_id and timestamp indexes: + code data: containing a column for every code and 1 and 0 values indicating presence + value data: containing a column for every code which the numerical value observed. + + Args: + cfg: configuration dictionary containing the necessary parameters for tabularizing the data. + """ + iter_wrapper = load_tqdm(cfg.tqdm) + flat_dir, split_to_fp, feature_columns = setup_environment(cfg, load_data=False) + med_dir = Path(cfg.tabularized_data_dir) + ts_dir = med_dir / "ts" + static_dir = med_dir / "static" + shard_fps = list(ts_dir.glob("*/*/*/*/*.pkl")) + + # Produce ts representation + out_subdir = flat_dir / "sparse" + + for shard_fp in iter_wrapper(shard_fps): + split = shard_fp.parent.parent.parent.parent.stem + in_ts_fp = shard_fp + assert in_ts_fp.exists(), f"{in_ts_fp} does not exist!" + in_static_fp = static_dir / split / f"{shard_fp.stem}.parquet" + assert in_static_fp.exists(), f"{in_static_fp} does not exist!" + out_fp = out_subdir / f"{shard_fp.stem}" + out_fp.parent.mkdir(parents=True, exist_ok=True) + + def read_fn(in_fps): + in_static_fp, in_ts_fp = in_fps + static_df = pl.read_parquet(in_static_fp) + ts_df = pd.read_pickle(in_ts_fp) + return [static_df, ts_df] + + def compute_fn(shards): + static_df, shard_df = shards + return merge_dfs( + feature_columns=feature_columns, + static_df=static_df.to_pandas(), + ts_df=shard_df, + ) + + def write_fn(data, out_df): + write_df(data, out_df, do_overwrite=cfg.do_overwrite) + + in_fps = in_static_fp, in_ts_fp + logger.info(f"Processing {in_static_fp} and\n{in_ts_fp}") + logger.info(f"Writing to {out_fp}...") + rwlock_wrap( + in_fps, + out_fp, + read_fn, + write_fn, + compute_fn, + do_overwrite=cfg.do_overwrite, + do_return=False, + ) + logger.info("Generated TS flat representations.") + + +if __name__ == "__main__": + tabularize_ts_data() From 958906d0862c2fc6c6298de05f04e431a90023b2 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Fri, 31 May 2024 19:18:28 +0000 Subject: [PATCH 040/106] merging script runs, but the output is 50GB --- scripts/e2e.sh | 26 +++++++++++++++++ scripts/hf_cohort_e2e.sh | 49 +++++++++++++++++++------------- scripts/tabularize_merge.py | 4 +-- src/MEDS_tabular_automl/utils.py | 8 ++++-- 4 files changed, 62 insertions(+), 25 deletions(-) create mode 100644 scripts/e2e.sh diff --git a/scripts/e2e.sh b/scripts/e2e.sh new file mode 100644 index 0000000..fd1ff60 --- /dev/null +++ b/scripts/e2e.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +MEDS_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort +OUTPUT_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize +N_PARALLEL_WORKERS="2" #"$3" + +# echo "Running identify_columns.py: Caching feature names and frequencies." +# POLARS_MAX_THREADS=32 python scripts/identify_columns.py \ +# MEDS_cohort_dir=$MEDS_DIR \ +# tabularized_data_dir=$OUTPUT_DIR \ +# min_code_inclusion_frequency=1 "window_sizes=[1d, 7d, 30d, 365d, full]" do_overwrite=True + +# echo "Running tabularize_static.py: tabularizing static data" +# POLARS_MAX_THREADS=32 python scripts/tabularize_static.py \ +# MEDS_cohort_dir=$MEDS_DIR \ +# tabularized_data_dir=$OUTPUT_DIR \ +# min_code_inclusion_frequency=1 "window_sizes=[1d, 7d, 30d, 365d, full]" do_overwrite=True + +echo "Running summarize_over_windows.py with $N_PARALLEL_WORKERS workers in parallel" +POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py \ + --multirun \ + worker="range(1,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + MEDS_cohort_dir=$MEDS_DIR \ + tabularized_data_dir=$OUTPUT_DIR \ + min_code_inclusion_frequency=1 "window_sizes=[1d, 7d, 30d, 365d, full]" do_overwrite=True diff --git a/scripts/hf_cohort_e2e.sh b/scripts/hf_cohort_e2e.sh index 3c39ea5..e32c781 100644 --- a/scripts/hf_cohort_e2e.sh +++ b/scripts/hf_cohort_e2e.sh @@ -2,38 +2,47 @@ MEDS_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort OUTPUT_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize -N_PARALLEL_WORKERS="$1" +# N_PARALLEL_WORKERS="$1" WINDOW_SIZES="window_sizes=[1d]" AGGS="aggs=[code/count,value/sum]" # WINDOW_SIZES="window_sizes=[1d,7d,30d,365d,full]" # AGGS="aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" -echo "Running identify_columns.py: Caching feature names and frequencies." -rm -rf $OUTPUT_DIR -POLARS_MAX_THREADS=32 python scripts/identify_columns.py \ - MEDS_cohort_dir=$MEDS_DIR \ - tabularized_data_dir=$OUTPUT_DIR \ - min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" +# echo "Running identify_columns.py: Caching feature names and frequencies." +# rm -rf $OUTPUT_DIR +# POLARS_MAX_THREADS=32 python scripts/identify_columns.py \ +# MEDS_cohort_dir=$MEDS_DIR \ +# tabularized_data_dir=$OUTPUT_DIR \ +# min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" -echo "Running tabularize_static.py: tabularizing static data" -POLARS_MAX_THREADS=32 python scripts/tabularize_static.py \ - MEDS_cohort_dir=$MEDS_DIR \ - tabularized_data_dir=$OUTPUT_DIR \ - min_code_inclusion_frequency=1 $WINDOW_SIZES do_overwrite=False $AGGS +# echo "Running tabularize_static.py: tabularizing static data" +# POLARS_MAX_THREADS=32 python scripts/tabularize_static.py \ +# MEDS_cohort_dir=$MEDS_DIR \ +# tabularized_data_dir=$OUTPUT_DIR \ +# min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" -# echo "Running summarize_over_windows.py with $N_PARALLEL_WORKERS workers in parallel" +# # echo "Running summarize_over_windows.py with $N_PARALLEL_WORKERS workers in parallel" +# # POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py \ +# # --multirun \ +# # worker="range(0,$N_PARALLEL_WORKERS)" \ +# # hydra/launcher=joblib \ +# # MEDS_cohort_dir=$MEDS_DIR \ +# # tabularized_data_dir=$OUTPUT_DIR \ +# # min_code_inclusion_frequency=1 do_overwrite=False \ +# # "$WINDOW_SIZES" "$AGGS" + +# echo "Running summarize_over_windows.py" # POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py \ -# --multirun \ -# worker="range(0,$N_PARALLEL_WORKERS)" \ -# hydra/launcher=joblib \ # MEDS_cohort_dir=$MEDS_DIR \ # tabularized_data_dir=$OUTPUT_DIR \ # min_code_inclusion_frequency=1 do_overwrite=False \ -# $WINDOW_SIZES $AGGS +# "$WINDOW_SIZES" "$AGGS" + -echo "Running summarize_over_windows.py" -POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py \ +echo "Running tabularize_merge.py" +rm -r "$OUTPUT_DIR/sparse" +POLARS_MAX_THREADS=10 python /home/nassim/projects/MEDS_Tabular_AutoML/scripts/tabularize_merge.py \ MEDS_cohort_dir=$MEDS_DIR \ tabularized_data_dir=$OUTPUT_DIR \ min_code_inclusion_frequency=1 do_overwrite=False \ - $WINDOW_SIZES $AGGS + "$WINDOW_SIZES" "$AGGS" diff --git a/scripts/tabularize_merge.py b/scripts/tabularize_merge.py index 9261e00..15be7c3 100644 --- a/scripts/tabularize_merge.py +++ b/scripts/tabularize_merge.py @@ -91,12 +91,12 @@ def tabularize_ts_data( out_subdir = flat_dir / "sparse" for shard_fp in iter_wrapper(shard_fps): - split = shard_fp.parent.parent.parent.parent.stem + split = shard_fp.parts[-5] in_ts_fp = shard_fp assert in_ts_fp.exists(), f"{in_ts_fp} does not exist!" in_static_fp = static_dir / split / f"{shard_fp.stem}.parquet" assert in_static_fp.exists(), f"{in_static_fp} does not exist!" - out_fp = out_subdir / f"{shard_fp.stem}" + out_fp = out_subdir / "/".join(shard_fp.parts[-5:-1]) / f"{shard_fp.stem}" out_fp.parent.mkdir(parents=True, exist_ok=True) def read_fn(in_fps): diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 538db88..92027c4 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -9,6 +9,7 @@ from collections.abc import Mapping from pathlib import Path +import numpy as np import pandas as pd import polars as pl import polars.selectors as cs @@ -60,6 +61,8 @@ def write_df(df: DF_T, fp: Path, **kwargs): f"Expected DataFrame to have columns ['patient_id', 'timestamp'], got {df.columns[:2]}" ) df.to_pickle(fp) + elif isinstance(df, np.matrix): + np.save(fp, df) else: raise ValueError(f"Unsupported type for df: {type(df)}") @@ -77,7 +80,7 @@ def get_static_col_dtype(col: str) -> pl.DataType: case "count" | "has_values_count": return pl.UInt32 case _: - raise ValueError(f"Column name {col} malformed! Expected aggregations: 'sum', 'sum_sqd', 'min', 'max', 'value', 'first', 'present', 'count', 'has_values_count'.") + raise ValueError(f"Column name {col} malformed!") def add_static_missing_cols( @@ -346,8 +349,7 @@ def setup_environment(cfg: DictConfig, load_data: bool = True): stored_config = OmegaConf.create(yaml_config) logger.info(f"Stored config: {stored_config}") logger.info(f"Worker config: {cfg}") - assert cfg.keys() == stored_config.keys(), ( - f"Keys in stored config do not match current config.") + assert cfg.keys() == stored_config.keys(), "Keys in stored config do not match current config." for key in cfg.keys(): assert key in stored_config, f"Key {key} not found in stored config." if key == "worker": From 7668382cfc9a861cdb6b3d64a4c17284866a5e10 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Fri, 31 May 2024 20:11:53 +0000 Subject: [PATCH 041/106] merging script works and is efficient --- scripts/tabularize_merge.py | 44 ++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/scripts/tabularize_merge.py b/scripts/tabularize_merge.py index 15be7c3..eef9f59 100644 --- a/scripts/tabularize_merge.py +++ b/scripts/tabularize_merge.py @@ -8,7 +8,7 @@ import polars as pl from loguru import logger from omegaconf import DictConfig -from scipy.sparse import coo_matrix, csc_matrix, hstack +from scipy.sparse import coo_matrix, csc_matrix, csr_matrix, hstack from MEDS_tabular_automl.mapper import wrap as rwlock_wrap from MEDS_tabular_automl.utils import load_tqdm, setup_environment, write_df @@ -29,31 +29,45 @@ def merge_dfs(feature_columns, static_df, ts_df): """ # Make static data sparse and merge it with the time-series data logger.info("Make static data sparse and merge it with the time-series data") - static_df[static_df.columns[1:]] = ( - static_df[static_df.columns[1:]].fillna(0).astype(pd.SparseDtype("float64", fill_value=0)) + assert static_df.patient_id.is_monotonic_increasing + assert ts_df.patient_id.is_monotonic_increasing + sparse_time_series = ts_df.drop(columns=["patient_id", "timestamp"]).sparse.to_coo() + duplication_index = ts_df["patient_id"].value_counts().sort_index() + + # load static data as sparse matrix + static_matrix = static_df.drop(columns="patient_id").values + data_list = [] + rows = [] + cols = [] + for row in range(static_matrix.shape[0]): + for col in range(static_matrix.shape[1]): + data = static_matrix[row, col] + if (data is not None) and (data != 0): + data_list.append(data) + rows.append(row) + cols.append(col) + static_matrix = csr_matrix( + (data_list, (rows, cols)), shape=(static_matrix.shape[0], static_matrix.shape[1]) ) - merge_df = pd.merge(ts_df, static_df, on=["patient_id"], how="left") - # indexes_df = merge_df[["patient_id", "timestamp"]] - # drop indexes - merge_df = merge_df.drop(columns=["patient_id", "timestamp"]) + duplication_index = ts_df["patient_id"].value_counts().sort_index().reset_index(drop=True) + reindex_slices = np.repeat(duplication_index.index.values, duplication_index.values) + static_matrix = static_matrix[reindex_slices, :] + # TODO: fix naming convention, we are generating value rows with zero frequency so remove those - merge_df = merge_df.rename( - columns={ - c: "/".join(c.split("/")[1:-1]) for c in merge_df.columns if c.split("/")[-2] in ["code", "value"] - } - ) + ts_columns = ["/".join(c.split("/")[1:-1]) for c in ts_df.columns] + sparse_columns = ts_columns + list(static_df.columns) # Convert to sparse matrix and remove 0 frequency columns (i.e. columns not in feature_columns) logger.info( "Convert to sparse matrix and remove 0 frequency columns (i.e. columns not in feature_columns)" ) - original_sparse_matrix = merge_df.sparse.to_coo() - missing_columns = [col for col in feature_columns if col not in merge_df.columns] + set_sparse_cols = set(sparse_columns) + missing_columns = [col for col in feature_columns if col not in set_sparse_cols] # reorder columns to be in order of feature_columns logger.info("Reorder columns to be in order of feature_columns") final_sparse_matrix = hstack( - [original_sparse_matrix, coo_matrix((merge_df.shape[0], len(missing_columns)))] + [sparse_time_series, static_matrix, coo_matrix((sparse_time_series.shape[0], len(missing_columns)))] ) index_map = {name: index for index, name in enumerate(feature_columns)} reverse_map = [index_map[col] for col in feature_columns] From b6b8d43166ac1c8bf86fd071693bddf16d257219 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Fri, 31 May 2024 23:22:31 +0000 Subject: [PATCH 042/106] fixed bug with sparse matrix shape being too small for merging static and time series dataframs --- scripts/tabularize_merge.py | 14 +++--- tests/test_tabularize.py | 89 ++++++++++++++++++++++++++----------- 2 files changed, 70 insertions(+), 33 deletions(-) diff --git a/scripts/tabularize_merge.py b/scripts/tabularize_merge.py index eef9f59..084ce58 100644 --- a/scripts/tabularize_merge.py +++ b/scripts/tabularize_merge.py @@ -27,12 +27,15 @@ def merge_dfs(feature_columns, static_df, ts_df): Returns: - pd.DataFrame: A merged dataframe containing static and time-series features. """ + # TODO - store static and ts data as numpy matrices + # TODO - Eventually do this duplication at the task specific stage after filtering patients and features # Make static data sparse and merge it with the time-series data logger.info("Make static data sparse and merge it with the time-series data") assert static_df.patient_id.is_monotonic_increasing assert ts_df.patient_id.is_monotonic_increasing sparse_time_series = ts_df.drop(columns=["patient_id", "timestamp"]).sparse.to_coo() - duplication_index = ts_df["patient_id"].value_counts().sort_index() + + num_patients = max(static_df.patient_id.nunique(), ts_df.patient_id.nunique()) # load static data as sparse matrix static_matrix = static_df.drop(columns="patient_id").values @@ -46,9 +49,8 @@ def merge_dfs(feature_columns, static_df, ts_df): data_list.append(data) rows.append(row) cols.append(col) - static_matrix = csr_matrix( - (data_list, (rows, cols)), shape=(static_matrix.shape[0], static_matrix.shape[1]) - ) + static_matrix = csr_matrix((data_list, (rows, cols)), shape=(num_patients, static_matrix.shape[1])) + # Duplicate static matrix rows to match time-series data duplication_index = ts_df["patient_id"].value_counts().sort_index().reset_index(drop=True) reindex_slices = np.repeat(duplication_index.index.values, duplication_index.values) static_matrix = static_matrix[reindex_slices, :] @@ -81,7 +83,7 @@ def merge_dfs(feature_columns, static_df, ts_df): @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") -def tabularize_ts_data( +def merge_data( cfg: DictConfig, ): """Processes a medical dataset to generates and stores flat representatiosn of time-series data. @@ -146,4 +148,4 @@ def write_fn(data, out_df): if __name__ == "__main__": - tabularize_ts_data() + merge_data() diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index cb6703d..3f4d46e 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -15,6 +15,7 @@ from scripts.identify_columns import store_columns from scripts.summarize_over_windows import summarize_ts_data_over_windows +from scripts.tabularize_merge import merge_data from scripts.tabularize_static import tabularize_static_data from scripts.tabularize_ts import tabularize_ts_data @@ -102,6 +103,60 @@ "tuning/0": MEDS_TUNING_0, } +SUMMARIZE_EXPECTED_FILES = [ + "train/365d/value/sum/0.pkl", + "train/365d/value/sum/1.pkl", + "train/365d/code/count/0.pkl", + "train/365d/code/count/1.pkl", + "train/full/value/sum/0.pkl", + "train/full/value/sum/1.pkl", + "train/full/code/count/0.pkl", + "train/full/code/count/1.pkl", + "train/30d/value/sum/0.pkl", + "train/30d/value/sum/1.pkl", + "train/30d/code/count/0.pkl", + "train/30d/code/count/1.pkl", + "held_out/365d/value/sum/0.pkl", + "held_out/365d/code/count/0.pkl", + "held_out/full/value/sum/0.pkl", + "held_out/full/code/count/0.pkl", + "held_out/30d/value/sum/0.pkl", + "held_out/30d/code/count/0.pkl", + "tuning/365d/value/sum/0.pkl", + "tuning/365d/code/count/0.pkl", + "tuning/full/value/sum/0.pkl", + "tuning/full/code/count/0.pkl", + "tuning/30d/value/sum/0.pkl", + "tuning/30d/code/count/0.pkl", +] + +MERGE_EXPECTED_FILES = [ + "train/365d/value/sum/0.npy", + "train/365d/value/sum/1.npy", + "train/365d/code/count/0.npy", + "train/365d/code/count/1.npy", + "train/full/value/sum/0.npy", + "train/full/value/sum/1.npy", + "train/full/code/count/0.npy", + "train/full/code/count/1.npy", + "train/30d/value/sum/0.npy", + "train/30d/value/sum/1.npy", + "train/30d/code/count/0.npy", + "train/30d/code/count/1.npy", + "held_out/365d/value/sum/0.npy", + "held_out/365d/code/count/0.npy", + "held_out/full/value/sum/0.npy", + "held_out/full/code/count/0.npy", + "held_out/30d/value/sum/0.npy", + "held_out/30d/code/count/0.npy", + "tuning/365d/value/sum/0.npy", + "tuning/365d/code/count/0.npy", + "tuning/full/value/sum/0.npy", + "tuning/full/code/count/0.npy", + "tuning/30d/value/sum/0.npy", + "tuning/30d/code/count/0.npy", +] + def test_tabularize(): with tempfile.TemporaryDirectory() as d: @@ -177,33 +232,13 @@ def test_tabularize(): # confirm summary files exist: output_files = list(tabularized_data_dir.glob("ts/*/*/*/*/*.pkl")) actual_files = [str(Path(*f.parts[-5:])) for f in output_files] - expected_files = [ - "train/365d/value/sum/0.pkl", - "train/365d/value/sum/1.pkl", - "train/365d/code/count/0.pkl", - "train/365d/code/count/1.pkl", - "train/full/value/sum/0.pkl", - "train/full/value/sum/1.pkl", - "train/full/code/count/0.pkl", - "train/full/code/count/1.pkl", - "train/30d/value/sum/0.pkl", - "train/30d/value/sum/1.pkl", - "train/30d/code/count/0.pkl", - "train/30d/code/count/1.pkl", - "held_out/365d/value/sum/0.pkl", - "held_out/365d/code/count/0.pkl", - "held_out/full/value/sum/0.pkl", - "held_out/full/code/count/0.pkl", - "held_out/30d/value/sum/0.pkl", - "held_out/30d/code/count/0.pkl", - "tuning/365d/value/sum/0.pkl", - "tuning/365d/code/count/0.pkl", - "tuning/full/value/sum/0.pkl", - "tuning/full/code/count/0.pkl", - "tuning/30d/value/sum/0.pkl", - "tuning/30d/code/count/0.pkl", - ] - assert set(actual_files) == set(expected_files) + + assert set(actual_files) == set(SUMMARIZE_EXPECTED_FILES) for f in output_files: df = pd.read_pickle(f) assert df.shape[0] > 0 + + merge_data(cfg) + output_files = list(tabularized_data_dir.glob("sparse/*/*/*/*/*.npy")) + actual_files = [str(Path(*f.parts[-5:])) for f in output_files] + assert set(actual_files) == set(MERGE_EXPECTED_FILES) From e6a88a7fb341a6e081cd3374a4b13491a0d9a1f1 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Sat, 1 Jun 2024 00:56:03 +0000 Subject: [PATCH 043/106] changed to sparse format --- configs/xgboost_sweep.yaml | 3 +- scripts/xgboost_sweep.py | 185 +++++++++++++++++++++++-------------- 2 files changed, 115 insertions(+), 73 deletions(-) diff --git a/configs/xgboost_sweep.yaml b/configs/xgboost_sweep.yaml index 15b7391..f5f2275 100644 --- a/configs/xgboost_sweep.yaml +++ b/configs/xgboost_sweep.yaml @@ -32,7 +32,6 @@ model: iterator: keep_data_in_memory: False - keep_static_data_in_memory: False # Hydra settings for sweep defaults: @@ -40,7 +39,7 @@ defaults: - override hydra/sweeper/sampler: tpe hydra: - verbose: True + verbose: False sweep: dir: ${tabularized_data_dir}/.logs/etl/${now:%Y-%m-%d_%H-%M-%S} run: diff --git a/scripts/xgboost_sweep.py b/scripts/xgboost_sweep.py index 034a175..2caac22 100644 --- a/scripts/xgboost_sweep.py +++ b/scripts/xgboost_sweep.py @@ -6,7 +6,6 @@ import hydra import numpy as np -import pandas as pd import scipy.sparse as sp import xgboost as xgb from loguru import logger @@ -24,16 +23,11 @@ def __init__(self, cfg: DictConfig, split: str = "train"): """ self.cfg = cfg self.data_path = Path(cfg.tabularized_data_dir) - self.dynamic_data_path = self.data_path / "ts" / split - self.static_data_path = self.data_path / "static" / split - self._data_shards = [ - 0, - 1, - 2, - 3, - ] # [shard.stem for shard in list(self.static_data_path.glob("*.parquet"))] - - self.codes_set, self.aggs_set, self.codes_mask = self._get_inclusion_sets() + self.dynamic_data_path = self.data_path / "sparse" / split + self.label_data_path = self.data_path / "task" / split + self._data_shards = [4] # [shard.stem for shard in list(self.static_data_path.glob("*."))] + # TODO: need to fix this path/logic + self.window_set, self.aggs_set, self.codes_set = self._get_inclusion_sets() self._it = 0 @@ -41,65 +35,78 @@ def __init__(self, cfg: DictConfig, split: str = "train"): # "cache" super().__init__(cache_prefix=os.path.join(".", "cache")) - def _get_inclusion_sets(self) -> tuple[set, set, np.array]: - """Get the inclusion sets for codes and aggregations. - - Returns: - - tuple[set, set, set]: Tuple of sets for codes, aggregations, and minimum code frequency. - """ - codes_set = None - aggs_set = None - min_frequency_set = None - window_set = None - - if self.cfg.aggs is not None: - aggs_set = set(self.cfg.aggs) - - if self.cfg.window_sizes is not None: - window_set = set(self.cfg.window_sizes) - - feature_columns = json.load(self.data_path / "feature_columns.json") - + def _get_code_set(self) -> set: + """Get the set of codes to include in the data based on the configuration.""" + with open(self.data_path / "feature_columns.json") as f: + feature_columns = json.load(f) + feature_dict = {col: i for i, col in enumerate(feature_columns)} if self.cfg.codes is not None: - codes_mask = np.zeros(len(feature_columns), dtype=bool) - codes_set = set(self.cfg.codes) - for code in codes_set: - codes_mask |= np.array([code in col for col in feature_columns]) - else: - codes_mask = np.ones(len(feature_columns), dtype=bool) + codes_set = {feature_dict[code] for code in set(self.cfg.codes) if code in feature_dict} if self.cfg.min_code_inclusion_frequency is not None: - frequency_mask = np.zeros(len(feature_columns), dtype=bool) with open(self.data_path / "feature_freqs.json") as f: feature_freqs = json.load(f) min_frequency_set = { key for key, value in feature_freqs.items() if value >= self.cfg.min_code_inclusion_frequency } - for code in min_frequency_set: - frequency_mask |= np.array([code in col for col in feature_columns]) + frequency_set = {feature_dict[code] for code in min_frequency_set if code in feature_dict} + + if self.cfg.codes is not None and self.cfg.min_code_inclusion_frequency is not None: + codes_set = codes_set.intersection(frequency_set) + elif self.cfg.codes is not None: + codes_set = codes_set + elif self.cfg.min_code_inclusion_frequency is not None: + codes_set = frequency_set else: - frequency_mask = np.ones(len(feature_columns), dtype=bool) + codes_set = None # set(feature_columns) + return codes_set - mask = codes_mask | frequency_mask + def _get_inclusion_sets(self) -> tuple[set, set, np.array]: + """Get the inclusion sets for aggregations, window sizes, and a mask for minimum code frequency. - return aggs_set, window_set, mask + Returns: + - Tuple[Optional[Set[str]], Optional[Set[str]], np.ndarray]: Tuple containing: + - Set of aggregations. + - Set of window sizes. + - Boolean array mask indicating which feature columns meet the inclusion criteria. + + Examples: + >>> import tempfile + >>> from types import SimpleNamespace + >>> cfg = SimpleNamespace( + ... aggs=["code/count", "value/sum"], + ... window_sizes=None, + ... codes=["code1", "code2", "value1"], + ... min_code_inclusion_frequency=2 + ... ) + >>> with tempfile.TemporaryDirectory() as tempdir: + ... data_path = Path(tempdir) + ... cfg.tabularized_data_dir = str(data_path) + ... feature_columns = ["code1/code", "code2/code", "value1/value"] + ... feature_freqs = {"code1": 3, "code2": 1, "value1": 5} + ... with open(data_path / "feature_columns.json", "w") as f: + ... json.dump(feature_columns, f) + ... with open(data_path / "feature_freqs.json", "w") as f: + ... json.dump(feature_freqs, f) + ... iterator = Iterator(cfg) + ... aggs_set, window_set, mask = iterator._get_inclusion_sets() + ... assert aggs_set == {"code/count", "value/sum"} + ... assert window_set == None + ... assert np.array_equal(mask, [True, False, True]) + """ - def _get_task_by_index(self, idx: int) -> pd.DataFrame: - """Get the task data for a specific shard. + window_set = None + aggs_set = None - Args: - - idx (int): Index of the shard. + if self.cfg.aggs is not None: + aggs_set = set(self.cfg.aggs) - Returns: - - pd.DataFrame: Data frame with the task data. - """ - # TODO: replace with something real - file = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}.pkl"))[0] - shard = pd.read_pickle(file) - shard["label"] = np.random.randint(0, 2, shard.shape[0]) - return shard[["patient_id", "timestamp", "label"]] + if self.cfg.window_sizes is not None: + window_set = set(self.cfg.window_sizes) - def _load_dynamic_shard_from_file(self, path: Path) -> sp.csr_matrix: + return aggs_set, window_set, self._get_code_set() + + def _load_dynamic_shard_from_file(self, path: Path) -> sp.csc_matrix: """Load a sparse shard into memory. Args: @@ -107,20 +114,56 @@ def _load_dynamic_shard_from_file(self, path: Path) -> sp.csr_matrix: Returns: - sp.coo_matrix: Data frame with the sparse shard. + >>> import tempfile + >>> from types import SimpleNamespace + >>> with tempfile.TemporaryDirectory() as tempdir: + ... sample_shard_path = Path(tempdir) / "sample_shard.npy" + ... sample_shard_data = np.array([[0, 1, 0], + ... [1, 0, 1], + ... [0, 1, 0]]) + ... sample_filtered_data = np.array([[1, 0], + ... [0, 1], + ... [1, 0]]) + ... np.save(sample_shard_path, sample_shard_data) + ... cfg = SimpleNamespace( + ... aggs=None, + ... window_sizes=None, + ... codes=None, + ... min_code_inclusion_frequency=None, + ... tabularized_data_dir=Path(tempdir) + ... ) + ... feature_columns = ["code1/code", "code2/code", "value1/value"] + ... with open(Path(tempdir) / "feature_columns.json", "w") as f: + ... json.dump(feature_columns, f) + ... iterator_instance = Iterator(cfg) + ... iterator_instance.codes_mask = np.array([False, True, True]) + ... loaded_shard = iterator_instance._load_dynamic_shard_from_file(sample_shard_path) + ... assert isinstance(loaded_shard, sp.csc_matrix) + ... expected_csc = sp.csc_matrix(sample_filtered_data) + ... assert sp.issparse(loaded_shard) + ... assert np.array_equal(loaded_shard.data, expected_csc.data) + ... assert np.array_equal(loaded_shard.indices, expected_csc.indices) + ... assert np.array_equal(loaded_shard.indptr, expected_csc.indptr) """ - shard = np.load(path) # TODO: check this with nassim - self._filter_shard_on_codes_and_freqs(shard) - return shard + # column_shard is of form event_idx, feature_idx, value + column_shard = np.load(path).T # TODO: Fix this!!! + shard = sp.csc_matrix( + (column_shard[:, 0], (column_shard[:, 1], column_shard[:, 2])), + shape=( + max(column_shard[:, 1].astype(np.int32) + 1), + max(column_shard[:, 2].astype(np.int32) + 1), + ), + ) + return self._filter_shard_on_codes_and_freqs(shard) - def _get_dynamic_shard_by_index(self, idx: int) -> sp.csc_matrix: + def _get_dynamic_shard_by_index(self, idx: int) -> sp.csr_matrix: """Load a specific shard of dynamic data from disk and return it as a sparse matrix after filtering column inclusion.""" - files = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}.pkl")) - files = [file for file in files if self._filter_shard_files_on_window_and_aggs(file)] - - dynamic_coos = [sp.csc_matrix(self._load_dynamic_shard_from_file(file)) for file in files] - return sp.hstack(dynamic_coos) + files = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}.npy")) + files = sorted([file for file in files if self._filter_shard_files_on_window_and_aggs(file)]) + dynamic_cscs = [self._load_dynamic_shard_from_file(file) for file in files] + return sp.hstack(dynamic_cscs).tocsr()[self._valid_event_ids[idx], :] def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: """Load a specific shard of data from disk and concatenate with static data. @@ -136,10 +179,10 @@ def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: dynamic_df = self._get_dynamic_shard_by_index(idx) logger.debug(f"Dynamic data loading took {datetime.now() - time}") time = datetime.now() - task_df = self._get_task_by_index(idx) + label_df = self._get_label_by_index(idx) logger.debug(f"Task data loading took {datetime.now() - time}") - return sp.csr_matrix(dynamic_df), task_df["label"].values + return dynamic_df, label_df["label"].values def _filter_shard_files_on_window_and_aggs(self, file: Path) -> bool: parts = file.relative_to(self.dynamic_data_path).parts @@ -153,7 +196,7 @@ def _filter_shard_files_on_window_and_aggs(self, file: Path) -> bool: self.aggs_set is None or aggs_part in self.aggs_set ) - def _filter_shard_on_codes_and_freqs(self, df: sp.coo_matrix) -> sp.sp.csr_matrix: + def _filter_shard_on_codes_and_freqs(self, df: sp.csc_matrix) -> sp.csc_matrix: """Filter the dynamic data frame based on the inclusion sets. Given the codes_mask, filter the data frame to only include columns that are True in the mask. @@ -163,7 +206,9 @@ def _filter_shard_on_codes_and_freqs(self, df: sp.coo_matrix) -> sp.sp.csr_matri Returns: - df (scipy.sparse.sp.csr_matrix): Filtered data frame. """ - return sp.csr_matrix(df)[:, self.codes_mask] + if self.codes_set is None: + return df + return df[:, list({index for index in self.codes_set if index < df.shape[1]})] def next(self, input_data: Callable): """Advance the iterator by 1 step and pass the data to XGBoost. This function is called by XGBoost @@ -193,7 +238,7 @@ def reset(self): """Reset the iterator to its beginning.""" self._it = 0 - def collect_in_memory(self) -> tuple[sp.sp.csr_matrix, np.ndarray]: + def collect_in_memory(self) -> tuple[sp.coo_matrix, np.ndarray]: """Collect the data in memory. Returns: @@ -243,14 +288,12 @@ def train(self): def _build(self): """Build necessary data structures for training.""" - start_time = datetime.now() if self.keep_data_in_memory: self._build_iterators() self._build_dmatrix_in_memory() else: self._build_iterators() self._build_dmatrix_from_iterators() - logger.debug(f"Data loading took {datetime.now() - start_time}") def _build_dmatrix_in_memory(self): """Build the DMatrix from the data in memory.""" From e8d64fd14dbe21697975dd898facd3b498698cf1 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sat, 1 Jun 2024 01:30:25 +0000 Subject: [PATCH 044/106] added script for extracting tasks using aces --- hf_cohort/aces_task.sh | 13 ++++++ hf_cohort/aces_task_extraction.py | 51 +++++++++++++++++++++++ hf_cohort/config.yaml | 21 ++++++++++ {scripts => hf_cohort}/hf_cohort_e2e.sh | 0 {scripts => hf_cohort}/hf_cohort_shard.sh | 0 hf_cohort/task.yaml | 21 ++++++++++ scripts/e2e.sh | 26 ------------ 7 files changed, 106 insertions(+), 26 deletions(-) create mode 100644 hf_cohort/aces_task.sh create mode 100644 hf_cohort/aces_task_extraction.py create mode 100644 hf_cohort/config.yaml rename {scripts => hf_cohort}/hf_cohort_e2e.sh (100%) rename {scripts => hf_cohort}/hf_cohort_shard.sh (100%) create mode 100644 hf_cohort/task.yaml delete mode 100644 scripts/e2e.sh diff --git a/hf_cohort/aces_task.sh b/hf_cohort/aces_task.sh new file mode 100644 index 0000000..c8dcf5d --- /dev/null +++ b/hf_cohort/aces_task.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +MEDS_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort +OUTPUT_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize +# N_PARALLEL_WORKERS="$1" +WINDOW_SIZES="window_sizes=[1d]" +AGGS="aggs=[code/count,value/sum]" + +python /home/nassim/projects/MEDS_Tabular_AutoML/hf_cohort/aces_task_extraction.py \ + MEDS_cohort_dir=$MEDS_DIR \ + tabularized_data_dir=$OUTPUT_DIR \ + min_code_inclusion_frequency=1 do_overwrite=False \ + "$WINDOW_SIZES" "$AGGS" diff --git a/hf_cohort/aces_task_extraction.py b/hf_cohort/aces_task_extraction.py new file mode 100644 index 0000000..6b86af1 --- /dev/null +++ b/hf_cohort/aces_task_extraction.py @@ -0,0 +1,51 @@ +""" +Setup Conda environment as described here: https://github.com/justin13601/ACES +""" +from pathlib import Path + +import hydra +import polars as pl +from aces import config, predicates, query +from tqdm import tqdm + + +@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") +def main(cfg): + # create task configuration object + task_cfg = config.TaskExtractorConfig.load(config_path="hf_cohort/task.yaml") + + # setup directories + med_dir = Path(cfg.tabularized_data_dir) + + # location of MEDS format Data + cohort_dir = med_dir.parent / "final_cohort" + # output directory for tables with event_ids and labels + output_dir = med_dir / "task" + + shard_fps = list(cohort_dir.glob("*/*.parquet")) + + for in_fp in tqdm(shard_fps): + out_fp = output_dir / "/".join(in_fp.parts[-2:]) + out_fp.parent.mkdir(parents=True, exist_ok=True) + # one of the following + predicates_df = predicates.generate_predicates_df(task_cfg, in_fp, "meds") + + # execute query + df_result = query.query(task_cfg, predicates_df) + label_df = ( + df_result.select(pl.col(["subject_id", "trigger", "label"])) + .rename({"trigger": "timestamp", "subject_id": "patient_id"}) + .sort(by=["patient_id", "timestamp"]) + ) + data_df = pl.scan_parquet(in_fp) + data_df = data_df.unique(subset=["patient_id", "timestamp"]).sort(by=["patient_id", "timestamp"]) + data_df = data_df.with_row_index("event_id") + data_df = data_df.drop(["code", "numerical_value"]) + output_df = label_df.lazy().join_asof(other=data_df, by="patient_id", on="timestamp") + + # store it + output_df.collect().write_parquet(out_fp) + + +if __name__ == "__main__": + main() diff --git a/hf_cohort/config.yaml b/hf_cohort/config.yaml new file mode 100644 index 0000000..a9911fb --- /dev/null +++ b/hf_cohort/config.yaml @@ -0,0 +1,21 @@ +# Path to the task configuration file +config_path: task.yaml + +# Raw Data +data: + # Path to the data file or directory + path: /storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort/train/0.parquet + + # Data standard, one of (csv, meds, esgpt) + standard: meds + +# Output Directory (saves as .parquet file) +output_dir: results/ + +# Hydra +hydra: + job: + name: ACES_${now:%Y-%m-%d_%H-%M-%S} + run: + dir: ${ACES_dir}/.logs/${hydra.job.name} +# aces-cli --config-dir='./' --config-name='config.yaml' diff --git a/scripts/hf_cohort_e2e.sh b/hf_cohort/hf_cohort_e2e.sh similarity index 100% rename from scripts/hf_cohort_e2e.sh rename to hf_cohort/hf_cohort_e2e.sh diff --git a/scripts/hf_cohort_shard.sh b/hf_cohort/hf_cohort_shard.sh similarity index 100% rename from scripts/hf_cohort_shard.sh rename to hf_cohort/hf_cohort_shard.sh diff --git a/hf_cohort/task.yaml b/hf_cohort/task.yaml new file mode 100644 index 0000000..19ff7f0 --- /dev/null +++ b/hf_cohort/task.yaml @@ -0,0 +1,21 @@ +# Task: 30-day Readmission Risk Prediction +predicates: + admission: + code: ADMIT_DATE + discharge: + code: DISCHARGE_DATE + +trigger: admission + +windows: + input: + start: trigger + end: start -> discharge + start_inclusive: False + end_inclusive: True + target: + start: input.end + end: start + 30 days + start_inclusive: False + end_inclusive: True + label: admission diff --git a/scripts/e2e.sh b/scripts/e2e.sh deleted file mode 100644 index fd1ff60..0000000 --- a/scripts/e2e.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash - -MEDS_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort -OUTPUT_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize -N_PARALLEL_WORKERS="2" #"$3" - -# echo "Running identify_columns.py: Caching feature names and frequencies." -# POLARS_MAX_THREADS=32 python scripts/identify_columns.py \ -# MEDS_cohort_dir=$MEDS_DIR \ -# tabularized_data_dir=$OUTPUT_DIR \ -# min_code_inclusion_frequency=1 "window_sizes=[1d, 7d, 30d, 365d, full]" do_overwrite=True - -# echo "Running tabularize_static.py: tabularizing static data" -# POLARS_MAX_THREADS=32 python scripts/tabularize_static.py \ -# MEDS_cohort_dir=$MEDS_DIR \ -# tabularized_data_dir=$OUTPUT_DIR \ -# min_code_inclusion_frequency=1 "window_sizes=[1d, 7d, 30d, 365d, full]" do_overwrite=True - -echo "Running summarize_over_windows.py with $N_PARALLEL_WORKERS workers in parallel" -POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py \ - --multirun \ - worker="range(1,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - MEDS_cohort_dir=$MEDS_DIR \ - tabularized_data_dir=$OUTPUT_DIR \ - min_code_inclusion_frequency=1 "window_sizes=[1d, 7d, 30d, 365d, full]" do_overwrite=True From 5c5dc8e1573d91120aef6c73938829364cddef40 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sat, 1 Jun 2024 01:44:00 +0000 Subject: [PATCH 045/106] added dependencies --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index eb188e3..4fcaec8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", "pandas", "numba", "tqdm", "xgboost"] +dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", "pandas", "numba", "tqdm", "xgboost", "scikit-learn", "hydra-optuna-sweeper"] [project.optional-dependencies] dev = ["pre-commit"] From d99e2749ad7dbe8bbf0e0d7e4ded3f5e92615f54 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sat, 1 Jun 2024 02:19:56 +0000 Subject: [PATCH 046/106] added support for loading cached labels and event indexes --- configs/xgboost_sweep.yaml | 4 ++-- hf_cohort/xgboost.sh | 6 ++++++ pyproject.toml | 1 - scripts/xgboost_sweep.py | 23 +++++++++++++++++++++-- 4 files changed, 29 insertions(+), 5 deletions(-) create mode 100644 hf_cohort/xgboost.sh diff --git a/configs/xgboost_sweep.yaml b/configs/xgboost_sweep.yaml index f5f2275..89a99b2 100644 --- a/configs/xgboost_sweep.yaml +++ b/configs/xgboost_sweep.yaml @@ -1,7 +1,7 @@ # Raw data MEDS_cohort_dir: ??? -tabularized_data_dir: /storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize -model_dir: /storage/teya/test/ +tabularized_data_dir: ??? +model_dir: ${tabularized_data_dir}/model # Pre-processing min_code_inclusion_frequency: 1 diff --git a/hf_cohort/xgboost.sh b/hf_cohort/xgboost.sh new file mode 100644 index 0000000..3ef570f --- /dev/null +++ b/hf_cohort/xgboost.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +BASE_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed +TAB_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize + +python -m scripts.xgboost_sweep MEDS_cohort_dir=$BASE_DIR tabularized_data_dir=$TAB_DIR diff --git a/pyproject.toml b/pyproject.toml index 4fcaec8..157c6d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,6 @@ dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", " dev = ["pre-commit"] tests = ["pytest", "pytest-cov", "rootutils"] local_parallelism = ["hydra-joblib-launcher"] -slurm_parallelism = ["hydra-submitit-launcher"] [project.urls] Homepage = "https://github.com/mmcdermott/MEDS_polars_functions" diff --git a/scripts/xgboost_sweep.py b/scripts/xgboost_sweep.py index 2caac22..26f3505 100644 --- a/scripts/xgboost_sweep.py +++ b/scripts/xgboost_sweep.py @@ -1,11 +1,12 @@ import json import os -from collections.abc import Callable +from collections.abc import Callable, Mapping from datetime import datetime from pathlib import Path import hydra import numpy as np +import polars as pl import scipy.sparse as sp import xgboost as xgb from loguru import logger @@ -25,7 +26,8 @@ def __init__(self, cfg: DictConfig, split: str = "train"): self.data_path = Path(cfg.tabularized_data_dir) self.dynamic_data_path = self.data_path / "sparse" / split self.label_data_path = self.data_path / "task" / split - self._data_shards = [4] # [shard.stem for shard in list(self.static_data_path.glob("*."))] + self._data_shards = [4] # sort([shard.stem for shard in list(self.static_data_path.glob("*."))]) + self.valid_event_ids, self.labels = self.load_labels() # TODO: need to fix this path/logic self.window_set, self.aggs_set, self.codes_set = self._get_inclusion_sets() @@ -35,6 +37,23 @@ def __init__(self, cfg: DictConfig, split: str = "train"): # "cache" super().__init__(cache_prefix=os.path.join(".", "cache")) + def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: + """Loads valid event ids and labels for each shard. + + Returns: + - Tuple[Mapping[int, list], Mapping[int, list]]: Tuple containing: + dictionary from shard number to list of valid event ids -- used for indexing rows + in the sparse matrix + dictionary from shard number to list of labels for these valid event ids + """ + label_fps = {shard: self.label_data_path / f"{shard}.parquet" for shard in self._data_shards} + cached_labels, cached_event_ids = dict(), dict() + for shard, label_fp in label_fps.items(): + label_df = pl.scan_parquet(label_fp) + cached_event_ids[shard] = label_df.select(pl.col("event_id")).collect().to_series() + cached_labels[shard] = label_df.select(pl.col("label")).collect().to_series() + return cached_event_ids, cached_labels + def _get_code_set(self) -> set: """Get the set of codes to include in the data based on the configuration.""" with open(self.data_path / "feature_columns.json") as f: From cadc603e67276cc0d007a29d6b1815433645f479 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sat, 1 Jun 2024 02:43:18 +0000 Subject: [PATCH 047/106] updated readme --- README.md | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 5a596d1..b1fd8dc 100644 --- a/README.md +++ b/README.md @@ -46,27 +46,29 @@ This repository consists of two key pieces: See `tests/test_tabularize_integration.py` for an example of the end-to-end pipeline being run on synthetic data. This script is a functional test that is also run with `pytest` to verify the correctness of the algorithm. + #### Core Scripts: -1. `scripts/tabularize/identify_columns.py` loads all training shard to identify which feature columns +1. `scripts/identify_columns.py` loads all training shard to identify which feature columns to generate tabular data for. - -```bash -POLARS_MAX_THREADS=32 python scripts/identify_columns.py MEDS_cohort_dir=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort tabularized_data_dir=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize min_code_inclusion_frequency=1 "window_sizes=[1d, 7d, full]" do_overwrite=True -``` - -2. `scripts/tabularize/tabularize_static.py` Iterates through shards and generates tabular vectors for +2. `scripts/tabularize_static.py` Iterates through shards and generates tabular vectors for each patient. There is a single row per patient for each shard. +3. `scripts/summarize_over_windows.py` For each shard, iterates through window sizes and aggregations to and + horizontally concatenates the outputs to generate the final tabular representations at every event time for + every patient. +4. `scripts/tabularize_merge` Aligns the time-series window aggregations (generated in the previous step) with + the static tabular vectors and caches them for training. +5. `scripts/hf_cohort/aces_task_extraction.py` Generates the task labels and caches them with the event_id + indexes which align them with the nearest prior event in the tabular data. +6. `scripts/xgboost_sweep.py` Tunes XGboost on methods. Iterates through the labels and corresponding tabular data. -```bash -POLARS_MAX_THREADS=32 python scripts/tabularize_static.py MEDS_cohort_dir=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort tabularized_data_dir=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize min_code_inclusion_frequency=1 "window_sizes=[1d, 7d, full]" do_overwrite=True -``` - -4. `scripts/tabularize/summarize_over_windows.py` For each shard, iterates through window sizes and aggregations to - and horizontally concatenates the outputs to generate the final tabular representations at every event time for every patient. +We run this on an example dataset using the following bash scripts in sequence: ```bash -POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py MEDS_cohort_dir=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort tabularized_data_dir=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize min_code_inclusion_frequency=1 "window_sizes=[1d, 7d, full]" do_overwrite=True +bash hf_cohort_shard.sh # processes the dataset into meds format +bash hf_cohort_e2e.sh # performs (steps 1-4 above) +bash hf_cohort/aces_task.sh # generates labels (step 5) +bash xgboost.sh # trains xgboos (step 6) ``` ## Feature Construction, Storage, and Loading From 285ccbfa3015d23013359610324c8d86379f70cd Mon Sep 17 00:00:00 2001 From: teyaberg Date: Sat, 1 Jun 2024 19:48:35 +0000 Subject: [PATCH 048/106] size issues for loading sparse matrix --- configs/xgboost_sweep.yaml | 3 +- scripts/xgboost_sweep.py | 59 ++++++++++++++++++++++++-------------- 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/configs/xgboost_sweep.yaml b/configs/xgboost_sweep.yaml index 89a99b2..eeec001 100644 --- a/configs/xgboost_sweep.yaml +++ b/configs/xgboost_sweep.yaml @@ -26,9 +26,8 @@ tqdm: True model: booster: gbtree device: cpu - epochs: 1 tree_method: hist - objective: binary:logistic + objective: reg:squarederror iterator: keep_data_in_memory: False diff --git a/scripts/xgboost_sweep.py b/scripts/xgboost_sweep.py index 26f3505..3cd06dd 100644 --- a/scripts/xgboost_sweep.py +++ b/scripts/xgboost_sweep.py @@ -3,6 +3,7 @@ from collections.abc import Callable, Mapping from datetime import datetime from pathlib import Path +from timeit import timeit import hydra import numpy as np @@ -25,11 +26,12 @@ def __init__(self, cfg: DictConfig, split: str = "train"): self.cfg = cfg self.data_path = Path(cfg.tabularized_data_dir) self.dynamic_data_path = self.data_path / "sparse" / split - self.label_data_path = self.data_path / "task" / split - self._data_shards = [4] # sort([shard.stem for shard in list(self.static_data_path.glob("*."))]) + self.task_data_path = self.data_path / "task" / split + self._data_shards = sorted( + [shard.stem for shard in list(self.task_data_path.glob("*.parquet"))] + ) # [2, 4, 5] # self.valid_event_ids, self.labels = self.load_labels() - # TODO: need to fix this path/logic - self.window_set, self.aggs_set, self.codes_set = self._get_inclusion_sets() + self.window_set, self.aggs_set, self.codes_set, self.num_features = self._get_inclusion_sets() self._it = 0 @@ -46,7 +48,7 @@ def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: in the sparse matrix dictionary from shard number to list of labels for these valid event ids """ - label_fps = {shard: self.label_data_path / f"{shard}.parquet" for shard in self._data_shards} + label_fps = {shard: self.task_data_path / f"{shard}.parquet" for shard in self._data_shards} cached_labels, cached_event_ids = dict(), dict() for shard, label_fp in label_fps.items(): label_df = pl.scan_parquet(label_fp) @@ -78,7 +80,8 @@ def _get_code_set(self) -> set: codes_set = frequency_set else: codes_set = None # set(feature_columns) - return codes_set + # TODO: make sure we aren't filtering out static columns!!! + return list(codes_set), len(feature_columns) def _get_inclusion_sets(self) -> tuple[set, set, np.array]: """Get the inclusion sets for aggregations, window sizes, and a mask for minimum code frequency. @@ -123,9 +126,11 @@ def _get_inclusion_sets(self) -> tuple[set, set, np.array]: if self.cfg.window_sizes is not None: window_set = set(self.cfg.window_sizes) - return aggs_set, window_set, self._get_code_set() + codes_set, num_features = self._get_code_set() - def _load_dynamic_shard_from_file(self, path: Path) -> sp.csc_matrix: + return window_set, aggs_set, codes_set, num_features + + def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: """Load a sparse shard into memory. Args: @@ -157,20 +162,21 @@ def _load_dynamic_shard_from_file(self, path: Path) -> sp.csc_matrix: ... iterator_instance = Iterator(cfg) ... iterator_instance.codes_mask = np.array([False, True, True]) ... loaded_shard = iterator_instance._load_dynamic_shard_from_file(sample_shard_path) - ... assert isinstance(loaded_shard, sp.csc_matrix) - ... expected_csc = sp.csc_matrix(sample_filtered_data) + ... assert isinstance(loaded_shard, sp.csr_matrix) + ... expected_csr = sp.csr_matrix(sample_filtered_data) ... assert sp.issparse(loaded_shard) - ... assert np.array_equal(loaded_shard.data, expected_csc.data) - ... assert np.array_equal(loaded_shard.indices, expected_csc.indices) - ... assert np.array_equal(loaded_shard.indptr, expected_csc.indptr) + ... assert np.array_equal(loaded_shard.data, expected_csr.data) + ... assert np.array_equal(loaded_shard.indices, expected_csr.indices) + ... assert np.array_equal(loaded_shard.indptr, expected_csr.indptr) """ # column_shard is of form event_idx, feature_idx, value column_shard = np.load(path).T # TODO: Fix this!!! + shard = sp.csc_matrix( (column_shard[:, 0], (column_shard[:, 1], column_shard[:, 2])), shape=( - max(column_shard[:, 1].astype(np.int32) + 1), - max(column_shard[:, 2].astype(np.int32) + 1), + max(self.valid_event_ids[self._data_shards[idx]]) + 1, + self.num_features, ), ) return self._filter_shard_on_codes_and_freqs(shard) @@ -181,8 +187,8 @@ def _get_dynamic_shard_by_index(self, idx: int) -> sp.csr_matrix: files = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}.npy")) files = sorted([file for file in files if self._filter_shard_files_on_window_and_aggs(file)]) - dynamic_cscs = [self._load_dynamic_shard_from_file(file) for file in files] - return sp.hstack(dynamic_cscs).tocsr()[self._valid_event_ids[idx], :] + dynamic_csrs = [self._load_dynamic_shard_from_file(file, idx) for file in files] + return sp.hstack(dynamic_csrs).tocsr()[self.valid_event_ids[self._data_shards[idx]], :] def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: """Load a specific shard of data from disk and concatenate with static data. @@ -198,10 +204,9 @@ def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: dynamic_df = self._get_dynamic_shard_by_index(idx) logger.debug(f"Dynamic data loading took {datetime.now() - time}") time = datetime.now() - label_df = self._get_label_by_index(idx) + label_df = self.labels[self._data_shards[idx]] logger.debug(f"Task data loading took {datetime.now() - time}") - - return dynamic_df, label_df["label"].values + return dynamic_df, label_df def _filter_shard_files_on_window_and_aggs(self, file: Path) -> bool: parts = file.relative_to(self.dynamic_data_path).parts @@ -227,7 +232,7 @@ def _filter_shard_on_codes_and_freqs(self, df: sp.csc_matrix) -> sp.csc_matrix: """ if self.codes_set is None: return df - return df[:, list({index for index in self.codes_set if index < df.shape[1]})] + return df[:, self.codes_set] # [:, list({index for index in self.codes_set if index < df.shape[1]})] def next(self, input_data: Callable): """Advance the iterator by 1 step and pass the data to XGBoost. This function is called by XGBoost @@ -358,8 +363,12 @@ def xgboost(cfg: DictConfig) -> float: Returns: - float: Evaluation result. """ + logger.debug("Initializing XGBoost model") model = XGBoostModel(cfg) + logger.debug("Training XGBoost model") + time = datetime.now() model.train() + logger.debug(f"Training took {datetime.now() - time}") # save model save_dir = ( Path(cfg.model_dir) @@ -369,8 +378,14 @@ def xgboost(cfg: DictConfig) -> float: save_dir.mkdir(parents=True, exist_ok=True) model.model.save_model(save_dir / f"{np.random.randint(100000, 999999)}_model.json") + return model.evaluate() if __name__ == "__main__": - xgboost() + # start_time = datetime.now() + # xgboost() + # logger.debug(f"Total time: {datetime.now() - start_time}") + num = 10 + time = timeit(xgboost, number=num) / num + logger.debug(f"Training time averaged over {num} runs: {time}") From 795b532f81692e989f412581f64fdefd76ec74db Mon Sep 17 00:00:00 2001 From: teyaberg Date: Sat, 1 Jun 2024 20:43:21 +0000 Subject: [PATCH 049/106] push updates --- scripts/xgboost_sweep.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/scripts/xgboost_sweep.py b/scripts/xgboost_sweep.py index 3cd06dd..a872307 100644 --- a/scripts/xgboost_sweep.py +++ b/scripts/xgboost_sweep.py @@ -128,7 +128,7 @@ def _get_inclusion_sets(self) -> tuple[set, set, np.array]: codes_set, num_features = self._get_code_set() - return window_set, aggs_set, codes_set, num_features + return sorted(window_set), sorted(aggs_set), sorted(codes_set), num_features def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: """Load a sparse shard into memory. @@ -175,7 +175,7 @@ def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: shard = sp.csc_matrix( (column_shard[:, 0], (column_shard[:, 1], column_shard[:, 2])), shape=( - max(self.valid_event_ids[self._data_shards[idx]]) + 1, + max(self.valid_event_ids[self._data_shards[idx]], column_shard[:, 1]) + 1, self.num_features, ), ) @@ -183,12 +183,22 @@ def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: def _get_dynamic_shard_by_index(self, idx: int) -> sp.csr_matrix: """Load a specific shard of dynamic data from disk and return it as a sparse matrix after filtering - column inclusion.""" + column inclusion. - files = list(self.dynamic_data_path.glob(f"*/*/*/{self._data_shards[idx]}.npy")) - files = sorted([file for file in files if self._filter_shard_files_on_window_and_aggs(file)]) - dynamic_csrs = [self._load_dynamic_shard_from_file(file, idx) for file in files] - return sp.hstack(dynamic_csrs).tocsr()[self.valid_event_ids[self._data_shards[idx]], :] + Args: + - idx (int): Index of the shard to load. + + Returns: + - sp.csr_matrix: Filtered sparse matrix. + """ + shard_name = self._data_shards[idx] + shard_pattern = f"*/*/*/{shard_name}.npy" + files = self.dynamic_data_path.glob(shard_pattern) + valid_files = sorted(file for file in files if self._filter_shard_files_on_window_and_aggs(file)) + dynamic_csrs = [self._load_dynamic_shard_from_file(file, idx) for file in valid_files] + combined_csr = sp.hstack(dynamic_csrs, format="csr") + valid_indices = self.valid_event_ids[shard_name] + return combined_csr[valid_indices, :] def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: """Load a specific shard of data from disk and concatenate with static data. @@ -210,15 +220,19 @@ def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: def _filter_shard_files_on_window_and_aggs(self, file: Path) -> bool: parts = file.relative_to(self.dynamic_data_path).parts - if not parts: + if len(parts) < 2: return False windows_part = parts[0] aggs_part = "/".join(parts[1:-1]) - return (self.window_set is None or windows_part in self.window_set) and ( - self.aggs_set is None or aggs_part in self.aggs_set - ) + if self.window_set is not None and windows_part not in self.window_set: + return False + + if self.aggs_set is not None and aggs_part not in self.aggs_set: + return False + + return True def _filter_shard_on_codes_and_freqs(self, df: sp.csc_matrix) -> sp.csc_matrix: """Filter the dynamic data frame based on the inclusion sets. Given the codes_mask, filter the data From b9d057be1d7d434b60b1e425acf983cccf597e70 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sat, 1 Jun 2024 21:21:36 +0000 Subject: [PATCH 050/106] 4x speed increase for tabularization to sparse matrix by caching window sizes, using a csr_matrix for the input data, and using adding he aggregation outputs to lists as we iterate through windows and aggregate --- .../generate_summarized_reps.py | 187 ++++++++++-------- .../generate_ts_features.py | 68 ++++--- 2 files changed, 145 insertions(+), 110 deletions(-) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index ee9e569..33812eb 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -1,13 +1,12 @@ from collections.abc import Callable -from datetime import datetime import pandas as pd -from scipy.sparse import vstack pd.set_option("compute.use_numba", True) +import numpy as np import polars as pl from loguru import logger -from scipy.sparse import coo_matrix, csr_matrix +from scipy.sparse import coo_array, csr_array, sparray, vstack from MEDS_tabular_automl.generate_ts_features import get_ts_columns from MEDS_tabular_automl.utils import load_tqdm @@ -43,18 +42,18 @@ def f(c: str) -> str: def sparse_aggregate(sparse_matrix, agg): if agg == "sum": - merged_matrix = sparse_matrix.sum(axis=0) + merged_matrix = sparse_matrix.sum(axis=0, dtype=sparse_matrix.dtype) elif agg == "min": merged_matrix = sparse_matrix.min(axis=0) elif agg == "max": merged_matrix = sparse_matrix.max(axis=0) elif agg == "sum_sqd": - merged_matrix = sparse_matrix.power(2).sum(axis=0) + merged_matrix = sparse_matrix.power(2).sum(axis=0, dtype=sparse_matrix.dtype) elif agg == "count": merged_matrix = sparse_matrix.getnnz(axis=0) else: raise ValueError(f"Aggregation method '{agg}' not implemented.") - return csr_matrix(merged_matrix) + return merged_matrix def sum_merge_timestamps(df, sparse_matrix, agg): @@ -81,7 +80,7 @@ def sum_merge_timestamps(df, sparse_matrix, agg): # Create a new sparse matrix with summed rows per unique timestamp patient_id = df["patient_id"].iloc[0] timestamps = [] - output_matrix = csr_matrix((0, sparse_matrix.shape[1]), dtype=sparse_matrix.dtype) + output_matrix = csr_array((0, sparse_matrix.shape[1]), dtype=sparse_matrix.dtype) # Loop through each group and sum for timestamp, rows in indices.items(): @@ -116,7 +115,7 @@ def sparse_rolling(df, sparse_matrix, timedelta, agg): patient_id = df.iloc[0].patient_id df = df.drop(columns="patient_id").reset_index(drop=True).reset_index() timestamps = [] - out_sparse_matrix = coo_matrix((0, sparse_matrix.shape[1]), dtype=sparse_matrix.dtype) + out_sparse_matrix = coo_array((0, sparse_matrix.shape[1]), dtype=sparse_matrix.dtype) for each in df[["index", "timestamp"]].rolling(on="timestamp", window=timedelta): timestamps.append(each.index.max()) agg_subset_matrix = sparse_aggregate(sparse_matrix[each["index"]], agg) @@ -125,7 +124,49 @@ def sparse_rolling(df, sparse_matrix, timedelta, agg): return out_df, out_sparse_matrix -def compute_agg(df, window_size: str, agg: str, use_tqdm=False): +def get_rolling_window_indicies(index_df, window_size): + """Get the indices for the rolling windows.""" + if window_size == "full": + newest_date = df.select(pl.col("timestamp")).max().collect().item() + oldest_date = df.select(pl.col("timestamp")).min().collect().item() + timedelta = newest_date - oldest_date + pd.Timedelta(days=1) + else: + timedelta = pd.Timedelta(window_size) + return ( + index_df.with_row_index("index") + .rolling(index_column="timestamp", period=timedelta, group_by="patient_id") + .agg([pl.col("index").min().alias("min_index"), pl.col("index").max().alias("max_index")]) + .select(pl.col("min_index", "max_index")) + .collect() + ) + + +def aggregate_matrix(windows, matrix, agg, use_tqdm=False): + """Aggregate the matrix based on the windows.""" + tqdm = load_tqdm(use_tqdm) + agg = agg.split("/")[-1] + dtype = np.float32 + matrix = csr_array(matrix.astype(dtype)) + if agg.startswith("sum"): + out_dtype = np.float32 + else: + out_dtype = np.int32 + data, row, col = [], [], [] + for i, window in tqdm(enumerate(windows.iter_rows(named=True)), total=len(windows)): + min_index = window["min_index"] + max_index = window["max_index"] + subset_matrix = matrix[min_index : max_index + 1, :] + agg_matrix = sparse_aggregate(subset_matrix, agg).astype(out_dtype) + nozero_ind = np.nonzero(agg_matrix)[0] + col.append(nozero_ind) + data.append(agg_matrix[nozero_ind]) + row.append(np.repeat(np.array(i, dtype=np.int32), len(nozero_ind))) + row = np.concatenate(row) + out_matrix = coo_array((np.concatenate(data), (row, np.concatenate(col))), dtype=out_dtype) + return csr_array(out_matrix) + + +def compute_agg(index_df, matrix: sparray, window_size: str, agg: str, use_tqdm=False): """Applies aggreagtion to dataframe. Dataframe is expected to only have the relevant columns for aggregating @@ -175,45 +216,27 @@ def compute_agg(df, window_size: str, agg: str, use_tqdm=False): patient_id int64 dtype: object """ - if window_size == "full": - timedelta = df["timestamp"].max() - df["timestamp"].min() + pd.Timedelta(days=1) - else: - timedelta = pd.Timedelta(window_size) - logger.info("Grouping by patient_ids -- this may take a while.") - group = dict(list(df[["patient_id", "timestamp"]].groupby("patient_id"))) - sparse_matrix = df[df.columns[2:]].sparse.to_coo() - sparse_matrix = csr_matrix(sparse_matrix) - logger.info("Grouping Complete! Starting sparse rolling.") - out_sparse_matrix = coo_matrix((0, sparse_matrix.shape[1]), dtype=sparse_matrix.dtype) - - out_dfs = [] - iter_wrapper = load_tqdm(use_tqdm) - agg = agg.split("/")[1] - start_time = datetime.now() - for i, (patient_id, subset_df) in enumerate(iter_wrapper(group.items(), total=len(group))): - if i % 10 == 0: - logger.info(f"Progress is {i}/{len(group)}") - logger.info(f"Time elapsed: {datetime.now() - start_time}") - subset_sparse_matrix = sparse_matrix[subset_df.index] - patient_df = subset_df[["patient_id", "timestamp"]] - assert patient_df.timestamp.isnull().sum() == 0, "timestamp cannot be null" - patient_df, subset_sparse_matrix = sum_merge_timestamps(patient_df, subset_sparse_matrix, agg) - patient_df, out_sparse = sparse_rolling(patient_df, subset_sparse_matrix, timedelta, agg) - out_dfs.append(patient_df) - out_sparse_matrix = vstack([out_sparse_matrix, out_sparse]) - out_df = pd.concat(out_dfs, axis=0) - out_df = pd.concat( - [out_df.reset_index(drop=True), pd.DataFrame.sparse.from_spmatrix(out_sparse_matrix)], axis=1 + logger.info("Step 1: Grouping by same (patient_ids, timestamps) and aggregating") + group_df = ( + index_df.with_row_index("index") + .group_by(["patient_id", "timestamp"], maintain_order=True) + .agg([pl.col("index").min().alias("min_index"), pl.col("index").max().alias("max_index")]) + .collect() ) - out_df.columns = df.columns - out_df.rename(columns=time_aggd_col_alias_fntr(window_size, agg)) - - id_cols = ["patient_id", "timestamp"] - out_df = out_df.loc[:, id_cols + list(df.columns[2:])] - return out_df - - -def _generate_summary(df: pd.DataFrame, window_size: str, agg: str, use_tqdm=False) -> pl.LazyFrame: + index_df = group_df.lazy().select(pl.col("patient_id", "timestamp")) + windows = group_df.select(pl.col("min_index", "max_index")) + logger.info("Step 1.5: Running sparse aggregation.") + matrix = aggregate_matrix(windows, matrix, agg, use_tqdm) + logger.info("Step 2: computing rolling windows and aggregating.") + windows = get_rolling_window_indicies(index_df, window_size) + logger.info("Starting final sparse aggregations.") + matrix = aggregate_matrix(windows, matrix, agg, use_tqdm) + return matrix + + +def _generate_summary( + ts_columns: list[str], index_df: pd.DataFrame, matrix: sparray, window_size: str, agg: str, use_tqdm=False +) -> pl.LazyFrame: """Generate a summary of the data frame for a given window size and aggregation. Args: @@ -250,20 +273,12 @@ def _generate_summary(df: pd.DataFrame, window_size: str, agg: str, use_tqdm=Fal """ if agg not in VALID_AGGREGATIONS: raise ValueError(f"Invalid aggregation: {agg}. Valid options are: {VALID_AGGREGATIONS}") - code_cols = [c for c in df.columns if c.endswith("code")] - value_cols = [c for c in df.columns if c.endswith("value")] - if agg in CODE_AGGREGATIONS: - cols = code_cols - else: - cols = value_cols - id_cols = ["patient_id", "timestamp"] - df = df.loc[:, id_cols + cols] - out_df = compute_agg(df, window_size, agg, use_tqdm=use_tqdm) - return out_df + out_matrix = compute_agg(index_df, sparse_matrix, window_size, agg, use_tqdm=use_tqdm) + return out_matrix def generate_summary( - feature_columns: list[str], df: pd.DataFrame, window_size, agg: str, use_tqdm=False + feature_columns: list[str], index_df: pl.LazyFrame, matrix: sparray, window_size, agg: str, use_tqdm=False ) -> pl.LazyFrame: """Generate a summary of the data frame for given window sizes and aggregations. @@ -318,34 +333,40 @@ def generate_summary( 0 NaN NaN 0 """ logger.info("Sorting sparse dataframe by patient_id and timestamp") - df = df.sort_values(["patient_id", "timestamp"]).reset_index(drop=True) assert len(feature_columns), "feature_columns must be a non-empty list" ts_columns = get_ts_columns(feature_columns) - code_value_ts_columns = [f"{c}/code" for c in ts_columns] + [f"{c}/value" for c in ts_columns] - final_columns = [] - out_dfs = [] # Generate summaries for each window size and aggregation code_type, agg_name = agg.split("/") - final_columns = [f"{window_size}/{c}/{agg_name}" for c in code_value_ts_columns if c.endswith(code_type)] # only iterate through code_types that exist in the dataframe columns - if any([c.endswith(code_type) for c in df.columns]): - logger.info(f"Generating aggregation {agg} for window_size {window_size}") - # timestamp_dtype = df.dtypes[df.columns.index("timestamp")] - # assert timestamp_dtype in [ - # pl.Datetime, - # pl.Date, - # ], f"timestamp must be of type Date, but is {timestamp_dtype}" - out_df = _generate_summary(df, window_size, agg, use_tqdm=use_tqdm) - out_dfs.append(out_df) - - final_columns = sorted(final_columns) - # Combine all dataframes using successive joins - result_df = pd.concat(out_dfs) - # Add in missing feature columns with default values - missing_columns = [col for col in final_columns if col not in result_df.columns] - - result_df[missing_columns] = pd.DataFrame.sparse.from_spmatrix( - coo_matrix((result_df.shape[0], len(missing_columns))) + assert any([c.endswith(code_type) for c in ts_columns]) + logger.info(f"Generating aggregation {agg} for window_size {window_size}") + out_matrix = _generate_summary(ts_columns, index_df, matrix, window_size, agg, use_tqdm=use_tqdm) + return out_matrix + + +if __name__ == "__main__": + import json + from pathlib import Path + + from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep + + feature_columns = json.load( + open( + Path("/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize") / "feature_columns.json" + ) + ) + df = pl.scan_parquet( + Path("/storage/shared/meds_tabular_ml/ebcl_dataset/processed") + / "final_cohort" + / "train" + / "0.parquet" + ) + index_df, sparse_matrix = get_flat_ts_rep(feature_columns, df) + generate_summary( + feature_columns=feature_columns, + index_df=index_df, + matrix=sparse_matrix, + window_size="full", + agg="code/count", + use_tqdm=True, ) - result_df = result_df[["patient_id", "timestamp"] + final_columns] - return result_df diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index 7ff413f..a2a120b 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -4,7 +4,7 @@ import pandas as pd import polars as pl from loguru import logger -from scipy.sparse import coo_matrix +from scipy.sparse import csr_array from MEDS_tabular_automl.generate_static_features import ( STATIC_CODE_COL, @@ -19,10 +19,7 @@ def get_ts_columns(feature_columns): def is_static(c): return c.endswith(STATIC_CODE_COL) or c.endswith(STATIC_VALUE_COL) - def get_code_name(c): - return "/".join(c.split("/")[0:-1]) - - ts_columns = sorted(list({get_code_name(c) for c in feature_columns if not is_static(c)})) + ts_columns = sorted(list({c for c in feature_columns if not is_static(c)})) return ts_columns @@ -33,19 +30,36 @@ def fill_missing_entries_with_nan(sparse_df, type, columns): return sparse_df -def get_long_code_df(df, ts_columns, col_offset): - column_to_int = {col: i + col_offset for i, col in enumerate(ts_columns)} - rows = range(len(df)) - cols = df["code"].map(column_to_int) - data = np.ones(len(df), dtype=np.bool_) +def get_long_code_df(df, ts_columns): + column_to_int = {col: i for i, col in enumerate(ts_columns)} + rows = range(df.select(pl.len()).collect().item()) + cols = ( + df.with_columns( + pl.concat_str([pl.col("code"), pl.lit("/code")]).replace(column_to_int).alias("code_index") + ) + .select("code_index") + .collect() + .to_series() + .to_numpy() + ) + data = np.ones(df.select(pl.len()).collect().item(), dtype=np.bool_) return data, (rows, cols) def get_long_value_df(df, ts_columns): column_to_int = {col: i for i, col in enumerate(ts_columns)} - rows = range(0, len(df)) - cols = df["code"].map(column_to_int) - data = df["numerical_value"] + value_df = df.drop_nulls("numerical_value") + rows = range(value_df.select(pl.len()).collect().item()) + cols = ( + value_df.with_columns( + pl.concat_str([pl.col("code"), pl.lit("/value")]).replace(column_to_int).alias("value_index") + ) + .select("value_index") + .collect() + .to_series() + .to_numpy() + ) + data = value_df.select(pl.col("numerical_value")).collect().to_series().to_numpy() return data, (rows, cols) @@ -85,27 +99,28 @@ def summarize_dynamic_measurements( 0 1 2021-01-01 1 0 1 0 1 1 2021-01-01 2 0 1 0 """ - logger.info("create code and value") + logger.info("Generating Sparse matrix for Time Series Features") id_cols = ["patient_id", "timestamp"] + + # Confirm dataframe is sorted + check_df = df.select(pl.col(id_cols)) + assert check_df.sort(by=id_cols).collect().equals(check_df.collect()), "data frame must be sorted" + + # Generate sparse matrices value_df = df.drop(columns=id_cols) value_data, (value_rows, value_cols) = get_long_value_df(value_df, ts_columns) - code_df = df.drop(columns=id_cols + ["numerical_value"]) - code_data, (code_rows, code_cols) = get_long_code_df(code_df, ts_columns, col_offset=len(ts_columns)) + code_data, (code_rows, code_cols) = get_long_code_df(code_df, ts_columns) - logger.info("merge") merge_data = np.concatenate([value_data, code_data]) merge_rows = np.concatenate([value_rows, code_rows]) merge_cols = np.concatenate([value_cols, code_cols]) - merge_columns = [f"{c}/value" for c in ts_columns] + [f"{c}/code" for c in ts_columns] - long_df = pd.DataFrame.sparse.from_spmatrix( - coo_matrix((merge_data, (merge_rows, merge_cols)), shape=(len(value_df), len(merge_columns))), - columns=merge_columns, + merge_columns = ts_columns + sp_matrix = csr_array( + (merge_data, (merge_rows, merge_cols)), + shape=(value_df.select(pl.len()).collect().item(), len(merge_columns)), ) - long_df["timestamp"] = df["timestamp"] - long_df["patient_id"] = df["patient_id"] - long_df = long_df[id_cols + merge_columns] - return long_df + return df.select(pl.col(id_cols)), sp_matrix def get_flat_ts_rep( @@ -151,5 +166,4 @@ def get_flat_ts_rep( ts_columns = get_ts_columns(feature_columns) ts_shard_df = shard_df.drop_nulls(subset=["timestamp", "code"]) - pd_df = ts_shard_df.collect().to_pandas() - return summarize_dynamic_measurements(ts_columns, pd_df) + return summarize_dynamic_measurements(ts_columns, ts_shard_df) From 7ea32306862738ecf442dcaa6c0da074f12d32bb Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sun, 2 Jun 2024 00:16:55 +0000 Subject: [PATCH 051/106] standardized file storage using file_name.py and updated from using npy files for storing sparse matrices to using npz files that contain the array with data,row,cols and contain the shape of the sparse matrix --- scripts/identify_columns.py | 38 ++-- scripts/summarize_over_windows.py | 116 ++++++----- scripts/tabularize_static.py | 80 ++++---- scripts/tabularize_ts.py | 65 ------- src/MEDS_tabular_automl/file_name.py | 72 +++++++ .../generate_summarized_reps.py | 6 +- src/MEDS_tabular_automl/utils.py | 48 +++-- tests/test_tabularize.py | 183 +++++++++--------- 8 files changed, 316 insertions(+), 292 deletions(-) delete mode 100644 scripts/tabularize_ts.py create mode 100644 src/MEDS_tabular_automl/file_name.py diff --git a/scripts/identify_columns.py b/scripts/identify_columns.py index 186b6c5..b52eb41 100644 --- a/scripts/identify_columns.py +++ b/scripts/identify_columns.py @@ -5,16 +5,14 @@ from pathlib import Path import hydra +import numpy as np import polars as pl from loguru import logger from omegaconf import DictConfig, OmegaConf +from MEDS_tabular_automl.file_name import FileNameResolver from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import ( - compute_feature_frequencies, - load_meds_data, - load_tqdm, -) +from MEDS_tabular_automl.utils import compute_feature_frequencies, load_tqdm def store_config_yaml(config_fp: Path, cfg: DictConfig): @@ -73,14 +71,12 @@ def store_columns( """ iter_wrapper = load_tqdm(cfg.tqdm) # create output dir - flat_dir = Path(cfg.tabularized_data_dir) + f_name_resolver = FileNameResolver(cfg) + flat_dir = f_name_resolver.tabularize_dir flat_dir.mkdir(exist_ok=True, parents=True) - # load MEDS data - split_to_fps = load_meds_data(cfg.MEDS_cohort_dir, load_data=False) - # store params in json file - config_fp = flat_dir / "config.yaml" + config_fp = f_name_resolver.get_config_path() store_config_yaml(config_fp, cfg) # 0. Identify Output Columns and Frequencies @@ -96,11 +92,11 @@ def read_fn(in_fp): return pl.scan_parquet(in_fp) # Map: Iterates through shards and caches feature frequencies - feature_freq_fp = flat_dir / "feature_freqs" - feature_freq_fp.mkdir(exist_ok=True) - for shard_fp in iter_wrapper(split_to_fps["train"]): - name = shard_fp.stem - out_fp = feature_freq_fp / f"{name}.json" + train_shards = f_name_resolver.list_meds_files(split="train") + np.random.shuffle(train_shards) + feature_dir = f_name_resolver.tabularize_dir + for shard_fp in iter_wrapper(train_shards): + out_fp = feature_dir / "identify_train_columns" / f"{shard_fp.stem}.json" rwlock_wrap( shard_fp, out_fp, @@ -123,16 +119,16 @@ def compute_fn(feature_freq_list): def write_fn(data, out_fp): feature_freqs, feature_columns = data - json.dump(feature_columns, open(out_fp / "feature_columns.json", "w")) - json.dump(feature_freqs, open(flat_dir / "feature_freqs.json", "w")) + json.dump(feature_columns, open(f_name_resolver.get_feature_columns_fp(), "w")) + json.dump(feature_freqs, open(f_name_resolver.get_feature_freqs_fp(), "w")) - def read_fn(in_fp): - files = list(in_fp.glob("*.json")) + def read_fn(feature_dir): + files = list(feature_dir.glob("*.json")) return [json.load(open(fp)) for fp in files] rwlock_wrap( - feature_freq_fp, - flat_dir, + feature_dir / "identify_train_columns", + feature_dir, read_fn, write_fn, compute_fn, diff --git a/scripts/summarize_over_windows.py b/scripts/summarize_over_windows.py index 66a4c71..fb9f4c5 100644 --- a/scripts/summarize_over_windows.py +++ b/scripts/summarize_over_windows.py @@ -1,26 +1,20 @@ #!/usr/bin/env python """Aggregates time-series data for feature columns across different window sizes.""" -import os +import json +from itertools import product import hydra +import numpy as np import polars as pl from loguru import logger from omegaconf import DictConfig +from MEDS_tabular_automl.file_name import FileNameResolver from MEDS_tabular_automl.generate_summarized_reps import generate_summary from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import setup_environment, write_df - - -def hydra_loguru_init() -> None: - """Adds loguru output to the logs that hydra scrapes. - - Must be called from a hydra main! - """ - hydra_path = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir - logger.add(os.path.join(hydra_path, "main.log")) +from MEDS_tabular_automl.utils import hydra_loguru_init, load_tqdm, write_df @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") @@ -53,58 +47,60 @@ def summarize_ts_data_over_windows( FileNotFoundError: If specified directories or files in the configuration are not found. ValueError: If required columns like 'code' or 'value' are missing in the data files. """ + iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.test: hydra_loguru_init() - flat_dir, split_to_fps, feature_columns = setup_environment(cfg, load_data=False) + f_name_resolver = FileNameResolver(cfg) # Produce ts representation - ts_subdir = flat_dir / "ts" - - for sp, shard_fps in split_to_fps.items(): - sp_dir = ts_subdir / sp - - for i, shard_fp in enumerate(shard_fps): - for window_size in cfg.window_sizes: - for agg in cfg.aggs: - pivot_fp = sp_dir / window_size / agg / f"{i}.pkl" - if pivot_fp.exists() and not cfg.do_overwrite: - raise FileExistsError( - f"do_overwrite is {cfg.do_overwrite} and {pivot_fp.exists()} exists!" - ) - - def read_fn(fp): - return pl.scan_parquet(fp) - - def compute_fn(shard_df): - # Load Sparse DataFrame - pivot_df = get_flat_ts_rep( - feature_columns=feature_columns, - shard_df=shard_df, - ) - - # Summarize data -- applying aggregations on various window sizes - summary_df = generate_summary( - feature_columns, - pivot_df, - window_size, - agg, - ) - assert summary_df.shape[1] > 2, "No data found in the summarized dataframe" - - logger.info("Writing pivot file") - return summary_df - - def write_fn(out_df, out_fp): - write_df(out_df, out_fp, do_overwrite=cfg.do_overwrite) - - rwlock_wrap( - shard_fp, - pivot_fp, - read_fn, - write_fn, - compute_fn, - do_overwrite=cfg.do_overwrite, - do_return=False, - ) + meds_shard_fps = f_name_resolver.list_meds_files() + feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) + + # shuffle tasks + tabularization_tasks = list(product(meds_shard_fps, cfg.window_sizes, cfg.aggs)) + np.random.shuffle(tabularization_tasks) + + # iterate through them + for shard_fp, window_size, agg in iter_wrapper(tabularization_tasks): + shard_num = shard_fp.stem + split = shard_fp.parent.stem + assert split in ["train", "held_out", "tuning"], f"Invalid split {split}" + ts_fp = f_name_resolver.get_flat_ts_rep(split, shard_num, window_size, agg) + if ts_fp.exists() and not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {ts_fp.exists()} exists!") + + def read_fn(fp): + return pl.scan_parquet(fp) + + def compute_fn(shard_df): + # Load Sparse DataFrame + index_df, sparse_matrix = get_flat_ts_rep(feature_columns, shard_df) + + # Summarize data -- applying aggregations on a specific window size + aggregation combination + summary_df = generate_summary( + feature_columns, + index_df, + sparse_matrix, + window_size, + agg, + ) + assert summary_df.shape[1] > 2, "No data found in the summarized dataframe" + + logger.info("Writing pivot file") + return summary_df + + def write_fn(out_matrix, out_fp): + coo_matrix = out_matrix.tocoo() + write_df(coo_matrix, out_fp, do_overwrite=cfg.do_overwrite) + + rwlock_wrap( + shard_fp, + ts_fp, + read_fn, + write_fn, + compute_fn, + do_overwrite=cfg.do_overwrite, + do_return=False, + ) if __name__ == "__main__": diff --git a/scripts/tabularize_static.py b/scripts/tabularize_static.py index 8f19ae6..d5ba698 100644 --- a/scripts/tabularize_static.py +++ b/scripts/tabularize_static.py @@ -1,15 +1,19 @@ #!/usr/bin/env python """Tabularizes static data in MEDS format into tabular representations.""" +import json +from itertools import product from pathlib import Path import hydra +import numpy as np import polars as pl from omegaconf import DictConfig, OmegaConf +from MEDS_tabular_automl.file_name import FileNameResolver from MEDS_tabular_automl.generate_static_features import get_flat_static_rep from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import setup_environment, write_df +from MEDS_tabular_automl.utils import hydra_loguru_init, load_tqdm, write_df pl.enable_string_cache() @@ -96,44 +100,46 @@ def tabularize_static_data( .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 """ - flat_dir, split_to_fp, feature_columns = setup_environment(cfg, load_data=False) - - # Produce static representation - static_subdir = flat_dir / "static" - - static_dfs = {} - for sp, shard_fps in split_to_fp.items(): - static_dfs[sp] = [] - sp_dir = static_subdir / sp - - for i, shard_fp in enumerate(shard_fps): - fp = sp_dir / f"{i}.parquet" - static_dfs[sp].append(fp) - if fp.exists() and not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {fp} exists!") - - def read_fn(in_fp): - return pl.scan_parquet(in_fp) - - def compute_fn(shard_df): - return get_flat_static_rep( - feature_columns=feature_columns, - shard_df=shard_df, - ) - - def write_fn(data, out_df): - write_df(data, out_df, do_overwrite=cfg.do_overwrite) - - rwlock_wrap( - shard_fp, - fp, - read_fn, - write_fn, - compute_fn, - do_overwrite=cfg.do_overwrite, - do_return=False, + iter_wrapper = load_tqdm(cfg.tqdm) + if not cfg.test: + hydra_loguru_init() + f_name_resolver = FileNameResolver(cfg) + # Produce ts representation + meds_shard_fps = f_name_resolver.list_meds_files() + # f_name_resolver.get_meds_dir() + feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) + + # shuffle tasks + tabularization_tasks = list(product(meds_shard_fps, cfg.window_sizes, cfg.aggs)) + np.random.shuffle(tabularization_tasks) + + for shard_fp in iter_wrapper(meds_shard_fps): + static_fp = f_name_resolver.get_flat_static_rep(shard_fp.parent.stem, shard_fp.stem) + if static_fp.exists() and not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {static_fp} exists!") + + def read_fn(in_fp): + return pl.scan_parquet(in_fp) + + def compute_fn(shard_df): + return get_flat_static_rep( + feature_columns=feature_columns, + shard_df=shard_df, ) + def write_fn(data, out_df): + write_df(data, out_df, do_overwrite=cfg.do_overwrite) + + rwlock_wrap( + shard_fp, + static_fp, + read_fn, + write_fn, + compute_fn, + do_overwrite=cfg.do_overwrite, + do_return=False, + ) + if __name__ == "__main__": tabularize_static_data() diff --git a/scripts/tabularize_ts.py b/scripts/tabularize_ts.py deleted file mode 100644 index ae39595..0000000 --- a/scripts/tabularize_ts.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python -"""Tabularizes time-series data in MEDS format into tabular representations.""" - -import hydra -import polars as pl -from loguru import logger -from omegaconf import DictConfig - -from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep -from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import load_tqdm, setup_environment, write_df - - -@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") -def tabularize_ts_data( - cfg: DictConfig, -): - """Processes a medical dataset to generates and stores flat representatiosn of time-series data. - - This function handles MEDS format data and pivots tables to create two types of data files - with patient_id and timestamp indexes: - code data: containing a column for every code and 1 and 0 values indicating presence - value data: containing a column for every code which the numerical value observed. - - Args: - cfg: configuration dictionary containing the necessary parameters for tabularizing the data. - """ - iter_wrapper = load_tqdm(cfg.tqdm) - flat_dir, split_to_fp, feature_columns = setup_environment(cfg, load_data=False) - - # Produce ts representation - ts_subdir = flat_dir / "ts" - - for sp, shard_fps in split_to_fp.items(): - sp_dir = ts_subdir / sp - - for i, shard_fp in enumerate(iter_wrapper(shard_fps)): - out_fp = sp_dir / f"{i}.pkl" - - def read_fn(in_fp): - return pl.scan_parquet(in_fp) - - def compute_fn(shard_df): - return get_flat_ts_rep( - feature_columns=feature_columns, - shard_df=shard_df, - ) - - def write_fn(data, out_df): - write_df(data, out_df, do_overwrite=cfg.do_overwrite) - - rwlock_wrap( - shard_fp, - out_fp, - read_fn, - write_fn, - compute_fn, - do_overwrite=cfg.do_overwrite, - do_return=False, - ) - logger.info("Generated TS flat representations.") - - -if __name__ == "__main__": - tabularize_ts_data() diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py new file mode 100644 index 0000000..7c5b1cf --- /dev/null +++ b/src/MEDS_tabular_automl/file_name.py @@ -0,0 +1,72 @@ +"""Help functions for getting file names and paths for MEDS tabular automl tasks.""" +from pathlib import Path + +from omegaconf import DictConfig + + +class FileNameResolver: + def __init__(self, cfg: DictConfig): + self.cfg = cfg + self.meds_dir = Path(cfg.MEDS_cohort_dir) + self.tabularize_dir = Path(cfg.tabularized_data_dir) + + def get_meds_dir(self): + return self.meds_dir / "final_cohort" + + def get_static_dir(self): + return self.tabularize_dir / "static" + + def get_ts_dir(self): + return self.tabularize_dir / "ts" + + def get_sparse_dir(self): + return self.tabularize_dir / "sparse" + + def get_feature_columns_fp(self): + return self.tabularize_dir / "feature_columns.json" + + def get_feature_freqs_fp(self): + return self.tabularize_dir / "feature_freqs.json" + + def get_config_path(self): + return self.tabularize_dir / "config.yaml" + + def get_meds_shard(self, shard_num: int): + # Given a shard number, return the MEDS format data + return self.get_meds_dir() / f"{shard_num}.parquet" + + def get_flat_static_rep(self, split: str, shard_num: int): + # Given a shard number, returns the static representation path + return self.get_static_dir() / split / f"{shard_num}.parquet" + + def get_flat_ts_rep(self, split: str, shard_num: int, window_size: int, agg: str): + # Given a shard number, returns the time series representation path + return self.get_ts_dir() / split / f"{shard_num}" / f"{window_size}" / f"{agg}.npz" + + def get_flat_sparse_rep(self, split: str, shard_num: int, window_size: int, agg: str): + # Given a shard number, returns the sparse representation path + return self.get_sparse_dir() / split / f"{shard_num}" / f"{window_size}" / f"{agg}.npz" + + def list_meds_files(self, split=None): + # List all MEDS files + if split: + return sorted(list(self.get_meds_dir().glob(f"{split}/*.parquet"))) + return sorted(list(self.get_meds_dir().glob("*/*.parquet"))) + + def list_static_files(self, split=None): + # List all static files + if split: + return sorted(list(self.get_static_dir().glob(f"{split}/*.parquet"))) + return sorted(list(self.get_static_dir().glob("*/*.parquet"))) + + def list_ts_files(self, split=None): + # List all time series files + if split: + return sorted(list(self.get_ts_dir().glob(f"{split}/*/*/*/*.npz"))) + return sorted(list(self.get_ts_dir().glob("*/*/*/*/*.npz"))) + + def list_sparse_files(self, split=None): + # List all sparse files + if split: + return sorted(list(self.get_sparse_dir().glob(f"{split}/*/*.npz"))) + return sorted(list(self.get_sparse_dir().glob("*/*/*.npz"))) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 33812eb..c77cb14 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -127,8 +127,8 @@ def sparse_rolling(df, sparse_matrix, timedelta, agg): def get_rolling_window_indicies(index_df, window_size): """Get the indices for the rolling windows.""" if window_size == "full": - newest_date = df.select(pl.col("timestamp")).max().collect().item() - oldest_date = df.select(pl.col("timestamp")).min().collect().item() + newest_date = index_df.select(pl.col("timestamp")).max().collect().item() + oldest_date = index_df.select(pl.col("timestamp")).min().collect().item() timedelta = newest_date - oldest_date + pd.Timedelta(days=1) else: timedelta = pd.Timedelta(window_size) @@ -273,7 +273,7 @@ def _generate_summary( """ if agg not in VALID_AGGREGATIONS: raise ValueError(f"Invalid aggregation: {agg}. Valid options are: {VALID_AGGREGATIONS}") - out_matrix = compute_agg(index_df, sparse_matrix, window_size, agg, use_tqdm=use_tqdm) + out_matrix = compute_agg(index_df, matrix, window_size, agg, use_tqdm=use_tqdm) return out_matrix diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 92027c4..2da18b0 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -6,22 +6,33 @@ DF_T: This defines the type of internal dataframes -- e.g. polars DataFrames. """ import json +import os from collections.abc import Mapping from pathlib import Path +import hydra import numpy as np -import pandas as pd import polars as pl import polars.selectors as cs import yaml from loguru import logger from omegaconf import DictConfig, OmegaConf +from scipy.sparse import coo_array DF_T = pl.LazyFrame WRITE_USE_PYARROW = True ROW_IDX_NAME = "__row_idx" +def hydra_loguru_init() -> None: + """Adds loguru output to the logs that hydra scrapes. + + Must be called from a hydra main! + """ + hydra_path = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir + logger.add(os.path.join(hydra_path, "main.log")) + + def load_tqdm(use_tqdm): if use_tqdm: from tqdm import tqdm @@ -42,7 +53,28 @@ def parse_static_feature_column(c: str) -> tuple[str, str, str, str]: return ("/".join(parts[:-2]), parts[-2], parts[-1]) -def write_df(df: DF_T, fp: Path, **kwargs): +def array_to_sparse_matrix(array: np.ndarray, shape: tuple[int, int]): + assert array.shape[0] == 3 + data, row, col = array + return coo_array((data, (row, col)), shape=shape) + + +def sparse_matrix_to_array(coo_matrix: coo_array): + return np.array([coo_matrix.data, coo_matrix.row, coo_matrix.col]), coo_matrix.shape + + +def store_matrix(coo_matrix: coo_array, fp_path: Path): + array, shape = sparse_matrix_to_array(coo_matrix) + np.savez(fp_path, array=array, shape=shape) + + +def load_matrix(fp_path: Path): + npzfile = np.load(fp_path) + array, shape = npzfile["array"], npzfile["shape"] + return array_to_sparse_matrix(array, shape) + + +def write_df(df: coo_array, fp: Path, **kwargs): """Write shard to disk.""" do_overwrite = kwargs.get("do_overwrite", False) @@ -55,16 +87,10 @@ def write_df(df: DF_T, fp: Path, **kwargs): df.collect().write_parquet(fp, use_pyarrow=WRITE_USE_PYARROW) elif isinstance(df, pl.DataFrame): df.write_parquet(fp, use_pyarrow=WRITE_USE_PYARROW) - elif isinstance(df, pd.DataFrame): - if not all(df.columns[:2] == ["patient_id", "timestamp"]): - raise ValueError( - f"Expected DataFrame to have columns ['patient_id', 'timestamp'], got {df.columns[:2]}" - ) - df.to_pickle(fp) - elif isinstance(df, np.matrix): - np.save(fp, df) + elif isinstance(df, coo_array): + store_matrix(df, fp) else: - raise ValueError(f"Unsupported type for df: {type(df)}") + raise TypeError(f"Unsupported type for df: {type(df)}") def get_static_col_dtype(col: str) -> pl.DataType: diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index ed6988c..5755c98 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -3,21 +3,19 @@ root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) import json -import shutil import tempfile from io import StringIO from pathlib import Path -import pandas as pd import polars as pl from hydra import compose, initialize from loguru import logger +from MEDS_tabular_automl.file_name import FileNameResolver +from MEDS_tabular_automl.utils import load_matrix from scripts.identify_columns import store_columns from scripts.summarize_over_windows import summarize_ts_data_over_windows -from scripts.tabularize_merge import merge_data from scripts.tabularize_static import tabularize_static_data -from scripts.tabularize_ts import tabularize_ts_data SPLITS_JSON = """{"train/0": [239684, 1195293], "train/1": [68729, 814703], "tuning/0": [754281], "held_out/0": [1500733]}""" # noqa: E501 @@ -104,80 +102,64 @@ } SUMMARIZE_EXPECTED_FILES = [ - "train/365d/value/sum/0.pkl", - "train/365d/value/sum/1.pkl", - "train/365d/code/count/0.pkl", - "train/365d/code/count/1.pkl", - "train/full/value/sum/0.pkl", - "train/full/value/sum/1.pkl", - "train/full/code/count/0.pkl", - "train/full/code/count/1.pkl", - "train/30d/value/sum/0.pkl", - "train/30d/value/sum/1.pkl", - "train/30d/code/count/0.pkl", - "train/30d/code/count/1.pkl", - "held_out/365d/value/sum/0.pkl", - "held_out/365d/code/count/0.pkl", - "held_out/full/value/sum/0.pkl", - "held_out/full/code/count/0.pkl", - "held_out/30d/value/sum/0.pkl", - "held_out/30d/code/count/0.pkl", - "tuning/365d/value/sum/0.pkl", - "tuning/365d/code/count/0.pkl", - "tuning/full/value/sum/0.pkl", - "tuning/full/code/count/0.pkl", - "tuning/30d/value/sum/0.pkl", - "tuning/30d/code/count/0.pkl", + "train/1/365d/value/sum.npz", + "train/1/365d/code/count.npz", + "train/1/full/value/sum.npz", + "train/1/full/code/count.npz", + "train/1/30d/value/sum.npz", + "train/1/30d/code/count.npz", + "train/0/365d/value/sum.npz", + "train/0/365d/code/count.npz", + "train/0/full/value/sum.npz", + "train/0/full/code/count.npz", + "train/0/30d/value/sum.npz", + "train/0/30d/code/count.npz", + "held_out/0/365d/value/sum.npz", + "held_out/0/365d/code/count.npz", + "held_out/0/full/value/sum.npz", + "held_out/0/full/code/count.npz", + "held_out/0/30d/value/sum.npz", + "held_out/0/30d/code/count.npz", + "tuning/0/365d/value/sum.npz", + "tuning/0/365d/code/count.npz", + "tuning/0/full/value/sum.npz", + "tuning/0/full/code/count.npz", + "tuning/0/30d/value/sum.npz", + "tuning/0/30d/code/count.npz", ] MERGE_EXPECTED_FILES = [ - "train/365d/value/sum/0.npy", - "train/365d/value/sum/1.npy", - "train/365d/code/count/0.npy", - "train/365d/code/count/1.npy", - "train/full/value/sum/0.npy", - "train/full/value/sum/1.npy", - "train/full/code/count/0.npy", - "train/full/code/count/1.npy", - "train/30d/value/sum/0.npy", - "train/30d/value/sum/1.npy", - "train/30d/code/count/0.npy", - "train/30d/code/count/1.npy", - "held_out/365d/value/sum/0.npy", - "held_out/365d/code/count/0.npy", - "held_out/full/value/sum/0.npy", - "held_out/full/code/count/0.npy", - "held_out/30d/value/sum/0.npy", - "held_out/30d/code/count/0.npy", - "tuning/365d/value/sum/0.npy", - "tuning/365d/code/count/0.npy", - "tuning/full/value/sum/0.npy", - "tuning/full/code/count/0.npy", - "tuning/30d/value/sum/0.npy", - "tuning/30d/code/count/0.npy", + "train/365d/value/sum/0.npz", + "train/365d/value/sum/1.npz", + "train/365d/code/count/0.npz", + "train/365d/code/count/1.npz", + "train/full/value/sum/0.npz", + "train/full/value/sum/1.npz", + "train/full/code/count/0.npz", + "train/full/code/count/1.npz", + "train/30d/value/sum/0.npz", + "train/30d/value/sum/1.npz", + "train/30d/code/count/0.npz", + "train/30d/code/count/1.npz", + "held_out/365d/value/sum/0.npz", + "held_out/365d/code/count/0.npz", + "held_out/full/value/sum/0.npz", + "held_out/full/code/count/0.npz", + "held_out/30d/value/sum/0.npz", + "held_out/30d/code/count/0.npz", + "tuning/365d/value/sum/0.npz", + "tuning/365d/code/count/0.npz", + "tuning/full/value/sum/0.npz", + "tuning/full/code/count/0.npz", + "tuning/30d/value/sum/0.npz", + "tuning/30d/code/count/0.npz", ] def test_tabularize(): with tempfile.TemporaryDirectory() as d: - MEDS_cohort_dir = Path(d) / "MEDS_cohort" - tabularized_data_dir = Path(d) / "flat_reps" - - # Create the directories - MEDS_cohort_dir.mkdir() - - # Store MEDS outputs - for split, data in MEDS_OUTPUTS.items(): - file_path = MEDS_cohort_dir / f"{split}.parquet" - file_path.parent.mkdir(exist_ok=True) - df = pl.read_csv(StringIO(data)) - df.with_columns(pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S.%f")).write_parquet( - file_path - ) - - split_json = json.load(StringIO(SPLITS_JSON)) - splits_fp = MEDS_cohort_dir / "splits.json" - json.dump(split_json, splits_fp.open("w")) + MEDS_cohort_dir = Path(d) / "processed" + tabularized_data_dir = Path(d) / "processed" / "tabularize" tabularize_config_kwargs = { "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), @@ -198,50 +180,61 @@ def test_tabularize(): with initialize(version_base=None, config_path="../configs/"): # path to config.yaml overrides = [f"{k}={v}" for k, v in tabularize_config_kwargs.items()] cfg = compose(config_name="tabularize", overrides=overrides) # config.yaml + + f_name_resolver = FileNameResolver(cfg) + + # Create the directories + (MEDS_cohort_dir / "final_cohort").mkdir(parents=True, exist_ok=True) + + # Store MEDS outputs + for split, data in MEDS_OUTPUTS.items(): + file_path = MEDS_cohort_dir / "final_cohort" / f"{split}.parquet" + file_path.parent.mkdir(exist_ok=True) + df = pl.read_csv(StringIO(data)) + df.with_columns(pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S.%f")).write_parquet( + file_path + ) + + # Check the files are not empty + meds_files = f_name_resolver.list_meds_files() + assert len(meds_files) == 4, "MEDS Data Files Should be 4!" + for f in meds_files: + assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" + + split_json = json.load(StringIO(SPLITS_JSON)) + splits_fp = MEDS_cohort_dir / "splits.json" + json.dump(split_json, splits_fp.open("w")) logger.info("caching flat representation of MEDS data") store_columns(cfg) assert (tabularized_data_dir / "config.yaml").is_file() assert (tabularized_data_dir / "feature_columns.json").is_file() assert (tabularized_data_dir / "feature_freqs.json").is_file() tabularize_static_data(cfg) - actual_files = [ - (f.parent.stem, f.stem) for f in list(tabularized_data_dir.glob("static/*/*.parquet")) - ] + actual_files = [(f.parent.stem, f.stem) for f in f_name_resolver.list_static_files()] expected_files = [("train", "1"), ("train", "0"), ("held_out", "0"), ("tuning", "0")] + f_name_resolver.get_static_dir() assert set(actual_files) == set(expected_files) # Check the files are not empty for f in list(tabularized_data_dir.glob("static/*/*.parquet")): assert pl.read_parquet(f).shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" - tabularize_ts_data(cfg) - # confirm the time series files exist: - actual_files = [(f.parent.stem, f.stem) for f in list(tabularized_data_dir.glob("ts/*/*.pkl"))] - expected_files = [ - ("train", "1"), - ("train", "0"), - ("held_out", "0"), - ("tuning", "0"), - ] - assert set(actual_files) == set(expected_files) - for f in list(tabularized_data_dir.glob("ts/*/*.pkl")): - assert pd.read_pickle(f).shape[0] > 0, "Time-Series Tabular Dataframe Should not be Empty!" - shutil.rmtree(tabularized_data_dir / "ts") - summarize_ts_data_over_windows(cfg) # confirm summary files exist: - output_files = list(tabularized_data_dir.glob("ts/*/*/*/*/*.pkl")) + output_files = list(tabularized_data_dir.glob("ts/*/*/*/*/*.npz")) + f_name_resolver.list_ts_files() actual_files = [str(Path(*f.parts[-5:])) for f in output_files] assert set(actual_files) == set(SUMMARIZE_EXPECTED_FILES) for f in output_files: - df = pd.read_pickle(f) - assert df.shape[0] > 0 - - merge_data(cfg) - output_files = list(tabularized_data_dir.glob("sparse/*/*/*/*/*.npy")) - actual_files = [str(Path(*f.parts[-5:])) for f in output_files] - assert set(actual_files) == set(MERGE_EXPECTED_FILES) + sparse_array = load_matrix(f) + assert sparse_array.shape[0] > 0 + assert sparse_array.shape[1] > 0 + + # merge_data(cfg) + # output_files = list(tabularized_data_dir.glob("sparse/*/*/*/*/*.npz")) + # actual_files = [str(Path(*f.parts[-5:])) for f in output_files] + # assert set(actual_files) == set(MERGE_EXPECTED_FILES) # model_dir = Path(d) / "save_model" # xgboost_config_kwargs = { From 23a2e3b8874681e7bd664ef2340bdf8acabf42e5 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sun, 2 Jun 2024 07:24:20 +0000 Subject: [PATCH 052/106] cleaned up file paths so we can load all aggregations selectively and concatenate them. Now the get_feature_names and get_feature_indices functions will load the feature names and indices of the columns in the sparse matrix we generate for a given aggregation --- scripts/summarize_over_windows.py | 15 +- scripts/tabularize_merge.py | 151 -------------- scripts/tabularize_static.py | 18 +- scripts/xgboost_sweep.py | 3 +- src/MEDS_tabular_automl/file_name.py | 12 +- .../generate_static_features.py | 185 ++++++++++++------ .../generate_summarized_reps.py | 68 ++++--- .../generate_ts_features.py | 62 +++--- src/MEDS_tabular_automl/utils.py | 82 +++++--- tests/test_tabularize.py | 65 ++++-- 10 files changed, 324 insertions(+), 337 deletions(-) delete mode 100644 scripts/tabularize_merge.py diff --git a/scripts/summarize_over_windows.py b/scripts/summarize_over_windows.py index fb9f4c5..070232f 100644 --- a/scripts/summarize_over_windows.py +++ b/scripts/summarize_over_windows.py @@ -14,7 +14,13 @@ from MEDS_tabular_automl.generate_summarized_reps import generate_summary from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import hydra_loguru_init, load_tqdm, write_df +from MEDS_tabular_automl.utils import ( + STATIC_CODE_AGGREGATION, + STATIC_VALUE_AGGREGATION, + hydra_loguru_init, + load_tqdm, + write_df, +) @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") @@ -56,7 +62,8 @@ def summarize_ts_data_over_windows( feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) # shuffle tasks - tabularization_tasks = list(product(meds_shard_fps, cfg.window_sizes, cfg.aggs)) + aggs = [agg for agg in cfg.aggs if agg not in [STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION]] + tabularization_tasks = list(product(meds_shard_fps, cfg.window_sizes, aggs)) np.random.shuffle(tabularization_tasks) # iterate through them @@ -73,7 +80,7 @@ def read_fn(fp): def compute_fn(shard_df): # Load Sparse DataFrame - index_df, sparse_matrix = get_flat_ts_rep(feature_columns, shard_df) + index_df, sparse_matrix = get_flat_ts_rep(agg, feature_columns, shard_df) # Summarize data -- applying aggregations on a specific window size + aggregation combination summary_df = generate_summary( @@ -83,7 +90,7 @@ def compute_fn(shard_df): window_size, agg, ) - assert summary_df.shape[1] > 2, "No data found in the summarized dataframe" + assert summary_df.shape[1] > 0, "No data found in the summarized dataframe" logger.info("Writing pivot file") return summary_df diff --git a/scripts/tabularize_merge.py b/scripts/tabularize_merge.py deleted file mode 100644 index 084ce58..0000000 --- a/scripts/tabularize_merge.py +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/env python -"""Tabularizes time-series data in MEDS format into tabular representations.""" -from pathlib import Path - -import hydra -import numpy as np -import pandas as pd -import polars as pl -from loguru import logger -from omegaconf import DictConfig -from scipy.sparse import coo_matrix, csc_matrix, csr_matrix, hstack - -from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import load_tqdm, setup_environment, write_df - - -def merge_dfs(feature_columns, static_df, ts_df): - """Merges static and time-series dataframes. - - This function merges the static and time-series dataframes based on the patient_id column. - - Args: - - feature_columns (List[str]): A list of feature columns to include in the merged dataframe. - - static_df (pd.DataFrame): A dataframe containing static features. - - ts_df (pd.DataFrame): A dataframe containing time-series features. - - Returns: - - pd.DataFrame: A merged dataframe containing static and time-series features. - """ - # TODO - store static and ts data as numpy matrices - # TODO - Eventually do this duplication at the task specific stage after filtering patients and features - # Make static data sparse and merge it with the time-series data - logger.info("Make static data sparse and merge it with the time-series data") - assert static_df.patient_id.is_monotonic_increasing - assert ts_df.patient_id.is_monotonic_increasing - sparse_time_series = ts_df.drop(columns=["patient_id", "timestamp"]).sparse.to_coo() - - num_patients = max(static_df.patient_id.nunique(), ts_df.patient_id.nunique()) - - # load static data as sparse matrix - static_matrix = static_df.drop(columns="patient_id").values - data_list = [] - rows = [] - cols = [] - for row in range(static_matrix.shape[0]): - for col in range(static_matrix.shape[1]): - data = static_matrix[row, col] - if (data is not None) and (data != 0): - data_list.append(data) - rows.append(row) - cols.append(col) - static_matrix = csr_matrix((data_list, (rows, cols)), shape=(num_patients, static_matrix.shape[1])) - # Duplicate static matrix rows to match time-series data - duplication_index = ts_df["patient_id"].value_counts().sort_index().reset_index(drop=True) - reindex_slices = np.repeat(duplication_index.index.values, duplication_index.values) - static_matrix = static_matrix[reindex_slices, :] - - # TODO: fix naming convention, we are generating value rows with zero frequency so remove those - ts_columns = ["/".join(c.split("/")[1:-1]) for c in ts_df.columns] - sparse_columns = ts_columns + list(static_df.columns) - - # Convert to sparse matrix and remove 0 frequency columns (i.e. columns not in feature_columns) - logger.info( - "Convert to sparse matrix and remove 0 frequency columns (i.e. columns not in feature_columns)" - ) - set_sparse_cols = set(sparse_columns) - missing_columns = [col for col in feature_columns if col not in set_sparse_cols] - - # reorder columns to be in order of feature_columns - logger.info("Reorder columns to be in order of feature_columns") - final_sparse_matrix = hstack( - [sparse_time_series, static_matrix, coo_matrix((sparse_time_series.shape[0], len(missing_columns)))] - ) - index_map = {name: index for index, name in enumerate(feature_columns)} - reverse_map = [index_map[col] for col in feature_columns] - final_sparse_matrix = coo_matrix(csc_matrix(final_sparse_matrix)[:, reverse_map]) - - # convert to np matrix of data, row, col - logger.info(f"Final sparse matrix shape: {final_sparse_matrix.shape}") - data, row, col = final_sparse_matrix.data, final_sparse_matrix.row, final_sparse_matrix.col - final_matrix = np.matrix([data, row, col]) - return final_matrix - - -@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") -def merge_data( - cfg: DictConfig, -): - """Processes a medical dataset to generates and stores flat representatiosn of time-series data. - - This function handles MEDS format data and pivots tables to create two types of data files - with patient_id and timestamp indexes: - code data: containing a column for every code and 1 and 0 values indicating presence - value data: containing a column for every code which the numerical value observed. - - Args: - cfg: configuration dictionary containing the necessary parameters for tabularizing the data. - """ - iter_wrapper = load_tqdm(cfg.tqdm) - flat_dir, split_to_fp, feature_columns = setup_environment(cfg, load_data=False) - med_dir = Path(cfg.tabularized_data_dir) - ts_dir = med_dir / "ts" - static_dir = med_dir / "static" - shard_fps = list(ts_dir.glob("*/*/*/*/*.pkl")) - - # Produce ts representation - out_subdir = flat_dir / "sparse" - - for shard_fp in iter_wrapper(shard_fps): - split = shard_fp.parts[-5] - in_ts_fp = shard_fp - assert in_ts_fp.exists(), f"{in_ts_fp} does not exist!" - in_static_fp = static_dir / split / f"{shard_fp.stem}.parquet" - assert in_static_fp.exists(), f"{in_static_fp} does not exist!" - out_fp = out_subdir / "/".join(shard_fp.parts[-5:-1]) / f"{shard_fp.stem}" - out_fp.parent.mkdir(parents=True, exist_ok=True) - - def read_fn(in_fps): - in_static_fp, in_ts_fp = in_fps - static_df = pl.read_parquet(in_static_fp) - ts_df = pd.read_pickle(in_ts_fp) - return [static_df, ts_df] - - def compute_fn(shards): - static_df, shard_df = shards - return merge_dfs( - feature_columns=feature_columns, - static_df=static_df.to_pandas(), - ts_df=shard_df, - ) - - def write_fn(data, out_df): - write_df(data, out_df, do_overwrite=cfg.do_overwrite) - - in_fps = in_static_fp, in_ts_fp - logger.info(f"Processing {in_static_fp} and\n{in_ts_fp}") - logger.info(f"Writing to {out_fp}...") - rwlock_wrap( - in_fps, - out_fp, - read_fn, - write_fn, - compute_fn, - do_overwrite=cfg.do_overwrite, - do_return=False, - ) - logger.info("Generated TS flat representations.") - - -if __name__ == "__main__": - merge_data() diff --git a/scripts/tabularize_static.py b/scripts/tabularize_static.py index d5ba698..abe6ea1 100644 --- a/scripts/tabularize_static.py +++ b/scripts/tabularize_static.py @@ -13,7 +13,13 @@ from MEDS_tabular_automl.file_name import FileNameResolver from MEDS_tabular_automl.generate_static_features import get_flat_static_rep from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import hydra_loguru_init, load_tqdm, write_df +from MEDS_tabular_automl.utils import ( + STATIC_CODE_AGGREGATION, + STATIC_VALUE_AGGREGATION, + hydra_loguru_init, + load_tqdm, + write_df, +) pl.enable_string_cache() @@ -110,11 +116,14 @@ def tabularize_static_data( feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) # shuffle tasks - tabularization_tasks = list(product(meds_shard_fps, cfg.window_sizes, cfg.aggs)) + static_aggs = [agg for agg in cfg.aggs if agg in [STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION]] + tabularization_tasks = list(product(meds_shard_fps, static_aggs)) np.random.shuffle(tabularization_tasks) - for shard_fp in iter_wrapper(meds_shard_fps): - static_fp = f_name_resolver.get_flat_static_rep(shard_fp.parent.stem, shard_fp.stem) + for shard_fp, agg in iter_wrapper(tabularization_tasks): + static_fp = f_name_resolver.get_flat_static_rep( + shard_fp.parent.stem, shard_fp.stem, agg.split("/")[-1] + ) if static_fp.exists() and not cfg.do_overwrite: raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {static_fp} exists!") @@ -123,6 +132,7 @@ def read_fn(in_fp): def compute_fn(shard_df): return get_flat_static_rep( + agg=agg, feature_columns=feature_columns, shard_df=shard_df, ) diff --git a/scripts/xgboost_sweep.py b/scripts/xgboost_sweep.py index a872307..a149ce2 100644 --- a/scripts/xgboost_sweep.py +++ b/scripts/xgboost_sweep.py @@ -196,7 +196,8 @@ def _get_dynamic_shard_by_index(self, idx: int) -> sp.csr_matrix: files = self.dynamic_data_path.glob(shard_pattern) valid_files = sorted(file for file in files if self._filter_shard_files_on_window_and_aggs(file)) dynamic_csrs = [self._load_dynamic_shard_from_file(file, idx) for file in valid_files] - combined_csr = sp.hstack(dynamic_csrs, format="csr") + combined_csr = sp.hstack(dynamic_csrs, format="csr") # TODO: check this + # Filter Rows valid_indices = self.valid_event_ids[shard_name] return combined_csr[valid_indices, :] diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py index 7c5b1cf..bfab7a3 100644 --- a/src/MEDS_tabular_automl/file_name.py +++ b/src/MEDS_tabular_automl/file_name.py @@ -31,13 +31,13 @@ def get_feature_freqs_fp(self): def get_config_path(self): return self.tabularize_dir / "config.yaml" - def get_meds_shard(self, shard_num: int): + def get_meds_shard(self, split: str, shard_num: int): # Given a shard number, return the MEDS format data - return self.get_meds_dir() / f"{shard_num}.parquet" + return self.get_meds_dir() / split / f"{shard_num}.parquet" - def get_flat_static_rep(self, split: str, shard_num: int): + def get_flat_static_rep(self, split: str, shard_num: int, agg: str): # Given a shard number, returns the static representation path - return self.get_static_dir() / split / f"{shard_num}.parquet" + return self.get_static_dir() / split / f"{shard_num}" / f"{agg}.npz" def get_flat_ts_rep(self, split: str, shard_num: int, window_size: int, agg: str): # Given a shard number, returns the time series representation path @@ -56,8 +56,8 @@ def list_meds_files(self, split=None): def list_static_files(self, split=None): # List all static files if split: - return sorted(list(self.get_static_dir().glob(f"{split}/*.parquet"))) - return sorted(list(self.get_static_dir().glob("*/*.parquet"))) + return sorted(list(self.get_static_dir().glob(f"{split}/*/*.npz"))) + return sorted(list(self.get_static_dir().glob("*/*/*.npz"))) def list_ts_files(self, split=None): # List all time series files diff --git a/src/MEDS_tabular_automl/generate_static_features.py b/src/MEDS_tabular_automl/generate_static_features.py index 3786ffa..192f1e3 100644 --- a/src/MEDS_tabular_automl/generate_static_features.py +++ b/src/MEDS_tabular_automl/generate_static_features.py @@ -8,19 +8,82 @@ - get_flat_static_rep: Produces a tabular representation of static data features. """ +import numpy as np import polars as pl +from loguru import logger +from scipy.sparse import coo_array, csr_array from MEDS_tabular_automl.utils import ( DF_T, - add_static_missing_cols, + STATIC_CODE_AGGREGATION, + STATIC_VALUE_AGGREGATION, + get_events_df, + get_feature_names, parse_static_feature_column, ) -STATIC_CODE_COL = "/static/present" -STATIC_VALUE_COL = "/static/first" + +def convert_to_matrix(df, num_events, num_features): + """Converts a Polars DataFrame to a sparse matrix.""" + dense_matrix = df.drop(columns="patient_id").collect().to_numpy() + data_list = [] + rows = [] + cols = [] + for row in range(dense_matrix.shape[0]): + for col in range(dense_matrix.shape[1]): + data = dense_matrix[row, col] + if (data is not None) and (data != 0): + data_list.append(data) + rows.append(row) + cols.append(col) + matrix = csr_array((data_list, (rows, cols)), shape=(num_events, num_features)) + return matrix + + +def get_sparse_static_rep(static_features, static_df, meds_df, feature_columns) -> coo_array: + """Merges static and time-series dataframes. + + This function merges the static and time-series dataframes based on the patient_id column. + + Args: + - feature_columns (List[str]): A list of feature columns to include in the merged dataframe. + - static_df (pd.DataFrame): A dataframe containing static features. + - ts_df (pd.DataFrame): A dataframe containing time-series features. + + Returns: + - pd.DataFrame: A merged dataframe containing static and time-series features. + """ + # TODO - Eventually do this duplication at the task specific stage after filtering patients and features + # Make static data sparse and merge it with the time-series data + logger.info("Make static data sparse and merge it with the time-series data") + # Check static_df is sorted and unique + assert static_df.select(pl.col("patient_id")).collect().to_series().is_sorted() + assert ( + static_df.select(pl.len()).collect().item() + == static_df.select(pl.col("patient_id").n_unique()).collect().item() + ) + meds_df = get_events_df(meds_df, feature_columns) + + # load static data as sparse matrix + static_matrix = convert_to_matrix( + static_df, num_events=meds_df.select(pl.len()).collect().item(), num_features=len(static_features) + ) + # Duplicate static matrix rows to match time-series data + events_per_patient = ( + meds_df.select(pl.col("patient_id").value_counts()) + .unnest("patient_id") + .sort(by="patient_id") + .select(pl.col("count")) + .collect() + .to_series() + ) + reindex_slices = np.repeat(range(len(events_per_patient)), events_per_patient) + static_matrix = static_matrix[reindex_slices, :] + return coo_array(static_matrix) def summarize_static_measurements( + agg: str, feature_columns: list[str], df: DF_T, ) -> pl.LazyFrame: @@ -39,58 +102,63 @@ def summarize_static_measurements( or simply as present, then performs a pivot to reshape the data for each patient, providing a tabular format where each row represents a patient and each column represents a static feature. """ - static_present = [c for c in feature_columns if c.endswith(STATIC_CODE_COL)] - static_first = [c for c in feature_columns if c.endswith(STATIC_VALUE_COL)] - - # Handling 'first' static values - static_first_codes = [parse_static_feature_column(c)[0] for c in static_first] - code_subset = df.filter(pl.col("code").is_in(static_first_codes)) - first_code_subset = code_subset.group_by(pl.col("patient_id")).first().collect() - static_value_pivot_df = first_code_subset.pivot( - index=["patient_id"], columns=["code"], values=["numerical_value"], aggregate_function=None - ) - # rename code to feature name - remap_cols = { - input_name: output_name - for input_name, output_name in zip(static_first_codes, static_first) - if input_name in static_value_pivot_df.columns - } - static_value_pivot_df = static_value_pivot_df.select( - *["patient_id"], *[pl.col(k).alias(v).cast(pl.Boolean) for k, v in remap_cols.items()] - ) - # pivot can be faster: https://stackoverflow.com/questions/73522017/replacing-a-pivot-with-a-lazy-groupby-operation # noqa: E501 - # TODO: consider casting with .cast(pl.Float32)) - - # Handling 'present' static indicators - static_present_codes = [parse_static_feature_column(c)[0] for c in static_present] - static_present_pivot_df = ( - df.select(*["patient_id", "code"]) - .filter(pl.col("code").is_in(static_present_codes)) - .with_columns(pl.lit(True).alias("__indicator")) - .collect() - .pivot( - index=["patient_id"], - columns=["code"], - values="__indicator", - aggregate_function=None, + if agg == STATIC_VALUE_AGGREGATION: + static_features = get_feature_names(agg=agg, feature_columns=feature_columns) + # Handling 'first' static values + static_first_codes = [parse_static_feature_column(c)[0] for c in static_features] + code_subset = df.filter(pl.col("code").is_in(static_first_codes)) + first_code_subset = code_subset.group_by(pl.col("patient_id")).first().collect() + static_value_pivot_df = first_code_subset.pivot( + index=["patient_id"], columns=["code"], values=["numerical_value"], aggregate_function=None ) - ) - remap_cols = { - input_name: output_name - for input_name, output_name in zip(static_present_codes, static_present) - if input_name in static_present_pivot_df.columns - } - # rename columns to final feature names - static_present_pivot_df = static_present_pivot_df.select( - *["patient_id"], *[pl.col(k).alias(v).cast(pl.Boolean) for k, v in remap_cols.items()] - ) - return pl.concat([static_value_pivot_df, static_present_pivot_df], how="align") + # rename code to feature name + remap_cols = { + input_name: output_name + for input_name, output_name in zip(static_first_codes, static_features) + if input_name in static_value_pivot_df.columns + } + static_value_pivot_df = static_value_pivot_df.select( + *["patient_id"], *[pl.col(k).alias(v).cast(pl.Boolean) for k, v in remap_cols.items()] + ).sort(by="patient_id") + # pivot can be faster: https://stackoverflow.com/questions/73522017/replacing-a-pivot-with-a-lazy-groupby-operation # noqa: E501 + # TODO: consider casting with .cast(pl.Float32)) + return static_value_pivot_df + elif agg == STATIC_CODE_AGGREGATION: + static_features = get_feature_names(agg=agg, feature_columns=feature_columns) + # Handling 'present' static indicators + static_present_codes = [parse_static_feature_column(c)[0] for c in static_features] + static_present_pivot_df = ( + df.select(*["patient_id", "code"]) + .filter(pl.col("code").is_in(static_present_codes)) + .with_columns(pl.lit(True).alias("__indicator")) + .collect() + .pivot( + index=["patient_id"], + columns=["code"], + values="__indicator", + aggregate_function=None, + ) + .sort(by="patient_id") + ) + remap_cols = { + input_name: output_name + for input_name, output_name in zip(static_present_codes, static_features) + if input_name in static_present_pivot_df.columns + } + # rename columns to final feature names + static_present_pivot_df = static_present_pivot_df.select( + *["patient_id"], *[pl.col(k).alias(v).cast(pl.Boolean) for k, v in remap_cols.items()] + ) + return static_present_pivot_df + else: + raise ValueError(f"Invalid aggregation type: {agg}") def get_flat_static_rep( + agg: str, feature_columns: list[str], shard_df: DF_T, -) -> pl.LazyFrame: +) -> coo_array: """Produces a raw representation for static data from a specified shard DataFrame. Parameters: @@ -104,14 +172,11 @@ def get_flat_static_rep( _summarize_static_measurements, and then normalizes the resulting data to ensure it is suitable for further analysis or machine learning tasks. """ - static_features = [ - c for c in feature_columns if c.endswith(STATIC_CODE_COL) or c.endswith(STATIC_VALUE_COL) - ] - static_measurements = summarize_static_measurements(static_features, df=shard_df) - # fill up missing feature columns with nulls - normalized_measurements = add_static_missing_cols( - static_measurements, - static_features, - set_count_0_to_null=False, - ) - return normalized_measurements + static_features = get_feature_names(agg=agg, feature_columns=feature_columns) + static_measurements = summarize_static_measurements(agg, static_features, df=shard_df) + # convert to sparse_matrix + matrix = get_sparse_static_rep(static_features, static_measurements.lazy(), shard_df, feature_columns) + assert matrix.shape[1] == len( + static_features + ), f"Expected {len(static_features)} features, got {matrix.shape[1]}" + return matrix diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index c77cb14..d0d85c6 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -8,23 +8,8 @@ from loguru import logger from scipy.sparse import coo_array, csr_array, sparray, vstack -from MEDS_tabular_automl.generate_ts_features import get_ts_columns -from MEDS_tabular_automl.utils import load_tqdm - -CODE_AGGREGATIONS = [ - "code/count", -] - -VALUE_AGGREGATIONS = [ - "value/count", - "value/has_values_count", - "value/sum", - "value/sum_sqd", - "value/min", - "value/max", -] - -VALID_AGGREGATIONS = CODE_AGGREGATIONS + VALUE_AGGREGATIONS +from MEDS_tabular_automl.generate_ts_features import get_feature_names, get_flat_ts_rep +from MEDS_tabular_automl.utils import CODE_AGGREGATIONS, VALUE_AGGREGATIONS, load_tqdm def time_aggd_col_alias_fntr(window_size: str, agg: str) -> Callable[[str], str]: @@ -141,7 +126,7 @@ def get_rolling_window_indicies(index_df, window_size): ) -def aggregate_matrix(windows, matrix, agg, use_tqdm=False): +def aggregate_matrix(windows, matrix, agg, num_features, use_tqdm=False): """Aggregate the matrix based on the windows.""" tqdm = load_tqdm(use_tqdm) agg = agg.split("/")[-1] @@ -162,11 +147,15 @@ def aggregate_matrix(windows, matrix, agg, use_tqdm=False): data.append(agg_matrix[nozero_ind]) row.append(np.repeat(np.array(i, dtype=np.int32), len(nozero_ind))) row = np.concatenate(row) - out_matrix = coo_array((np.concatenate(data), (row, np.concatenate(col))), dtype=out_dtype) + out_matrix = coo_array( + (np.concatenate(data), (row, np.concatenate(col))), + dtype=out_dtype, + shape=(windows.shape[0], num_features), + ) return csr_array(out_matrix) -def compute_agg(index_df, matrix: sparray, window_size: str, agg: str, use_tqdm=False): +def compute_agg(index_df, matrix: sparray, window_size: str, agg: str, num_features: int, use_tqdm=False): """Applies aggreagtion to dataframe. Dataframe is expected to only have the relevant columns for aggregating @@ -216,7 +205,6 @@ def compute_agg(index_df, matrix: sparray, window_size: str, agg: str, use_tqdm= patient_id int64 dtype: object """ - logger.info("Step 1: Grouping by same (patient_ids, timestamps) and aggregating") group_df = ( index_df.with_row_index("index") .group_by(["patient_id", "timestamp"], maintain_order=True) @@ -226,16 +214,22 @@ def compute_agg(index_df, matrix: sparray, window_size: str, agg: str, use_tqdm= index_df = group_df.lazy().select(pl.col("patient_id", "timestamp")) windows = group_df.select(pl.col("min_index", "max_index")) logger.info("Step 1.5: Running sparse aggregation.") - matrix = aggregate_matrix(windows, matrix, agg, use_tqdm) + matrix = aggregate_matrix(windows, matrix, agg, num_features, use_tqdm) logger.info("Step 2: computing rolling windows and aggregating.") windows = get_rolling_window_indicies(index_df, window_size) logger.info("Starting final sparse aggregations.") - matrix = aggregate_matrix(windows, matrix, agg, use_tqdm) + matrix = aggregate_matrix(windows, matrix, agg, num_features, use_tqdm) return matrix def _generate_summary( - ts_columns: list[str], index_df: pd.DataFrame, matrix: sparray, window_size: str, agg: str, use_tqdm=False + ts_columns: list[str], + index_df: pd.DataFrame, + matrix: sparray, + window_size: str, + agg: str, + num_features, + use_tqdm=False, ) -> pl.LazyFrame: """Generate a summary of the data frame for a given window size and aggregation. @@ -271,9 +265,11 @@ def _generate_summary( 2 3 2 0 2021-01-01 1 0 0 2 0 2021-01-04 2 """ - if agg not in VALID_AGGREGATIONS: - raise ValueError(f"Invalid aggregation: {agg}. Valid options are: {VALID_AGGREGATIONS}") - out_matrix = compute_agg(index_df, matrix, window_size, agg, use_tqdm=use_tqdm) + if agg not in CODE_AGGREGATIONS + VALUE_AGGREGATIONS: + raise ValueError( + f"Invalid aggregation: {agg}. Valid options are: {CODE_AGGREGATIONS + VALUE_AGGREGATIONS}" + ) + out_matrix = compute_agg(index_df, matrix, window_size, agg, num_features, use_tqdm=use_tqdm) return out_matrix @@ -332,15 +328,18 @@ def generate_summary( 2 NaN NaN 0 0 NaN NaN 0 """ - logger.info("Sorting sparse dataframe by patient_id and timestamp") assert len(feature_columns), "feature_columns must be a non-empty list" - ts_columns = get_ts_columns(feature_columns) + ts_columns = get_feature_names(agg, feature_columns) # Generate summaries for each window size and aggregation - code_type, agg_name = agg.split("/") + code_type, _ = agg.split("/") # only iterate through code_types that exist in the dataframe columns assert any([c.endswith(code_type) for c in ts_columns]) - logger.info(f"Generating aggregation {agg} for window_size {window_size}") - out_matrix = _generate_summary(ts_columns, index_df, matrix, window_size, agg, use_tqdm=use_tqdm) + logger.info( + f"Generating aggregation {agg} for window_size {window_size}, with {len(ts_columns)} columns." + ) + out_matrix = _generate_summary( + ts_columns, index_df, matrix, window_size, agg, len(ts_columns), use_tqdm=use_tqdm + ) return out_matrix @@ -348,8 +347,6 @@ def generate_summary( import json from pathlib import Path - from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep - feature_columns = json.load( open( Path("/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize") / "feature_columns.json" @@ -361,7 +358,8 @@ def generate_summary( / "train" / "0.parquet" ) - index_df, sparse_matrix = get_flat_ts_rep(feature_columns, df) + agg = "value/count" + index_df, sparse_matrix = get_flat_ts_rep(agg, feature_columns, df) generate_summary( feature_columns=feature_columns, index_df=index_df, diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index a2a120b..0f4fe5d 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -6,31 +6,19 @@ from loguru import logger from scipy.sparse import csr_array -from MEDS_tabular_automl.generate_static_features import ( - STATIC_CODE_COL, - STATIC_VALUE_COL, +from MEDS_tabular_automl.utils import ( + CODE_AGGREGATIONS, + DF_T, + VALUE_AGGREGATIONS, + get_events_df, + get_feature_names, ) -from MEDS_tabular_automl.utils import DF_T warnings.simplefilter(action="ignore", category=FutureWarning) -def get_ts_columns(feature_columns): - def is_static(c): - return c.endswith(STATIC_CODE_COL) or c.endswith(STATIC_VALUE_COL) - - ts_columns = sorted(list({c for c in feature_columns if not is_static(c)})) - return ts_columns - - -def fill_missing_entries_with_nan(sparse_df, type, columns): - # Fill missing entries with NaN - for col in columns: - sparse_df[col] = sparse_df[col].astype(pd.SparseDtype(type, fill_value=np.nan)) - return sparse_df - - def get_long_code_df(df, ts_columns): + """Pivots the codes data frame to a long format one-hot rep for time series data.""" column_to_int = {col: i for i, col in enumerate(ts_columns)} rows = range(df.select(pl.len()).collect().item()) cols = ( @@ -47,9 +35,10 @@ def get_long_code_df(df, ts_columns): def get_long_value_df(df, ts_columns): + """Pivots the numerical value data frame to a long format for time series data.""" column_to_int = {col: i for i, col in enumerate(ts_columns)} - value_df = df.drop_nulls("numerical_value") - rows = range(value_df.select(pl.len()).collect().item()) + value_df = df.with_row_index("index").drop_nulls("numerical_value") + rows = value_df.select(pl.col("index")).collect().to_series().to_numpy() cols = ( value_df.with_columns( pl.concat_str([pl.col("code"), pl.lit("/value")]).replace(column_to_int).alias("value_index") @@ -64,6 +53,7 @@ def get_long_value_df(df, ts_columns): def summarize_dynamic_measurements( + agg: str, ts_columns: list[str], df: pd.DataFrame, ) -> pd.DataFrame: @@ -106,24 +96,23 @@ def summarize_dynamic_measurements( check_df = df.select(pl.col(id_cols)) assert check_df.sort(by=id_cols).collect().equals(check_df.collect()), "data frame must be sorted" - # Generate sparse matrices - value_df = df.drop(columns=id_cols) - value_data, (value_rows, value_cols) = get_long_value_df(value_df, ts_columns) - code_df = df.drop(columns=id_cols + ["numerical_value"]) - code_data, (code_rows, code_cols) = get_long_code_df(code_df, ts_columns) + # Generate sparse matrix + if agg in CODE_AGGREGATIONS: + code_df = df.drop(columns=id_cols + ["numerical_value"]) + data, (rows, cols) = get_long_code_df(code_df, ts_columns) + elif agg in VALUE_AGGREGATIONS: + value_df = df.drop(columns=id_cols) + data, (rows, cols) = get_long_value_df(value_df, ts_columns) - merge_data = np.concatenate([value_data, code_data]) - merge_rows = np.concatenate([value_rows, code_rows]) - merge_cols = np.concatenate([value_cols, code_cols]) - merge_columns = ts_columns sp_matrix = csr_array( - (merge_data, (merge_rows, merge_cols)), - shape=(value_df.select(pl.len()).collect().item(), len(merge_columns)), + (data, (rows, cols)), + shape=(df.select(pl.len()).collect().item(), len(ts_columns)), ) return df.select(pl.col(id_cols)), sp_matrix def get_flat_ts_rep( + agg: str, feature_columns: list[str], shard_df: DF_T, ) -> pl.LazyFrame: @@ -161,9 +150,6 @@ def get_flat_ts_rep( 3 2 2021-01-04 0 2 0 0 1 0 """ # Remove codes not in training set - raw_feature_columns = ["/".join(c.split("/")[:-1]) for c in feature_columns] - shard_df = shard_df.filter(pl.col("code").is_in(raw_feature_columns)) - - ts_columns = get_ts_columns(feature_columns) - ts_shard_df = shard_df.drop_nulls(subset=["timestamp", "code"]) - return summarize_dynamic_measurements(ts_columns, ts_shard_df) + shard_df = get_events_df(shard_df, feature_columns) + ts_columns = get_feature_names(agg, feature_columns) + return summarize_dynamic_measurements(agg, ts_columns, shard_df) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 2da18b0..e63b190 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -5,7 +5,6 @@ dataframes, etc. DF_T: This defines the type of internal dataframes -- e.g. polars DataFrames. """ -import json import os from collections.abc import Mapping from pathlib import Path @@ -14,15 +13,30 @@ import numpy as np import polars as pl import polars.selectors as cs -import yaml from loguru import logger -from omegaconf import DictConfig, OmegaConf +from omegaconf import DictConfig from scipy.sparse import coo_array DF_T = pl.LazyFrame WRITE_USE_PYARROW = True ROW_IDX_NAME = "__row_idx" +STATIC_CODE_AGGREGATION = "static/present" +STATIC_VALUE_AGGREGATION = "static/first" + +CODE_AGGREGATIONS = [ + "code/count", +] + +VALUE_AGGREGATIONS = [ + "value/count", + "value/has_values_count", + "value/sum", + "value/sum_sqd", + "value/min", + "value/max", +] + def hydra_loguru_init() -> None: """Adds loguru output to the logs that hydra scrapes. @@ -360,27 +374,41 @@ def load_meds_data(MEDS_cohort_dir: str, load_data: bool = True) -> Mapping[str, return split_to_df -def setup_environment(cfg: DictConfig, load_data: bool = True): - # check output dir - flat_dir = Path(cfg.tabularized_data_dir) - assert flat_dir.exists() - - # load MEDS data - split_to_df = load_meds_data(cfg.MEDS_cohort_dir, load_data) - feature_columns = json.load(open(flat_dir / "feature_columns.json")) - - # Check that the stored config matches the current config - with open(flat_dir / "config.yaml") as file: - yaml_config = yaml.safe_load(file) - stored_config = OmegaConf.create(yaml_config) - logger.info(f"Stored config: {stored_config}") - logger.info(f"Worker config: {cfg}") - assert cfg.keys() == stored_config.keys(), "Keys in stored config do not match current config." - for key in cfg.keys(): - assert key in stored_config, f"Key {key} not found in stored config." - if key == "worker": - continue - assert ( - cfg[key] == stored_config[key] - ), f"Config key {key}, value is {cfg[key]} vs {stored_config[key]}" - return flat_dir, split_to_df, feature_columns +def get_events_df(shard_df: pl.DataFrame, feature_columns) -> pl.DataFrame: + """Extracts Events DataFrame with one row per observation (timestamps can be duplicated)""" + # raw_feature_columns = ["/".join(c.split("/")[:-1]) for c in feature_columns] + # shard_df = shard_df.filter(pl.col("code").is_in(raw_feature_columns)) + ts_shard_df = shard_df.drop_nulls(subset=["timestamp", "code"]) + return ts_shard_df + + +def get_unique_time_events_df(events_df: pl.DataFrame): + """Updates Events DataFrame to have unique timestamps and sorted by patient_id and timestamp.""" + assert events_df.select(pl.col("timestamp")).is_nan().any().collect().item() == 0 + # Check events_df is sorted - so it aligns with the ts_matrix we generate later in the pipeline + events_df = ( + events_df.drop_nulls("timestamp") + .select(pl.col(["patient_id", "timestamp"])) + .unique(maintain_order=True) + ) + assert events_df.sort(by=["patient_id", "timestamp"]).collect().equals(events_df.collect()) + return events_df + + +def get_feature_names(agg, feature_columns) -> str: + """Indices of columns in feature_columns list.""" + if agg in [STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION]: + return [c for c in feature_columns if c.endswith(agg)] + elif agg in CODE_AGGREGATIONS: + return [c for c in feature_columns if c.endswith("/code")] + elif agg in VALUE_AGGREGATIONS: + return [c for c in feature_columns if c.endswith("/value")] + else: + raise ValueError(f"Unknown aggregation type {agg}") + + +def get_feature_indices(agg, feature_columns) -> str: + """Indices of columns in feature_columns list.""" + feature_to_index = {c: i for i, c in enumerate(feature_columns)} + agg_features = get_feature_names(agg, feature_columns) + return [feature_to_index[c] for c in agg_features] diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 5755c98..28921a7 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -12,7 +12,7 @@ from loguru import logger from MEDS_tabular_automl.file_name import FileNameResolver -from MEDS_tabular_automl.utils import load_matrix +from MEDS_tabular_automl.utils import VALUE_AGGREGATIONS, get_feature_names, load_matrix from scripts.identify_columns import store_columns from scripts.summarize_over_windows import summarize_ts_data_over_windows from scripts.tabularize_static import tabularize_static_data @@ -101,6 +101,35 @@ "tuning/0": MEDS_TUNING_0, } +CODE_COLS = [ + "ADMISSION//CARDIAC/code", + "ADMISSION//ORTHOPEDIC/code", + "ADMISSION//PULMONARY/code", + "DISCHARGE/code", + "DOB/code", + "HR/code", + "TEMP/code", +] +VALUE_COLS = ["HR/value", "TEMP/value"] +STATIC_PRESENT_COLS = [ + "EYE_COLOR//BLUE/static/present", + "EYE_COLOR//BROWN/static/present", + "EYE_COLOR//HAZEL/static/present", + "HEIGHT/static/present", +] +STATIC_FIRST_COLS = ["HEIGHT/static/first"] + +EXPECTED_STATIC_FILES = [ + "tabularize/static/held_out/0/first.npz", + "tabularize/static/held_out/0/present.npz", + "tabularize/static/train/0/first.npz", + "tabularize/static/train/0/present.npz", + "tabularize/static/train/1/first.npz", + "tabularize/static/train/1/present.npz", + "tabularize/static/tuning/0/first.npz", + "tabularize/static/tuning/0/present.npz", +] + SUMMARIZE_EXPECTED_FILES = [ "train/1/365d/value/sum.npz", "train/1/365d/code/count.npz", @@ -166,7 +195,7 @@ def test_tabularize(): "tabularized_data_dir": str(tabularized_data_dir.resolve()), "min_code_inclusion_frequency": 1, "window_sizes": ["30d", "365d", "full"], - "aggs": ["code/count", "value/sum"], + "aggs": ["code/count", "value/sum", "static/present", "static/first"], "codes": "null", "n_patients_per_sub_shard": 2, "do_overwrite": True, @@ -209,19 +238,33 @@ def test_tabularize(): assert (tabularized_data_dir / "config.yaml").is_file() assert (tabularized_data_dir / "feature_columns.json").is_file() assert (tabularized_data_dir / "feature_freqs.json").is_file() - tabularize_static_data(cfg) - actual_files = [(f.parent.stem, f.stem) for f in f_name_resolver.list_static_files()] - expected_files = [("train", "1"), ("train", "0"), ("held_out", "0"), ("tuning", "0")] - f_name_resolver.get_static_dir() - assert set(actual_files) == set(expected_files) + feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) + assert get_feature_names("code/count", feature_columns) == sorted(CODE_COLS) + assert get_feature_names("static/present", feature_columns) == sorted(STATIC_PRESENT_COLS) + assert get_feature_names("static/first", feature_columns) == sorted(STATIC_FIRST_COLS) + for value_agg in VALUE_AGGREGATIONS: + assert get_feature_names(value_agg, feature_columns) == sorted(VALUE_COLS) + + # Check Static File Generation + tabularize_static_data(cfg) + actual_files = [str(Path(*f.parts[-5:])) for f in f_name_resolver.list_static_files()] + assert set(actual_files) == set(EXPECTED_STATIC_FILES) # Check the files are not empty - for f in list(tabularized_data_dir.glob("static/*/*.parquet")): - assert pl.read_parquet(f).shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" + for f in f_name_resolver.list_static_files(): + static_matrix = load_matrix(f) + assert static_matrix.shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" + expected_num_cols = len(get_feature_names(f"static/{f.stem}", feature_columns)) + logger.info((static_matrix.shape[1], expected_num_cols)) + logger.info(f_name_resolver.list_static_files()) + assert static_matrix.shape[1] == expected_num_cols, ( + f"Static Data Tabular Dataframe Should have {expected_num_cols}" + f"Columns but has {static_matrix.shape[1]}!" + ) summarize_ts_data_over_windows(cfg) # confirm summary files exist: - output_files = list(tabularized_data_dir.glob("ts/*/*/*/*/*.npz")) + output_files = f_name_resolver.list_ts_files() f_name_resolver.list_ts_files() actual_files = [str(Path(*f.parts[-5:])) for f in output_files] @@ -232,7 +275,7 @@ def test_tabularize(): assert sparse_array.shape[1] > 0 # merge_data(cfg) - # output_files = list(tabularized_data_dir.glob("sparse/*/*/*/*/*.npz")) + # output_files = f_name_resolver.list_sparse_files() # actual_files = [str(Path(*f.parts[-5:])) for f in output_files] # assert set(actual_files) == set(MERGE_EXPECTED_FILES) From c225c4796e0f855a8ff3972c0c33a72be896339f Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sun, 2 Jun 2024 08:35:52 +0000 Subject: [PATCH 053/106] fixed bug with codes that are only in the test and validation set (not in the training set) crashing the window summarization. Now we throw those events out. --- hf_cohort/hf_cohort_e2e.sh | 50 +++++++------------ scripts/identify_columns.py | 2 +- .../generate_summarized_reps.py | 10 ++-- .../generate_ts_features.py | 16 ++++-- src/MEDS_tabular_automl/utils.py | 6 ++- 5 files changed, 38 insertions(+), 46 deletions(-) diff --git a/hf_cohort/hf_cohort_e2e.sh b/hf_cohort/hf_cohort_e2e.sh index e32c781..02d1a00 100644 --- a/hf_cohort/hf_cohort_e2e.sh +++ b/hf_cohort/hf_cohort_e2e.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -MEDS_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort +MEDS_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed OUTPUT_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize # N_PARALLEL_WORKERS="$1" WINDOW_SIZES="window_sizes=[1d]" @@ -8,40 +8,24 @@ AGGS="aggs=[code/count,value/sum]" # WINDOW_SIZES="window_sizes=[1d,7d,30d,365d,full]" # AGGS="aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" -# echo "Running identify_columns.py: Caching feature names and frequencies." -# rm -rf $OUTPUT_DIR -# POLARS_MAX_THREADS=32 python scripts/identify_columns.py \ -# MEDS_cohort_dir=$MEDS_DIR \ -# tabularized_data_dir=$OUTPUT_DIR \ -# min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" - -# echo "Running tabularize_static.py: tabularizing static data" -# POLARS_MAX_THREADS=32 python scripts/tabularize_static.py \ -# MEDS_cohort_dir=$MEDS_DIR \ -# tabularized_data_dir=$OUTPUT_DIR \ -# min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" - -# # echo "Running summarize_over_windows.py with $N_PARALLEL_WORKERS workers in parallel" -# # POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py \ -# # --multirun \ -# # worker="range(0,$N_PARALLEL_WORKERS)" \ -# # hydra/launcher=joblib \ -# # MEDS_cohort_dir=$MEDS_DIR \ -# # tabularized_data_dir=$OUTPUT_DIR \ -# # min_code_inclusion_frequency=1 do_overwrite=False \ -# # "$WINDOW_SIZES" "$AGGS" - -# echo "Running summarize_over_windows.py" -# POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py \ -# MEDS_cohort_dir=$MEDS_DIR \ -# tabularized_data_dir=$OUTPUT_DIR \ -# min_code_inclusion_frequency=1 do_overwrite=False \ -# "$WINDOW_SIZES" "$AGGS" +echo "Running identify_columns.py: Caching feature names and frequencies." +rm -rf $OUTPUT_DIR +POLARS_MAX_THREADS=32 python scripts/identify_columns.py \ + MEDS_cohort_dir=$MEDS_DIR \ + tabularized_data_dir=$OUTPUT_DIR \ + min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" +echo "Running tabularize_static.py: tabularizing static data" +POLARS_MAX_THREADS=32 python scripts/tabularize_static.py \ + MEDS_cohort_dir=$MEDS_DIR \ + tabularized_data_dir=$OUTPUT_DIR \ + min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" -echo "Running tabularize_merge.py" -rm -r "$OUTPUT_DIR/sparse" -POLARS_MAX_THREADS=10 python /home/nassim/projects/MEDS_Tabular_AutoML/scripts/tabularize_merge.py \ +echo "Running summarize_over_windows.py with $N_PARALLEL_WORKERS workers in parallel" +POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ MEDS_cohort_dir=$MEDS_DIR \ tabularized_data_dir=$OUTPUT_DIR \ min_code_inclusion_frequency=1 do_overwrite=False \ diff --git a/scripts/identify_columns.py b/scripts/identify_columns.py index b52eb41..84c3a1a 100644 --- a/scripts/identify_columns.py +++ b/scripts/identify_columns.py @@ -128,7 +128,7 @@ def read_fn(feature_dir): rwlock_wrap( feature_dir / "identify_train_columns", - feature_dir, + f_name_resolver.get_feature_columns_fp(), read_fn, write_fn, compute_fn, diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index d0d85c6..50f9dc5 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -355,16 +355,16 @@ def generate_summary( df = pl.scan_parquet( Path("/storage/shared/meds_tabular_ml/ebcl_dataset/processed") / "final_cohort" - / "train" - / "0.parquet" + / "held_out" + / "7.parquet" ) - agg = "value/count" + agg = "code/count" index_df, sparse_matrix = get_flat_ts_rep(agg, feature_columns, df) generate_summary( feature_columns=feature_columns, index_df=index_df, matrix=sparse_matrix, - window_size="full", - agg="code/count", + window_size="1d", + agg=agg, use_tqdm=True, ) diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index 0f4fe5d..5bc0b48 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -17,37 +17,43 @@ warnings.simplefilter(action="ignore", category=FutureWarning) +def feature_name_to_code(feature_name: str) -> str: + """Converts a feature name to a code name.""" + return "/".join(feature_name.split("/")[:-1]) + def get_long_code_df(df, ts_columns): """Pivots the codes data frame to a long format one-hot rep for time series data.""" - column_to_int = {col: i for i, col in enumerate(ts_columns)} + column_to_int = {feature_name_to_code(col): i for i, col in enumerate(ts_columns)} rows = range(df.select(pl.len()).collect().item()) cols = ( df.with_columns( - pl.concat_str([pl.col("code"), pl.lit("/code")]).replace(column_to_int).alias("code_index") + pl.col("code").cast(str).replace(column_to_int).cast(int).alias("code_index") ) .select("code_index") .collect() .to_series() .to_numpy() ) + assert np.issubdtype(cols.dtype, np.number), "numerical_value must be a numerical type" data = np.ones(df.select(pl.len()).collect().item(), dtype=np.bool_) return data, (rows, cols) def get_long_value_df(df, ts_columns): """Pivots the numerical value data frame to a long format for time series data.""" - column_to_int = {col: i for i, col in enumerate(ts_columns)} - value_df = df.with_row_index("index").drop_nulls("numerical_value") + column_to_int = {feature_name_to_code(col): i for i, col in enumerate(ts_columns)} + value_df = df.with_row_index("index").drop_nulls("numerical_value").filter(pl.col("code").is_in(ts_columns)) rows = value_df.select(pl.col("index")).collect().to_series().to_numpy() cols = ( value_df.with_columns( - pl.concat_str([pl.col("code"), pl.lit("/value")]).replace(column_to_int).alias("value_index") + pl.col("code").cast(str).replace(column_to_int).cast(int).alias("value_index") ) .select("value_index") .collect() .to_series() .to_numpy() ) + assert np.issubdtype(cols.dtype, np.number), "numerical_value must be a numerical type" data = value_df.select(pl.col("numerical_value")).collect().to_series().to_numpy() return data, (rows, cols) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index e63b190..2abd6d9 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -376,8 +376,10 @@ def load_meds_data(MEDS_cohort_dir: str, load_data: bool = True) -> Mapping[str, def get_events_df(shard_df: pl.DataFrame, feature_columns) -> pl.DataFrame: """Extracts Events DataFrame with one row per observation (timestamps can be duplicated)""" - # raw_feature_columns = ["/".join(c.split("/")[:-1]) for c in feature_columns] - # shard_df = shard_df.filter(pl.col("code").is_in(raw_feature_columns)) + # Filter out feature_columns that were not present in the training set + raw_feature_columns = ["/".join(c.split("/")[:-1]) for c in feature_columns] + shard_df = shard_df.filter(pl.col("code").is_in(raw_feature_columns)) + # Drop rows with missing timestamp or code to get events ts_shard_df = shard_df.drop_nulls(subset=["timestamp", "code"]) return ts_shard_df From cb21821de5c974e2416f90fb0231ce0b81be0701 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sun, 2 Jun 2024 09:00:12 +0000 Subject: [PATCH 054/106] fixed bug with summarization script crashing for min and max value aggregations due to a coo matrix being returned rather than a dense matrix as with sum and count operations --- .../generate_summarized_reps.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 50f9dc5..46151d0 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -142,10 +142,17 @@ def aggregate_matrix(windows, matrix, agg, num_features, use_tqdm=False): max_index = window["max_index"] subset_matrix = matrix[min_index : max_index + 1, :] agg_matrix = sparse_aggregate(subset_matrix, agg).astype(out_dtype) - nozero_ind = np.nonzero(agg_matrix)[0] - col.append(nozero_ind) - data.append(agg_matrix[nozero_ind]) - row.append(np.repeat(np.array(i, dtype=np.int32), len(nozero_ind))) + if isinstance(agg_matrix, np.ndarray): + nozero_ind = np.nonzero(agg_matrix)[0] + col.append(nozero_ind) + data.append(agg_matrix[nozero_ind]) + row.append(np.repeat(np.array(i, dtype=np.int32), len(nozero_ind))) + elif isinstance(agg_matrix, coo_array): + col.append(agg_matrix.col) + data.append(agg_matrix.data) + row.append(agg_matrix.row) + else: + raise TypeError(f"Invalid matrix type {type(agg_matrix)}") row = np.concatenate(row) out_matrix = coo_array( (np.concatenate(data), (row, np.concatenate(col))), @@ -355,16 +362,16 @@ def generate_summary( df = pl.scan_parquet( Path("/storage/shared/meds_tabular_ml/ebcl_dataset/processed") / "final_cohort" - / "held_out" - / "7.parquet" + / "train" + / "3.parquet" ) - agg = "code/count" + agg = "value/min" index_df, sparse_matrix = get_flat_ts_rep(agg, feature_columns, df) generate_summary( feature_columns=feature_columns, index_df=index_df, matrix=sparse_matrix, - window_size="1d", + window_size="30d", agg=agg, use_tqdm=True, ) From 3a412a0af543561a1c9740bc41aa2497de9d5154 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sun, 2 Jun 2024 09:52:36 +0000 Subject: [PATCH 055/106] removed overwrite killing of jobs which causes errors in multirun --- scripts/summarize_over_windows.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/summarize_over_windows.py b/scripts/summarize_over_windows.py index 070232f..049c0d6 100644 --- a/scripts/summarize_over_windows.py +++ b/scripts/summarize_over_windows.py @@ -72,8 +72,6 @@ def summarize_ts_data_over_windows( split = shard_fp.parent.stem assert split in ["train", "held_out", "tuning"], f"Invalid split {split}" ts_fp = f_name_resolver.get_flat_ts_rep(split, shard_num, window_size, agg) - if ts_fp.exists() and not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {ts_fp.exists()} exists!") def read_fn(fp): return pl.scan_parquet(fp) From a4f184319366bb9207ac0345b53ba9b6047e3411 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sun, 2 Jun 2024 11:27:37 +0000 Subject: [PATCH 056/106] Xgboost is able to load all concatenated windows and aggregations. Fixed bugs related to event ids and column ids being incorrect. --- configs/xgboost_sweep.yaml | 1 + hf_cohort/aces_task_extraction.py | 29 ++++- scripts/tabularize_static.py | 4 +- scripts/xgboost_sweep.py | 122 +++++------------- src/MEDS_tabular_automl/file_name.py | 30 ++++- .../generate_static_features.py | 4 +- src/MEDS_tabular_automl/utils.py | 2 +- tests/test_tabularize.py | 63 ++++++--- 8 files changed, 142 insertions(+), 113 deletions(-) diff --git a/configs/xgboost_sweep.yaml b/configs/xgboost_sweep.yaml index eeec001..606b446 100644 --- a/configs/xgboost_sweep.yaml +++ b/configs/xgboost_sweep.yaml @@ -22,6 +22,7 @@ do_overwrite: False do_update: True seed: 1 tqdm: True +test: False model: booster: gbtree diff --git a/hf_cohort/aces_task_extraction.py b/hf_cohort/aces_task_extraction.py index 6b86af1..722c5c7 100644 --- a/hf_cohort/aces_task_extraction.py +++ b/hf_cohort/aces_task_extraction.py @@ -1,6 +1,7 @@ """ Setup Conda environment as described here: https://github.com/justin13601/ACES """ +import json from pathlib import Path import hydra @@ -9,6 +10,29 @@ from tqdm import tqdm +def get_events_df(shard_df: pl.DataFrame, feature_columns) -> pl.DataFrame: + """Extracts Events DataFrame with one row per observation (timestamps can be duplicated)""" + # Filter out feature_columns that were not present in the training set + raw_feature_columns = ["/".join(c.split("/")[:-1]) for c in feature_columns] + shard_df = shard_df.filter(pl.col("code").is_in(raw_feature_columns)) + # Drop rows with missing timestamp or code to get events + ts_shard_df = shard_df.drop_nulls(subset=["timestamp", "code"]) + return ts_shard_df + + +def get_unique_time_events_df(events_df: pl.DataFrame): + """Updates Events DataFrame to have unique timestamps and sorted by patient_id and timestamp.""" + assert events_df.select(pl.col("timestamp")).null_count().collect().item() == 0 + # Check events_df is sorted - so it aligns with the ts_matrix we generate later in the pipeline + events_df = ( + events_df.drop_nulls("timestamp") + .select(pl.col(["patient_id", "timestamp"])) + .unique(maintain_order=True) + ) + assert events_df.sort(by=["patient_id", "timestamp"]).collect().equals(events_df.collect()) + return events_df + + @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") def main(cfg): # create task configuration object @@ -37,10 +61,11 @@ def main(cfg): .rename({"trigger": "timestamp", "subject_id": "patient_id"}) .sort(by=["patient_id", "timestamp"]) ) + feature_columns = json.read(Path(cfg.tabularized_data_dir) / "feature_columns.json") data_df = pl.scan_parquet(in_fp) - data_df = data_df.unique(subset=["patient_id", "timestamp"]).sort(by=["patient_id", "timestamp"]) - data_df = data_df.with_row_index("event_id") + data_df = get_unique_time_events_df(get_events_df(data_df, feature_columns)) data_df = data_df.drop(["code", "numerical_value"]) + data_df = data_df.with_row_index("event_id") output_df = label_df.lazy().join_asof(other=data_df, by="patient_id", on="timestamp") # store it diff --git a/scripts/tabularize_static.py b/scripts/tabularize_static.py index abe6ea1..4a6ceb6 100644 --- a/scripts/tabularize_static.py +++ b/scripts/tabularize_static.py @@ -121,9 +121,7 @@ def tabularize_static_data( np.random.shuffle(tabularization_tasks) for shard_fp, agg in iter_wrapper(tabularization_tasks): - static_fp = f_name_resolver.get_flat_static_rep( - shard_fp.parent.stem, shard_fp.stem, agg.split("/")[-1] - ) + static_fp = f_name_resolver.get_flat_static_rep(shard_fp.parent.stem, shard_fp.stem, agg) if static_fp.exists() and not cfg.do_overwrite: raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {static_fp} exists!") diff --git a/scripts/xgboost_sweep.py b/scripts/xgboost_sweep.py index a149ce2..8ad44d7 100644 --- a/scripts/xgboost_sweep.py +++ b/scripts/xgboost_sweep.py @@ -14,6 +14,9 @@ from omegaconf import DictConfig, OmegaConf from sklearn.metrics import mean_absolute_error +from MEDS_tabular_automl.file_name import FileNameResolver +from MEDS_tabular_automl.utils import get_feature_indices, load_matrix + class Iterator(xgb.DataIter): def __init__(self, cfg: DictConfig, split: str = "train"): @@ -24,14 +27,18 @@ def __init__(self, cfg: DictConfig, split: str = "train"): - split (str): The data split to use ("train", "tuning", or "held_out"). """ self.cfg = cfg - self.data_path = Path(cfg.tabularized_data_dir) - self.dynamic_data_path = self.data_path / "sparse" / split - self.task_data_path = self.data_path / "task" / split + self.file_name_resolver = FileNameResolver(cfg) + self.split = split + # self.data_path = Path(cfg.tabularized_data_dir) + # self.dynamic_data_path = self.data_path / "sparse" / split + # self.task_data_path = self.data_path / "task" / split self._data_shards = sorted( - [shard.stem for shard in list(self.task_data_path.glob("*.parquet"))] + [shard.stem for shard in self.file_name_resolver.list_label_files(split)] ) # [2, 4, 5] # self.valid_event_ids, self.labels = self.load_labels() - self.window_set, self.aggs_set, self.codes_set, self.num_features = self._get_inclusion_sets() + self.codes_set, self.num_features = self._get_code_set() + feature_columns = json.load(open(self.file_name_resolver.get_feature_columns_fp())) + self.agg_to_feature_ids = {agg: get_feature_indices(agg, feature_columns) for agg in cfg.aggs} self._it = 0 @@ -48,7 +55,9 @@ def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: in the sparse matrix dictionary from shard number to list of labels for these valid event ids """ - label_fps = {shard: self.task_data_path / f"{shard}.parquet" for shard in self._data_shards} + label_fps = { + shard: self.file_name_resolver.get_label(self.split, shard) for shard in self._data_shards + } cached_labels, cached_event_ids = dict(), dict() for shard, label_fp in label_fps.items(): label_df = pl.scan_parquet(label_fp) @@ -58,14 +67,14 @@ def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: def _get_code_set(self) -> set: """Get the set of codes to include in the data based on the configuration.""" - with open(self.data_path / "feature_columns.json") as f: + with open(self.file_name_resolver.get_feature_columns_fp()) as f: feature_columns = json.load(f) feature_dict = {col: i for i, col in enumerate(feature_columns)} if self.cfg.codes is not None: codes_set = {feature_dict[code] for code in set(self.cfg.codes) if code in feature_dict} if self.cfg.min_code_inclusion_frequency is not None: - with open(self.data_path / "feature_freqs.json") as f: + with open(self.file_name_resolver.get_feature_freqs_fp()) as f: feature_freqs = json.load(f) min_frequency_set = { key for key, value in feature_freqs.items() if value >= self.cfg.min_code_inclusion_frequency @@ -83,53 +92,6 @@ def _get_code_set(self) -> set: # TODO: make sure we aren't filtering out static columns!!! return list(codes_set), len(feature_columns) - def _get_inclusion_sets(self) -> tuple[set, set, np.array]: - """Get the inclusion sets for aggregations, window sizes, and a mask for minimum code frequency. - - Returns: - - Tuple[Optional[Set[str]], Optional[Set[str]], np.ndarray]: Tuple containing: - - Set of aggregations. - - Set of window sizes. - - Boolean array mask indicating which feature columns meet the inclusion criteria. - - Examples: - >>> import tempfile - >>> from types import SimpleNamespace - >>> cfg = SimpleNamespace( - ... aggs=["code/count", "value/sum"], - ... window_sizes=None, - ... codes=["code1", "code2", "value1"], - ... min_code_inclusion_frequency=2 - ... ) - >>> with tempfile.TemporaryDirectory() as tempdir: - ... data_path = Path(tempdir) - ... cfg.tabularized_data_dir = str(data_path) - ... feature_columns = ["code1/code", "code2/code", "value1/value"] - ... feature_freqs = {"code1": 3, "code2": 1, "value1": 5} - ... with open(data_path / "feature_columns.json", "w") as f: - ... json.dump(feature_columns, f) - ... with open(data_path / "feature_freqs.json", "w") as f: - ... json.dump(feature_freqs, f) - ... iterator = Iterator(cfg) - ... aggs_set, window_set, mask = iterator._get_inclusion_sets() - ... assert aggs_set == {"code/count", "value/sum"} - ... assert window_set == None - ... assert np.array_equal(mask, [True, False, True]) - """ - - window_set = None - aggs_set = None - - if self.cfg.aggs is not None: - aggs_set = set(self.cfg.aggs) - - if self.cfg.window_sizes is not None: - window_set = set(self.cfg.window_sizes) - - codes_set, num_features = self._get_code_set() - - return sorted(window_set), sorted(aggs_set), sorted(codes_set), num_features - def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: """Load a sparse shard into memory. @@ -170,16 +132,13 @@ def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: ... assert np.array_equal(loaded_shard.indptr, expected_csr.indptr) """ # column_shard is of form event_idx, feature_idx, value - column_shard = np.load(path).T # TODO: Fix this!!! - - shard = sp.csc_matrix( - (column_shard[:, 0], (column_shard[:, 1], column_shard[:, 2])), - shape=( - max(self.valid_event_ids[self._data_shards[idx]], column_shard[:, 1]) + 1, - self.num_features, - ), - ) - return self._filter_shard_on_codes_and_freqs(shard) + matrix = load_matrix(path) + if path.stem in ["first", "present"]: + agg = f"static/{path.stem}" + else: + agg = f"{path.parent.stem}/{path.stem}" + + return self._filter_shard_on_codes_and_freqs(agg, sp.csc_matrix(matrix)) def _get_dynamic_shard_by_index(self, idx: int) -> sp.csr_matrix: """Load a specific shard of dynamic data from disk and return it as a sparse matrix after filtering @@ -191,11 +150,14 @@ def _get_dynamic_shard_by_index(self, idx: int) -> sp.csr_matrix: Returns: - sp.csr_matrix: Filtered sparse matrix. """ + # TODO Nassim Fix this guy + # get all window_size x aggreagation files using the file resolver + files = self.file_name_resolver.get_model_files( + self.cfg.window_sizes, self.cfg.aggs, self.split, self._data_shards[idx] + ) + assert all([file.exists() for file in files]) shard_name = self._data_shards[idx] - shard_pattern = f"*/*/*/{shard_name}.npy" - files = self.dynamic_data_path.glob(shard_pattern) - valid_files = sorted(file for file in files if self._filter_shard_files_on_window_and_aggs(file)) - dynamic_csrs = [self._load_dynamic_shard_from_file(file, idx) for file in valid_files] + dynamic_csrs = [self._load_dynamic_shard_from_file(file, idx) for file in files] combined_csr = sp.hstack(dynamic_csrs, format="csr") # TODO: check this # Filter Rows valid_indices = self.valid_event_ids[shard_name] @@ -219,23 +181,7 @@ def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: logger.debug(f"Task data loading took {datetime.now() - time}") return dynamic_df, label_df - def _filter_shard_files_on_window_and_aggs(self, file: Path) -> bool: - parts = file.relative_to(self.dynamic_data_path).parts - if len(parts) < 2: - return False - - windows_part = parts[0] - aggs_part = "/".join(parts[1:-1]) - - if self.window_set is not None and windows_part not in self.window_set: - return False - - if self.aggs_set is not None and aggs_part not in self.aggs_set: - return False - - return True - - def _filter_shard_on_codes_and_freqs(self, df: sp.csc_matrix) -> sp.csc_matrix: + def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csc_matrix) -> sp.csc_matrix: """Filter the dynamic data frame based on the inclusion sets. Given the codes_mask, filter the data frame to only include columns that are True in the mask. @@ -247,7 +193,9 @@ def _filter_shard_on_codes_and_freqs(self, df: sp.csc_matrix) -> sp.csc_matrix: """ if self.codes_set is None: return df - return df[:, self.codes_set] # [:, list({index for index in self.codes_set if index < df.shape[1]})] + feature_ids = self.agg_to_feature_ids[agg] + code_mask = [True if idx in self.codes_set else False for idx in feature_ids] + return df[:, code_mask] # [:, list({index for index in self.codes_set if index < df.shape[1]})] def next(self, input_data: Callable): """Advance the iterator by 1 step and pass the data to XGBoost. This function is called by XGBoost diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py index bfab7a3..7700b82 100644 --- a/src/MEDS_tabular_automl/file_name.py +++ b/src/MEDS_tabular_automl/file_name.py @@ -22,6 +22,9 @@ def get_ts_dir(self): def get_sparse_dir(self): return self.tabularize_dir / "sparse" + def get_label_dir(self): + return self.tabularize_dir / "task" + def get_feature_columns_fp(self): return self.tabularize_dir / "feature_columns.json" @@ -37,7 +40,8 @@ def get_meds_shard(self, split: str, shard_num: int): def get_flat_static_rep(self, split: str, shard_num: int, agg: str): # Given a shard number, returns the static representation path - return self.get_static_dir() / split / f"{shard_num}" / f"{agg}.npz" + agg_name = agg.split("/")[-1] + return self.get_static_dir() / split / f"{shard_num}" / f"{agg_name}.npz" def get_flat_ts_rep(self, split: str, shard_num: int, window_size: int, agg: str): # Given a shard number, returns the time series representation path @@ -47,6 +51,10 @@ def get_flat_sparse_rep(self, split: str, shard_num: int, window_size: int, agg: # Given a shard number, returns the sparse representation path return self.get_sparse_dir() / split / f"{shard_num}" / f"{window_size}" / f"{agg}.npz" + def get_label(self, split: str, shard_num: int): + # Given a shard number, returns the label path + return self.get_label_dir() / split / f"{shard_num}.parquet" + def list_meds_files(self, split=None): # List all MEDS files if split: @@ -70,3 +78,23 @@ def list_sparse_files(self, split=None): if split: return sorted(list(self.get_sparse_dir().glob(f"{split}/*/*.npz"))) return sorted(list(self.get_sparse_dir().glob("*/*/*.npz"))) + + def list_label_files(self, split=None): + # List all label files + if split: + return sorted(list(self.get_label_dir().glob(f"{split}/*.parquet"))) + return sorted(list(self.get_label_dir().glob("*/*.parquet"))) + + def get_model_files(self, window_sizes, aggs, split, shard_num: int): + # Given a shard number, returns the model files + model_files = [] + for window_size in window_sizes: + for agg in aggs: + if agg.startswith("static"): + continue + else: + model_files.append(self.get_flat_ts_rep(split, shard_num, window_size, agg)) + for agg in aggs: + if agg.startswith("static"): + model_files.append(self.get_flat_static_rep(split, shard_num, agg)) + return sorted(model_files) diff --git a/src/MEDS_tabular_automl/generate_static_features.py b/src/MEDS_tabular_automl/generate_static_features.py index 192f1e3..8ab60b5 100644 --- a/src/MEDS_tabular_automl/generate_static_features.py +++ b/src/MEDS_tabular_automl/generate_static_features.py @@ -19,6 +19,7 @@ STATIC_VALUE_AGGREGATION, get_events_df, get_feature_names, + get_unique_time_events_df, parse_static_feature_column, ) @@ -53,7 +54,6 @@ def get_sparse_static_rep(static_features, static_df, meds_df, feature_columns) Returns: - pd.DataFrame: A merged dataframe containing static and time-series features. """ - # TODO - Eventually do this duplication at the task specific stage after filtering patients and features # Make static data sparse and merge it with the time-series data logger.info("Make static data sparse and merge it with the time-series data") # Check static_df is sorted and unique @@ -62,7 +62,7 @@ def get_sparse_static_rep(static_features, static_df, meds_df, feature_columns) static_df.select(pl.len()).collect().item() == static_df.select(pl.col("patient_id").n_unique()).collect().item() ) - meds_df = get_events_df(meds_df, feature_columns) + meds_df = get_unique_time_events_df(get_events_df(meds_df, feature_columns)) # load static data as sparse matrix static_matrix = convert_to_matrix( diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 2abd6d9..3a88c27 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -386,7 +386,7 @@ def get_events_df(shard_df: pl.DataFrame, feature_columns) -> pl.DataFrame: def get_unique_time_events_df(events_df: pl.DataFrame): """Updates Events DataFrame to have unique timestamps and sorted by patient_id and timestamp.""" - assert events_df.select(pl.col("timestamp")).is_nan().any().collect().item() == 0 + assert events_df.select(pl.col("timestamp")).null_count().collect().item() == 0 # Check events_df is sorted - so it aligns with the ts_matrix we generate later in the pipeline events_df = ( events_df.drop_nulls("timestamp") diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 28921a7..39873c5 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -12,10 +12,16 @@ from loguru import logger from MEDS_tabular_automl.file_name import FileNameResolver -from MEDS_tabular_automl.utils import VALUE_AGGREGATIONS, get_feature_names, load_matrix +from MEDS_tabular_automl.utils import ( + VALUE_AGGREGATIONS, + get_events_df, + get_feature_names, + load_matrix, +) from scripts.identify_columns import store_columns from scripts.summarize_over_windows import summarize_ts_data_over_windows from scripts.tabularize_static import tabularize_static_data +from scripts.xgboost_sweep import xgboost SPLITS_JSON = """{"train/0": [239684, 1195293], "train/1": [68729, 814703], "tuning/0": [754281], "held_out/0": [1500733]}""" # noqa: E501 @@ -261,6 +267,11 @@ def test_tabularize(): f"Static Data Tabular Dataframe Should have {expected_num_cols}" f"Columns but has {static_matrix.shape[1]}!" ) + static_first_fp = f_name_resolver.get_flat_static_rep("tuning", "0", "static/first") + static_present_fp = f_name_resolver.get_flat_static_rep("tuning", "0", "static/present") + assert ( + load_matrix(static_first_fp).shape[0] == load_matrix(static_present_fp).shape[0] + ), "static data first and present aggregations have different numbers of rows" summarize_ts_data_over_windows(cfg) # confirm summary files exist: @@ -273,21 +284,39 @@ def test_tabularize(): sparse_array = load_matrix(f) assert sparse_array.shape[0] > 0 assert sparse_array.shape[1] > 0 + ts_code_fp = f_name_resolver.get_flat_ts_rep("tuning", "0", "365d", "code/count") + ts_value_fp = f_name_resolver.get_flat_ts_rep("tuning", "0", "365d", "value/sum") + assert ( + load_matrix(ts_code_fp).shape[0] == load_matrix(ts_value_fp).shape[0] + ), "time series code and value have different numbers of rows" + assert ( + load_matrix(static_first_fp).shape[0] == load_matrix(ts_value_fp).shape[0] + ), "static data and time series have different numbers of rows" - # merge_data(cfg) - # output_files = f_name_resolver.list_sparse_files() - # actual_files = [str(Path(*f.parts[-5:])) for f in output_files] - # assert set(actual_files) == set(MERGE_EXPECTED_FILES) + # Create fake labels + for f in f_name_resolver.list_meds_files(): + df = pl.read_parquet(f) + df = get_events_df(df, feature_columns) + pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) + df = df.with_columns(pl.Series(name="label", values=pseudo_labels)) + df = df.select(pl.col(["patient_id", "timestamp", "label"])) + df = df.unique(subset=["patient_id", "timestamp"]) + df = df.with_row_index("event_id") - # model_dir = Path(d) / "save_model" - # xgboost_config_kwargs = { - # "model_dir": str(model_dir.resolve()), - # "hydra.mode": "MULTIRUN", - # } - # xgboost_config_kwargs = {**tabularize_config_kwargs, **xgboost_config_kwargs} - # with initialize(version_base=None, config_path="../configs/"): # path to config.yaml - # overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] - # cfg = compose(config_name="xgboost_sweep", overrides=overrides) # config.yaml - # xgboost(cfg) - # output_files = list(model_dir.glob("*/*/*_model.json")) - # assert len(output_files) == 1 + split = f.parent.stem + shard_num = f.stem + out_f = f_name_resolver.get_label(split, shard_num) + out_f.parent.mkdir(parents=True, exist_ok=True) + df.write_parquet(out_f) + model_dir = Path(d) / "save_model" + xgboost_config_kwargs = { + "model_dir": str(model_dir.resolve()), + "hydra.mode": "MULTIRUN", + } + xgboost_config_kwargs = {**tabularize_config_kwargs, **xgboost_config_kwargs} + with initialize(version_base=None, config_path="../configs/"): # path to config.yaml + overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] + cfg = compose(config_name="xgboost_sweep", overrides=overrides) # config.yaml + xgboost(cfg) + output_files = list(model_dir.glob("*/*/*_model.json")) + assert len(output_files) == 1 From 800ab7e7d9f3d11edd5749dc9ad070545b815993 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sun, 2 Jun 2024 11:35:32 +0000 Subject: [PATCH 057/106] fixed timedelta overflow bug --- src/MEDS_tabular_automl/generate_summarized_reps.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 46151d0..21cc03b 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -112,9 +112,7 @@ def sparse_rolling(df, sparse_matrix, timedelta, agg): def get_rolling_window_indicies(index_df, window_size): """Get the indices for the rolling windows.""" if window_size == "full": - newest_date = index_df.select(pl.col("timestamp")).max().collect().item() - oldest_date = index_df.select(pl.col("timestamp")).min().collect().item() - timedelta = newest_date - oldest_date + pd.Timedelta(days=1) + timedelta = pd.Timedelta(150*52, unit="W") # just use 150 years as time delta else: timedelta = pd.Timedelta(window_size) return ( @@ -363,15 +361,15 @@ def generate_summary( Path("/storage/shared/meds_tabular_ml/ebcl_dataset/processed") / "final_cohort" / "train" - / "3.parquet" + / "2.parquet" ) - agg = "value/min" + agg = "code/count" index_df, sparse_matrix = get_flat_ts_rep(agg, feature_columns, df) generate_summary( feature_columns=feature_columns, index_df=index_df, matrix=sparse_matrix, - window_size="30d", + window_size="full", agg=agg, use_tqdm=True, ) From 4b0637a1c4ec288a3ea3ffe8c779457b708d5a02 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sun, 2 Jun 2024 14:05:52 +0000 Subject: [PATCH 058/106] fixed bug with loading feature columns json for aces task script --- hf_cohort/aces_task_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hf_cohort/aces_task_extraction.py b/hf_cohort/aces_task_extraction.py index 722c5c7..851c0c1 100644 --- a/hf_cohort/aces_task_extraction.py +++ b/hf_cohort/aces_task_extraction.py @@ -61,7 +61,7 @@ def main(cfg): .rename({"trigger": "timestamp", "subject_id": "patient_id"}) .sort(by=["patient_id", "timestamp"]) ) - feature_columns = json.read(Path(cfg.tabularized_data_dir) / "feature_columns.json") + feature_columns = json.load(open(Path(cfg.tabularized_data_dir) / "feature_columns.json")) data_df = pl.scan_parquet(in_fp) data_df = get_unique_time_events_df(get_events_df(data_df, feature_columns)) data_df = data_df.drop(["code", "numerical_value"]) From 127d04a6b33ebf0c21e75ad78e3ed3909c6087ba Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sun, 2 Jun 2024 14:07:26 +0000 Subject: [PATCH 059/106] added memory profiling to hf_cohort e2e script --- hf_cohort/hf_cohort_e2e.sh | 47 ++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/hf_cohort/hf_cohort_e2e.sh b/hf_cohort/hf_cohort_e2e.sh index 02d1a00..9d52bd2 100644 --- a/hf_cohort/hf_cohort_e2e.sh +++ b/hf_cohort/hf_cohort_e2e.sh @@ -1,10 +1,15 @@ #!/usr/bin/env bash +METHOD=meds +N_RUNS="1" +OUTPUT_BASE=results +POLARS_MAX_THREADS=32 + MEDS_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed OUTPUT_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize -# N_PARALLEL_WORKERS="$1" -WINDOW_SIZES="window_sizes=[1d]" -AGGS="aggs=[code/count,value/sum]" +N_PARALLEL_WORKERS="$1" +WINDOW_SIZES="window_sizes=[1d,7d,30d,365d,full]" +AGGS="aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" # WINDOW_SIZES="window_sizes=[1d,7d,30d,365d,full]" # AGGS="aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" @@ -21,12 +26,30 @@ POLARS_MAX_THREADS=32 python scripts/tabularize_static.py \ tabularized_data_dir=$OUTPUT_DIR \ min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" -echo "Running summarize_over_windows.py with $N_PARALLEL_WORKERS workers in parallel" -POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - MEDS_cohort_dir=$MEDS_DIR \ - tabularized_data_dir=$OUTPUT_DIR \ - min_code_inclusion_frequency=1 do_overwrite=False \ - "$WINDOW_SIZES" "$AGGS" +POLARS_MAX_THREADS=1 +ID=$RANDOM +LOG_DIR="logs/$METHOD/$ID-logs" +mkdir -p $LOG_DIR +{ time \ + mprof run --include-children --exit-code --output "$LOG_DIR/mprofile.dat" \ + python scripts/summarize_over_windows.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + MEDS_cohort_dir=$MEDS_DIR \ + tabularized_data_dir=$OUTPUT_DIR \ + min_code_inclusion_frequency=1 do_overwrite=False \ + "$WINDOW_SIZES" "$AGGS" \ + 2> $LOG_DIR/cmd.stderr +} 2> $LOG_DIR/timings.txt + +cmd_exit_status=${PIPESTATUS[0]} +# Check the exit status of the second command in the pipeline (mprof run ...) +if [ -n "$cmd_exit_status" ] && [ "$cmd_exit_status" -ne 0 ]; then + echo "build_dataset.sh failed with status $cmd_exit_status." + echo "Stderr from build_dataset.sh (see $LOG_DIR/cmd.stderr):" + tail $LOG_DIR/cmd.stderr + exit $cmd_exit_status +fi +mprof plot -o $LOG_DIR/mprofile.png $LOG_DIR/mprofile.dat +mprof peak $LOG_DIR/mprofile.dat > $LOG_DIR/peak_memory_usage.txt From 36f54a344e6b677e79bd4cf1fe4796e0889cdf61 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 2 Jun 2024 10:55:23 -0400 Subject: [PATCH 060/106] Made tests ignore the hf_cohort directory --- .github/workflows/tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 4a0dbf0..999fffb 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -35,7 +35,7 @@ jobs: #---------------------------------------------- - name: Run tests run: | - pytest -v --doctest-modules --cov + pytest -v --doctest-modules --cov --ignore=hf_cohort/ - name: Upload coverage to Codecov uses: codecov/codecov-action@v4.0.1 From 81bf2d9b964caca162c28c1b93b32e616f3c5c7e Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 2 Jun 2024 10:58:52 -0400 Subject: [PATCH 061/106] Pre-commit fixes --- hf_cohort/hf_cohort_e2e.sh | 11 ++++------- src/MEDS_tabular_automl/generate_summarized_reps.py | 2 +- src/MEDS_tabular_automl/generate_ts_features.py | 13 ++++++------- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/hf_cohort/hf_cohort_e2e.sh b/hf_cohort/hf_cohort_e2e.sh index 9d52bd2..c9bb74d 100644 --- a/hf_cohort/hf_cohort_e2e.sh +++ b/hf_cohort/hf_cohort_e2e.sh @@ -1,9 +1,6 @@ #!/usr/bin/env bash METHOD=meds -N_RUNS="1" -OUTPUT_BASE=results -POLARS_MAX_THREADS=32 MEDS_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed OUTPUT_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize @@ -26,13 +23,13 @@ POLARS_MAX_THREADS=32 python scripts/tabularize_static.py \ tabularized_data_dir=$OUTPUT_DIR \ min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" -POLARS_MAX_THREADS=1 + ID=$RANDOM LOG_DIR="logs/$METHOD/$ID-logs" mkdir -p $LOG_DIR { time \ mprof run --include-children --exit-code --output "$LOG_DIR/mprofile.dat" \ - python scripts/summarize_over_windows.py \ + POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ @@ -40,7 +37,7 @@ mkdir -p $LOG_DIR tabularized_data_dir=$OUTPUT_DIR \ min_code_inclusion_frequency=1 do_overwrite=False \ "$WINDOW_SIZES" "$AGGS" \ - 2> $LOG_DIR/cmd.stderr + 2> $LOG_DIR/cmd.stderr } 2> $LOG_DIR/timings.txt cmd_exit_status=${PIPESTATUS[0]} @@ -49,7 +46,7 @@ if [ -n "$cmd_exit_status" ] && [ "$cmd_exit_status" -ne 0 ]; then echo "build_dataset.sh failed with status $cmd_exit_status." echo "Stderr from build_dataset.sh (see $LOG_DIR/cmd.stderr):" tail $LOG_DIR/cmd.stderr - exit $cmd_exit_status + exit "$cmd_exit_status" fi mprof plot -o $LOG_DIR/mprofile.png $LOG_DIR/mprofile.dat mprof peak $LOG_DIR/mprofile.dat > $LOG_DIR/peak_memory_usage.txt diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 21cc03b..70f6e68 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -112,7 +112,7 @@ def sparse_rolling(df, sparse_matrix, timedelta, agg): def get_rolling_window_indicies(index_df, window_size): """Get the indices for the rolling windows.""" if window_size == "full": - timedelta = pd.Timedelta(150*52, unit="W") # just use 150 years as time delta + timedelta = pd.Timedelta(150 * 52, unit="W") # just use 150 years as time delta else: timedelta = pd.Timedelta(window_size) return ( diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index 5bc0b48..0dfadfd 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -21,14 +21,13 @@ def feature_name_to_code(feature_name: str) -> str: """Converts a feature name to a code name.""" return "/".join(feature_name.split("/")[:-1]) + def get_long_code_df(df, ts_columns): """Pivots the codes data frame to a long format one-hot rep for time series data.""" column_to_int = {feature_name_to_code(col): i for i, col in enumerate(ts_columns)} rows = range(df.select(pl.len()).collect().item()) cols = ( - df.with_columns( - pl.col("code").cast(str).replace(column_to_int).cast(int).alias("code_index") - ) + df.with_columns(pl.col("code").cast(str).replace(column_to_int).cast(int).alias("code_index")) .select("code_index") .collect() .to_series() @@ -42,12 +41,12 @@ def get_long_code_df(df, ts_columns): def get_long_value_df(df, ts_columns): """Pivots the numerical value data frame to a long format for time series data.""" column_to_int = {feature_name_to_code(col): i for i, col in enumerate(ts_columns)} - value_df = df.with_row_index("index").drop_nulls("numerical_value").filter(pl.col("code").is_in(ts_columns)) + value_df = ( + df.with_row_index("index").drop_nulls("numerical_value").filter(pl.col("code").is_in(ts_columns)) + ) rows = value_df.select(pl.col("index")).collect().to_series().to_numpy() cols = ( - value_df.with_columns( - pl.col("code").cast(str).replace(column_to_int).cast(int).alias("value_index") - ) + value_df.with_columns(pl.col("code").cast(str).replace(column_to_int).cast(int).alias("value_index")) .select("value_index") .collect() .to_series() From 83c4eecbcbf0073f79fecc64fd54acb02cae2133 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 2 Jun 2024 11:02:08 -0400 Subject: [PATCH 062/106] Resolving deprecation warnings --- src/MEDS_tabular_automl/generate_static_features.py | 2 +- src/MEDS_tabular_automl/generate_ts_features.py | 4 ++-- src/MEDS_tabular_automl/utils.py | 8 ++++---- tests/test_tabularize.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/MEDS_tabular_automl/generate_static_features.py b/src/MEDS_tabular_automl/generate_static_features.py index 8ab60b5..c2164c4 100644 --- a/src/MEDS_tabular_automl/generate_static_features.py +++ b/src/MEDS_tabular_automl/generate_static_features.py @@ -26,7 +26,7 @@ def convert_to_matrix(df, num_events, num_features): """Converts a Polars DataFrame to a sparse matrix.""" - dense_matrix = df.drop(columns="patient_id").collect().to_numpy() + dense_matrix = df.drop("patient_id").collect().to_numpy() data_list = [] rows = [] cols = [] diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index 0dfadfd..c4d244e 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -103,10 +103,10 @@ def summarize_dynamic_measurements( # Generate sparse matrix if agg in CODE_AGGREGATIONS: - code_df = df.drop(columns=id_cols + ["numerical_value"]) + code_df = df.drop(*(id_cols + ["numerical_value"])) data, (rows, cols) = get_long_code_df(code_df, ts_columns) elif agg in VALUE_AGGREGATIONS: - value_df = df.drop(columns=id_cols) + value_df = df.drop(*id_cols) data, (rows, cols) = get_long_value_df(value_df, ts_columns) sp_matrix = csr_array( diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 3a88c27..f80b87b 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -259,14 +259,14 @@ def compute_feature_frequencies(cfg: DictConfig, shard_df: DF_T) -> list[str]: static_df = shard_df.filter( pl.col("patient_id").is_not_null() & pl.col("code").is_not_null() & pl.col("timestamp").is_null() ) - static_code_freqs_df = static_df.groupby("code").agg(pl.count("code").alias("count")).collect() + static_code_freqs_df = static_df.group_by("code").agg(pl.count("code").alias("count")).collect() static_code_freqs = { row["code"] + "/static/present": row["count"] for row in static_code_freqs_df.iter_rows(named=True) } static_value_df = static_df.filter(pl.col("numerical_value").is_not_null()) static_value_freqs_df = ( - static_value_df.groupby("code").agg(pl.count("numerical_value").alias("count")).collect() + static_value_df.group_by("code").agg(pl.count("numerical_value").alias("count")).collect() ) static_value_freqs = { row["code"] + "/static/first": row["count"] for row in static_value_freqs_df.iter_rows(named=True) @@ -275,11 +275,11 @@ def compute_feature_frequencies(cfg: DictConfig, shard_df: DF_T) -> list[str]: ts_df = shard_df.filter( pl.col("patient_id").is_not_null() & pl.col("code").is_not_null() & pl.col("timestamp").is_not_null() ) - code_freqs_df = ts_df.groupby("code").agg(pl.count("code").alias("count")).collect() + code_freqs_df = ts_df.group_by("code").agg(pl.count("code").alias("count")).collect() code_freqs = {row["code"] + "/code": row["count"] for row in code_freqs_df.iter_rows(named=True)} value_df = ts_df.filter(pl.col("numerical_value").is_not_null()) - value_freqs_df = value_df.groupby("code").agg(pl.count("numerical_value").alias("count")).collect() + value_freqs_df = value_df.group_by("code").agg(pl.count("numerical_value").alias("count")).collect() value_freqs = {row["code"] + "/value": row["count"] for row in value_freqs_df.iter_rows(named=True)} combined_freqs = {**static_code_freqs, **static_value_freqs, **code_freqs, **value_freqs} diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 39873c5..0967b64 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -226,7 +226,7 @@ def test_tabularize(): file_path = MEDS_cohort_dir / "final_cohort" / f"{split}.parquet" file_path.parent.mkdir(exist_ok=True) df = pl.read_csv(StringIO(data)) - df.with_columns(pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S.%f")).write_parquet( + df.with_columns(pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f")).write_parquet( file_path ) From e7a85ba867c7e79e67fbc8974f63a8b82180857d Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 2 Jun 2024 11:03:17 -0400 Subject: [PATCH 063/106] Fixed test installation instructions. --- .github/workflows/tests.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 999fffb..6aa8294 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -26,9 +26,7 @@ jobs: - name: Install packages run: | - pip install -e . - pip install pytest - pip install pytest-cov[toml] + pip install -e .[tests] #---------------------------------------------- # run test suite From bef63b639e506685feab345d7c5a796719f827bc Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 2 Jun 2024 11:09:21 -0400 Subject: [PATCH 064/106] Resolved one error (or, rather, shifted it) by making some things properties, but don't know what the desired behavior is for the error --- src/MEDS_tabular_automl/file_name.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py index 7700b82..c621983 100644 --- a/src/MEDS_tabular_automl/file_name.py +++ b/src/MEDS_tabular_automl/file_name.py @@ -7,8 +7,14 @@ class FileNameResolver: def __init__(self, cfg: DictConfig): self.cfg = cfg - self.meds_dir = Path(cfg.MEDS_cohort_dir) - self.tabularize_dir = Path(cfg.tabularized_data_dir) + + @property + def meds_dir(self): + return Path(self.cfg.MEDS_cohort_dir) + + @property + def tabularize_dir(self): + return Path(self.cfg.tabularized_data_dir) def get_meds_dir(self): return self.meds_dir / "final_cohort" From e9775e24f637d8c1db9a952a6db41f62d070d61d Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 2 Jun 2024 11:22:28 -0400 Subject: [PATCH 065/106] Shifted more test errors around, but the failures are deeper than expected. --- .../generate_summarized_reps.py | 70 +++++++------------ 1 file changed, 25 insertions(+), 45 deletions(-) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 70f6e68..2d49fdb 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -168,34 +168,17 @@ def compute_agg(index_df, matrix: sparray, window_size: str, agg: str, num_featu if agg is a code aggregation or only value columns if it is a value aggreagation. Example: - >>> from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep - >>> feature_columns = ['A/value/sum', 'A/code/count', 'B/value/sum', 'B/code/count', - ... "C/value/sum", "C/code/count", "A/static/present"] - >>> data = {'patient_id': [1, 1, 1, 2, 2, 2], - ... 'code': ['A', 'A', 'B', 'B', 'C', 'C'], - ... 'timestamp': ['2021-01-01', '2021-01-01', '2020-01-01', '2021-01-04', None, None], - ... 'numerical_value': [1, 2, 2, 2, 3, 4]} - >>> df = pl.DataFrame(data).lazy() - >>> df = get_flat_ts_rep(feature_columns, df) - >>> df - patient_id timestamp A/value B/value C/value A/code B/code C/code - 0 1 2021-01-01 1 0 0 1 0 0 - 1 1 2021-01-01 2 0 0 1 0 0 - 2 1 2020-01-01 0 2 0 0 1 0 - 3 2 2021-01-04 0 2 0 0 1 0 - >>> df['timestamp'] = pd.to_datetime(df['timestamp']) - >>> df.dtypes - patient_id int64 - timestamp datetime64[ns] - A/value Sparse[int64, 0] - B/value Sparse[int64, 0] - C/value Sparse[int64, 0] - A/code Sparse[int64, 0] - B/code Sparse[int64, 0] - C/code Sparse[int64, 0] - dtype: object - >>> output = compute_agg(df[['patient_id', 'timestamp', 'A/code', 'B/code', 'C/code']], - ... "1d", "code/count") + >>> from datetime import datetime + >>> df = pd.DataFrame({ + ... "patient_id": [1, 1, 1, 2], + ... "timestamp": [ + ... datetime(2021, 1, 1), datetime(2021, 1, 1), datetime(2020, 1, 3), datetime(2021, 1, 4) + ... ], + ... "A/code": [1, 1, 0, 0], + ... "B/code": [0, 0, 1, 1], + ... "C/code": [0, 0, 0, 0], + ... }) + >>> output = compute_agg(df, "1d", "code/count") >>> output 1d/A/code/count 1d/B/code/count 1d/C/code/count timestamp patient_id 0 1 0 0 2021-01-01 1 @@ -247,23 +230,20 @@ def _generate_summary( - pl.LazyFrame: The summarized data frame. Expect: - >>> from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep - >>> feature_columns = ['A/value/sum', 'A/code/count', 'B/value/sum', 'B/code/count', - ... "C/value/sum", "C/code/count", "A/static/present"] - >>> data = {'patient_id': [1, 1, 1, 2, 2, 2], - ... 'code': ['A', 'A', 'B', 'B', 'C', 'C'], - ... 'timestamp': ['2021-01-01', '2021-01-01', '2020-01-01', '2021-01-04', None, None], - ... 'numerical_value': [1, 2, 2, 2, 3, 4]} - >>> df = pl.DataFrame(data).lazy() - >>> pivot_df = get_flat_ts_rep(feature_columns, df) - >>> pivot_df['timestamp'] = pd.to_datetime(pivot_df['timestamp']) - >>> pivot_df - patient_id timestamp A/value B/value C/value A/code B/code C/code - 0 1 2021-01-01 1 0 0 1 0 0 - 1 1 2021-01-01 2 0 0 1 0 0 - 2 1 2020-01-01 0 2 0 0 1 0 - 3 2 2021-01-04 0 2 0 0 1 0 - >>> _generate_summary(pivot_df, "full", "value/sum") + >>> from datetime import datetime + >>> wide_df = pd.DataFrame({ + ... "patient_id": [1, 1, 1, 2], + ... "timestamp": [ + ... datetime(2021, 1, 1), datetime(2021, 1, 1), datetime(2020, 1, 3), datetime(2021, 1, 4) + ... ], + ... "A/code": [1, 1, 0, 0], + ... "B/code": [0, 0, 1, 1], + ... "C/code": [0, 0, 0, 0], + ... "A/value": [1, 2, 0, 0], + ... "B/value": [0, 0, 2, 2], + ... "C/value": [0, 0, 0, 0], + ... }) + >>> _generate_summary(wide_df, "full", "value/sum") full/A/value/count full/B/value/count full/C/value/count timestamp patient_id 0 1 0 0 2021-01-01 1 1 3 0 0 2021-01-01 1 From 91a2056bb17da8d56e812de66e7e1741574eab1c Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Sun, 2 Jun 2024 18:14:26 +0000 Subject: [PATCH 066/106] update iterators to reference correct data --- scripts/xgboost_sweep.py | 73 ++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/scripts/xgboost_sweep.py b/scripts/xgboost_sweep.py index 8ad44d7..8897a07 100644 --- a/scripts/xgboost_sweep.py +++ b/scripts/xgboost_sweep.py @@ -1,16 +1,14 @@ import json import os from collections.abc import Callable, Mapping -from datetime import datetime from pathlib import Path -from timeit import timeit import hydra import numpy as np import polars as pl import scipy.sparse as sp import xgboost as xgb -from loguru import logger +from mixins import TimeableMixin from omegaconf import DictConfig, OmegaConf from sklearn.metrics import mean_absolute_error @@ -18,7 +16,7 @@ from MEDS_tabular_automl.utils import get_feature_indices, load_matrix -class Iterator(xgb.DataIter): +class Iterator(xgb.DataIter, TimeableMixin): def __init__(self, cfg: DictConfig, split: str = "train"): """Initialize the Iterator with the provided configuration and split. @@ -44,8 +42,12 @@ def __init__(self, cfg: DictConfig, split: str = "train"): # XGBoost will generate some cache files under current directory with the prefix # "cache" - super().__init__(cache_prefix=os.path.join(".", "cache")) + super().__init__( + cache_prefix=os.path.join(".", "cache") + ) # TODO: Change where this is!! it should be in the same directory it comes from!! + # this is security issue! + @TimeableMixin.TimeAs def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: """Loads valid event ids and labels for each shard. @@ -65,6 +67,7 @@ def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: cached_labels[shard] = label_df.select(pl.col("label")).collect().to_series() return cached_event_ids, cached_labels + @TimeableMixin.TimeAs def _get_code_set(self) -> set: """Get the set of codes to include in the data based on the configuration.""" with open(self.file_name_resolver.get_feature_columns_fp()) as f: @@ -92,6 +95,7 @@ def _get_code_set(self) -> set: # TODO: make sure we aren't filtering out static columns!!! return list(codes_set), len(feature_columns) + @TimeableMixin.TimeAs def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: """Load a sparse shard into memory. @@ -140,6 +144,7 @@ def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: return self._filter_shard_on_codes_and_freqs(agg, sp.csc_matrix(matrix)) + @TimeableMixin.TimeAs def _get_dynamic_shard_by_index(self, idx: int) -> sp.csr_matrix: """Load a specific shard of dynamic data from disk and return it as a sparse matrix after filtering column inclusion. @@ -163,6 +168,7 @@ def _get_dynamic_shard_by_index(self, idx: int) -> sp.csr_matrix: valid_indices = self.valid_event_ids[shard_name] return combined_csr[valid_indices, :] + @TimeableMixin.TimeAs def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: """Load a specific shard of data from disk and concatenate with static data. @@ -173,14 +179,11 @@ def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: - X (scipy.sparse.csr_matrix): Feature data frame.ß - y (numpy.ndarray): Labels. """ - time = datetime.now() dynamic_df = self._get_dynamic_shard_by_index(idx) - logger.debug(f"Dynamic data loading took {datetime.now() - time}") - time = datetime.now() label_df = self.labels[self._data_shards[idx]] - logger.debug(f"Task data loading took {datetime.now() - time}") return dynamic_df, label_df + @TimeableMixin.TimeAs def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csc_matrix) -> sp.csc_matrix: """Filter the dynamic data frame based on the inclusion sets. Given the codes_mask, filter the data frame to only include columns that are True in the mask. @@ -197,6 +200,7 @@ def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csc_matrix) -> sp.cs code_mask = [True if idx in self.codes_set else False for idx in feature_ids] return df[:, code_mask] # [:, list({index for index in self.codes_set if index < df.shape[1]})] + @TimeableMixin.TimeAs def next(self, input_data: Callable): """Advance the iterator by 1 step and pass the data to XGBoost. This function is called by XGBoost during the construction of ``DMatrix`` @@ -207,7 +211,6 @@ def next(self, input_data: Callable): Returns: - int: 0 if end of iteration, 1 otherwise. """ - start_time = datetime.now() if self._it == len(self._data_shards): # return 0 to let XGBoost know this is the end of iteration return 0 @@ -218,13 +221,14 @@ def next(self, input_data: Callable): input_data(data=X, label=y) self._it += 1 # Return 1 to let XGBoost know we haven't seen all the files yet. - logger.debug(f"******** One iteration took {datetime.now() - start_time}") return 1 + @TimeableMixin.TimeAs def reset(self): """Reset the iterator to its beginning.""" self._it = 0 + @TimeableMixin.TimeAs def collect_in_memory(self) -> tuple[sp.coo_matrix, np.ndarray]: """Collect the data in memory. @@ -255,15 +259,16 @@ def __init__(self, cfg: DictConfig): self.keep_data_in_memory = getattr(getattr(cfg, "iterator", {}), "keep_data_in_memory", True) self.itrain = None - self.ival = None - self.itest = None + self.ituning = None + self.iheld_out = None self.dtrain = None - self.dval = None - self.dtest = None + self.dtuning = None + self.dheld_out = None self.model = None + @TimeableMixin.TimeAs def train(self): """Train the model.""" self._build() @@ -273,6 +278,7 @@ def train(self): OmegaConf.to_container(self.cfg.model), self.dtrain ) # do we want eval and things? + @TimeableMixin.TimeAs def _build(self): """Build necessary data structures for training.""" if self.keep_data_in_memory: @@ -282,27 +288,31 @@ def _build(self): self._build_iterators() self._build_dmatrix_from_iterators() + @TimeableMixin.TimeAs def _build_dmatrix_in_memory(self): """Build the DMatrix from the data in memory.""" X_train, y_train = self.itrain.collect_in_memory() - X_val, y_val = self.ival.collect_in_memory() - X_test, y_test = self.itest.collect_in_memory() + X_tuning, y_tuning = self.ituning.collect_in_memory() + X_held_out, y_held_out = self.iheld_out.collect_in_memory() self.dtrain = xgb.DMatrix(X_train, label=y_train) - self.dval = xgb.DMatrix(X_val, label=y_val) - self.dtest = xgb.DMatrix(X_test, label=y_test) + self.dtuning = xgb.DMatrix(X_tuning, label=y_tuning) + self.dheld_out = xgb.DMatrix(X_held_out, label=y_held_out) + @TimeableMixin.TimeAs def _build_dmatrix_from_iterators(self): """Build the DMatrix from the iterators.""" - self.dtrain = xgb.DMatrix(self.ival) - self.dval = xgb.DMatrix(self.itest) - self.dtest = xgb.DMatrix(self.itest) + self.dtrain = xgb.DMatrix(self.irain) + self.dtuning = xgb.DMatrix(self.ituning) + self.dheld_out = xgb.DMatrix(self.iheld_out) + @TimeableMixin.TimeAs def _build_iterators(self): """Build the iterators for training, validation, and testing.""" self.itrain = Iterator(self.cfg, split="train") - self.ival = Iterator(self.cfg, split="tuning") - self.itest = Iterator(self.cfg, split="held_out") + self.ituning = Iterator(self.cfg, split="tuning") + self.iheld_out = Iterator(self.cfg, split="held_out") + @TimeableMixin.TimeAs def evaluate(self) -> float: """Evaluate the model on the test set. @@ -311,8 +321,8 @@ def evaluate(self) -> float: """ # TODO: Figure out exactly what we want to do here - y_pred = self.model.predict(self.dtest) - y_true = self.dtest.get_label() + y_pred = self.model.predict(self.dheld_out) + y_true = self.dheld_out.get_label() return mean_absolute_error(y_true, y_pred) @@ -326,12 +336,8 @@ def xgboost(cfg: DictConfig) -> float: Returns: - float: Evaluation result. """ - logger.debug("Initializing XGBoost model") model = XGBoostModel(cfg) - logger.debug("Training XGBoost model") - time = datetime.now() model.train() - logger.debug(f"Training took {datetime.now() - time}") # save model save_dir = ( Path(cfg.model_dir) @@ -346,9 +352,4 @@ def xgboost(cfg: DictConfig) -> float: if __name__ == "__main__": - # start_time = datetime.now() - # xgboost() - # logger.debug(f"Total time: {datetime.now() - start_time}") - num = 10 - time = timeit(xgboost, number=num) / num - logger.debug(f"Training time averaged over {num} runs: {time}") + xgboost() From 0bbe87945ffadd5a19914adc701b56c05bb30cab Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Sun, 2 Jun 2024 21:37:29 +0000 Subject: [PATCH 067/106] timeable --- configs/{xgboost_sweep.yaml => xgboost.yaml} | 16 ++-- scripts/{xgboost_sweep.py => xgboost.py} | 92 +++++++++----------- 2 files changed, 52 insertions(+), 56 deletions(-) rename configs/{xgboost_sweep.yaml => xgboost.yaml} (69%) rename scripts/{xgboost_sweep.py => xgboost.py} (83%) diff --git a/configs/xgboost_sweep.yaml b/configs/xgboost.yaml similarity index 69% rename from configs/xgboost_sweep.yaml rename to configs/xgboost.yaml index 606b446..b88f479 100644 --- a/configs/xgboost_sweep.yaml +++ b/configs/xgboost.yaml @@ -1,11 +1,13 @@ # Raw data MEDS_cohort_dir: ??? -tabularized_data_dir: ??? -model_dir: ${tabularized_data_dir}/model +tabularized_data_dir: ${MEDS_cohort_dir}/tabularize +task_dir: ${tabularized_data_dir}/task +model_dir: ${MEDS_cohort_dir}/model_${now:%Y-%m-%d_%H-%M-%S}/ +cache_dir: ${tabularized_data_dir}/.cache # Pre-processing min_code_inclusion_frequency: 1 -window_sizes: [1d] +window_sizes: [full] codes: null aggs: - "code/count" @@ -28,10 +30,11 @@ model: booster: gbtree device: cpu tree_method: hist - objective: reg:squarederror + objective: binary:logistic iterator: keep_data_in_memory: False + binarize_task: True # Hydra settings for sweep defaults: @@ -41,9 +44,9 @@ defaults: hydra: verbose: False sweep: - dir: ${tabularized_data_dir}/.logs/etl/${now:%Y-%m-%d_%H-%M-%S} + dir: ${tabularized_data_dir}/.logs/${now:%Y-%m-%d_%H-%M-%S} run: - dir: ${tabularized_data_dir}/.logs/etl/${now:%Y-%m-%d_%H-%M-%S} + dir: ${tabularized_data_dir}/.logs/${now:%Y-%m-%d_%H-%M-%S} # Optuna Sweeper sweeper: @@ -57,5 +60,4 @@ hydra: # Define search space for Optuna params: window_sizes: choice([30d, 365d, full], [30d, full], [30d]) - # iterator.keep_static_data_in_memory: choice([True], [False]) # iterator.keep_data_in_memory: choice([True], [False]) diff --git a/scripts/xgboost_sweep.py b/scripts/xgboost.py similarity index 83% rename from scripts/xgboost_sweep.py rename to scripts/xgboost.py index 8897a07..40510e7 100644 --- a/scripts/xgboost_sweep.py +++ b/scripts/xgboost.py @@ -10,11 +10,16 @@ import xgboost as xgb from mixins import TimeableMixin from omegaconf import DictConfig, OmegaConf -from sklearn.metrics import mean_absolute_error +from sklearn.metrics import roc_auc_score from MEDS_tabular_automl.file_name import FileNameResolver from MEDS_tabular_automl.utils import get_feature_indices, load_matrix +from loguru import logger + +from datetime import datetime + + class Iterator(xgb.DataIter, TimeableMixin): def __init__(self, cfg: DictConfig, split: str = "train"): @@ -64,7 +69,12 @@ def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: for shard, label_fp in label_fps.items(): label_df = pl.scan_parquet(label_fp) cached_event_ids[shard] = label_df.select(pl.col("event_id")).collect().to_series() + + # TODO: check this for Nan or any other case we need to worry about cached_labels[shard] = label_df.select(pl.col("label")).collect().to_series() + # if self.cfg.iterator.binarize_task: + # cached_labels[shard] = cached_labels[shard].map_elements(lambda x: 1 if x > 0 else 0, return_dtype=pl.Int8) + return cached_event_ids, cached_labels @TimeableMixin.TimeAs @@ -104,36 +114,6 @@ def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: Returns: - sp.coo_matrix: Data frame with the sparse shard. - >>> import tempfile - >>> from types import SimpleNamespace - >>> with tempfile.TemporaryDirectory() as tempdir: - ... sample_shard_path = Path(tempdir) / "sample_shard.npy" - ... sample_shard_data = np.array([[0, 1, 0], - ... [1, 0, 1], - ... [0, 1, 0]]) - ... sample_filtered_data = np.array([[1, 0], - ... [0, 1], - ... [1, 0]]) - ... np.save(sample_shard_path, sample_shard_data) - ... cfg = SimpleNamespace( - ... aggs=None, - ... window_sizes=None, - ... codes=None, - ... min_code_inclusion_frequency=None, - ... tabularized_data_dir=Path(tempdir) - ... ) - ... feature_columns = ["code1/code", "code2/code", "value1/value"] - ... with open(Path(tempdir) / "feature_columns.json", "w") as f: - ... json.dump(feature_columns, f) - ... iterator_instance = Iterator(cfg) - ... iterator_instance.codes_mask = np.array([False, True, True]) - ... loaded_shard = iterator_instance._load_dynamic_shard_from_file(sample_shard_path) - ... assert isinstance(loaded_shard, sp.csr_matrix) - ... expected_csr = sp.csr_matrix(sample_filtered_data) - ... assert sp.issparse(loaded_shard) - ... assert np.array_equal(loaded_shard.data, expected_csr.data) - ... assert np.array_equal(loaded_shard.indices, expected_csr.indices) - ... assert np.array_equal(loaded_shard.indptr, expected_csr.indptr) """ # column_shard is of form event_idx, feature_idx, value matrix = load_matrix(path) @@ -160,7 +140,9 @@ def _get_dynamic_shard_by_index(self, idx: int) -> sp.csr_matrix: files = self.file_name_resolver.get_model_files( self.cfg.window_sizes, self.cfg.aggs, self.split, self._data_shards[idx] ) - assert all([file.exists() for file in files]) + if not all(file.exists() for file in files): + raise ValueError("Not all files exist") + shard_name = self._data_shards[idx] dynamic_csrs = [self._load_dynamic_shard_from_file(file, idx) for file in files] combined_csr = sp.hstack(dynamic_csrs, format="csr") # TODO: check this @@ -247,7 +229,7 @@ def collect_in_memory(self) -> tuple[sp.coo_matrix, np.ndarray]: return X, y -class XGBoostModel: +class XGBoostModel(TimeableMixin): def __init__(self, cfg: DictConfig): """Initialize the XGBoostClassifier with the provided configuration. @@ -267,16 +249,19 @@ def __init__(self, cfg: DictConfig): self.dheld_out = None self.model = None - @TimeableMixin.TimeAs - def train(self): + def _train(self): """Train the model.""" - self._build() # TODO: add in eval, early stopping, etc. # TODO: check for Nan and inf in labels! self.model = xgb.train( OmegaConf.to_container(self.cfg.model), self.dtrain - ) # do we want eval and things? + ) # TODO: fix eval etc. + @TimeableMixin.TimeAs + def train(self): + """Train the model.""" + self._build() + self._train() @TimeableMixin.TimeAs def _build(self): @@ -291,7 +276,7 @@ def _build(self): @TimeableMixin.TimeAs def _build_dmatrix_in_memory(self): """Build the DMatrix from the data in memory.""" - X_train, y_train = self.itrain.collect_in_memory() + X_train, y_train = self.ituning.collect_in_memory() X_tuning, y_tuning = self.ituning.collect_in_memory() X_held_out, y_held_out = self.iheld_out.collect_in_memory() self.dtrain = xgb.DMatrix(X_train, label=y_train) @@ -301,7 +286,7 @@ def _build_dmatrix_in_memory(self): @TimeableMixin.TimeAs def _build_dmatrix_from_iterators(self): """Build the DMatrix from the iterators.""" - self.dtrain = xgb.DMatrix(self.irain) + self.dtrain = xgb.DMatrix(self.itrain) self.dtuning = xgb.DMatrix(self.ituning) self.dheld_out = xgb.DMatrix(self.iheld_out) @@ -323,10 +308,10 @@ def evaluate(self) -> float: y_pred = self.model.predict(self.dheld_out) y_true = self.dheld_out.get_label() - return mean_absolute_error(y_true, y_pred) + return roc_auc_score(y_true, y_pred) -@hydra.main(version_base=None, config_path="../configs", config_name="xgboost_sweep") +@hydra.main(version_base=None, config_path="../configs", config_name="xgboost") def xgboost(cfg: DictConfig) -> float: """Optimize the model based on the provided configuration. @@ -338,17 +323,26 @@ def xgboost(cfg: DictConfig) -> float: """ model = XGBoostModel(cfg) model.train() + logger.info("Time Profiling:") + logger.info("Train Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model._profile_durations().items()))) + logger.info("Train Iterator Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model.itrain._profile_durations().items()))) + logger.info("Tuning Iterator Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model.ituning._profile_durations().items()))) + logger.info("Held Out Iterator Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model.iheld_out._profile_durations().items()))) + + # print("Time Profiling:") + # print("Train Time: \n", model._profile_durations()) + # print("Train Iterator Time: \n", model.itrain._profile_durations()) + # print("Tuning Iterator Time: \n", model.ituning._profile_durations()) + # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) + # save model - save_dir = ( - Path(cfg.model_dir) - / "_".join(map(str, cfg.window_sizes)) - / "_".join([agg.replace("/", "") for agg in cfg.aggs]) - ) + save_dir = Path(cfg.model_dir) save_dir.mkdir(parents=True, exist_ok=True) - model.model.save_model(save_dir / f"{np.random.randint(100000, 999999)}_model.json") - - return model.evaluate() + model.model.save_model(save_dir / "model.json") + auc = model.evaluate() + logger.info(f"ROC AUC: {auc}") + return auc if __name__ == "__main__": From e0914322031a3e65eb8cef36a693639990766dbd Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 2 Jun 2024 17:45:39 -0400 Subject: [PATCH 068/106] Update xgboost.py --- scripts/xgboost.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/xgboost.py b/scripts/xgboost.py index 40510e7..911fd88 100644 --- a/scripts/xgboost.py +++ b/scripts/xgboost.py @@ -178,9 +178,13 @@ def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csc_matrix) -> sp.cs """ if self.codes_set is None: return df + key=f"_filter_shard_on_codes_and_freqs/{agg}" + self._register_start(key=key) feature_ids = self.agg_to_feature_ids[agg] code_mask = [True if idx in self.codes_set else False for idx in feature_ids] - return df[:, code_mask] # [:, list({index for index in self.codes_set if index < df.shape[1]})] + df = df[:, code_mask] # [:, list({index for index in self.codes_set if index < df.shape[1]})] + self._register_end(key=key) + return df @TimeableMixin.TimeAs def next(self, input_data: Callable): From dc8a093420d39d5743793bf060748b58bf69a774 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 2 Jun 2024 17:49:55 -0400 Subject: [PATCH 069/106] Add time tracking around row selection. --- scripts/xgboost.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/scripts/xgboost.py b/scripts/xgboost.py index 911fd88..2a03846 100644 --- a/scripts/xgboost.py +++ b/scripts/xgboost.py @@ -145,10 +145,19 @@ def _get_dynamic_shard_by_index(self, idx: int) -> sp.csr_matrix: shard_name = self._data_shards[idx] dynamic_csrs = [self._load_dynamic_shard_from_file(file, idx) for file in files] + + fn_name = "_get_dynamic_shard_by_index" + hstack_key = f"{fn_name}/hstack" + self._register_start(key=hstack_key) combined_csr = sp.hstack(dynamic_csrs, format="csr") # TODO: check this + self._register_end(key=hstack_key) # Filter Rows valid_indices = self.valid_event_ids[shard_name] - return combined_csr[valid_indices, :] + filter_key = f"{fn_name}/filter" + self._register_start(key=filter_key) + out = combined_csr[valid_indices, :] + self._register_end(key=filter_key) + return out @TimeableMixin.TimeAs def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: From 45470aea79a1a980f7d7daf4f230f704dca47a30 Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Sun, 2 Jun 2024 22:45:22 +0000 Subject: [PATCH 070/106] sparse updates --- configs/xgboost.yaml | 6 ++--- hf_cohort/xgboost.sh | 3 ++- scripts/xgboost.py | 38 ++++++++++++++-------------- src/MEDS_tabular_automl/file_name.py | 7 +++++ 4 files changed, 31 insertions(+), 23 deletions(-) diff --git a/configs/xgboost.yaml b/configs/xgboost.yaml index b88f479..ff2e273 100644 --- a/configs/xgboost.yaml +++ b/configs/xgboost.yaml @@ -6,8 +6,8 @@ model_dir: ${MEDS_cohort_dir}/model_${now:%Y-%m-%d_%H-%M-%S}/ cache_dir: ${tabularized_data_dir}/.cache # Pre-processing -min_code_inclusion_frequency: 1 -window_sizes: [full] +min_code_inclusion_frequency: 10 +window_sizes: [1d] codes: null aggs: - "code/count" @@ -30,7 +30,7 @@ model: booster: gbtree device: cpu tree_method: hist - objective: binary:logistic + objective: reg:squarederror iterator: keep_data_in_memory: False diff --git a/hf_cohort/xgboost.sh b/hf_cohort/xgboost.sh index 3ef570f..f6dd525 100644 --- a/hf_cohort/xgboost.sh +++ b/hf_cohort/xgboost.sh @@ -2,5 +2,6 @@ BASE_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed TAB_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize +KEEP_IN_MEMORY=True -python -m scripts.xgboost_sweep MEDS_cohort_dir=$BASE_DIR tabularized_data_dir=$TAB_DIR +python -m scripts.xgboost MEDS_cohort_dir=$BASE_DIR tabularized_data_dir=$TAB_DIR iterator.keep_data_in_memory=$KEEP_IN_MEMORY diff --git a/scripts/xgboost.py b/scripts/xgboost.py index 2a03846..a45c75d 100644 --- a/scripts/xgboost.py +++ b/scripts/xgboost.py @@ -10,7 +10,7 @@ import xgboost as xgb from mixins import TimeableMixin from omegaconf import DictConfig, OmegaConf -from sklearn.metrics import roc_auc_score +from sklearn.metrics import roc_auc_score, mean_absolute_error from MEDS_tabular_automl.file_name import FileNameResolver from MEDS_tabular_automl.utils import get_feature_indices, load_matrix @@ -36,7 +36,7 @@ def __init__(self, cfg: DictConfig, split: str = "train"): # self.dynamic_data_path = self.data_path / "sparse" / split # self.task_data_path = self.data_path / "task" / split self._data_shards = sorted( - [shard.stem for shard in self.file_name_resolver.list_label_files(split)] + ["0"]#[shard.stem for shard in self.file_name_resolver.list_label_files(split)] ) # [2, 4, 5] # self.valid_event_ids, self.labels = self.load_labels() self.codes_set, self.num_features = self._get_code_set() @@ -48,7 +48,7 @@ def __init__(self, cfg: DictConfig, split: str = "train"): # XGBoost will generate some cache files under current directory with the prefix # "cache" super().__init__( - cache_prefix=os.path.join(".", "cache") + cache_prefix=os.path.join(self.file_name_resolver.get_cache_dir()) ) # TODO: Change where this is!! it should be in the same directory it comes from!! # this is security issue! @@ -106,7 +106,7 @@ def _get_code_set(self) -> set: return list(codes_set), len(feature_columns) @TimeableMixin.TimeAs - def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: + def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csr_matrix: """Load a sparse shard into memory. Args: @@ -175,7 +175,7 @@ def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: return dynamic_df, label_df @TimeableMixin.TimeAs - def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csc_matrix) -> sp.csc_matrix: + def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csc_matrix) -> sp.csr_matrix: """Filter the dynamic data frame based on the inclusion sets. Given the codes_mask, filter the data frame to only include columns that are True in the mask. @@ -193,7 +193,7 @@ def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csc_matrix) -> sp.cs code_mask = [True if idx in self.codes_set else False for idx in feature_ids] df = df[:, code_mask] # [:, list({index for index in self.codes_set if index < df.shape[1]})] self._register_end(key=key) - return df + return sp.csr_matrix(df) @TimeableMixin.TimeAs def next(self, input_data: Callable): @@ -289,7 +289,7 @@ def _build(self): @TimeableMixin.TimeAs def _build_dmatrix_in_memory(self): """Build the DMatrix from the data in memory.""" - X_train, y_train = self.ituning.collect_in_memory() + X_train, y_train = self.itrain.collect_in_memory() X_tuning, y_tuning = self.ituning.collect_in_memory() X_held_out, y_held_out = self.iheld_out.collect_in_memory() self.dtrain = xgb.DMatrix(X_train, label=y_train) @@ -321,7 +321,7 @@ def evaluate(self) -> float: y_pred = self.model.predict(self.dheld_out) y_true = self.dheld_out.get_label() - return roc_auc_score(y_true, y_pred) + return mean_absolute_error(y_true, y_pred) @hydra.main(version_base=None, config_path="../configs", config_name="xgboost") @@ -336,17 +336,17 @@ def xgboost(cfg: DictConfig) -> float: """ model = XGBoostModel(cfg) model.train() - logger.info("Time Profiling:") - logger.info("Train Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model._profile_durations().items()))) - logger.info("Train Iterator Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model.itrain._profile_durations().items()))) - logger.info("Tuning Iterator Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model.ituning._profile_durations().items()))) - logger.info("Held Out Iterator Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model.iheld_out._profile_durations().items()))) - - # print("Time Profiling:") - # print("Train Time: \n", model._profile_durations()) - # print("Train Iterator Time: \n", model.itrain._profile_durations()) - # print("Tuning Iterator Time: \n", model.ituning._profile_durations()) - # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) + # logger.info("Time Profiling:") + # logger.info("Train Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model._profile_durations().items()))) + # logger.info("Train Iterator Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model.itrain._profile_durations().items()))) + # logger.info("Tuning Iterator Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model.ituning._profile_durations().items()))) + # logger.info("Held Out Iterator Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model.iheld_out._profile_durations().items()))) + + print("Time Profiling:") + print("Train Time: \n", model._profile_durations()) + print("Train Iterator Time: \n", model.itrain._profile_durations()) + print("Tuning Iterator Time: \n", model.ituning._profile_durations()) + print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) # save model save_dir = Path(cfg.model_dir) diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py index c621983..f19e9b6 100644 --- a/src/MEDS_tabular_automl/file_name.py +++ b/src/MEDS_tabular_automl/file_name.py @@ -16,6 +16,10 @@ def meds_dir(self): def tabularize_dir(self): return Path(self.cfg.tabularized_data_dir) + @property + def cache_dir(self): + return Path(self.cfg.cache_dir) + def get_meds_dir(self): return self.meds_dir / "final_cohort" @@ -90,6 +94,9 @@ def list_label_files(self, split=None): if split: return sorted(list(self.get_label_dir().glob(f"{split}/*.parquet"))) return sorted(list(self.get_label_dir().glob("*/*.parquet"))) + + def get_cache_dir(self): + return self.cache_dir def get_model_files(self, window_sizes, aggs, split, shard_num: int): # Given a shard number, returns the model files From 84f5a772e1b86a521f5ad3c584fa7a8ba01d3e2d Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Mon, 3 Jun 2024 01:08:48 +0000 Subject: [PATCH 071/106] speed up? --- scripts/xgboost.py | 101 +++++++++++++++++++++++++++++++-------------- 1 file changed, 69 insertions(+), 32 deletions(-) diff --git a/scripts/xgboost.py b/scripts/xgboost.py index a45c75d..cd657ef 100644 --- a/scripts/xgboost.py +++ b/scripts/xgboost.py @@ -8,17 +8,13 @@ import polars as pl import scipy.sparse as sp import xgboost as xgb +from loguru import logger from mixins import TimeableMixin from omegaconf import DictConfig, OmegaConf -from sklearn.metrics import roc_auc_score, mean_absolute_error +from sklearn.metrics import mean_absolute_error from MEDS_tabular_automl.file_name import FileNameResolver -from MEDS_tabular_automl.utils import get_feature_indices, load_matrix - -from loguru import logger - -from datetime import datetime - +from MEDS_tabular_automl.utils import get_feature_indices class Iterator(xgb.DataIter, TimeableMixin): @@ -32,15 +28,14 @@ def __init__(self, cfg: DictConfig, split: str = "train"): self.cfg = cfg self.file_name_resolver = FileNameResolver(cfg) self.split = split - # self.data_path = Path(cfg.tabularized_data_dir) - # self.dynamic_data_path = self.data_path / "sparse" / split - # self.task_data_path = self.data_path / "task" / split + self._data_shards = sorted( - ["0"]#[shard.stem for shard in self.file_name_resolver.list_label_files(split)] - ) # [2, 4, 5] # + [shard.stem for shard in self.file_name_resolver.list_label_files(split)] + ) self.valid_event_ids, self.labels = self.load_labels() - self.codes_set, self.num_features = self._get_code_set() + self.codes_set, self.code_masks, self.num_features = self._get_code_set() feature_columns = json.load(open(self.file_name_resolver.get_feature_columns_fp())) + self.agg_to_feature_ids = {agg: get_feature_indices(agg, feature_columns) for agg in cfg.aggs} self._it = 0 @@ -49,8 +44,34 @@ def __init__(self, cfg: DictConfig, split: str = "train"): # "cache" super().__init__( cache_prefix=os.path.join(self.file_name_resolver.get_cache_dir()) - ) # TODO: Change where this is!! it should be in the same directory it comes from!! - # this is security issue! + ) + + @TimeableMixin.TimeAs + def _get_code_masks(self, feature_columns, codes_set): + code_masks = {} + for agg in set(self.cfg.aggs): + feature_ids = get_feature_indices(agg, feature_columns) + code_mask = [True if idx in codes_set else False for idx in feature_ids] + code_masks[agg] = code_mask + return code_masks + + + @TimeableMixin.TimeAs + def _load_matrix(self, path: Path) -> sp.csr_matrix: + """Load a sparse matrix from disk. + + Args: + - path (Path): Path to the sparse matrix. + + Returns: + - sp.csr_matrix: Sparse matrix. + """ + npzfile = np.load(path) + array, shape = npzfile["array"], npzfile["shape"] + if array.shape[0] != 3: + raise ValueError(f"Expected array to have 3 rows, but got {array.shape[0]} rows") + data, row, col = array + return sp.csr_matrix((data, (row, col)), shape=shape) @TimeableMixin.TimeAs def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: @@ -70,11 +91,11 @@ def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: label_df = pl.scan_parquet(label_fp) cached_event_ids[shard] = label_df.select(pl.col("event_id")).collect().to_series() - # TODO: check this for Nan or any other case we need to worry about + # TODO: check this for Nan or any other case we need to worry about cached_labels[shard] = label_df.select(pl.col("label")).collect().to_series() # if self.cfg.iterator.binarize_task: # cached_labels[shard] = cached_labels[shard].map_elements(lambda x: 1 if x > 0 else 0, return_dtype=pl.Int8) - + return cached_event_ids, cached_labels @TimeableMixin.TimeAs @@ -102,8 +123,10 @@ def _get_code_set(self) -> set: codes_set = frequency_set else: codes_set = None # set(feature_columns) + if codes_set == set(feature_columns): + codes_set = None # TODO: make sure we aren't filtering out static columns!!! - return list(codes_set), len(feature_columns) + return codes_set, self._get_code_masks(feature_columns, codes_set), len(feature_columns) @TimeableMixin.TimeAs def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csr_matrix: @@ -116,13 +139,13 @@ def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csr_matrix: - sp.coo_matrix: Data frame with the sparse shard. """ # column_shard is of form event_idx, feature_idx, value - matrix = load_matrix(path) + matrix = self._load_matrix(path) if path.stem in ["first", "present"]: agg = f"static/{path.stem}" else: agg = f"{path.parent.stem}/{path.stem}" - return self._filter_shard_on_codes_and_freqs(agg, sp.csc_matrix(matrix)) + return self._filter_shard_on_codes_and_freqs(agg, matrix) @TimeableMixin.TimeAs def _get_dynamic_shard_by_index(self, idx: int) -> sp.csr_matrix: @@ -142,7 +165,7 @@ def _get_dynamic_shard_by_index(self, idx: int) -> sp.csr_matrix: ) if not all(file.exists() for file in files): raise ValueError("Not all files exist") - + shard_name = self._data_shards[idx] dynamic_csrs = [self._load_dynamic_shard_from_file(file, idx) for file in files] @@ -175,7 +198,7 @@ def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: return dynamic_df, label_df @TimeableMixin.TimeAs - def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csc_matrix) -> sp.csr_matrix: + def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csr_matrix) -> sp.csr_matrix: """Filter the dynamic data frame based on the inclusion sets. Given the codes_mask, filter the data frame to only include columns that are True in the mask. @@ -187,13 +210,27 @@ def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csc_matrix) -> sp.cs """ if self.codes_set is None: return df - key=f"_filter_shard_on_codes_and_freqs/{agg}" - self._register_start(key=key) - feature_ids = self.agg_to_feature_ids[agg] - code_mask = [True if idx in self.codes_set else False for idx in feature_ids] - df = df[:, code_mask] # [:, list({index for index in self.codes_set if index < df.shape[1]})] - self._register_end(key=key) - return sp.csr_matrix(df) + # key = f"_filter_shard_on_codes_and_freqs/{agg}" + # self._register_start(key=key) + + # feature_ids = self.agg_to_feature_ids[agg] + # code_mask = [True if idx in self.codes_set else False for idx in feature_ids] + # filtered_df = df[:, code_mask] # [:, list({index for index in self.codes_set if index < df.shape[1]})] + + # # df = df[:, self.code_masks[agg]] # [:, list({index for index in self.codes_set if index < df.shape[1]})] + + # self._register_end(key=key) + + # if not np.array_equal(code_mask, self.code_masks[agg]): + # raise ValueError("code_mask and another_mask are not the same") + ckey = f"precomputed_filter_shard_on_codes_and_freqs/{agg}" + self._register_start(key=ckey) + + df = df[:, self.code_masks[agg]] + + self._register_end(key=ckey) + + return df @TimeableMixin.TimeAs def next(self, input_data: Callable): @@ -262,14 +299,14 @@ def __init__(self, cfg: DictConfig): self.dheld_out = None self.model = None + @TimeableMixin.TimeAs def _train(self): """Train the model.""" # TODO: add in eval, early stopping, etc. # TODO: check for Nan and inf in labels! - self.model = xgb.train( - OmegaConf.to_container(self.cfg.model), self.dtrain - ) # TODO: fix eval etc. + self.model = xgb.train(OmegaConf.to_container(self.cfg.model), self.dtrain) # TODO: fix eval etc. + @TimeableMixin.TimeAs def train(self): """Train the model.""" From a2fffa4654092217d6b5065e4e5f6296fc8dde47 Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Mon, 3 Jun 2024 01:27:45 +0000 Subject: [PATCH 072/106] wip --- scripts/xgboost.py | 17 ++++++----------- src/MEDS_tabular_automl/file_name.py | 4 ++-- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/scripts/xgboost.py b/scripts/xgboost.py index cd657ef..7ca74fd 100644 --- a/scripts/xgboost.py +++ b/scripts/xgboost.py @@ -29,9 +29,7 @@ def __init__(self, cfg: DictConfig, split: str = "train"): self.file_name_resolver = FileNameResolver(cfg) self.split = split - self._data_shards = sorted( - [shard.stem for shard in self.file_name_resolver.list_label_files(split)] - ) + self._data_shards = sorted([shard.stem for shard in self.file_name_resolver.list_label_files(split)]) self.valid_event_ids, self.labels = self.load_labels() self.codes_set, self.code_masks, self.num_features = self._get_code_set() feature_columns = json.load(open(self.file_name_resolver.get_feature_columns_fp())) @@ -42,9 +40,7 @@ def __init__(self, cfg: DictConfig, split: str = "train"): # XGBoost will generate some cache files under current directory with the prefix # "cache" - super().__init__( - cache_prefix=os.path.join(self.file_name_resolver.get_cache_dir()) - ) + super().__init__(cache_prefix=os.path.join(self.file_name_resolver.get_cache_dir())) @TimeableMixin.TimeAs def _get_code_masks(self, feature_columns, codes_set): @@ -54,7 +50,6 @@ def _get_code_masks(self, feature_columns, codes_set): code_mask = [True if idx in codes_set else False for idx in feature_ids] code_masks[agg] = code_mask return code_masks - @TimeableMixin.TimeAs def _load_matrix(self, path: Path) -> sp.csr_matrix: @@ -212,7 +207,7 @@ def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csr_matrix) -> sp.cs return df # key = f"_filter_shard_on_codes_and_freqs/{agg}" # self._register_start(key=key) - + # feature_ids = self.agg_to_feature_ids[agg] # code_mask = [True if idx in self.codes_set else False for idx in feature_ids] # filtered_df = df[:, code_mask] # [:, list({index for index in self.codes_set if index < df.shape[1]})] @@ -223,10 +218,10 @@ def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csr_matrix) -> sp.cs # if not np.array_equal(code_mask, self.code_masks[agg]): # raise ValueError("code_mask and another_mask are not the same") - ckey = f"precomputed_filter_shard_on_codes_and_freqs/{agg}" + ckey = f"_filter_shard_on_codes_and_freqs/{agg}" self._register_start(key=ckey) - - df = df[:, self.code_masks[agg]] + + df = df[:, self.code_masks[agg]] self._register_end(key=ckey) diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py index f19e9b6..ed877ef 100644 --- a/src/MEDS_tabular_automl/file_name.py +++ b/src/MEDS_tabular_automl/file_name.py @@ -19,7 +19,7 @@ def tabularize_dir(self): @property def cache_dir(self): return Path(self.cfg.cache_dir) - + def get_meds_dir(self): return self.meds_dir / "final_cohort" @@ -94,7 +94,7 @@ def list_label_files(self, split=None): if split: return sorted(list(self.get_label_dir().glob(f"{split}/*.parquet"))) return sorted(list(self.get_label_dir().glob("*/*.parquet"))) - + def get_cache_dir(self): return self.cache_dir From d7c1c1324075f36d4864a5b1cae61f0eba4741de Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Mon, 3 Jun 2024 02:37:49 +0000 Subject: [PATCH 073/106] config updates --- configs/xgboost.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/xgboost.yaml b/configs/xgboost.yaml index ff2e273..b3c18fb 100644 --- a/configs/xgboost.yaml +++ b/configs/xgboost.yaml @@ -2,11 +2,11 @@ MEDS_cohort_dir: ??? tabularized_data_dir: ${MEDS_cohort_dir}/tabularize task_dir: ${tabularized_data_dir}/task -model_dir: ${MEDS_cohort_dir}/model_${now:%Y-%m-%d_%H-%M-%S}/ +model_dir: ${MEDS_cohort_dir}/model/${now:%Y-%m-%d_%H-%M-%S} cache_dir: ${tabularized_data_dir}/.cache # Pre-processing -min_code_inclusion_frequency: 10 +min_code_inclusion_frequency: 1 window_sizes: [1d] codes: null aggs: @@ -44,9 +44,9 @@ defaults: hydra: verbose: False sweep: - dir: ${tabularized_data_dir}/.logs/${now:%Y-%m-%d_%H-%M-%S} + dir: ${tmodel_dir}/.logs/ run: - dir: ${tabularized_data_dir}/.logs/${now:%Y-%m-%d_%H-%M-%S} + dir: ${model_dir}/.logs/ # Optuna Sweeper sweeper: From f6786f78e9746f0e191e7e1273aea81ee78f2595 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 3 Jun 2024 03:26:04 +0000 Subject: [PATCH 074/106] fixed tests and added starter code for caching csc matrix --- configs/tabularize.yaml | 57 +++++++++++++++++---- configs/xgboost.yaml | 63 ----------------------- scripts/task_specific_caching.py | 75 ++++++++++++++++++++++++++++ scripts/xgboost.py | 2 +- src/MEDS_tabular_automl/file_name.py | 2 +- tests/test_tabularize.py | 9 ++-- 6 files changed, 127 insertions(+), 81 deletions(-) delete mode 100644 configs/xgboost.yaml create mode 100644 scripts/task_specific_caching.py diff --git a/configs/tabularize.yaml b/configs/tabularize.yaml index 9f64161..d5c3338 100644 --- a/configs/tabularize.yaml +++ b/configs/tabularize.yaml @@ -1,10 +1,18 @@ # Raw data MEDS_cohort_dir: ??? -tabularized_data_dir: ??? +tabularized_data_dir: ${MEDS_cohort_dir}/tabularize +task_dir: ${tabularized_data_dir}/task +model_dir: ${MEDS_cohort_dir}/model/${now:%Y-%m-%d_%H-%M-%S} +cache_dir: ${tabularized_data_dir}/.cache # Pre-processing -min_code_inclusion_frequency: ??? -window_sizes: ??? +min_code_inclusion_frequency: 1 +window_sizes: + - "1d" + - "7d" + - "30d" + - "365d" + - "full" codes: null aggs: - "static/present" @@ -15,6 +23,7 @@ aggs: - "value/sum_sqd" - "value/min" - "value/max" + dynamic_threshold: 0.01 numerical_value_threshold: 0.1 @@ -25,15 +34,41 @@ n_patients_per_sub_shard: null do_overwrite: False do_update: True seed: 1 -tqdm: False -worker: 1 +tqdm: True test: False -# Hydra +model: + booster: gbtree + device: cpu + tree_method: hist + objective: reg:squarederror + +iterator: + keep_data_in_memory: False + binarize_task: True + +# Hydra settings for sweep +defaults: + - override hydra/sweeper: optuna + - override hydra/sweeper/sampler: tpe + hydra: - job: - name: tabularize_step_${now:%Y-%m-%d_%H-%M-%S} - run: - dir: ${tabularized_data_dir}/.logs/etl/${hydra.job.name} + verbose: False sweep: - dir: ${tabularized_data_dir}/.logs/etl/${hydra.job.name} + dir: ${tmodel_dir}/.logs/ + run: + dir: ${model_dir}/.logs/ + + # Optuna Sweeper + sweeper: + sampler: + seed: 1 + storage: null + study_name: tabularize_study_${now:%Y-%m-%d_%H-%M-%S} + direction: minimize + n_trials: 10 + + # Define search space for Optuna + params: + window_sizes: choice([30d, 365d, full], [30d, full], [30d]) + # iterator.keep_data_in_memory: choice([True], [False]) diff --git a/configs/xgboost.yaml b/configs/xgboost.yaml deleted file mode 100644 index b3c18fb..0000000 --- a/configs/xgboost.yaml +++ /dev/null @@ -1,63 +0,0 @@ -# Raw data -MEDS_cohort_dir: ??? -tabularized_data_dir: ${MEDS_cohort_dir}/tabularize -task_dir: ${tabularized_data_dir}/task -model_dir: ${MEDS_cohort_dir}/model/${now:%Y-%m-%d_%H-%M-%S} -cache_dir: ${tabularized_data_dir}/.cache - -# Pre-processing -min_code_inclusion_frequency: 1 -window_sizes: [1d] -codes: null -aggs: - - "code/count" - - "value/sum" - -dynamic_threshold: 0.01 -numerical_value_threshold: 0.1 - -# Sharding -n_patients_per_sub_shard: null - -# Misc -do_overwrite: False -do_update: True -seed: 1 -tqdm: True -test: False - -model: - booster: gbtree - device: cpu - tree_method: hist - objective: reg:squarederror - -iterator: - keep_data_in_memory: False - binarize_task: True - -# Hydra settings for sweep -defaults: - - override hydra/sweeper: optuna - - override hydra/sweeper/sampler: tpe - -hydra: - verbose: False - sweep: - dir: ${tmodel_dir}/.logs/ - run: - dir: ${model_dir}/.logs/ - - # Optuna Sweeper - sweeper: - sampler: - seed: 1 - storage: null - study_name: tabularize_study_${now:%Y-%m-%d_%H-%M-%S} - direction: minimize - n_trials: 10 - - # Define search space for Optuna - params: - window_sizes: choice([30d, 365d, full], [30d, full], [30d]) - # iterator.keep_data_in_memory: choice([True], [False]) diff --git a/scripts/task_specific_caching.py b/scripts/task_specific_caching.py new file mode 100644 index 0000000..2abc35f --- /dev/null +++ b/scripts/task_specific_caching.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python + +"""Aggregates time-series data for feature columns across different window sizes.""" +import json + +import hydra +import numpy as np +import polars as pl +from omegaconf import DictConfig + +from MEDS_tabular_automl.file_name import FileNameResolver +from MEDS_tabular_automl.mapper import wrap as rwlock_wrap +from MEDS_tabular_automl.utils import ( + hydra_loguru_init, + load_tqdm, + write_df, +) + + +def generate_row_cached_matrix(matrix, label_df, feature_columns): + """Generates row-cached matrix for a given matrix and label_df.""" + return None # TODO + + +@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") +def task_specific_cache( + cfg: DictConfig, +): + """Performs row splicing of tabularized data for a specific task.""" + iter_wrapper = load_tqdm(cfg.tqdm) + if not cfg.test: + hydra_loguru_init() + f_name_resolver = FileNameResolver(cfg) + # Produce ts representation + meds_shard_fps = f_name_resolver.list_meds_files() + feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) + + # shuffle tasks + tabularization_tasks = f_name_resolver.list_static_files() + f_name_resolver.list_ts_files() + np.random.shuffle(tabularization_tasks) + + # iterate through them + for shard_fp, window_size, agg in iter_wrapper(tabularization_tasks): + agg, window_size = 0, 0 # TODO: fix + shard_num = shard_fp.stem + split = shard_fp.parent.stem + out_fp = f_name_resolver.get_task_specific_output( + split, shard_num, window_size, agg + ) # TODO make this function + + def read_fn(fps): + matrix_fp, label_fp = fps + return load_matrix(fp), pl.scan_parquet(label_fp) + + def compute_fn(shard_dfs): + matrix, label_df = shard_dfs + cache_matrix = generate_row_cached_matrix(matrix, label_df, feature_columns) + return cache_matrix + + def write_fn(cache_matrix, out_fp): + write_df(cache_matrix, out_fp, do_overwrite=cfg.do_overwrite) + + rwlock_wrap( + shard_fp, + ts_fp, + read_fn, + write_fn, + compute_fn, + do_overwrite=cfg.do_overwrite, + do_return=False, + ) + + +if __name__ == "__main__": + task_specific_cache() diff --git a/scripts/xgboost.py b/scripts/xgboost.py index 7ca74fd..c718580 100644 --- a/scripts/xgboost.py +++ b/scripts/xgboost.py @@ -356,7 +356,7 @@ def evaluate(self) -> float: return mean_absolute_error(y_true, y_pred) -@hydra.main(version_base=None, config_path="../configs", config_name="xgboost") +@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") def xgboost(cfg: DictConfig) -> float: """Optimize the model based on the provided configuration. diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py index ed877ef..96c0c76 100644 --- a/src/MEDS_tabular_automl/file_name.py +++ b/src/MEDS_tabular_automl/file_name.py @@ -33,7 +33,7 @@ def get_sparse_dir(self): return self.tabularize_dir / "sparse" def get_label_dir(self): - return self.tabularize_dir / "task" + return Path(self.cfg.task_dir) def get_feature_columns_fp(self): return self.tabularize_dir / "feature_columns.json" diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 0967b64..c45b9ef 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -21,7 +21,7 @@ from scripts.identify_columns import store_columns from scripts.summarize_over_windows import summarize_ts_data_over_windows from scripts.tabularize_static import tabularize_static_data -from scripts.xgboost_sweep import xgboost +from scripts.xgboost import xgboost SPLITS_JSON = """{"train/0": [239684, 1195293], "train/1": [68729, 814703], "tuning/0": [754281], "held_out/0": [1500733]}""" # noqa: E501 @@ -200,6 +200,7 @@ def test_tabularize(): "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), "tabularized_data_dir": str(tabularized_data_dir.resolve()), "min_code_inclusion_frequency": 1, + "model_dir": str(Path(d) / "save_model"), "window_sizes": ["30d", "365d", "full"], "aggs": ["code/count", "value/sum", "static/present", "static/first"], "codes": "null", @@ -314,9 +315,7 @@ def test_tabularize(): "hydra.mode": "MULTIRUN", } xgboost_config_kwargs = {**tabularize_config_kwargs, **xgboost_config_kwargs} - with initialize(version_base=None, config_path="../configs/"): # path to config.yaml - overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] - cfg = compose(config_name="xgboost_sweep", overrides=overrides) # config.yaml xgboost(cfg) - output_files = list(model_dir.glob("*/*/*_model.json")) + output_files = list(Path(cfg.model_dir).glob("*.json")) assert len(output_files) == 1 + assert output_files[0] == Path(cfg.model_dir) / "model.json" From 3c1b9108ff97f8644ff580d04389c8c9cef1e49f Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Mon, 3 Jun 2024 03:29:32 +0000 Subject: [PATCH 075/106] time profiling --- scripts/xgboost.py | 64 ++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/scripts/xgboost.py b/scripts/xgboost.py index 7ca74fd..99a6e2b 100644 --- a/scripts/xgboost.py +++ b/scripts/xgboost.py @@ -11,7 +11,7 @@ from loguru import logger from mixins import TimeableMixin from omegaconf import DictConfig, OmegaConf -from sklearn.metrics import mean_absolute_error +from sklearn.metrics import roc_auc_score from MEDS_tabular_automl.file_name import FileNameResolver from MEDS_tabular_automl.utils import get_feature_indices @@ -88,8 +88,10 @@ def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: # TODO: check this for Nan or any other case we need to worry about cached_labels[shard] = label_df.select(pl.col("label")).collect().to_series() - # if self.cfg.iterator.binarize_task: - # cached_labels[shard] = cached_labels[shard].map_elements(lambda x: 1 if x > 0 else 0, return_dtype=pl.Int8) + if self.cfg.iterator.binarize_task: + cached_labels[shard] = cached_labels[shard].map_elements( + lambda x: 1 if x > 0 else 0, return_dtype=pl.Int8 + ) return cached_event_ids, cached_labels @@ -131,7 +133,7 @@ def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csr_matrix: - path (Path): Path to the sparse shard. Returns: - - sp.coo_matrix: Data frame with the sparse shard. + - sp.csr_matrix: Data frame with the sparse shard. """ # column_shard is of form event_idx, feature_idx, value matrix = self._load_matrix(path) @@ -198,26 +200,14 @@ def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csr_matrix) -> sp.cs frame to only include columns that are True in the mask. Args: - - df (scipy.sparse.coo_matrix): Data frame to filter. + - df (scipy.sparse.csr_matrix): Data frame to filter. Returns: - df (scipy.sparse.sp.csr_matrix): Filtered data frame. """ if self.codes_set is None: return df - # key = f"_filter_shard_on_codes_and_freqs/{agg}" - # self._register_start(key=key) - # feature_ids = self.agg_to_feature_ids[agg] - # code_mask = [True if idx in self.codes_set else False for idx in feature_ids] - # filtered_df = df[:, code_mask] # [:, list({index for index in self.codes_set if index < df.shape[1]})] - - # # df = df[:, self.code_masks[agg]] # [:, list({index for index in self.codes_set if index < df.shape[1]})] - - # self._register_end(key=key) - - # if not np.array_equal(code_mask, self.code_masks[agg]): - # raise ValueError("code_mask and another_mask are not the same") ckey = f"_filter_shard_on_codes_and_freqs/{agg}" self._register_start(key=ckey) @@ -256,7 +246,7 @@ def reset(self): self._it = 0 @TimeableMixin.TimeAs - def collect_in_memory(self) -> tuple[sp.coo_matrix, np.ndarray]: + def collect_in_memory(self) -> tuple[sp.csr_matrix, np.ndarray]: """Collect the data in memory. Returns: @@ -295,12 +285,32 @@ def __init__(self, cfg: DictConfig): self.model = None + # @TimeableMixin.TimeAs + # def _get_callbacks(self): + # """Get the callbacks for training.""" + # callbacks = [] + # if self.cfg.model.early_stopping_rounds is not None: + # es = xgb.callback.EarlyStopping( + # rounds=self.cfg.model.early_stopping_rounds, + # min_delta=1e-3, + # save_best=True, + # maximize=True, + # data_name="tuning", + # metric_name="auc", + # ) + # callbacks.append(es) + # return callbacks + @TimeableMixin.TimeAs def _train(self): """Train the model.""" - # TODO: add in eval, early stopping, etc. - # TODO: check for Nan and inf in labels! - self.model = xgb.train(OmegaConf.to_container(self.cfg.model), self.dtrain) # TODO: fix eval etc. + self.model = xgb.train( + OmegaConf.to_container(self.cfg.model), + self.dtrain, + num_boost_round=self.cfg.num_boost_round, + early_stopping_rounds=self.cfg.early_stopping_rounds, + evals=[(self.dtrain, "train"), (self.dtuning, "tuning")], + ) @TimeableMixin.TimeAs def train(self): @@ -353,7 +363,7 @@ def evaluate(self) -> float: y_pred = self.model.predict(self.dheld_out) y_true = self.dheld_out.get_label() - return mean_absolute_error(y_true, y_pred) + return roc_auc_score(y_true, y_pred) @hydra.main(version_base=None, config_path="../configs", config_name="xgboost") @@ -368,13 +378,11 @@ def xgboost(cfg: DictConfig) -> float: """ model = XGBoostModel(cfg) model.train() - # logger.info("Time Profiling:") - # logger.info("Train Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model._profile_durations().items()))) - # logger.info("Train Iterator Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model.itrain._profile_durations().items()))) - # logger.info("Tuning Iterator Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model.ituning._profile_durations().items()))) - # logger.info("Held Out Iterator Time:\n{}".format("\n".join(f"{key}: {value}" for key, value in model.iheld_out._profile_durations().items()))) - print("Time Profiling:") + print( + "Time Profiling for window sizes ", + f"{cfg.window_sizes} and min code frequency of {cfg.min_code_inclusion_frequency}:", + ) print("Train Time: \n", model._profile_durations()) print("Train Iterator Time: \n", model.itrain._profile_durations()) print("Tuning Iterator Time: \n", model.ituning._profile_durations()) From fce9bccc166005e91a36c23c732d7706a2417985 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 3 Jun 2024 03:36:17 +0000 Subject: [PATCH 076/106] added code to reduce dtype and filter out zero or nan values from sparse matrix --- src/MEDS_tabular_automl/utils.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index f80b87b..61c1950 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -73,8 +73,24 @@ def array_to_sparse_matrix(array: np.ndarray, shape: tuple[int, int]): return coo_array((data, (row, col)), shape=shape) +def get_min_dtype(array): + return np.result_type(np.min_scalar_type(array.min()), array.max()) + + def sparse_matrix_to_array(coo_matrix: coo_array): - return np.array([coo_matrix.data, coo_matrix.row, coo_matrix.col]), coo_matrix.shape + data, row, col = coo_matrix.data, coo_matrix.row, coo_matrix.col + # Remove invalid indices + valid_indices = (data == 0) | np.isnan(data) + data = data[~valid_indices] + row = row[~valid_indices] + col = col[~valid_indices] + # reduce dtypes + if len(data): + data = data.astype(get_min_dtype(data)) + row = row.astype(get_min_dtype(row)) + col = col.astype(get_min_dtype(col)) + + return np.array([data, row, col]), coo_matrix.shape def store_matrix(coo_matrix: coo_array, fp_path: Path): From 136204ea5b34d0027678f0b7a1e839a9a939c80e Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 3 Jun 2024 14:49:05 +0000 Subject: [PATCH 077/106] updated fixes for yaml --- hf_cohort/cohort.yaml | 373 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 373 insertions(+) create mode 100644 hf_cohort/cohort.yaml diff --git a/hf_cohort/cohort.yaml b/hf_cohort/cohort.yaml new file mode 100644 index 0000000..51eb868 --- /dev/null +++ b/hf_cohort/cohort.yaml @@ -0,0 +1,373 @@ +patient_id_col: "empi" + +demographic: + sex: + code: + - SEX + - col(sex) + timestamp: null + race: + code: + - RACE + - col(race) + timestamp: null + country: + code: + - COUNTRY + - col(country) + timestamp: null + zip_code: + code: + - ZIP_CODE + - col(zip_code) + timestamp: null + birth: + code: BIRTH + timestamp: col(date_of_birth) + timestamp_format: "%Y-%m-%d" + death: + code: DEATH + timestamp: col(date_of_death) + timestamp_format: "%Y-%m-%d" + +diagnosis: + diagnosis: + code: + - DIAGNOSIS + - col(diagnosis_name) + timestamp: col(date) + timestamp_format: "%Y-%m-%d" + +encounters_modified: + admit_date: + code: ADMIT_DATE + timestamp: col(admit_date) + timestamp_format: "%Y-%m-%d" + discharge_date: + code: DISCHARGE_DATE + timestamp: col(discharge_date) + timestamp_format: "%Y-%m-%d" + +physical: + physical: + code: + - PHYSICAL + - col(physical_name) + timestamp: col(date) + timestamp_format: "%Y-%m-%d" + numerical_value: col(result) + +ecg: + ecg: + code: ECG + timestamp: col(date) + timestamp_format: "%Y-%m-%d %H:%M:%S" + +echo: + echo_type: + code: + - ECHO_TYPE + - col(echo_type) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + lv_wall_thickness: + code: + - LV_WALL_THICKNESS + - col(lv_wall_thickness) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + lv_hypertrophy: + code: + - LV_HYPERTROPHY + - col(lv_hypertrophy) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + lv_ef: + code: + - LV_EF + - col(lv_ef) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + lv_ef_method: + code: + - LV_EF_METHOD + - col(lv_ef_method) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + rv_size: + code: + - RV_SIZE + - col(rv_size) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + tv_regurg: + code: + - TV_REGURG + - col(tv_regurg) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + tv_regurg_severity: + code: + - TV_REGURG_SEVERITY + - col(tv_regurg_severity) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + av_abnormal: + code: + - AV_ABNORMAL + - col(av_abnormal) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + av_stenosis: + code: + - AV_STENOSIS + - col(av_stenosis) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + av_calcified: + code: + - AV_CALCIFIED + - col(av_calcified) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + mv_regurg: + code: + - MV_REGURG + - col(mv_regurg) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + mv_regurg_severity: + code: + - MV_REGURG_SEVERITY + - col(mv_regurg_severity) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + pv_stenosis_severity: + code: + - PV_STENOSIS_SEVERITY + - col(pv_stenosis_severity) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + pv_regurg: + code: + - PV_REGURG + - col(pv_regurg) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + pv_regurg_severity: + code: + - PV_REGURG_SEVERITY + - col(pv_regurg_severity) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + echo_quality: + code: + - ECHO_QUALITY + - col(echo_quality) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + la_size: + code: + - LA_SIZE + - col(la_size) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + ra_size: + code: + - RA_SIZE + - col(ra_size) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + pericardium_normal: + code: + - PERICARDIUM_NORMAL + - col(pericardium_normal) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + pericardial_effusion: + code: + - PERICARDIAL_EFFUSION + - col(pericardial_effusion) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + pleural_effusion: + code: + - PLEURAL_EFFUSION + - col(pleural_effusion) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + av_morphology: + code: + - AV_MORPHOLOGY + - col(av_morphology) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + indication_category: + code: + - INDICATION_CATEGORY + - col(indication_category) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + indication_subcategory: + code: + - INDICATION_SUBCATEGORY + - col(indication_subcategory) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + echo_comprehensive: + code: + - ECHO_COMPREHENSIVE + - col(echo_comprehensive) + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + lv_ef_value: + code: LV_EF_VALUE + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + numerical_value: col(lv_ef_value) + tv_rv_systolic_pressure: + code: TV_RV_SYSTOLIC_PRESSURE + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + numerical_value: col(tv_rv_systolic_pressure) + tv_ra_estimated_pressure: + code: + - TV_RA_ESTIMATED_PRESSURE + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + numerical_value: col(tv_ra_estimated_pressure) + body_surface_area: + code: BODY_SURFACE_AREA + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + numerical_value: col(body_surface_area) + height_cm: + code: HEIGHT_CM + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + numerical_value: col(height_cm) + weight_kg: + code: WEIGHT_KG + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + numerical_value: col(weight_kg) + heart_rate: + code: HEART_RATE + timestamp: col(date) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + numerical_value: col(heart_rate) + +pressure: + mean_ra_pressure: + code: MEAN_RA_PRESSURE + timestamp: col(date) + timestamp_format: "%Y-%m-%d" + numerical_value: col(mean_ra_pressure) + mean_pa_pressure: + code: MEAN_PA_PRESSURE + timestamp: col(date) + timestamp_format: "%Y-%m-%d" + numerical_value: col(mean_pa_pressure) + mean_wedge_pressure: + code: MEAN_WEDGE_PRESSURE + timestamp: col(date) + timestamp_format: "%Y-%m-%d" + numerical_value: col(mean_wedge_pressure) + rvedp: + code: RVEDP + timestamp: col(date) + timestamp_format: "%Y-%m-%d" + numerical_value: col(rvedp) + +lab: + lab: + code: + - LAB + - col(group) + timestamp: col(date) + timestamp_format: "%Y-%m-%d %H:%M:%S" + numerical_value: col(result) + +medication_after_1960: + medication: + code: + - MEDICATION + - col(medication_name) + timestamp: col(date) + timestamp_format: "%m/%d/%Y" + numerical_value: quantity + +procedure: + procedure: + code: + - PROCEDURE + - col(procedure_name) + timestamp: col(date) + timestamp_format: "%Y-%m-%d" + numerical_value: quantity From 266d785b5cb6791e0de79ba8ba23560d48c1c0f6 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 3 Jun 2024 14:52:50 +0000 Subject: [PATCH 078/106] updated scripts for launching hf cohort meds formatting and e2e tabularization --- hf_cohort/hf_cohort_e2e.sh | 5 +++-- hf_cohort/hf_cohort_shard.sh | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/hf_cohort/hf_cohort_e2e.sh b/hf_cohort/hf_cohort_e2e.sh index c9bb74d..e4c9726 100644 --- a/hf_cohort/hf_cohort_e2e.sh +++ b/hf_cohort/hf_cohort_e2e.sh @@ -1,10 +1,12 @@ #!/usr/bin/env bash +# bash hf_cohort/hf_cohort_e2e.sh hf_cohort 80 METHOD=meds MEDS_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed OUTPUT_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize -N_PARALLEL_WORKERS="$1" +ID=$1 +N_PARALLEL_WORKERS="$2" WINDOW_SIZES="window_sizes=[1d,7d,30d,365d,full]" AGGS="aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" # WINDOW_SIZES="window_sizes=[1d,7d,30d,365d,full]" @@ -24,7 +26,6 @@ POLARS_MAX_THREADS=32 python scripts/tabularize_static.py \ min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" -ID=$RANDOM LOG_DIR="logs/$METHOD/$ID-logs" mkdir -p $LOG_DIR { time \ diff --git a/hf_cohort/hf_cohort_shard.sh b/hf_cohort/hf_cohort_shard.sh index 351ef3f..6b81051 100644 --- a/hf_cohort/hf_cohort_shard.sh +++ b/hf_cohort/hf_cohort_shard.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash OUTPUT_DIR=/data/storage/shared/meds_tabular_ml/ebcl_dataset/processed -PATIENTS_PER_SHARD="2500" +PATIENTS_PER_SHARD="5_000" CHUNKSIZE="200_000_000" rm -rf $OUTPUT_DIR @@ -9,7 +9,7 @@ echo "Running shard_events.py" POLARS_MAX_THREADS=32 python /home/nassim/projects/MEDS_polars_functions/scripts/extraction/shard_events.py \ raw_cohort_dir=/data/storage/shared/meds_tabular_ml/ebcl_dataset \ MEDS_cohort_dir=$OUTPUT_DIR \ - event_conversion_config_fp=/data/storage/shared/meds_tabular_ml/ebcl_dataset/cohort.yaml \ + event_conversion_config_fp=/home/nassim/projects/MEDS_Tabular_AutoML/hf_cohort/cohort.yaml \ split_fracs.train=0.6666666666666666 split_fracs.tuning=0.16666666666666666 \ split_fracs.held_out=0.16666666666666666 row_chunksize=$CHUNKSIZE \ n_patients_per_shard=$PATIENTS_PER_SHARD hydra.verbose=True @@ -18,7 +18,7 @@ echo "Running split_and_shard_patients.py" POLARS_MAX_THREADS=32 python /home/nassim/projects/MEDS_polars_functions/scripts/extraction/split_and_shard_patients.py \ raw_cohort_dir=/data/storage/shared/meds_tabular_ml/ebcl_dataset \ MEDS_cohort_dir=$OUTPUT_DIR \ - event_conversion_config_fp=/data/storage/shared/meds_tabular_ml/ebcl_dataset/cohort.yaml \ + event_conversion_config_fp=/home/nassim/projects/MEDS_Tabular_AutoML/hf_cohort/cohort.yaml \ split_fracs.train=0.6666666666666666 split_fracs.tuning=0.16666666666666666 \ split_fracs.held_out=0.16666666666666666 row_chunksize=$CHUNKSIZE \ n_patients_per_shard=$PATIENTS_PER_SHARD hydra.verbose=True @@ -27,7 +27,7 @@ echo "Running convert_to_sharded_events.py" POLARS_MAX_THREADS=32 python /home/nassim/projects/MEDS_polars_functions/scripts/extraction/convert_to_sharded_events.py \ raw_cohort_dir=/data/storage/shared/meds_tabular_ml/ebcl_dataset \ MEDS_cohort_dir=$OUTPUT_DIR \ - event_conversion_config_fp=/data/storage/shared/meds_tabular_ml/ebcl_dataset/cohort.yaml \ + event_conversion_config_fp=/home/nassim/projects/MEDS_Tabular_AutoML/hf_cohort/cohort.yaml \ split_fracs.train=0.6666666666666666 split_fracs.tuning=0.16666666666666666 \ split_fracs.held_out=0.16666666666666666 row_chunksize=$CHUNKSIZE \ n_patients_per_shard=$PATIENTS_PER_SHARD hydra.verbose=True @@ -36,7 +36,7 @@ echo "Running merge_to_MEDS_cohort.py" POLARS_MAX_THREADS=32 python /home/nassim/projects/MEDS_polars_functions/scripts/extraction/merge_to_MEDS_cohort.py \ raw_cohort_dir=/data/storage/shared/meds_tabular_ml/ebcl_dataset \ MEDS_cohort_dir=$OUTPUT_DIR \ - event_conversion_config_fp=/data/storage/shared/meds_tabular_ml/ebcl_dataset/cohort.yaml \ + event_conversion_config_fp=/home/nassim/projects/MEDS_Tabular_AutoML/hf_cohort/cohort.yaml \ split_fracs.train=0.6666666666666666 split_fracs.tuning=0.16666666666666666 \ split_fracs.held_out=0.16666666666666666 row_chunksize=$CHUNKSIZE \ n_patients_per_shard=$PATIENTS_PER_SHARD hydra.verbose=True From 0606a78f3deef8482fe9eae1b0ce5cf3fe006aca Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 3 Jun 2024 15:44:19 +0000 Subject: [PATCH 079/106] updated tests and added task specific caching script --- hf_cohort/hf_cohort_e2e.sh | 10 +++++- scripts/task_specific_caching.py | 48 +++++++++++++++++++++------- src/MEDS_tabular_automl/file_name.py | 36 +++++++++++++++++++-- tests/test_tabularize.py | 6 ++-- 4 files changed, 83 insertions(+), 17 deletions(-) diff --git a/hf_cohort/hf_cohort_e2e.sh b/hf_cohort/hf_cohort_e2e.sh index e4c9726..3d0963e 100644 --- a/hf_cohort/hf_cohort_e2e.sh +++ b/hf_cohort/hf_cohort_e2e.sh @@ -26,11 +26,12 @@ POLARS_MAX_THREADS=32 python scripts/tabularize_static.py \ min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" +POLARS_MAX_THREADS=1 LOG_DIR="logs/$METHOD/$ID-logs" mkdir -p $LOG_DIR { time \ mprof run --include-children --exit-code --output "$LOG_DIR/mprofile.dat" \ - POLARS_MAX_THREADS=1 python scripts/summarize_over_windows.py \ + python scripts/summarize_over_windows.py \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ @@ -51,3 +52,10 @@ if [ -n "$cmd_exit_status" ] && [ "$cmd_exit_status" -ne 0 ]; then fi mprof plot -o $LOG_DIR/mprofile.png $LOG_DIR/mprofile.dat mprof peak $LOG_DIR/mprofile.dat > $LOG_DIR/peak_memory_usage.txt + + +echo "Running task_specific_caching.py: tabularizing static data" +POLARS_MAX_THREADS=32 python scripts/task_specific_caching.py \ + MEDS_cohort_dir=$MEDS_DIR \ + tabularized_data_dir=$OUTPUT_DIR \ + min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" diff --git a/scripts/task_specific_caching.py b/scripts/task_specific_caching.py index 2abc35f..a34683c 100644 --- a/scripts/task_specific_caching.py +++ b/scripts/task_specific_caching.py @@ -6,20 +6,41 @@ import hydra import numpy as np import polars as pl +import scipy.sparse as sp from omegaconf import DictConfig from MEDS_tabular_automl.file_name import FileNameResolver from MEDS_tabular_automl.mapper import wrap as rwlock_wrap from MEDS_tabular_automl.utils import ( + CODE_AGGREGATIONS, + STATIC_CODE_AGGREGATION, + STATIC_VALUE_AGGREGATION, + VALUE_AGGREGATIONS, hydra_loguru_init, + load_matrix, load_tqdm, write_df, ) +VALID_AGGREGATIONS = [ + *VALUE_AGGREGATIONS, + *CODE_AGGREGATIONS, + STATIC_CODE_AGGREGATION, + STATIC_VALUE_AGGREGATION, +] + def generate_row_cached_matrix(matrix, label_df, feature_columns): """Generates row-cached matrix for a given matrix and label_df.""" - return None # TODO + label_len = label_df.select(pl.len()).collect().item() + if not matrix.shape[0] == label_len: + raise ValueError( + f"Matrix and label_df must have the same number of rows: {matrix.shape[0]} != {label_len}" + ) + csr = sp.csr_array(matrix) + valid_ids = label_df.select(pl.col("event_id")).collect().to_series().to_numpy() + csr = csr[valid_ids, :] + return sp.coo_array(csr) @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") @@ -32,7 +53,6 @@ def task_specific_cache( hydra_loguru_init() f_name_resolver = FileNameResolver(cfg) # Produce ts representation - meds_shard_fps = f_name_resolver.list_meds_files() feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) # shuffle tasks @@ -40,17 +60,20 @@ def task_specific_cache( np.random.shuffle(tabularization_tasks) # iterate through them - for shard_fp, window_size, agg in iter_wrapper(tabularization_tasks): - agg, window_size = 0, 0 # TODO: fix - shard_num = shard_fp.stem - split = shard_fp.parent.stem - out_fp = f_name_resolver.get_task_specific_output( - split, shard_num, window_size, agg - ) # TODO make this function + for data_fp in iter_wrapper(tabularization_tasks): + # parse as time series agg + try: + split, shard_num, agg = f_name_resolver.parse_static_file_path(data_fp) + window_size = None + except ValueError: + split, shard_num, window_size, agg = f_name_resolver.parse_ts_file_path(data_fp) + label_fp = f_name_resolver.get_label(split, shard_num) + out_fp = f_name_resolver.get_task_specific_path(split, shard_num, window_size, agg) + assert label_fp.exists(), f"Output file {label_fp} does not exist." def read_fn(fps): matrix_fp, label_fp = fps - return load_matrix(fp), pl.scan_parquet(label_fp) + return load_matrix(matrix_fp), pl.scan_parquet(label_fp) def compute_fn(shard_dfs): matrix, label_df = shard_dfs @@ -60,9 +83,10 @@ def compute_fn(shard_dfs): def write_fn(cache_matrix, out_fp): write_df(cache_matrix, out_fp, do_overwrite=cfg.do_overwrite) + in_fps = [data_fp, label_fp] rwlock_wrap( - shard_fp, - ts_fp, + in_fps, + out_fp, read_fn, write_fn, compute_fn, diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py index 96c0c76..fa2461b 100644 --- a/src/MEDS_tabular_automl/file_name.py +++ b/src/MEDS_tabular_automl/file_name.py @@ -3,6 +3,13 @@ from omegaconf import DictConfig +from MEDS_tabular_automl.utils import ( + CODE_AGGREGATIONS, + STATIC_CODE_AGGREGATION, + STATIC_VALUE_AGGREGATION, + VALUE_AGGREGATIONS, +) + class FileNameResolver: def __init__(self, cfg: DictConfig): @@ -106,8 +113,33 @@ def get_model_files(self, window_sizes, aggs, split, shard_num: int): if agg.startswith("static"): continue else: - model_files.append(self.get_flat_ts_rep(split, shard_num, window_size, agg)) + model_files.append(self.get_task_specific_path(split, shard_num, window_size, agg)) for agg in aggs: if agg.startswith("static"): - model_files.append(self.get_flat_static_rep(split, shard_num, agg)) + window_size = None + model_files.append(self.get_task_specific_path(split, shard_num, window_size, agg)) return sorted(model_files) + + def parse_ts_file_path(self, data_fp): + agg = f"{data_fp.parent.stem}/{data_fp.stem}" + if not agg in CODE_AGGREGATIONS + VALUE_AGGREGATIONS: + raise ValueError(f"Invalid aggregation: {agg}") + window_size = data_fp.parts[-3] + shard_num = data_fp.parts[-4] + split = data_fp.parts[-5] + return split, shard_num, window_size, agg + + def parse_static_file_path(self, data_fp): + # parse as static agg + agg = f"{data_fp.parent.parent.parent.stem}/{data_fp.stem}" + if not agg in [STATIC_VALUE_AGGREGATION, STATIC_CODE_AGGREGATION]: + raise ValueError(f"Invalid aggregation: {agg}") + shard_num = data_fp.parent.stem + split = data_fp.parts[-3] + return split, shard_num, agg + + def get_task_specific_path(self, split, shard_num, window_size, agg): + if window_size: + return self.get_label_dir() / split / f"{shard_num}" / f"{window_size}" / f"{agg}.npz" + else: + return self.get_label_dir() / split / f"{shard_num}" / f"{agg}.npz" diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index c45b9ef..8e9484b 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -21,6 +21,7 @@ from scripts.identify_columns import store_columns from scripts.summarize_over_windows import summarize_ts_data_over_windows from scripts.tabularize_static import tabularize_static_data +from scripts.task_specific_caching import task_specific_cache from scripts.xgboost import xgboost SPLITS_JSON = """{"train/0": [239684, 1195293], "train/1": [68729, 814703], "tuning/0": [754281], "held_out/0": [1500733]}""" # noqa: E501 @@ -309,9 +310,10 @@ def test_tabularize(): out_f = f_name_resolver.get_label(split, shard_num) out_f.parent.mkdir(parents=True, exist_ok=True) df.write_parquet(out_f) - model_dir = Path(d) / "save_model" + + task_specific_cache(cfg) + xgboost_config_kwargs = { - "model_dir": str(model_dir.resolve()), "hydra.mode": "MULTIRUN", } xgboost_config_kwargs = {**tabularize_config_kwargs, **xgboost_config_kwargs} From 9aa2d73a025dd0946c4ac7952ccca4e86174cb65 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 3 Jun 2024 15:56:45 +0000 Subject: [PATCH 080/106] fixed misspelled variable in yaml --- configs/tabularize.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/tabularize.yaml b/configs/tabularize.yaml index d5c3338..d41068d 100644 --- a/configs/tabularize.yaml +++ b/configs/tabularize.yaml @@ -55,7 +55,7 @@ defaults: hydra: verbose: False sweep: - dir: ${tmodel_dir}/.logs/ + dir: ${model_dir}/.logs/ run: dir: ${model_dir}/.logs/ From 51c86c67d142df5d01c3dd4066032dbac224bdcc Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Mon, 3 Jun 2024 16:41:53 +0000 Subject: [PATCH 081/106] all working --- configs/tabularize.yaml | 18 ++++--- hf_cohort/xgboost.sh | 4 +- scripts/xgboost.py | 112 +++++++++++++++++++++++++++------------- 3 files changed, 90 insertions(+), 44 deletions(-) diff --git a/configs/tabularize.yaml b/configs/tabularize.yaml index d5c3338..43b86cb 100644 --- a/configs/tabularize.yaml +++ b/configs/tabularize.yaml @@ -2,7 +2,7 @@ MEDS_cohort_dir: ??? tabularized_data_dir: ${MEDS_cohort_dir}/tabularize task_dir: ${tabularized_data_dir}/task -model_dir: ${MEDS_cohort_dir}/model/${now:%Y-%m-%d_%H-%M-%S} +model_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} cache_dir: ${tabularized_data_dir}/.cache # Pre-processing @@ -15,8 +15,8 @@ window_sizes: - "full" codes: null aggs: - - "static/present" - - "static/first" + # - "static/present" + # - "static/first" - "code/count" - "value/count" - "value/sum" @@ -37,11 +37,13 @@ seed: 1 tqdm: True test: False +num_boost_round: 1000 +early_stopping_rounds: 5 model: booster: gbtree device: cpu tree_method: hist - objective: reg:squarederror + objective: binary:logistic iterator: keep_data_in_memory: False @@ -55,7 +57,7 @@ defaults: hydra: verbose: False sweep: - dir: ${tmodel_dir}/.logs/ + dir: ${model_dir}/.logs/ run: dir: ${model_dir}/.logs/ @@ -70,5 +72,7 @@ hydra: # Define search space for Optuna params: - window_sizes: choice([30d, 365d, full], [30d, full], [30d]) - # iterator.keep_data_in_memory: choice([True], [False]) + window_sizes: + _target_: hydra.utils.call(${hydra.utils.cross_product}, + values=["1d", "7d", "30d", "365d", "full"]) # , max_options=5) + iterator.keep_data_in_memory: choice([True], [False]) diff --git a/hf_cohort/xgboost.sh b/hf_cohort/xgboost.sh index f6dd525..d45793a 100644 --- a/hf_cohort/xgboost.sh +++ b/hf_cohort/xgboost.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash -BASE_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed -TAB_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize +BASE_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed_bad_code_cohort +TAB_DIR=$BASE_DIR/tabularize KEEP_IN_MEMORY=True python -m scripts.xgboost MEDS_cohort_dir=$BASE_DIR tabularized_data_dir=$TAB_DIR iterator.keep_data_in_memory=$KEEP_IN_MEMORY diff --git a/scripts/xgboost.py b/scripts/xgboost.py index 3db3429..abeeede 100644 --- a/scripts/xgboost.py +++ b/scripts/xgboost.py @@ -18,12 +18,38 @@ class Iterator(xgb.DataIter, TimeableMixin): + """Iterator class for loading and processing data shards. + + This class provides functionality for iterating through data shards, loading + feature data and labels, and processing them based on the provided configuration. + + Args: + cfg: A configuration dictionary containing parameters for + data processing, feature selection, and other settings. + split: The data split to use, which can be one of "train", "tuning", + or "held_out". This determines which subset of the data is loaded and processed. + + Attributes: + cfg: Configuration dictionary containing parameters for + data processing, feature selection, and other settings. + file_name_resolver: Object for resolving file names and paths based on the configuration. + split: The data split being used for loading and processing data shards. + _data_shards: List of data shard names. + valid_event_ids: Dictionary mapping shard number to a list of valid event IDs. + labels: Dictionary mapping shard number to a list of labels for the corresponding event IDs. + codes_set: Set of codes to include in the data. + code_masks: Dictionary of code masks for filtering features based on aggregation. + num_features: Total number of features in the data. + """ + def __init__(self, cfg: DictConfig, split: str = "train"): - """Initialize the Iterator with the provided configuration and split. + """Initializes the Iterator with the provided configuration and data split. Args: - - cfg (DictConfig): Configuration dictionary. - - split (str): The data split to use ("train", "tuning", or "held_out"). + cfg: A configuration dictionary containing parameters for + data processing, feature selection, and other settings. + split: The data split to use, which can be one of "train", "tuning", + or "held_out". This determines which subset of the data is loaded and processed. """ self.cfg = cfg self.file_name_resolver = FileNameResolver(cfg) @@ -32,18 +58,24 @@ def __init__(self, cfg: DictConfig, split: str = "train"): self._data_shards = sorted([shard.stem for shard in self.file_name_resolver.list_label_files(split)]) self.valid_event_ids, self.labels = self.load_labels() self.codes_set, self.code_masks, self.num_features = self._get_code_set() - feature_columns = json.load(open(self.file_name_resolver.get_feature_columns_fp())) - - self.agg_to_feature_ids = {agg: get_feature_indices(agg, feature_columns) for agg in cfg.aggs} - self._it = 0 - # XGBoost will generate some cache files under current directory with the prefix - # "cache" super().__init__(cache_prefix=os.path.join(self.file_name_resolver.get_cache_dir())) @TimeableMixin.TimeAs - def _get_code_masks(self, feature_columns, codes_set): + def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, list[bool]]: + """Create boolean masks for filtering features. + + Creates a dictionary of boolean masks for each aggregation type. The masks are used to filter + the feature columns based on the specified included codes and minimum code inclusion frequency. + + Args: + feature_columns: List of feature columns. + codes_set: Set of codes to include. + + Returns: + Dictionary of code masks for each aggregation. + """ code_masks = {} for agg in set(self.cfg.aggs): feature_ids = get_feature_indices(agg, feature_columns) @@ -52,21 +84,21 @@ def _get_code_masks(self, feature_columns, codes_set): return code_masks @TimeableMixin.TimeAs - def _load_matrix(self, path: Path) -> sp.csr_matrix: + def _load_matrix(self, path: Path) -> sp.csc_matrix: """Load a sparse matrix from disk. Args: - path (Path): Path to the sparse matrix. Returns: - - sp.csr_matrix: Sparse matrix. + - sp.csc_matrix: Sparse matrix. """ npzfile = np.load(path) array, shape = npzfile["array"], npzfile["shape"] if array.shape[0] != 3: raise ValueError(f"Expected array to have 3 rows, but got {array.shape[0]} rows") data, row, col = array - return sp.csr_matrix((data, (row, col)), shape=shape) + return sp.csc_matrix((data, (row, col)), shape=shape) @TimeableMixin.TimeAs def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: @@ -96,7 +128,7 @@ def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: return cached_event_ids, cached_labels @TimeableMixin.TimeAs - def _get_code_set(self) -> set: + def _get_code_set(self) -> tuple[set, Mapping[int, list], int]: """Get the set of codes to include in the data based on the configuration.""" with open(self.file_name_resolver.get_feature_columns_fp()) as f: feature_columns = json.load(f) @@ -122,18 +154,22 @@ def _get_code_set(self) -> set: codes_set = None # set(feature_columns) if codes_set == set(feature_columns): codes_set = None - # TODO: make sure we aren't filtering out static columns!!! - return codes_set, self._get_code_masks(feature_columns, codes_set), len(feature_columns) + + return ( + codes_set, + self._get_code_masks(feature_columns, codes_set), + len(feature_columns), + ) @TimeableMixin.TimeAs - def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csr_matrix: + def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: """Load a sparse shard into memory. Args: - path (Path): Path to the sparse shard. Returns: - - sp.csr_matrix: Data frame with the sparse shard. + - sp.csc_matrix: Data frame with the sparse shard. """ # column_shard is of form event_idx, feature_idx, value matrix = self._load_matrix(path) @@ -145,7 +181,7 @@ def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csr_matrix: return self._filter_shard_on_codes_and_freqs(agg, matrix) @TimeableMixin.TimeAs - def _get_dynamic_shard_by_index(self, idx: int) -> sp.csr_matrix: + def _get_dynamic_shard_by_index(self, idx: int) -> sp.csc_matrix: """Load a specific shard of dynamic data from disk and return it as a sparse matrix after filtering column inclusion. @@ -153,41 +189,43 @@ def _get_dynamic_shard_by_index(self, idx: int) -> sp.csr_matrix: - idx (int): Index of the shard to load. Returns: - - sp.csr_matrix: Filtered sparse matrix. + - sp.csc_matrix: Filtered sparse matrix. """ # TODO Nassim Fix this guy # get all window_size x aggreagation files using the file resolver files = self.file_name_resolver.get_model_files( self.cfg.window_sizes, self.cfg.aggs, self.split, self._data_shards[idx] ) + if not all(file.exists() for file in files): - raise ValueError("Not all files exist") + raise ValueError(f"Not all files exist for shard {self._data_shards[idx]}") shard_name = self._data_shards[idx] - dynamic_csrs = [self._load_dynamic_shard_from_file(file, idx) for file in files] + dynamic_cscs = [self._load_dynamic_shard_from_file(file, idx) for file in files] fn_name = "_get_dynamic_shard_by_index" hstack_key = f"{fn_name}/hstack" self._register_start(key=hstack_key) - combined_csr = sp.hstack(dynamic_csrs, format="csr") # TODO: check this + + combined_csc = sp.hstack(dynamic_cscs, format="csc") # TODO: check this self._register_end(key=hstack_key) # Filter Rows valid_indices = self.valid_event_ids[shard_name] filter_key = f"{fn_name}/filter" self._register_start(key=filter_key) - out = combined_csr[valid_indices, :] + out = combined_csc[valid_indices, :] self._register_end(key=filter_key) return out @TimeableMixin.TimeAs - def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: + def _get_shard_by_index(self, idx: int) -> tuple[sp.csc_matrix, np.ndarray]: """Load a specific shard of data from disk and concatenate with static data. Args: - idx (int): Index of the shard to load. Returns: - - X (scipy.sparse.csr_matrix): Feature data frame.ß + - X (scipy.sparse.csc_matrix): Feature data frame.ß - y (numpy.ndarray): Labels. """ dynamic_df = self._get_dynamic_shard_by_index(idx) @@ -195,15 +233,15 @@ def _get_shard_by_index(self, idx: int) -> tuple[sp.csr_matrix, np.ndarray]: return dynamic_df, label_df @TimeableMixin.TimeAs - def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csr_matrix) -> sp.csr_matrix: + def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csc_matrix) -> sp.csc_matrix: """Filter the dynamic data frame based on the inclusion sets. Given the codes_mask, filter the data frame to only include columns that are True in the mask. Args: - - df (scipy.sparse.csr_matrix): Data frame to filter. + - df (scipy.sparse.csc_matrix): Data frame to filter. Returns: - - df (scipy.sparse.sp.csr_matrix): Filtered data frame. + - df (scipy.sparse.sp.csc_matrix): Filtered data frame. """ if self.codes_set is None: return df @@ -246,12 +284,18 @@ def reset(self): self._it = 0 @TimeableMixin.TimeAs - def collect_in_memory(self) -> tuple[sp.csr_matrix, np.ndarray]: - """Collect the data in memory. + def collect_in_memory(self) -> tuple[sp.csc_matrix, np.ndarray]: + """Collects data from all shards into memory and returns it. + + This method iterates through all data shards, retrieves the feature data + and labels from each shard, and then concatenates them into a single + sparse matrix and a single array, respectively. Returns: - - tuple[np.ndarray, np.ndarray]: Tuple of feature data and labels. + A tuple where the first element is a sparse matrix containing the + feature data, and the second element is a numpy array containing the labels. """ + # TODO: Make this more efficient especially if it is in csc format already X = [] y = [] for i in range(len(self._data_shards)): @@ -359,8 +403,6 @@ def evaluate(self) -> float: Returns: - float: Evaluation metric (mae). """ - # TODO: Figure out exactly what we want to do here - y_pred = self.model.predict(self.dheld_out) y_true = self.dheld_out.get_label() return roc_auc_score(y_true, y_pred) @@ -394,7 +436,7 @@ def xgboost(cfg: DictConfig) -> float: model.model.save_model(save_dir / "model.json") auc = model.evaluate() - logger.info(f"ROC AUC: {auc}") + logger.info(f"AUC: {auc}") return auc From 95f5694cba920fedf6473d1525936d6c4cc8777a Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Mon, 3 Jun 2024 19:22:27 +0000 Subject: [PATCH 082/106] updates for task cached shard --- scripts/xgboost.py | 38 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/scripts/xgboost.py b/scripts/xgboost.py index abeeede..c5b5f15 100644 --- a/scripts/xgboost.py +++ b/scripts/xgboost.py @@ -200,7 +200,6 @@ def _get_dynamic_shard_by_index(self, idx: int) -> sp.csc_matrix: if not all(file.exists() for file in files): raise ValueError(f"Not all files exist for shard {self._data_shards[idx]}") - shard_name = self._data_shards[idx] dynamic_cscs = [self._load_dynamic_shard_from_file(file, idx) for file in files] fn_name = "_get_dynamic_shard_by_index" @@ -208,14 +207,14 @@ def _get_dynamic_shard_by_index(self, idx: int) -> sp.csc_matrix: self._register_start(key=hstack_key) combined_csc = sp.hstack(dynamic_cscs, format="csc") # TODO: check this - self._register_end(key=hstack_key) - # Filter Rows - valid_indices = self.valid_event_ids[shard_name] - filter_key = f"{fn_name}/filter" - self._register_start(key=filter_key) - out = combined_csc[valid_indices, :] - self._register_end(key=filter_key) - return out + # self._register_end(key=hstack_key) + # # Filter Rows + # valid_indices = self.valid_event_ids[shard_name] + # filter_key = f"{fn_name}/filter" + # self._register_start(key=filter_key) + # out = combined_csc[valid_indices, :] + # self._register_end(key=filter_key) + return combined_csc @TimeableMixin.TimeAs def _get_shard_by_index(self, idx: int) -> tuple[sp.csc_matrix, np.ndarray]: @@ -273,7 +272,7 @@ def next(self, input_data: Callable): # input_data is a function passed in by XGBoost who has the exact same signature of # ``DMatrix`` X, y = self._get_shard_by_index(self._it) # self._data_shards[self._it]) - input_data(data=X, label=y) + input_data(data=sp.csr_matrix(X), label=y) self._it += 1 # Return 1 to let XGBoost know we haven't seen all the files yet. return 1 @@ -295,7 +294,7 @@ def collect_in_memory(self) -> tuple[sp.csc_matrix, np.ndarray]: A tuple where the first element is a sparse matrix containing the feature data, and the second element is a numpy array containing the labels. """ - # TODO: Make this more efficient especially if it is in csc format already + X = [] y = [] for i in range(len(self._data_shards)): @@ -329,22 +328,6 @@ def __init__(self, cfg: DictConfig): self.model = None - # @TimeableMixin.TimeAs - # def _get_callbacks(self): - # """Get the callbacks for training.""" - # callbacks = [] - # if self.cfg.model.early_stopping_rounds is not None: - # es = xgb.callback.EarlyStopping( - # rounds=self.cfg.model.early_stopping_rounds, - # min_delta=1e-3, - # save_best=True, - # maximize=True, - # data_name="tuning", - # metric_name="auc", - # ) - # callbacks.append(es) - # return callbacks - @TimeableMixin.TimeAs def _train(self): """Train the model.""" @@ -353,6 +336,7 @@ def _train(self): self.dtrain, num_boost_round=self.cfg.num_boost_round, early_stopping_rounds=self.cfg.early_stopping_rounds, + # nthreads=self.cfg.nthreads, evals=[(self.dtrain, "train"), (self.dtuning, "tuning")], ) From 15f045d4e4d1c2355495ef274ecefbfd2c10c914 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 3 Jun 2024 20:11:33 +0000 Subject: [PATCH 083/106] added cli base code --- cli/describe_codes.sh | 4 +++ cli/profile_tabularization.sh | 26 +++++++++++++++ cli/tabularization.sh | 14 ++++++++ cli/task_specific_caching.sh | 4 +++ cli/xgboost.sh | 4 +++ pyproject.toml | 12 +++++-- src/MEDS_tabular_automl/__main__.py | 51 +++++++++++++++++++++++++++++ 7 files changed, 112 insertions(+), 3 deletions(-) create mode 100755 cli/describe_codes.sh create mode 100755 cli/profile_tabularization.sh create mode 100755 cli/tabularization.sh create mode 100755 cli/task_specific_caching.sh create mode 100755 cli/xgboost.sh create mode 100644 src/MEDS_tabular_automl/__main__.py diff --git a/cli/describe_codes.sh b/cli/describe_codes.sh new file mode 100755 index 0000000..79187f5 --- /dev/null +++ b/cli/describe_codes.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +echo "Describing Codes: Caching feature names and frequencies." +python scripts/identify_columns.py "$@" diff --git a/cli/profile_tabularization.sh b/cli/profile_tabularization.sh new file mode 100755 index 0000000..e4c647a --- /dev/null +++ b/cli/profile_tabularization.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +PROFILE_LOG_DIR="$1" + +shift 1 + +SCRIPT_DIR=$(dirname "$0") +SCRIPT_NAME=$(basename "$0") + +mkdir -p "$PROFILE_LOG_DIR" +{ time \ + mprof run --include-children --exit-code --output "${PROFILE_LOG_DIR}/mprofile.dat" \ + bash "${SCRIPT_DIR}/tabularization.sh" "@" \ + 2> "${PROFILE_LOG_DIR}/cmd.stderr" +} 2> "${PROFILE_LOG_DIR}/timings.txt" + +cmd_exit_status=${PIPESTATUS[0]} +# Check the exit status of the second command in the pipeline (mprof run ...) +if [ -n "$cmd_exit_status" ] && [ "$cmd_exit_status" -ne 0 ]; then + echo "${SCRIPT_NAME} failed with status $cmd_exit_status." + echo "Stderr from ${SCRIPT_NAME} (see ${PROFILE_LOG_DIR}/cmd.stderr):" + tail "${PROFILE_LOG_DIR}/cmd.stderr" + exit "$cmd_exit_status" +fi +mprof plot -o "${PROFILE_LOG_DIR}/mprofile.png" "${PROFILE_LOG_DIR}/mprofile.dat" +mprof peak "${PROFILE_LOG_DIR}/mprofile.dat" > "${PROFILE_LOG_DIR}/peak_memory_usage.txt" diff --git a/cli/tabularization.sh b/cli/tabularization.sh new file mode 100755 index 0000000..587bdb8 --- /dev/null +++ b/cli/tabularization.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +N_PARALLEL_WORKERS="$1" +shift 1 + +echo "Tabularizing Static Data" +python scripts/tabularize_static.py "$@" + + +echo "Tabularizing Time-Series Data" +python scripts/summarize_over_windows.py "@" \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ diff --git a/cli/task_specific_caching.sh b/cli/task_specific_caching.sh new file mode 100755 index 0000000..92d8273 --- /dev/null +++ b/cli/task_specific_caching.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +echo "Caching Training Data for Task" +python scripts/task_specific_caching.py "$@" diff --git a/cli/xgboost.sh b/cli/xgboost.sh new file mode 100755 index 0000000..cf6b91c --- /dev/null +++ b/cli/xgboost.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +echo "Running Task Specific Caching" +python scripts/task_specific_caching.py "$@" diff --git a/pyproject.toml b/pyproject.toml index 157c6d2..1e5b961 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,8 +7,10 @@ name = "MEDS_tabularization" version = "0.0.1" authors = [ { name="Matthew McDermott", email="mattmcdermott8@gmail.com" }, + { name="Nassim Oufattole", email="noufattole@gmail.com" }, + { name="Teya Bergamaschi", email="teyabergamaschi@gmail.com" }, ] -description = "TODO" +description = "Scalable Tabularization of MEDS format Time-Series data" readme = "README.md" requires-python = ">=3.12" classifiers = [ @@ -18,11 +20,15 @@ classifiers = [ ] dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", "pandas", "numba", "tqdm", "xgboost", "scikit-learn", "hydra-optuna-sweeper"] +[project.scripts] +meds_tab = "MEDS_tabular_automl.__main__:main" + [project.optional-dependencies] dev = ["pre-commit"] tests = ["pytest", "pytest-cov", "rootutils"] local_parallelism = ["hydra-joblib-launcher"] +profiling = ["mprofile", "matplotlib"] [project.urls] -Homepage = "https://github.com/mmcdermott/MEDS_polars_functions" -Issues = "https://github.com/mmcdermott/MEDS_polars_functions/issues" +Homepage = "https://github.com/mmcdermott/MEDS_Tabular_AutoML" +Issues = "https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues" diff --git a/src/MEDS_tabular_automl/__main__.py b/src/MEDS_tabular_automl/__main__.py new file mode 100644 index 0000000..742073a --- /dev/null +++ b/src/MEDS_tabular_automl/__main__.py @@ -0,0 +1,51 @@ +"""Main script for end-to-end task querying.""" + +import enum +import subprocess +import sys +from importlib.resources import files + +CLI_SCRIPTS_DIR = files("MEDS_tabular_automl").parent.parent / "cli" + + +class Program(enum.Enum): + DESCRIBE_CODES = "describe_codes.sh" + TABULARIZATION = "tabularization.sh" + TASK_SPECIFIC_CACHING = "task_specific_caching.sh" + XGBOOST = "xgboost.sh" + PROFILE_TABULARIZATION = "profile_tabularization.sh" + + @staticmethod + def from_str(program_arg): + match program_arg: + case "describe_codes": + return Program.DESCRIBE_CODES + case "tabularization": + return Program.TABULARIZATION + case "task_specific_caching": + return Program.TASK_SPECIFIC_CACHING + case "xgboost": + return Program.XGBOOST + case "profile_tabularization": + return Program.PROFILE_TABULARIZATION + case _: + raise ValueError( + f"Invalid program name {program_arg}, valid programs are {[p.name for p in Program]}" + ) + + @staticmethod + def get_script(program): + return CLI_SCRIPTS_DIR / program.value + + +def main(): + program = sys.argv[1] + args = sys.argv[2:] + program = Program.from_str(program) + script_path = Program.get_script(program) + command_parts = [str(script_path.resolve()), *args] + subprocess.run(" ".join(command_parts), shell=True) + + +if __name__ == "__main__": + main() From a7b1c14595d264b51642152db2b3635b61044a69 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 3 Jun 2024 21:27:03 +0000 Subject: [PATCH 084/106] added cli tests and started adding config hierarchy --- cli/profile_tabularization.sh | 2 +- cli/tabularization.sh | 3 +- cli/xgboost.sh | 4 +- configs/tabularize.yaml | 50 +---- configs/xgboost.yaml | 79 +++++++ scripts/{xgboost.py => launch_xgboost.py} | 4 +- src/MEDS_tabular_automl/file_name.py | 4 +- .../generate_summarized_reps.py | 89 +------- tests/cli_test.py | 199 ++++++++++++++++++ tests/test_tabularize.py | 4 +- 10 files changed, 301 insertions(+), 137 deletions(-) create mode 100644 configs/xgboost.yaml rename scripts/{xgboost.py => launch_xgboost.py} (99%) create mode 100644 tests/cli_test.py diff --git a/cli/profile_tabularization.sh b/cli/profile_tabularization.sh index e4c647a..4b34366 100755 --- a/cli/profile_tabularization.sh +++ b/cli/profile_tabularization.sh @@ -10,7 +10,7 @@ SCRIPT_NAME=$(basename "$0") mkdir -p "$PROFILE_LOG_DIR" { time \ mprof run --include-children --exit-code --output "${PROFILE_LOG_DIR}/mprofile.dat" \ - bash "${SCRIPT_DIR}/tabularization.sh" "@" \ + bash "${SCRIPT_DIR}/tabularization.sh" "$@" \ 2> "${PROFILE_LOG_DIR}/cmd.stderr" } 2> "${PROFILE_LOG_DIR}/timings.txt" diff --git a/cli/tabularization.sh b/cli/tabularization.sh index 587bdb8..ceb147f 100755 --- a/cli/tabularization.sh +++ b/cli/tabularization.sh @@ -8,7 +8,8 @@ python scripts/tabularize_static.py "$@" echo "Tabularizing Time-Series Data" -python scripts/summarize_over_windows.py "@" \ +python scripts/summarize_over_windows.py \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ + "$@" diff --git a/cli/xgboost.sh b/cli/xgboost.sh index cf6b91c..9d9b286 100755 --- a/cli/xgboost.sh +++ b/cli/xgboost.sh @@ -1,4 +1,4 @@ #!/usr/bin/env bash -echo "Running Task Specific Caching" -python scripts/task_specific_caching.py "$@" +echo "Running XGBoost" +python scripts/launch_xgboost.py "$@" diff --git a/configs/tabularize.yaml b/configs/tabularize.yaml index 43b86cb..047a52c 100644 --- a/configs/tabularize.yaml +++ b/configs/tabularize.yaml @@ -2,7 +2,7 @@ MEDS_cohort_dir: ??? tabularized_data_dir: ${MEDS_cohort_dir}/tabularize task_dir: ${tabularized_data_dir}/task -model_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} +model_dir: ${MEDS_cohort_dir}/model/${now:%Y-%m-%d_%H-%M-%S} cache_dir: ${tabularized_data_dir}/.cache # Pre-processing @@ -15,8 +15,8 @@ window_sizes: - "full" codes: null aggs: - # - "static/present" - # - "static/first" + - "static/present" + - "static/first" - "code/count" - "value/count" - "value/sum" @@ -35,44 +35,14 @@ do_overwrite: False do_update: True seed: 1 tqdm: True +worker: 1 test: False -num_boost_round: 1000 -early_stopping_rounds: 5 -model: - booster: gbtree - device: cpu - tree_method: hist - objective: binary:logistic - -iterator: - keep_data_in_memory: False - binarize_task: True - -# Hydra settings for sweep -defaults: - - override hydra/sweeper: optuna - - override hydra/sweeper/sampler: tpe - +# Hydra hydra: - verbose: False - sweep: - dir: ${model_dir}/.logs/ + job: + name: tabularize_step_${now:%Y-%m-%d_%H-%M-%S} run: - dir: ${model_dir}/.logs/ - - # Optuna Sweeper - sweeper: - sampler: - seed: 1 - storage: null - study_name: tabularize_study_${now:%Y-%m-%d_%H-%M-%S} - direction: minimize - n_trials: 10 - - # Define search space for Optuna - params: - window_sizes: - _target_: hydra.utils.call(${hydra.utils.cross_product}, - values=["1d", "7d", "30d", "365d", "full"]) # , max_options=5) - iterator.keep_data_in_memory: choice([True], [False]) + dir: ${tabularized_data_dir}/.logs/etl/${hydra.job.name} + sweep: + dir: ${tabularized_data_dir}/.logs/etl/${hydra.job.name} diff --git a/configs/xgboost.yaml b/configs/xgboost.yaml new file mode 100644 index 0000000..0b4cbba --- /dev/null +++ b/configs/xgboost.yaml @@ -0,0 +1,79 @@ +# Raw data +MEDS_cohort_dir: ??? +tabularized_data_dir: ${MEDS_cohort_dir}/tabularize +task_dir: ${tabularized_data_dir}/task +model_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} +cache_dir: ${tabularized_data_dir}/.cache + +# Pre-processing +min_code_inclusion_frequency: 1 +window_sizes: + - "1d" + - "7d" + - "30d" + - "365d" + - "full" +codes: null +aggs: + # - "static/present" + # - "static/first" + - "code/count" + - "value/count" + - "value/sum" + - "value/sum_sqd" + - "value/min" + - "value/max" + +dynamic_threshold: 0.01 +numerical_value_threshold: 0.1 + +# Sharding +n_patients_per_sub_shard: null + +# Misc +do_overwrite: False +do_update: True +seed: 1 +tqdm: True +worker: 0 +test: False + +num_boost_round: 1000 +early_stopping_rounds: 5 +model: + booster: gbtree + device: cpu + tree_method: hist + objective: binary:logistic + +iterator: + keep_data_in_memory: False + binarize_task: True + +# Hydra settings for sweep +defaults: + - override hydra/sweeper: optuna + - override hydra/sweeper/sampler: tpe + +hydra: + verbose: False + sweep: + dir: ${model_dir}/.logs/ + run: + dir: ${model_dir}/.logs/ + + # Optuna Sweeper + sweeper: + sampler: + seed: 1 + storage: null + study_name: tabularize_study_${now:%Y-%m-%d_%H-%M-%S} + direction: minimize + n_trials: 10 + + # Define search space for Optuna + params: + window_sizes: + _target_: hydra.utils.call(${hydra.utils.cross_product}, + values=["1d", "7d", "30d", "365d", "full"]) # , max_options=5) + iterator.keep_data_in_memory: choice([True], [False]) diff --git a/scripts/xgboost.py b/scripts/launch_xgboost.py similarity index 99% rename from scripts/xgboost.py rename to scripts/launch_xgboost.py index c5b5f15..d903cfd 100644 --- a/scripts/xgboost.py +++ b/scripts/launch_xgboost.py @@ -393,7 +393,7 @@ def evaluate(self) -> float: @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") -def xgboost(cfg: DictConfig) -> float: +def launch_xgboost(cfg: DictConfig) -> float: """Optimize the model based on the provided configuration. Args: @@ -425,4 +425,4 @@ def xgboost(cfg: DictConfig) -> float: if __name__ == "__main__": - xgboost() + launch_xgboost() diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py index fa2461b..1492dbf 100644 --- a/src/MEDS_tabular_automl/file_name.py +++ b/src/MEDS_tabular_automl/file_name.py @@ -122,7 +122,7 @@ def get_model_files(self, window_sizes, aggs, split, shard_num: int): def parse_ts_file_path(self, data_fp): agg = f"{data_fp.parent.stem}/{data_fp.stem}" - if not agg in CODE_AGGREGATIONS + VALUE_AGGREGATIONS: + if agg not in CODE_AGGREGATIONS + VALUE_AGGREGATIONS: raise ValueError(f"Invalid aggregation: {agg}") window_size = data_fp.parts[-3] shard_num = data_fp.parts[-4] @@ -132,7 +132,7 @@ def parse_ts_file_path(self, data_fp): def parse_static_file_path(self, data_fp): # parse as static agg agg = f"{data_fp.parent.parent.parent.stem}/{data_fp.stem}" - if not agg in [STATIC_VALUE_AGGREGATION, STATIC_CODE_AGGREGATION]: + if agg not in [STATIC_VALUE_AGGREGATION, STATIC_CODE_AGGREGATION]: raise ValueError(f"Invalid aggregation: {agg}") shard_num = data_fp.parent.stem split = data_fp.parts[-3] diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 2d49fdb..da79cd5 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -1,30 +1,13 @@ -from collections.abc import Callable - -import pandas as pd - -pd.set_option("compute.use_numba", True) import numpy as np +import pandas as pd import polars as pl from loguru import logger -from scipy.sparse import coo_array, csr_array, sparray, vstack +from scipy.sparse import coo_array, csr_array, sparray from MEDS_tabular_automl.generate_ts_features import get_feature_names, get_flat_ts_rep from MEDS_tabular_automl.utils import CODE_AGGREGATIONS, VALUE_AGGREGATIONS, load_tqdm -def time_aggd_col_alias_fntr(window_size: str, agg: str) -> Callable[[str], str]: - if agg is None: - raise ValueError("Aggregation type 'agg' must be provided") - - def f(c: str) -> str: - if c in ["patient_id", "timestamp"]: - return c - else: - return "/".join([window_size] + c.split("/") + [agg]) - - return f - - def sparse_aggregate(sparse_matrix, agg): if agg == "sum": merged_matrix = sparse_matrix.sum(axis=0, dtype=sparse_matrix.dtype) @@ -41,74 +24,6 @@ def sparse_aggregate(sparse_matrix, agg): return merged_matrix -def sum_merge_timestamps(df, sparse_matrix, agg): - """Groups by timestamp and combines rows that are on the same date. - - The combining is done by summing the rows in the sparse matrix that correspond to the same date. - - Args: - df (DataFrame): The DataFrame with 'timestamp' and 'patient_id'. - sparse_matrix (csr_matrix): The corresponding sparse matrix with data. - agg (str): Aggregation method, currently only 'sum' is implemented. - - Returns: - DataFrame, csr_matrix: Tuple containing the DataFrame with aggregated timestamps and the corresponding - sparse matrix. - """ - # Assuming 'timestamp' is already sorted; if not, uncomment the next line: - # df = df.sort_values(by='timestamp') - - # Group by timestamp and sum the data - grouped = df.groupby("timestamp") - indices = grouped.indices - - # Create a new sparse matrix with summed rows per unique timestamp - patient_id = df["patient_id"].iloc[0] - timestamps = [] - output_matrix = csr_array((0, sparse_matrix.shape[1]), dtype=sparse_matrix.dtype) - - # Loop through each group and sum - for timestamp, rows in indices.items(): - # Combine the rows in the sparse matrix for the current group (respecting the aggregation being used) - merged_matrix = sparse_aggregate(sparse_matrix[rows], agg) - # Save the non-zero elements - output_matrix = vstack([output_matrix, merged_matrix]) - timestamps.extend([timestamp]) - - # Create output DataFrame - out_df = pd.DataFrame({"patient_id": [patient_id] * len(timestamps), "timestamp": timestamps}) - return out_df, output_matrix - - -def sparse_rolling(df, sparse_matrix, timedelta, agg): - """Iterates through rolling windows while maintaining sparsity. - - Example: - - >>> df = pd.DataFrame({'patient_id': {0: 1, 1: 1, 2: 1}, - ... 'timestamp': {0: pd.Timestamp('2021-01-01 00:00:00'), - ... 1: pd.Timestamp('2021-01-01 00:00:00'), 2: pd.Timestamp('2020-01-01 00:00:00')}, - ... 'A/code': {0: 1, 1: 1, 2: 0}, 'B/code': {0: 0, 1: 0, 2: 1}, 'C/code': {0: 0, 1: 0, 2: 0}}) - >>> for col in ["A/code", "B/code", "C/code"]: df[col] = pd.arrays.SparseArray(df[col]) - >>> sparse_rolling(df, pd.Timedelta("1d"), "sum").dtypes - A/code Sparse[int64, 0] - B/code Sparse[int64, 0] - C/code Sparse[int64, 0] - timestamp datetime64[ns] - dtype: object - """ - patient_id = df.iloc[0].patient_id - df = df.drop(columns="patient_id").reset_index(drop=True).reset_index() - timestamps = [] - out_sparse_matrix = coo_array((0, sparse_matrix.shape[1]), dtype=sparse_matrix.dtype) - for each in df[["index", "timestamp"]].rolling(on="timestamp", window=timedelta): - timestamps.append(each.index.max()) - agg_subset_matrix = sparse_aggregate(sparse_matrix[each["index"]], agg) - out_sparse_matrix = vstack([out_sparse_matrix, agg_subset_matrix]) - out_df = pd.DataFrame({"patient_id": [patient_id] * len(timestamps), "timestamp": timestamps}) - return out_df, out_sparse_matrix - - def get_rolling_window_indicies(index_df, window_size): """Get the indices for the rolling windows.""" if window_size == "full": diff --git a/tests/cli_test.py b/tests/cli_test.py new file mode 100644 index 0000000..b7b9e0d --- /dev/null +++ b/tests/cli_test.py @@ -0,0 +1,199 @@ +import rootutils + +root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) + +import json +import subprocess +import tempfile +from io import StringIO +from pathlib import Path + +import polars as pl +from loguru import logger +from omegaconf import DictConfig +from test_tabularize import ( + CODE_COLS, + EXPECTED_STATIC_FILES, + MEDS_OUTPUTS, + SPLITS_JSON, + STATIC_FIRST_COLS, + STATIC_PRESENT_COLS, + SUMMARIZE_EXPECTED_FILES, + VALUE_COLS, +) + +from MEDS_tabular_automl.file_name import FileNameResolver +from MEDS_tabular_automl.utils import ( + VALUE_AGGREGATIONS, + get_events_df, + get_feature_names, + load_matrix, +) +from scripts.identify_columns import store_columns +from scripts.tabularize_static import tabularize_static_data + + +def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): + command_parts = [script] + args + [f"{k}={v}" for k, v in hydra_kwargs.items()] + command_out = subprocess.run(" ".join(command_parts), shell=True, capture_output=True) + stderr = command_out.stderr.decode() + stdout = command_out.stdout.decode() + if command_out.returncode != 0: + raise AssertionError(f"{test_name} failed!\nstdout:\n{stdout}\nstderr:\n{stderr}") + return stderr, stdout + + +def test_tabularize(): + # Step 0: Setup Environment + with tempfile.TemporaryDirectory() as d: + MEDS_cohort_dir = Path(d) / "processed" + tabularized_data_dir = Path(d) / "processed" / "tabularize" + # Create the directories + (MEDS_cohort_dir / "final_cohort").mkdir(parents=True, exist_ok=True) + + # Store MEDS outputs + for split, data in MEDS_OUTPUTS.items(): + file_path = MEDS_cohort_dir / "final_cohort" / f"{split}.parquet" + file_path.parent.mkdir(exist_ok=True) + df = pl.read_csv(StringIO(data)) + df.with_columns(pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f")).write_parquet( + file_path + ) + + tabularize_config_kwargs = { + "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), + "tabularized_data_dir": str(tabularized_data_dir.resolve()), + "min_code_inclusion_frequency": 1, + "model_dir": str(Path(d) / "save_model"), + "window_sizes": "[30d,365d,full]", + "aggs": "[code/count,value/sum,static/present,static/first]", + "codes": "null", + "n_patients_per_sub_shard": 2, + "do_overwrite": True, + "do_update": True, + "seed": 1, + "hydra.verbose": True, + "tqdm": False, + "test": True, + "task_dir": str((tabularized_data_dir / "task").resolve()), + } + cfg = DictConfig(tabularize_config_kwargs) + f_name_resolver = FileNameResolver(cfg) + meds_files = f_name_resolver.list_meds_files() + assert len(meds_files) == 4, "MEDS Data Files Should be 4!" + for f in meds_files: + assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" + + split_json = json.load(StringIO(SPLITS_JSON)) + splits_fp = MEDS_cohort_dir / "splits.json" + json.dump(split_json, splits_fp.open("w")) + logger.info("caching flat representation of MEDS data") + + # Step 1: Run the describe_codes script + stderr, stdout = run_command( + "meds_tab describe_codes", + [], + tabularize_config_kwargs, + "describe_codes", + ) + + store_columns(cfg) + assert (tabularized_data_dir / "config.yaml").is_file() + assert (tabularized_data_dir / "feature_columns.json").is_file() + assert (tabularized_data_dir / "feature_freqs.json").is_file() + + feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) + assert get_feature_names("code/count", feature_columns) == sorted(CODE_COLS) + assert get_feature_names("static/present", feature_columns) == sorted(STATIC_PRESENT_COLS) + assert get_feature_names("static/first", feature_columns) == sorted(STATIC_FIRST_COLS) + for value_agg in VALUE_AGGREGATIONS: + assert get_feature_names(value_agg, feature_columns) == sorted(VALUE_COLS) + + # Step 2: Run the tabularization script + n_workers = "1" + stderr, stdout = run_command( + "meds_tab tabularization", + [n_workers], + tabularize_config_kwargs, + "tabularization", + ) + # Check Static File Generation + tabularize_static_data(cfg) + actual_files = [str(Path(*f.parts[-5:])) for f in f_name_resolver.list_static_files()] + assert set(actual_files) == set(EXPECTED_STATIC_FILES) + # Check the files are not empty + for f in f_name_resolver.list_static_files(): + static_matrix = load_matrix(f) + assert static_matrix.shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" + expected_num_cols = len(get_feature_names(f"static/{f.stem}", feature_columns)) + logger.info((static_matrix.shape[1], expected_num_cols)) + logger.info(f_name_resolver.list_static_files()) + assert static_matrix.shape[1] == expected_num_cols, ( + f"Static Data Tabular Dataframe Should have {expected_num_cols}" + f"Columns but has {static_matrix.shape[1]}!" + ) + static_first_fp = f_name_resolver.get_flat_static_rep("tuning", "0", "static/first") + static_present_fp = f_name_resolver.get_flat_static_rep("tuning", "0", "static/present") + assert ( + load_matrix(static_first_fp).shape[0] == load_matrix(static_present_fp).shape[0] + ), "static data first and present aggregations have different numbers of rows" + + # Check Time Series File Generation + output_files = f_name_resolver.list_ts_files() + f_name_resolver.list_ts_files() + actual_files = [str(Path(*f.parts[-5:])) for f in output_files] + + assert set(actual_files) == set(SUMMARIZE_EXPECTED_FILES) + for f in output_files: + sparse_array = load_matrix(f) + assert sparse_array.shape[0] > 0 + assert sparse_array.shape[1] > 0 + ts_code_fp = f_name_resolver.get_flat_ts_rep("tuning", "0", "365d", "code/count") + ts_value_fp = f_name_resolver.get_flat_ts_rep("tuning", "0", "365d", "value/sum") + assert ( + load_matrix(ts_code_fp).shape[0] == load_matrix(ts_value_fp).shape[0] + ), "time series code and value have different numbers of rows" + assert ( + load_matrix(static_first_fp).shape[0] == load_matrix(ts_value_fp).shape[0] + ), "static data and time series have different numbers of rows" + + # Create Fake Labels + feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) + for f in f_name_resolver.list_meds_files(): + df = pl.read_parquet(f) + df = get_events_df(df, feature_columns) + pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) + df = df.with_columns(pl.Series(name="label", values=pseudo_labels)) + df = df.select(pl.col(["patient_id", "timestamp", "label"])) + df = df.unique(subset=["patient_id", "timestamp"]) + df = df.with_row_index("event_id") + + split = f.parent.stem + shard_num = f.stem + out_f = f_name_resolver.get_label(split, shard_num) + out_f.parent.mkdir(parents=True, exist_ok=True) + df.write_parquet(out_f) + + # Step 3: Run the task_specific_caching script + stderr, stdout = run_command( + "meds_tab task_specific_caching", + [], + tabularize_config_kwargs, + "task_specific_caching", + ) + # Check the files are not empty + + # Step 4: Run the xgboost script + xgboost_config_kwargs = { + "hydra.mode": "MULTIRUN", + } + xgboost_config_kwargs = {**tabularize_config_kwargs, **xgboost_config_kwargs} + stderr, stdout = run_command( + "meds_tab xgboost", + [], + xgboost_config_kwargs, + "xgboost", + ) + output_files = list(Path(cfg.model_dir).glob("*.json")) + assert len(output_files) == 1 + assert output_files[0] == Path(cfg.model_dir) / "model.json" diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 8e9484b..93e26f7 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -19,10 +19,10 @@ load_matrix, ) from scripts.identify_columns import store_columns +from scripts.launch_xgboost import launch_xgboost from scripts.summarize_over_windows import summarize_ts_data_over_windows from scripts.tabularize_static import tabularize_static_data from scripts.task_specific_caching import task_specific_cache -from scripts.xgboost import xgboost SPLITS_JSON = """{"train/0": [239684, 1195293], "train/1": [68729, 814703], "tuning/0": [754281], "held_out/0": [1500733]}""" # noqa: E501 @@ -317,7 +317,7 @@ def test_tabularize(): "hydra.mode": "MULTIRUN", } xgboost_config_kwargs = {**tabularize_config_kwargs, **xgboost_config_kwargs} - xgboost(cfg) + launch_xgboost(cfg) output_files = list(Path(cfg.model_dir).glob("*.json")) assert len(output_files) == 1 assert output_files[0] == Path(cfg.model_dir) / "model.json" From d225961bc082a8bbcb8d423062f5eb91bf50070b Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 3 Jun 2024 22:17:04 +0000 Subject: [PATCH 085/106] added messy configs --- configs/base_config.yaml | 97 ++++++++++++++++++++++++++++++ configs/{ => bk}/tabularize.yaml | 0 configs/{ => bk}/xgboost.yaml | 0 configs/describe_codes.yaml | 79 ++++++++++++++++++++++++ configs/launch_xgboost.yaml | 79 ++++++++++++++++++++++++ configs/tabularization.yaml | 48 +++++++++++++++ configs/task_specific_caching.yaml | 0 7 files changed, 303 insertions(+) create mode 100644 configs/base_config.yaml rename configs/{ => bk}/tabularize.yaml (100%) rename configs/{ => bk}/xgboost.yaml (100%) create mode 100644 configs/describe_codes.yaml create mode 100644 configs/launch_xgboost.yaml create mode 100644 configs/tabularization.yaml create mode 100644 configs/task_specific_caching.yaml diff --git a/configs/base_config.yaml b/configs/base_config.yaml new file mode 100644 index 0000000..44172e1 --- /dev/null +++ b/configs/base_config.yaml @@ -0,0 +1,97 @@ +# Raw data +MEDS_cohort_dir: ??? +tabularized_data_dir: ${MEDS_cohort_dir}/tabularize +task_dir: ${tabularized_data_dir}/task +model_dir: ${MEDS_cohort_dir}/model/${now:%Y-%m-%d_%H-%M-%S} +cache_dir: ${tabularized_data_dir}/.cache + +# Pre-processing +min_code_inclusion_frequency: 1 +window_sizes: + - "1d" + - "7d" + - "30d" + - "365d" + - "full" +codes: null +aggs: + - "static/present" + - "static/first" + - "code/count" + - "value/count" + - "value/sum" + - "value/sum_sqd" + - "value/min" + - "value/max" + +dynamic_threshold: 0.01 +numerical_value_threshold: 0.1 + +# Sharding +n_patients_per_sub_shard: null + +# Misc +do_overwrite: False +seed: 1 +tqdm: False +worker: 1 +loguru_init: False + +# Hydra +hydra: + job: + name: tabularize_step_${now:%Y-%m-%d_%H-%M-%S} + run: + dir: ${tabularized_data_dir}/.logs/etl/${hydra.job.name} + sweep: + dir: ${tabularized_data_dir}/.logs/etl/${hydra.job.name} + + + + +Describe codes: +MEDS_cohort_dir: ??? +tabularized_data_dir: ${MEDS_cohort_dir}/tabularize +feature_path: ${tabularized_data_dir}/feature_description.parquet + +Final inputs: +MEDS final cohort directory (see tabularization) +Output filepath + +User inputs: +MEDS Base Directory +-> ... see tabularization + +Output: Dataframe with a column “code” and a column “code/n_occurrences” + +Tabularization: +static is fast +multirun only for time series + +Final inputs: +MEDS final cohort directory (directory within which there are shard keys and MEDS formatted files for tabularization) +Tabularization parameters (see above) + +User inputs: +Tabularizaton parameters +MEDS Base Directory +-> MEDS final cohort directory via final_cohort subdir or something +-> code metadata file via code_metadata.parquet file +Task-specific Caching +Final inputs: +Task directory: In the config file, somehow, specify a filepath to a directory which contains, in the same shard keys (e.g., “train/0”, “prospective_test_set/2”) parquet files with patient_id, index_datetime, and label). index_datetime is assumed to be inclusive but there is a configuration parameter that controls this. +Full tabularized data directory: directory within which there are shard keys mapping to tabularized sparse matrices. + +User inputs: +MEDS Base Directory +Task name +XGBoost: +Final Inputs: +Tabular feature selection parameters (see above) +Final task specific cached data directory. +XGBoost Model Parameters +In memory vs. external memory. + +User Inputs: +MEDS Base Directory +Task name diff --git a/configs/tabularize.yaml b/configs/bk/tabularize.yaml similarity index 100% rename from configs/tabularize.yaml rename to configs/bk/tabularize.yaml diff --git a/configs/xgboost.yaml b/configs/bk/xgboost.yaml similarity index 100% rename from configs/xgboost.yaml rename to configs/bk/xgboost.yaml diff --git a/configs/describe_codes.yaml b/configs/describe_codes.yaml new file mode 100644 index 0000000..0b4cbba --- /dev/null +++ b/configs/describe_codes.yaml @@ -0,0 +1,79 @@ +# Raw data +MEDS_cohort_dir: ??? +tabularized_data_dir: ${MEDS_cohort_dir}/tabularize +task_dir: ${tabularized_data_dir}/task +model_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} +cache_dir: ${tabularized_data_dir}/.cache + +# Pre-processing +min_code_inclusion_frequency: 1 +window_sizes: + - "1d" + - "7d" + - "30d" + - "365d" + - "full" +codes: null +aggs: + # - "static/present" + # - "static/first" + - "code/count" + - "value/count" + - "value/sum" + - "value/sum_sqd" + - "value/min" + - "value/max" + +dynamic_threshold: 0.01 +numerical_value_threshold: 0.1 + +# Sharding +n_patients_per_sub_shard: null + +# Misc +do_overwrite: False +do_update: True +seed: 1 +tqdm: True +worker: 0 +test: False + +num_boost_round: 1000 +early_stopping_rounds: 5 +model: + booster: gbtree + device: cpu + tree_method: hist + objective: binary:logistic + +iterator: + keep_data_in_memory: False + binarize_task: True + +# Hydra settings for sweep +defaults: + - override hydra/sweeper: optuna + - override hydra/sweeper/sampler: tpe + +hydra: + verbose: False + sweep: + dir: ${model_dir}/.logs/ + run: + dir: ${model_dir}/.logs/ + + # Optuna Sweeper + sweeper: + sampler: + seed: 1 + storage: null + study_name: tabularize_study_${now:%Y-%m-%d_%H-%M-%S} + direction: minimize + n_trials: 10 + + # Define search space for Optuna + params: + window_sizes: + _target_: hydra.utils.call(${hydra.utils.cross_product}, + values=["1d", "7d", "30d", "365d", "full"]) # , max_options=5) + iterator.keep_data_in_memory: choice([True], [False]) diff --git a/configs/launch_xgboost.yaml b/configs/launch_xgboost.yaml new file mode 100644 index 0000000..0b4cbba --- /dev/null +++ b/configs/launch_xgboost.yaml @@ -0,0 +1,79 @@ +# Raw data +MEDS_cohort_dir: ??? +tabularized_data_dir: ${MEDS_cohort_dir}/tabularize +task_dir: ${tabularized_data_dir}/task +model_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} +cache_dir: ${tabularized_data_dir}/.cache + +# Pre-processing +min_code_inclusion_frequency: 1 +window_sizes: + - "1d" + - "7d" + - "30d" + - "365d" + - "full" +codes: null +aggs: + # - "static/present" + # - "static/first" + - "code/count" + - "value/count" + - "value/sum" + - "value/sum_sqd" + - "value/min" + - "value/max" + +dynamic_threshold: 0.01 +numerical_value_threshold: 0.1 + +# Sharding +n_patients_per_sub_shard: null + +# Misc +do_overwrite: False +do_update: True +seed: 1 +tqdm: True +worker: 0 +test: False + +num_boost_round: 1000 +early_stopping_rounds: 5 +model: + booster: gbtree + device: cpu + tree_method: hist + objective: binary:logistic + +iterator: + keep_data_in_memory: False + binarize_task: True + +# Hydra settings for sweep +defaults: + - override hydra/sweeper: optuna + - override hydra/sweeper/sampler: tpe + +hydra: + verbose: False + sweep: + dir: ${model_dir}/.logs/ + run: + dir: ${model_dir}/.logs/ + + # Optuna Sweeper + sweeper: + sampler: + seed: 1 + storage: null + study_name: tabularize_study_${now:%Y-%m-%d_%H-%M-%S} + direction: minimize + n_trials: 10 + + # Define search space for Optuna + params: + window_sizes: + _target_: hydra.utils.call(${hydra.utils.cross_product}, + values=["1d", "7d", "30d", "365d", "full"]) # , max_options=5) + iterator.keep_data_in_memory: choice([True], [False]) diff --git a/configs/tabularization.yaml b/configs/tabularization.yaml new file mode 100644 index 0000000..047a52c --- /dev/null +++ b/configs/tabularization.yaml @@ -0,0 +1,48 @@ +# Raw data +MEDS_cohort_dir: ??? +tabularized_data_dir: ${MEDS_cohort_dir}/tabularize +task_dir: ${tabularized_data_dir}/task +model_dir: ${MEDS_cohort_dir}/model/${now:%Y-%m-%d_%H-%M-%S} +cache_dir: ${tabularized_data_dir}/.cache + +# Pre-processing +min_code_inclusion_frequency: 1 +window_sizes: + - "1d" + - "7d" + - "30d" + - "365d" + - "full" +codes: null +aggs: + - "static/present" + - "static/first" + - "code/count" + - "value/count" + - "value/sum" + - "value/sum_sqd" + - "value/min" + - "value/max" + +dynamic_threshold: 0.01 +numerical_value_threshold: 0.1 + +# Sharding +n_patients_per_sub_shard: null + +# Misc +do_overwrite: False +do_update: True +seed: 1 +tqdm: True +worker: 1 +test: False + +# Hydra +hydra: + job: + name: tabularize_step_${now:%Y-%m-%d_%H-%M-%S} + run: + dir: ${tabularized_data_dir}/.logs/etl/${hydra.job.name} + sweep: + dir: ${tabularized_data_dir}/.logs/etl/${hydra.job.name} diff --git a/configs/task_specific_caching.yaml b/configs/task_specific_caching.yaml new file mode 100644 index 0000000..e69de29 From 447d23beaacbcbd1845cb3e589c0e303d89797f9 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Mon, 3 Jun 2024 18:36:26 -0400 Subject: [PATCH 086/106] added starter configs --- configs/default.yaml | 15 ++++++ configs/describe_codes.yaml | 82 +++-------------------------- configs/tabularization.yaml | 52 +++--------------- configs/tabularization/default.yaml | 22 ++++++++ 4 files changed, 51 insertions(+), 120 deletions(-) create mode 100644 configs/default.yaml create mode 100644 configs/tabularization/default.yaml diff --git a/configs/default.yaml b/configs/default.yaml new file mode 100644 index 0000000..b81742b --- /dev/null +++ b/configs/default.yaml @@ -0,0 +1,15 @@ +do_overwrite: False +seed: 1 +tqdm: False +worker: 0 + +log_dir: ${output_dir}/.logs/ + +hydra: + verbose: False + job: + name: MEDS_TAB_${name}_${worker}_{now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ${log_dir} + run: + dir: ${log_dir} diff --git a/configs/describe_codes.yaml b/configs/describe_codes.yaml index 0b4cbba..dd158ac 100644 --- a/configs/describe_codes.yaml +++ b/configs/describe_codes.yaml @@ -1,79 +1,11 @@ -# Raw data -MEDS_cohort_dir: ??? -tabularized_data_dir: ${MEDS_cohort_dir}/tabularize -task_dir: ${tabularized_data_dir}/task -model_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} -cache_dir: ${tabularized_data_dir}/.cache - -# Pre-processing -min_code_inclusion_frequency: 1 -window_sizes: - - "1d" - - "7d" - - "30d" - - "365d" - - "full" -codes: null -aggs: - # - "static/present" - # - "static/first" - - "code/count" - - "value/count" - - "value/sum" - - "value/sum_sqd" - - "value/min" - - "value/max" - -dynamic_threshold: 0.01 -numerical_value_threshold: 0.1 - -# Sharding -n_patients_per_sub_shard: null - -# Misc -do_overwrite: False -do_update: True -seed: 1 -tqdm: True -worker: 0 -test: False - -num_boost_round: 1000 -early_stopping_rounds: 5 -model: - booster: gbtree - device: cpu - tree_method: hist - objective: binary:logistic - -iterator: - keep_data_in_memory: False - binarize_task: True - -# Hydra settings for sweep defaults: - - override hydra/sweeper: optuna - - override hydra/sweeper/sampler: tpe + - default + - _self_ -hydra: - verbose: False - sweep: - dir: ${model_dir}/.logs/ - run: - dir: ${model_dir}/.logs/ +# Raw data +MEDS_cohort_dir: ??? - # Optuna Sweeper - sweeper: - sampler: - seed: 1 - storage: null - study_name: tabularize_study_${now:%Y-%m-%d_%H-%M-%S} - direction: minimize - n_trials: 10 +input_directory: ${MEDS_cohort_dir}/final_cohort - # Define search space for Optuna - params: - window_sizes: - _target_: hydra.utils.call(${hydra.utils.cross_product}, - values=["1d", "7d", "30d", "365d", "full"]) # , max_options=5) - iterator.keep_data_in_memory: choice([True], [False]) +output_dir: ${MEDS_cohort_dir} +output_filepath: ${output_dir}/code_metadata.parquet diff --git a/configs/tabularization.yaml b/configs/tabularization.yaml index 047a52c..33cb27a 100644 --- a/configs/tabularization.yaml +++ b/configs/tabularization.yaml @@ -1,48 +1,10 @@ +defaults: + - default + - tabularization: default + - _self_ + # Raw data MEDS_cohort_dir: ??? -tabularized_data_dir: ${MEDS_cohort_dir}/tabularize -task_dir: ${tabularized_data_dir}/task -model_dir: ${MEDS_cohort_dir}/model/${now:%Y-%m-%d_%H-%M-%S} -cache_dir: ${tabularized_data_dir}/.cache - -# Pre-processing -min_code_inclusion_frequency: 1 -window_sizes: - - "1d" - - "7d" - - "30d" - - "365d" - - "full" -codes: null -aggs: - - "static/present" - - "static/first" - - "code/count" - - "value/count" - - "value/sum" - - "value/sum_sqd" - - "value/min" - - "value/max" - -dynamic_threshold: 0.01 -numerical_value_threshold: 0.1 - -# Sharding -n_patients_per_sub_shard: null - -# Misc -do_overwrite: False -do_update: True -seed: 1 -tqdm: True -worker: 1 -test: False -# Hydra -hydra: - job: - name: tabularize_step_${now:%Y-%m-%d_%H-%M-%S} - run: - dir: ${tabularized_data_dir}/.logs/etl/${hydra.job.name} - sweep: - dir: ${tabularized_data_dir}/.logs/etl/${hydra.job.name} +input_directory: ${MEDS_cohort_dir}/final_cohort +output_dir: ${MEDS_cohort_dir}/tabularize diff --git a/configs/tabularization/default.yaml b/configs/tabularization/default.yaml new file mode 100644 index 0000000..bb01064 --- /dev/null +++ b/configs/tabularization/default.yaml @@ -0,0 +1,22 @@ +# User inputs +code_metadata_fp: ${MEDS_cohort_dir}/code_metadata.parquet +allowed_codes: null +min_code_inclusion_frequency: 10 +window_sizes: + - "1d" + - "7d" + - "30d" + - "365d" + - "full" +aggs: + - "static/present" + - "static/first" + - "code/count" + - "value/count" + - "value/sum" + - "value/sum_sqd" + - "value/min" + - "value/max" + +# Resolved inputs +# _resolved_codes: ${filter_to_codes:${allowed_codes},${min_code_inclusion_frequency},${code_metadata_fp}} From c2b881bb1571e5b3e5e13d72741351355ed3ae6c Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 3 Jun 2024 22:58:06 +0000 Subject: [PATCH 087/106] updated configs to include all tasks --- configs/base_config.yaml | 97 ----------------------------- configs/bk/tabularize.yaml | 48 -------------- configs/bk/xgboost.yaml | 79 ----------------------- configs/default.yaml | 1 + configs/describe_codes.yaml | 4 +- configs/launch_xgboost.yaml | 84 ++++++++----------------- configs/tabularization.yaml | 2 - configs/tabularization/default.yaml | 2 +- configs/task_specific_caching.yaml | 11 ++++ 9 files changed, 39 insertions(+), 289 deletions(-) delete mode 100644 configs/base_config.yaml delete mode 100644 configs/bk/tabularize.yaml delete mode 100644 configs/bk/xgboost.yaml diff --git a/configs/base_config.yaml b/configs/base_config.yaml deleted file mode 100644 index 44172e1..0000000 --- a/configs/base_config.yaml +++ /dev/null @@ -1,97 +0,0 @@ -# Raw data -MEDS_cohort_dir: ??? -tabularized_data_dir: ${MEDS_cohort_dir}/tabularize -task_dir: ${tabularized_data_dir}/task -model_dir: ${MEDS_cohort_dir}/model/${now:%Y-%m-%d_%H-%M-%S} -cache_dir: ${tabularized_data_dir}/.cache - -# Pre-processing -min_code_inclusion_frequency: 1 -window_sizes: - - "1d" - - "7d" - - "30d" - - "365d" - - "full" -codes: null -aggs: - - "static/present" - - "static/first" - - "code/count" - - "value/count" - - "value/sum" - - "value/sum_sqd" - - "value/min" - - "value/max" - -dynamic_threshold: 0.01 -numerical_value_threshold: 0.1 - -# Sharding -n_patients_per_sub_shard: null - -# Misc -do_overwrite: False -seed: 1 -tqdm: False -worker: 1 -loguru_init: False - -# Hydra -hydra: - job: - name: tabularize_step_${now:%Y-%m-%d_%H-%M-%S} - run: - dir: ${tabularized_data_dir}/.logs/etl/${hydra.job.name} - sweep: - dir: ${tabularized_data_dir}/.logs/etl/${hydra.job.name} - - - - -Describe codes: -MEDS_cohort_dir: ??? -tabularized_data_dir: ${MEDS_cohort_dir}/tabularize -feature_path: ${tabularized_data_dir}/feature_description.parquet - -Final inputs: -MEDS final cohort directory (see tabularization) -Output filepath - -User inputs: -MEDS Base Directory --> ... see tabularization - -Output: Dataframe with a column “code” and a column “code/n_occurrences” - -Tabularization: -static is fast -multirun only for time series - -Final inputs: -MEDS final cohort directory (directory within which there are shard keys and MEDS formatted files for tabularization) -Tabularization parameters (see above) - -User inputs: -Tabularizaton parameters -MEDS Base Directory --> MEDS final cohort directory via final_cohort subdir or something --> code metadata file via code_metadata.parquet file -Task-specific Caching -Final inputs: -Task directory: In the config file, somehow, specify a filepath to a directory which contains, in the same shard keys (e.g., “train/0”, “prospective_test_set/2”) parquet files with patient_id, index_datetime, and label). index_datetime is assumed to be inclusive but there is a configuration parameter that controls this. -Full tabularized data directory: directory within which there are shard keys mapping to tabularized sparse matrices. - -User inputs: -MEDS Base Directory -Task name -XGBoost: -Final Inputs: -Tabular feature selection parameters (see above) -Final task specific cached data directory. -XGBoost Model Parameters -In memory vs. external memory. - -User Inputs: -MEDS Base Directory -Task name diff --git a/configs/bk/tabularize.yaml b/configs/bk/tabularize.yaml deleted file mode 100644 index 047a52c..0000000 --- a/configs/bk/tabularize.yaml +++ /dev/null @@ -1,48 +0,0 @@ -# Raw data -MEDS_cohort_dir: ??? -tabularized_data_dir: ${MEDS_cohort_dir}/tabularize -task_dir: ${tabularized_data_dir}/task -model_dir: ${MEDS_cohort_dir}/model/${now:%Y-%m-%d_%H-%M-%S} -cache_dir: ${tabularized_data_dir}/.cache - -# Pre-processing -min_code_inclusion_frequency: 1 -window_sizes: - - "1d" - - "7d" - - "30d" - - "365d" - - "full" -codes: null -aggs: - - "static/present" - - "static/first" - - "code/count" - - "value/count" - - "value/sum" - - "value/sum_sqd" - - "value/min" - - "value/max" - -dynamic_threshold: 0.01 -numerical_value_threshold: 0.1 - -# Sharding -n_patients_per_sub_shard: null - -# Misc -do_overwrite: False -do_update: True -seed: 1 -tqdm: True -worker: 1 -test: False - -# Hydra -hydra: - job: - name: tabularize_step_${now:%Y-%m-%d_%H-%M-%S} - run: - dir: ${tabularized_data_dir}/.logs/etl/${hydra.job.name} - sweep: - dir: ${tabularized_data_dir}/.logs/etl/${hydra.job.name} diff --git a/configs/bk/xgboost.yaml b/configs/bk/xgboost.yaml deleted file mode 100644 index 0b4cbba..0000000 --- a/configs/bk/xgboost.yaml +++ /dev/null @@ -1,79 +0,0 @@ -# Raw data -MEDS_cohort_dir: ??? -tabularized_data_dir: ${MEDS_cohort_dir}/tabularize -task_dir: ${tabularized_data_dir}/task -model_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} -cache_dir: ${tabularized_data_dir}/.cache - -# Pre-processing -min_code_inclusion_frequency: 1 -window_sizes: - - "1d" - - "7d" - - "30d" - - "365d" - - "full" -codes: null -aggs: - # - "static/present" - # - "static/first" - - "code/count" - - "value/count" - - "value/sum" - - "value/sum_sqd" - - "value/min" - - "value/max" - -dynamic_threshold: 0.01 -numerical_value_threshold: 0.1 - -# Sharding -n_patients_per_sub_shard: null - -# Misc -do_overwrite: False -do_update: True -seed: 1 -tqdm: True -worker: 0 -test: False - -num_boost_round: 1000 -early_stopping_rounds: 5 -model: - booster: gbtree - device: cpu - tree_method: hist - objective: binary:logistic - -iterator: - keep_data_in_memory: False - binarize_task: True - -# Hydra settings for sweep -defaults: - - override hydra/sweeper: optuna - - override hydra/sweeper/sampler: tpe - -hydra: - verbose: False - sweep: - dir: ${model_dir}/.logs/ - run: - dir: ${model_dir}/.logs/ - - # Optuna Sweeper - sweeper: - sampler: - seed: 1 - storage: null - study_name: tabularize_study_${now:%Y-%m-%d_%H-%M-%S} - direction: minimize - n_trials: 10 - - # Define search space for Optuna - params: - window_sizes: - _target_: hydra.utils.call(${hydra.utils.cross_product}, - values=["1d", "7d", "30d", "365d", "full"]) # , max_options=5) - iterator.keep_data_in_memory: choice([True], [False]) diff --git a/configs/default.yaml b/configs/default.yaml index b81742b..7c7a86f 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -1,3 +1,4 @@ +MEDS_cohort_dir: ??? do_overwrite: False seed: 1 tqdm: False diff --git a/configs/describe_codes.yaml b/configs/describe_codes.yaml index dd158ac..b74ae7d 100644 --- a/configs/describe_codes.yaml +++ b/configs/describe_codes.yaml @@ -3,9 +3,7 @@ defaults: - _self_ # Raw data -MEDS_cohort_dir: ??? - input_directory: ${MEDS_cohort_dir}/final_cohort - +# Where to store output code frequency data output_dir: ${MEDS_cohort_dir} output_filepath: ${output_dir}/code_metadata.parquet diff --git a/configs/launch_xgboost.yaml b/configs/launch_xgboost.yaml index 0b4cbba..ceb5406 100644 --- a/configs/launch_xgboost.yaml +++ b/configs/launch_xgboost.yaml @@ -1,67 +1,33 @@ -# Raw data -MEDS_cohort_dir: ??? -tabularized_data_dir: ${MEDS_cohort_dir}/tabularize -task_dir: ${tabularized_data_dir}/task -model_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} -cache_dir: ${tabularized_data_dir}/.cache - -# Pre-processing -min_code_inclusion_frequency: 1 -window_sizes: - - "1d" - - "7d" - - "30d" - - "365d" - - "full" -codes: null -aggs: - # - "static/present" - # - "static/first" - - "code/count" - - "value/count" - - "value/sum" - - "value/sum_sqd" - - "value/min" - - "value/max" - -dynamic_threshold: 0.01 -numerical_value_threshold: 0.1 - -# Sharding -n_patients_per_sub_shard: null - -# Misc -do_overwrite: False -do_update: True -seed: 1 -tqdm: True -worker: 0 -test: False - -num_boost_round: 1000 -early_stopping_rounds: 5 -model: - booster: gbtree - device: cpu - tree_method: hist - objective: binary:logistic - -iterator: - keep_data_in_memory: False - binarize_task: True - -# Hydra settings for sweep defaults: + - default + - tabularization: default - override hydra/sweeper: optuna - override hydra/sweeper/sampler: tpe + - _self_ +# Raw data +MEDS_cohort_dir: ??? +input_directory: ${MEDS_cohort_dir}/task +# Where to output the model and cached data +output_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} +output_filepath: ${output_dir}/model_metadata.parquet +cache_dir: ${MEDS_cohort_dir}/.cache + +# Model parameters +model_params: + num_boost_round: 1000 + early_stopping_rounds: 5 + model: + booster: gbtree + device: cpu + tree_method: hist + objective: binary:logistic + iterator: + keep_data_in_memory: False + binarize_task: True + +# Sweep parameters for Optuna hydra: - verbose: False - sweep: - dir: ${model_dir}/.logs/ - run: - dir: ${model_dir}/.logs/ - # Optuna Sweeper sweeper: sampler: diff --git a/configs/tabularization.yaml b/configs/tabularization.yaml index 33cb27a..5aa644d 100644 --- a/configs/tabularization.yaml +++ b/configs/tabularization.yaml @@ -4,7 +4,5 @@ defaults: - _self_ # Raw data -MEDS_cohort_dir: ??? - input_directory: ${MEDS_cohort_dir}/final_cohort output_dir: ${MEDS_cohort_dir}/tabularize diff --git a/configs/tabularization/default.yaml b/configs/tabularization/default.yaml index bb01064..9b67776 100644 --- a/configs/tabularization/default.yaml +++ b/configs/tabularization/default.yaml @@ -19,4 +19,4 @@ aggs: - "value/max" # Resolved inputs -# _resolved_codes: ${filter_to_codes:${allowed_codes},${min_code_inclusion_frequency},${code_metadata_fp}} +_resolved_codes: ${filter_to_codes:${allowed_codes},${min_code_inclusion_frequency},${code_metadata_fp}} diff --git a/configs/task_specific_caching.yaml b/configs/task_specific_caching.yaml index e69de29..5bb8ac8 100644 --- a/configs/task_specific_caching.yaml +++ b/configs/task_specific_caching.yaml @@ -0,0 +1,11 @@ +defaults: + - default + - tabularization: default + - _self_ + +# Tabularized Data +input_directory: ${MEDS_cohort_dir}/tabularize +# Where the labels are stored, with columns patient_id, timestamp, label +input_label_directory: ${MEDS_cohort_dir}/labels +# Where to output the task specific tabularized data +output_dir: ${MEDS_cohort_dir}/task From 9fd33f4b0e5f662f8887ea88f6287ca369b0e8d2 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 4 Jun 2024 01:28:27 +0000 Subject: [PATCH 088/106] updated describe_codes --- configs/describe_codes.yaml | 9 - configs/tabularization.yaml | 8 - pyproject.toml | 9 +- scripts/identify_columns.py | 142 --------- src/MEDS_tabular_automl/__main__.py | 51 ---- src/MEDS_tabular_automl/configs/__init__.py | 0 .../MEDS_tabular_automl/configs}/default.yaml | 1 + .../configs/describe_codes.yaml | 12 + .../configs}/launch_xgboost.yaml | 14 +- .../configs/tabularization.yaml | 10 + .../configs}/tabularization/default.yaml | 0 .../configs}/task_specific_caching.yaml | 7 +- src/MEDS_tabular_automl/describe_codes.py | 97 +++++++ src/MEDS_tabular_automl/file_name.py | 272 +++++++++--------- src/MEDS_tabular_automl/scripts/__init__.py | 0 .../MEDS_tabular_automl/scripts/cache_task.py | 16 +- .../scripts/describe_codes.py | 101 +++++++ .../scripts}/launch_xgboost.py | 7 +- .../scripts}/tabularize_static.py | 9 +- .../scripts/tabularize_time_series.py | 9 +- src/MEDS_tabular_automl/utils.py | 101 +++---- tests/{cli_test.py => cli_bk.py} | 4 +- tests/test_tabularize.py | 180 ++++++------ 23 files changed, 526 insertions(+), 533 deletions(-) delete mode 100644 configs/describe_codes.yaml delete mode 100644 configs/tabularization.yaml delete mode 100644 scripts/identify_columns.py delete mode 100644 src/MEDS_tabular_automl/__main__.py create mode 100644 src/MEDS_tabular_automl/configs/__init__.py rename {configs => src/MEDS_tabular_automl/configs}/default.yaml (92%) create mode 100644 src/MEDS_tabular_automl/configs/describe_codes.yaml rename {configs => src/MEDS_tabular_automl/configs}/launch_xgboost.yaml (77%) create mode 100644 src/MEDS_tabular_automl/configs/tabularization.yaml rename {configs => src/MEDS_tabular_automl/configs}/tabularization/default.yaml (100%) rename {configs => src/MEDS_tabular_automl/configs}/task_specific_caching.yaml (55%) create mode 100644 src/MEDS_tabular_automl/describe_codes.py create mode 100644 src/MEDS_tabular_automl/scripts/__init__.py rename scripts/task_specific_caching.py => src/MEDS_tabular_automl/scripts/cache_task.py (88%) create mode 100644 src/MEDS_tabular_automl/scripts/describe_codes.py rename {scripts => src/MEDS_tabular_automl/scripts}/launch_xgboost.py (98%) rename {scripts => src/MEDS_tabular_automl/scripts}/tabularize_static.py (97%) rename scripts/summarize_over_windows.py => src/MEDS_tabular_automl/scripts/tabularize_time_series.py (95%) rename tests/{cli_test.py => cli_bk.py} (100%) diff --git a/configs/describe_codes.yaml b/configs/describe_codes.yaml deleted file mode 100644 index b74ae7d..0000000 --- a/configs/describe_codes.yaml +++ /dev/null @@ -1,9 +0,0 @@ -defaults: - - default - - _self_ - -# Raw data -input_directory: ${MEDS_cohort_dir}/final_cohort -# Where to store output code frequency data -output_dir: ${MEDS_cohort_dir} -output_filepath: ${output_dir}/code_metadata.parquet diff --git a/configs/tabularization.yaml b/configs/tabularization.yaml deleted file mode 100644 index 5aa644d..0000000 --- a/configs/tabularization.yaml +++ /dev/null @@ -1,8 +0,0 @@ -defaults: - - default - - tabularization: default - - _self_ - -# Raw data -input_directory: ${MEDS_cohort_dir}/final_cohort -output_dir: ${MEDS_cohort_dir}/tabularize diff --git a/pyproject.toml b/pyproject.toml index 1e5b961..9b21e4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,15 +18,18 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", "pandas", "numba", "tqdm", "xgboost", "scikit-learn", "hydra-optuna-sweeper"] +dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", "pandas", "tqdm", "xgboost", "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher"] [project.scripts] -meds_tab = "MEDS_tabular_automl.__main__:main" +meds-tab-describe = "MEDS_tabular_automl.scripts.describe_codes:main" +meds_tab-tabularize-static = "MEDS_tabular_automl.scripts.tabularize_static:main" +meds_tab-tabularize-time-series = "MEDS_tabular_automl.scripts.tabularize_time_series:main" +meds_tab-cache-task = "MEDS_tabular_automl.scripts.cache_task:main" +meds_tab-xgboost = "MEDS_tabular_automl.scripts.launch_xgboost:main" [project.optional-dependencies] dev = ["pre-commit"] tests = ["pytest", "pytest-cov", "rootutils"] -local_parallelism = ["hydra-joblib-launcher"] profiling = ["mprofile", "matplotlib"] [project.urls] diff --git a/scripts/identify_columns.py b/scripts/identify_columns.py deleted file mode 100644 index 84c3a1a..0000000 --- a/scripts/identify_columns.py +++ /dev/null @@ -1,142 +0,0 @@ -#!/usr/bin/env python -"""This Python script, stores the configuration parameters and feature columns used in the output.""" -import json -from collections import defaultdict -from pathlib import Path - -import hydra -import numpy as np -import polars as pl -from loguru import logger -from omegaconf import DictConfig, OmegaConf - -from MEDS_tabular_automl.file_name import FileNameResolver -from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import compute_feature_frequencies, load_tqdm - - -def store_config_yaml(config_fp: Path, cfg: DictConfig): - """Stores configuration parameters into a JSON file. - - This function writes a dictionary of parameters, which includes patient partitioning - information and configuration details, to a specified JSON file. - - Args: - - config_fp (Path): The file path for the JSON file where config should be stored. - - cfg (DictConfig): A configuration object containing settings like the number of patients - per sub-shard, minimum code inclusion frequency, and flags for updating or overwriting existing files. - - Behavior: - - If config_fp exists and cfg.do_overwrite is False (without do_update being True), a - FileExistsError is raised to prevent unintentional data loss. - - Raises: - - ValueError: If there are discrepancies between old and new parameters during an update. - - FileExistsError: If the file exists and overwriting is not allowed. - - Example: - >>> cfg = DictConfig({ - ... "n_patients_per_sub_shard": 100, - ... "min_code_inclusion_frequency": 5, - ... "do_overwrite": True, - ... }) - >>> import tempfile - >>> from pathlib import Path - >>> with tempfile.NamedTemporaryFile() as temp_f: - ... config_fp = Path(temp_f.name) - ... store_config_yaml(config_fp, cfg) - ... assert config_fp.exists() - ... store_config_yaml(config_fp, cfg) - ... cfg.do_overwrite = False - ... try: - ... store_config_yaml(config_fp, cfg) - ... except FileExistsError as e: - ... print("FileExistsError Error Triggered") - FileExistsError Error Triggered - """ - if config_fp.exists(): - if not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {config_fp} exists!") - OmegaConf.save(cfg, config_fp) - - -@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") -def store_columns( - cfg: DictConfig, -): - """Stores the configuration parameters and feature columns tabularized data we will be generated for. - - Args: - cfg: The configuration object for the tabularization process. - """ - iter_wrapper = load_tqdm(cfg.tqdm) - # create output dir - f_name_resolver = FileNameResolver(cfg) - flat_dir = f_name_resolver.tabularize_dir - flat_dir.mkdir(exist_ok=True, parents=True) - - # store params in json file - config_fp = f_name_resolver.get_config_path() - store_config_yaml(config_fp, cfg) - - # 0. Identify Output Columns and Frequencies - logger.info("Iterating through shards and caching feature frequencies.") - - def compute_fn(shard_df): - return compute_feature_frequencies(cfg, shard_df) - - def write_fn(data, out_fp): - json.dump(data, open(out_fp, "w")) - - def read_fn(in_fp): - return pl.scan_parquet(in_fp) - - # Map: Iterates through shards and caches feature frequencies - train_shards = f_name_resolver.list_meds_files(split="train") - np.random.shuffle(train_shards) - feature_dir = f_name_resolver.tabularize_dir - for shard_fp in iter_wrapper(train_shards): - out_fp = feature_dir / "identify_train_columns" / f"{shard_fp.stem}.json" - rwlock_wrap( - shard_fp, - out_fp, - read_fn, - write_fn, - compute_fn, - do_overwrite=cfg.do_overwrite, - do_return=False, - ) - - logger.info("Summing frequency computations.") - # Reduce: sum the frequency computations - - def compute_fn(feature_freq_list): - feature_freqs = defaultdict(int) - for shard_feature_freq in feature_freq_list: - for feature, freq in shard_feature_freq.items(): - feature_freqs[feature] += freq - return feature_freqs, sorted(list(feature_freqs.keys())) - - def write_fn(data, out_fp): - feature_freqs, feature_columns = data - json.dump(feature_columns, open(f_name_resolver.get_feature_columns_fp(), "w")) - json.dump(feature_freqs, open(f_name_resolver.get_feature_freqs_fp(), "w")) - - def read_fn(feature_dir): - files = list(feature_dir.glob("*.json")) - return [json.load(open(fp)) for fp in files] - - rwlock_wrap( - feature_dir / "identify_train_columns", - f_name_resolver.get_feature_columns_fp(), - read_fn, - write_fn, - compute_fn, - do_overwrite=cfg.do_overwrite, - do_return=False, - ) - logger.info("Stored feature columns and frequencies.") - - -if __name__ == "__main__": - store_columns() diff --git a/src/MEDS_tabular_automl/__main__.py b/src/MEDS_tabular_automl/__main__.py deleted file mode 100644 index 742073a..0000000 --- a/src/MEDS_tabular_automl/__main__.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Main script for end-to-end task querying.""" - -import enum -import subprocess -import sys -from importlib.resources import files - -CLI_SCRIPTS_DIR = files("MEDS_tabular_automl").parent.parent / "cli" - - -class Program(enum.Enum): - DESCRIBE_CODES = "describe_codes.sh" - TABULARIZATION = "tabularization.sh" - TASK_SPECIFIC_CACHING = "task_specific_caching.sh" - XGBOOST = "xgboost.sh" - PROFILE_TABULARIZATION = "profile_tabularization.sh" - - @staticmethod - def from_str(program_arg): - match program_arg: - case "describe_codes": - return Program.DESCRIBE_CODES - case "tabularization": - return Program.TABULARIZATION - case "task_specific_caching": - return Program.TASK_SPECIFIC_CACHING - case "xgboost": - return Program.XGBOOST - case "profile_tabularization": - return Program.PROFILE_TABULARIZATION - case _: - raise ValueError( - f"Invalid program name {program_arg}, valid programs are {[p.name for p in Program]}" - ) - - @staticmethod - def get_script(program): - return CLI_SCRIPTS_DIR / program.value - - -def main(): - program = sys.argv[1] - args = sys.argv[2:] - program = Program.from_str(program) - script_path = Program.get_script(program) - command_parts = [str(script_path.resolve()), *args] - subprocess.run(" ".join(command_parts), shell=True) - - -if __name__ == "__main__": - main() diff --git a/src/MEDS_tabular_automl/configs/__init__.py b/src/MEDS_tabular_automl/configs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/configs/default.yaml b/src/MEDS_tabular_automl/configs/default.yaml similarity index 92% rename from configs/default.yaml rename to src/MEDS_tabular_automl/configs/default.yaml index 7c7a86f..8f8513c 100644 --- a/configs/default.yaml +++ b/src/MEDS_tabular_automl/configs/default.yaml @@ -3,6 +3,7 @@ do_overwrite: False seed: 1 tqdm: False worker: 0 +loguru_init: False log_dir: ${output_dir}/.logs/ diff --git a/src/MEDS_tabular_automl/configs/describe_codes.yaml b/src/MEDS_tabular_automl/configs/describe_codes.yaml new file mode 100644 index 0000000..f0e56dc --- /dev/null +++ b/src/MEDS_tabular_automl/configs/describe_codes.yaml @@ -0,0 +1,12 @@ +defaults: + - default + - _self_ + +# split we wish to get metadata for +split: train +# Raw data, must have a subdirectory "train" with the training data split +input_dir: ${MEDS_cohort_dir}/final_cohort/${split} +# Where to store output code frequency data +cache_dir: ${MEDS_cohort_dir}/.cache +output_dir: ${MEDS_cohort_dir} +output_filepath: ${output_dir}/code_metadata.parquet diff --git a/configs/launch_xgboost.yaml b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml similarity index 77% rename from configs/launch_xgboost.yaml rename to src/MEDS_tabular_automl/configs/launch_xgboost.yaml index ceb5406..6eb7084 100644 --- a/configs/launch_xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml @@ -5,9 +5,11 @@ defaults: - override hydra/sweeper/sampler: tpe - _self_ +task_name: task + # Raw data -MEDS_cohort_dir: ??? -input_directory: ${MEDS_cohort_dir}/task +input_dir: ${MEDS_cohort_dir}/${task_name} +input_code_metadata: ${MEDS_cohort_dir}/code_metadata.parquet # Where to output the model and cached data output_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} output_filepath: ${output_dir}/model_metadata.parquet @@ -20,10 +22,11 @@ model_params: model: booster: gbtree device: cpu + nthread: 1 tree_method: hist objective: binary:logistic iterator: - keep_data_in_memory: False + keep_data_in_memory: True binarize_task: True # Sweep parameters for Optuna @@ -31,7 +34,7 @@ hydra: # Optuna Sweeper sweeper: sampler: - seed: 1 + seed: ${seed} storage: null study_name: tabularize_study_${now:%Y-%m-%d_%H-%M-%S} direction: minimize @@ -41,5 +44,4 @@ hydra: params: window_sizes: _target_: hydra.utils.call(${hydra.utils.cross_product}, - values=["1d", "7d", "30d", "365d", "full"]) # , max_options=5) - iterator.keep_data_in_memory: choice([True], [False]) + values=["1d", "7d", "30d", "365d", "full"], min_options=1) diff --git a/src/MEDS_tabular_automl/configs/tabularization.yaml b/src/MEDS_tabular_automl/configs/tabularization.yaml new file mode 100644 index 0000000..dc2c48d --- /dev/null +++ b/src/MEDS_tabular_automl/configs/tabularization.yaml @@ -0,0 +1,10 @@ +defaults: + - default + - tabularization: default + - _self_ + +# Raw data +# Where the code metadata is stored +input_code_metadata: ${MEDS_cohort_dir}/code_metadata.parquet +input_dir: ${MEDS_cohort_dir}/final_cohort +output_dir: ${MEDS_cohort_dir}/tabularize diff --git a/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml similarity index 100% rename from configs/tabularization/default.yaml rename to src/MEDS_tabular_automl/configs/tabularization/default.yaml diff --git a/configs/task_specific_caching.yaml b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml similarity index 55% rename from configs/task_specific_caching.yaml rename to src/MEDS_tabular_automl/configs/task_specific_caching.yaml index 5bb8ac8..27135f8 100644 --- a/configs/task_specific_caching.yaml +++ b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml @@ -2,10 +2,11 @@ defaults: - default - tabularization: default - _self_ +task_name: task # Tabularized Data -input_directory: ${MEDS_cohort_dir}/tabularize +input_dir: ${MEDS_cohort_dir}/tabularize # Where the labels are stored, with columns patient_id, timestamp, label -input_label_directory: ${MEDS_cohort_dir}/labels +input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels # Where to output the task specific tabularized data -output_dir: ${MEDS_cohort_dir}/task +output_dir: ${MEDS_cohort_dir}/${task_name}/task_cache diff --git a/src/MEDS_tabular_automl/describe_codes.py b/src/MEDS_tabular_automl/describe_codes.py new file mode 100644 index 0000000..50c4fa6 --- /dev/null +++ b/src/MEDS_tabular_automl/describe_codes.py @@ -0,0 +1,97 @@ +import polars as pl +from omegaconf import DictConfig + +from MEDS_tabular_automl.utils import DF_T + + +def convert_to_df(freq_dict): + return pl.DataFrame([[col, freq] for col, freq in freq_dict.items()], schema=["code", "count"]) + + +def compute_feature_frequencies(cfg: DictConfig, shard_df: DF_T) -> list[str]: + """Generates a list of feature column names from the data within each shard based on specified + configurations. + + Parameters: + - cfg (DictConfig): Configuration dictionary specifying how features should be evaluated and aggregated. + - split_to_shard_df (dict): A dictionary of DataFrames, divided by data split (e.g., 'train', 'test'). + + Returns: + - tuple[list[str], dict]: A tuple containing a list of feature columns and a dictionary of code properties + identified during the evaluation. + + This function evaluates the properties of codes within training data and applies configured + aggregations to generate a comprehensive list of feature columns for modeling purposes. + Examples: + >>> import polars as pl + >>> data = {'code': ['A', 'A', 'B', 'B', 'C', 'C', 'C'], + ... 'timestamp': [None, '2021-01-01', None, None, '2021-01-03', '2021-01-04', None], + ... 'numerical_value': [1, None, 2, 2, None, None, 3]} + >>> df = pl.DataFrame(data).lazy() + >>> aggs = ['value/sum', 'code/count'] + >>> get_ts_feature_cols(aggs, df) + ['A/code', 'A/value', 'C/code', 'C/value'] + """ + static_df = shard_df.filter( + pl.col("patient_id").is_not_null() & pl.col("code").is_not_null() & pl.col("timestamp").is_null() + ) + static_code_freqs_df = static_df.group_by("code").agg(pl.count("code").alias("count")).collect() + static_code_freqs = { + row["code"] + "/static/present": row["count"] for row in static_code_freqs_df.iter_rows(named=True) + } + + static_value_df = static_df.filter(pl.col("numerical_value").is_not_null()) + static_value_freqs_df = ( + static_value_df.group_by("code").agg(pl.count("numerical_value").alias("count")).collect() + ) + static_value_freqs = { + row["code"] + "/static/first": row["count"] for row in static_value_freqs_df.iter_rows(named=True) + } + + ts_df = shard_df.filter( + pl.col("patient_id").is_not_null() & pl.col("code").is_not_null() & pl.col("timestamp").is_not_null() + ) + code_freqs_df = ts_df.group_by("code").agg(pl.count("code").alias("count")).collect() + code_freqs = {row["code"] + "/code": row["count"] for row in code_freqs_df.iter_rows(named=True)} + + value_df = ts_df.filter(pl.col("numerical_value").is_not_null()) + value_freqs_df = value_df.group_by("code").agg(pl.count("numerical_value").alias("count")).collect() + value_freqs = {row["code"] + "/value": row["count"] for row in value_freqs_df.iter_rows(named=True)} + + combined_freqs = {**static_code_freqs, **static_value_freqs, **code_freqs, **value_freqs} + return convert_to_df(combined_freqs) + + +def convert_to_freq_dict(df: pl.LazyFrame) -> dict: + """Converts a DataFrame to a dictionary of frequencies. + + This function converts a DataFrame to a dictionary of frequencies, where the keys are the + column names and the values are dictionaries of code frequencies. + + Args: + - df (pl.DataFrame): The DataFrame to be converted. + + Returns: + - dict: A dictionary of frequencies, where the keys are the column names and the values are + dictionaries of code frequencies. + + Example: + >>> import polars as pl + >>> df = pl.DataFrame({ + ... "code": [1, 2, 3, 4, 5], + ... "value": [10, 20, 30, 40, 50] + ... }) + >>> convert_to_freq_dict(df) + {'code': {1: 1, 2: 1, 3: 1, 4: 1, 5: 1}, 'value': {10: 1, 20: 1, 30: 1, 40: 1, 50: 1}} + """ + if not df.columns == ["code", "count"]: + raise ValueError(f"DataFrame must have columns 'code' and 'count', but has columns {df.columns}!") + return dict(df.collect().iter_rows()) + + +def get_feature_columns(fp): + return sorted(list(convert_to_freq_dict(pl.scan_parquet(fp)))) + + +def get_feature_freqs(fp): + return convert_to_freq_dict(pl.scan_parquet(fp)) diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py index 1492dbf..3ae6c07 100644 --- a/src/MEDS_tabular_automl/file_name.py +++ b/src/MEDS_tabular_automl/file_name.py @@ -1,8 +1,6 @@ """Help functions for getting file names and paths for MEDS tabular automl tasks.""" from pathlib import Path -from omegaconf import DictConfig - from MEDS_tabular_automl.utils import ( CODE_AGGREGATIONS, STATIC_CODE_AGGREGATION, @@ -11,135 +9,145 @@ ) -class FileNameResolver: - def __init__(self, cfg: DictConfig): - self.cfg = cfg - - @property - def meds_dir(self): - return Path(self.cfg.MEDS_cohort_dir) - - @property - def tabularize_dir(self): - return Path(self.cfg.tabularized_data_dir) - - @property - def cache_dir(self): - return Path(self.cfg.cache_dir) - - def get_meds_dir(self): - return self.meds_dir / "final_cohort" - - def get_static_dir(self): - return self.tabularize_dir / "static" - - def get_ts_dir(self): - return self.tabularize_dir / "ts" - - def get_sparse_dir(self): - return self.tabularize_dir / "sparse" - - def get_label_dir(self): - return Path(self.cfg.task_dir) - - def get_feature_columns_fp(self): - return self.tabularize_dir / "feature_columns.json" - - def get_feature_freqs_fp(self): - return self.tabularize_dir / "feature_freqs.json" - - def get_config_path(self): - return self.tabularize_dir / "config.yaml" - - def get_meds_shard(self, split: str, shard_num: int): - # Given a shard number, return the MEDS format data - return self.get_meds_dir() / split / f"{shard_num}.parquet" - - def get_flat_static_rep(self, split: str, shard_num: int, agg: str): - # Given a shard number, returns the static representation path - agg_name = agg.split("/")[-1] - return self.get_static_dir() / split / f"{shard_num}" / f"{agg_name}.npz" - - def get_flat_ts_rep(self, split: str, shard_num: int, window_size: int, agg: str): - # Given a shard number, returns the time series representation path - return self.get_ts_dir() / split / f"{shard_num}" / f"{window_size}" / f"{agg}.npz" - - def get_flat_sparse_rep(self, split: str, shard_num: int, window_size: int, agg: str): - # Given a shard number, returns the sparse representation path - return self.get_sparse_dir() / split / f"{shard_num}" / f"{window_size}" / f"{agg}.npz" - - def get_label(self, split: str, shard_num: int): - # Given a shard number, returns the label path - return self.get_label_dir() / split / f"{shard_num}.parquet" - - def list_meds_files(self, split=None): - # List all MEDS files - if split: - return sorted(list(self.get_meds_dir().glob(f"{split}/*.parquet"))) - return sorted(list(self.get_meds_dir().glob("*/*.parquet"))) - - def list_static_files(self, split=None): - # List all static files - if split: - return sorted(list(self.get_static_dir().glob(f"{split}/*/*.npz"))) - return sorted(list(self.get_static_dir().glob("*/*/*.npz"))) - - def list_ts_files(self, split=None): - # List all time series files - if split: - return sorted(list(self.get_ts_dir().glob(f"{split}/*/*/*/*.npz"))) - return sorted(list(self.get_ts_dir().glob("*/*/*/*/*.npz"))) - - def list_sparse_files(self, split=None): - # List all sparse files - if split: - return sorted(list(self.get_sparse_dir().glob(f"{split}/*/*.npz"))) - return sorted(list(self.get_sparse_dir().glob("*/*/*.npz"))) - - def list_label_files(self, split=None): - # List all label files - if split: - return sorted(list(self.get_label_dir().glob(f"{split}/*.parquet"))) - return sorted(list(self.get_label_dir().glob("*/*.parquet"))) - - def get_cache_dir(self): - return self.cache_dir - - def get_model_files(self, window_sizes, aggs, split, shard_num: int): - # Given a shard number, returns the model files - model_files = [] - for window_size in window_sizes: - for agg in aggs: - if agg.startswith("static"): - continue - else: - model_files.append(self.get_task_specific_path(split, shard_num, window_size, agg)) +def get_meds_dir(cfg): + return cfg.meds_dir / "final_cohort" + + +def get_static_dir(cfg): + return cfg.tabularize_dir / "static" + + +def get_ts_dir(cfg): + return cfg.tabularize_dir / "ts" + + +def get_sparse_dir(cfg): + return cfg.tabularize_dir / "sparse" + + +def get_label_dir(cfg): + return Path(cfg.task_dir) + + +def get_feature_columns_fp(cfg): + return cfg.tabularize_dir / "feature_columns.json" + + +def get_feature_freqs_fp(cfg): + return cfg.tabularize_dir / "feature_freqs.json" + + +def get_config_path(cfg): + return cfg.tabularize_dir / "config.yaml" + + +def get_meds_shard(cfg, split: str, shard_num: int): + # Given a shard number, return the MEDS format data + return get_meds_dir(cfg) / split / f"{shard_num}.parquet" + + +def get_flat_static_rep(cfg, split: str, shard_num: int, agg: str): + # Given a shard number, returns the static representation path + agg_name = agg.split("/")[-1] + return cfg.get_static_dir() / split / f"{shard_num}" / f"{agg_name}.npz" + + +def get_flat_ts_rep(cfg, split: str, shard_num: int, window_size: int, agg: str): + # Given a shard number, returns the time series representation path + return cfg.get_ts_dir() / split / f"{shard_num}" / f"{window_size}" / f"{agg}.npz" + + +def get_flat_sparse_rep(cfg, split: str, shard_num: int, window_size: int, agg: str): + # Given a shard number, returns the sparse representation path + return cfg.get_sparse_dir() / split / f"{shard_num}" / f"{window_size}" / f"{agg}.npz" + + +def get_label(cfg, split: str, shard_num: int): + # Given a shard number, returns the label path + return cfg.get_label_dir() / split / f"{shard_num}.parquet" + + +def list_meds_files(meds_dir: Path, split=None): + # List all MEDS files + if split: + return sorted(list(meds_dir.glob(f"{split}/*.parquet"))) + return sorted(list(meds_dir.glob("**/*.parquet"))) + + +def list_subdir_parquets(dir: [Path | str]): + return sorted(list(Path(dir).glob("**/*.parquet"))) + + +def list_static_files(cfg, split=None): + # List all static files + if split: + return sorted(list(cfg.get_static_dir().glob(f"{split}/*/*.npz"))) + return sorted(list(cfg.get_static_dir().glob("*/*/*.npz"))) + + +def list_ts_files(cfg, split=None): + # List all time series files + if split: + return sorted(list(cfg.get_ts_dir().glob(f"{split}/*/*/*/*.npz"))) + return sorted(list(cfg.get_ts_dir().glob("*/*/*/*/*.npz"))) + + +def list_sparse_files(cfg, split=None): + # List all sparse files + if split: + return sorted(list(cfg.get_sparse_dir().glob(f"{split}/*/*.npz"))) + return sorted(list(cfg.get_sparse_dir().glob("*/*/*.npz"))) + + +def list_label_files(cfg, split=None): + # List all label files + if split: + return sorted(list(cfg.get_label_dir().glob(f"{split}/*.parquet"))) + return sorted(list(cfg.get_label_dir().glob("*/*.parquet"))) + + +def get_cache_dir(cfg): + return cfg.cache_dir + + +def get_model_files(cfg, window_sizes, aggs, split, shard_num: int): + # Given a shard number, returns the model files + model_files = [] + for window_size in window_sizes: for agg in aggs: if agg.startswith("static"): - window_size = None - model_files.append(self.get_task_specific_path(split, shard_num, window_size, agg)) - return sorted(model_files) - - def parse_ts_file_path(self, data_fp): - agg = f"{data_fp.parent.stem}/{data_fp.stem}" - if agg not in CODE_AGGREGATIONS + VALUE_AGGREGATIONS: - raise ValueError(f"Invalid aggregation: {agg}") - window_size = data_fp.parts[-3] - shard_num = data_fp.parts[-4] - split = data_fp.parts[-5] - return split, shard_num, window_size, agg - - def parse_static_file_path(self, data_fp): - # parse as static agg - agg = f"{data_fp.parent.parent.parent.stem}/{data_fp.stem}" - if agg not in [STATIC_VALUE_AGGREGATION, STATIC_CODE_AGGREGATION]: - raise ValueError(f"Invalid aggregation: {agg}") - shard_num = data_fp.parent.stem - split = data_fp.parts[-3] - return split, shard_num, agg - - def get_task_specific_path(self, split, shard_num, window_size, agg): - if window_size: - return self.get_label_dir() / split / f"{shard_num}" / f"{window_size}" / f"{agg}.npz" - else: - return self.get_label_dir() / split / f"{shard_num}" / f"{agg}.npz" + continue + else: + model_files.append(cfg.get_task_specific_path(split, shard_num, window_size, agg)) + for agg in aggs: + if agg.startswith("static"): + window_size = None + model_files.append(cfg.get_task_specific_path(split, shard_num, window_size, agg)) + return sorted(model_files) + + +def parse_ts_file_path(cfg, data_fp): + agg = f"{data_fp.parent.stem}/{data_fp.stem}" + if agg not in CODE_AGGREGATIONS + VALUE_AGGREGATIONS: + raise ValueError(f"Invalid aggregation: {agg}") + window_size = data_fp.parts[-3] + shard_num = data_fp.parts[-4] + split = data_fp.parts[-5] + return split, shard_num, window_size, agg + + +def parse_static_file_path(cfg, data_fp): + # parse as static agg + agg = f"{data_fp.parent.parent.parent.stem}/{data_fp.stem}" + if agg not in [STATIC_VALUE_AGGREGATION, STATIC_CODE_AGGREGATION]: + raise ValueError(f"Invalid aggregation: {agg}") + shard_num = data_fp.parent.stem + split = data_fp.parts[-3] + return split, shard_num, agg + + +def get_task_specific_path(cfg, split, shard_num, window_size, agg): + if window_size: + return cfg.get_label_dir() / split / f"{shard_num}" / f"{window_size}" / f"{agg}.npz" + else: + return cfg.get_label_dir() / split / f"{shard_num}" / f"{agg}.npz" diff --git a/src/MEDS_tabular_automl/scripts/__init__.py b/src/MEDS_tabular_automl/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/task_specific_caching.py b/src/MEDS_tabular_automl/scripts/cache_task.py similarity index 88% rename from scripts/task_specific_caching.py rename to src/MEDS_tabular_automl/scripts/cache_task.py index a34683c..f597344 100644 --- a/scripts/task_specific_caching.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -1,15 +1,12 @@ #!/usr/bin/env python """Aggregates time-series data for feature columns across different window sizes.""" -import json - import hydra import numpy as np import polars as pl import scipy.sparse as sp from omegaconf import DictConfig -from MEDS_tabular_automl.file_name import FileNameResolver from MEDS_tabular_automl.mapper import wrap as rwlock_wrap from MEDS_tabular_automl.utils import ( CODE_AGGREGATIONS, @@ -30,7 +27,7 @@ ] -def generate_row_cached_matrix(matrix, label_df, feature_columns): +def generate_row_cached_matrix(matrix, label_df): """Generates row-cached matrix for a given matrix and label_df.""" label_len = label_df.select(pl.len()).collect().item() if not matrix.shape[0] == label_len: @@ -44,16 +41,15 @@ def generate_row_cached_matrix(matrix, label_df, feature_columns): @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") -def task_specific_cache( +def main( cfg: DictConfig, ): """Performs row splicing of tabularized data for a specific task.""" iter_wrapper = load_tqdm(cfg.tqdm) - if not cfg.test: + if not cfg.loguru_init: hydra_loguru_init() - f_name_resolver = FileNameResolver(cfg) + f_name_resolver = cfg # Produce ts representation - feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) # shuffle tasks tabularization_tasks = f_name_resolver.list_static_files() + f_name_resolver.list_ts_files() @@ -77,7 +73,7 @@ def read_fn(fps): def compute_fn(shard_dfs): matrix, label_df = shard_dfs - cache_matrix = generate_row_cached_matrix(matrix, label_df, feature_columns) + cache_matrix = generate_row_cached_matrix(matrix, label_df) return cache_matrix def write_fn(cache_matrix, out_fp): @@ -96,4 +92,4 @@ def write_fn(cache_matrix, out_fp): if __name__ == "__main__": - task_specific_cache() + main() diff --git a/src/MEDS_tabular_automl/scripts/describe_codes.py b/src/MEDS_tabular_automl/scripts/describe_codes.py new file mode 100644 index 0000000..4038b06 --- /dev/null +++ b/src/MEDS_tabular_automl/scripts/describe_codes.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +"""This Python script, stores the configuration parameters and feature columns used in the output.""" +from collections import defaultdict +from pathlib import Path + +import hydra +import numpy as np +import polars as pl +from loguru import logger +from omegaconf import DictConfig + +from MEDS_tabular_automl.describe_codes import ( + compute_feature_frequencies, + convert_to_df, + convert_to_freq_dict, +) +from MEDS_tabular_automl.file_name import list_subdir_parquets +from MEDS_tabular_automl.mapper import wrap as rwlock_wrap +from MEDS_tabular_automl.utils import load_tqdm, store_config_yaml, write_df + + +@hydra.main(version_base=None, config_path="../configs", config_name="describe_codes") +def main( + cfg: DictConfig, +): + """Stores the configuration parameters and feature columns tabularized data we will be generated for. + + Args: + cfg: The configuration object for the tabularization process. + """ + iter_wrapper = load_tqdm(cfg.tqdm) + + # Store Config + output_dir = Path(cfg.output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + store_config_yaml(output_dir / "config.yaml", cfg) + + # Create output dir + input_dir = Path(cfg.input_dir) + input_dir.mkdir(exist_ok=True, parents=True) + + # 0. Identify Output Columns and Frequencies + logger.info("Iterating through shards and caching feature frequencies.") + + def compute_fn(shard_df): + return compute_feature_frequencies(cfg, shard_df) + + def write_fn(df, out_fp): + write_df(df, out_fp) + + def read_fn(in_fp): + return pl.scan_parquet(in_fp) + + # Map: Iterates through shards and caches feature frequencies + train_shards = list_subdir_parquets(cfg.input_dir) + np.random.shuffle(train_shards) + for shard_fp in iter_wrapper(train_shards): + out_fp = Path(cfg.cache_dir) / shard_fp.name + rwlock_wrap( + shard_fp, + out_fp, + read_fn, + write_fn, + compute_fn, + do_overwrite=cfg.do_overwrite, + do_return=False, + ) + + logger.info("Summing frequency computations.") + # Reduce: sum the frequency computations + + def compute_fn(freq_df_list): + feature_freqs = defaultdict(int) + for shard_freq_df in freq_df_list: + shard_freq_dict = convert_to_freq_dict(shard_freq_df) + for feature, freq in shard_freq_dict.items(): + feature_freqs[feature] += freq + feature_df = convert_to_df(feature_freqs) + return feature_df + + def write_fn(df, out_fp): + write_df(df, out_fp) + + def read_fn(feature_dir): + files = list_subdir_parquets(feature_dir) + return [pl.scan_parquet(fp) for fp in files] + + rwlock_wrap( + Path(cfg.cache_dir), + Path(cfg.output_filepath), + read_fn, + write_fn, + compute_fn, + do_overwrite=cfg.do_overwrite, + do_return=False, + ) + logger.info("Stored feature columns and frequencies.") + + +if __name__ == "__main__": + main() diff --git a/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py similarity index 98% rename from scripts/launch_xgboost.py rename to src/MEDS_tabular_automl/scripts/launch_xgboost.py index d903cfd..4f445e9 100644 --- a/scripts/launch_xgboost.py +++ b/src/MEDS_tabular_automl/scripts/launch_xgboost.py @@ -13,7 +13,6 @@ from omegaconf import DictConfig, OmegaConf from sklearn.metrics import roc_auc_score -from MEDS_tabular_automl.file_name import FileNameResolver from MEDS_tabular_automl.utils import get_feature_indices @@ -52,7 +51,7 @@ def __init__(self, cfg: DictConfig, split: str = "train"): or "held_out". This determines which subset of the data is loaded and processed. """ self.cfg = cfg - self.file_name_resolver = FileNameResolver(cfg) + self.file_name_resolver = cfg self.split = split self._data_shards = sorted([shard.stem for shard in self.file_name_resolver.list_label_files(split)]) @@ -393,7 +392,7 @@ def evaluate(self) -> float: @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") -def launch_xgboost(cfg: DictConfig) -> float: +def main(cfg: DictConfig) -> float: """Optimize the model based on the provided configuration. Args: @@ -425,4 +424,4 @@ def launch_xgboost(cfg: DictConfig) -> float: if __name__ == "__main__": - launch_xgboost() + main() diff --git a/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py similarity index 97% rename from scripts/tabularize_static.py rename to src/MEDS_tabular_automl/scripts/tabularize_static.py index 4a6ceb6..70bc4ef 100644 --- a/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -10,7 +10,6 @@ import polars as pl from omegaconf import DictConfig, OmegaConf -from MEDS_tabular_automl.file_name import FileNameResolver from MEDS_tabular_automl.generate_static_features import get_flat_static_rep from MEDS_tabular_automl.mapper import wrap as rwlock_wrap from MEDS_tabular_automl.utils import ( @@ -64,7 +63,7 @@ def store_config_yaml(config_fp: Path, cfg: DictConfig): @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") -def tabularize_static_data( +def main( cfg: DictConfig, ): """Writes a flat (historically summarized) representation of the dataset to disk. @@ -107,9 +106,9 @@ def tabularize_static_data( .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 """ iter_wrapper = load_tqdm(cfg.tqdm) - if not cfg.test: + if not cfg.loguru_init: hydra_loguru_init() - f_name_resolver = FileNameResolver(cfg) + f_name_resolver = cfg # Produce ts representation meds_shard_fps = f_name_resolver.list_meds_files() # f_name_resolver.get_meds_dir() @@ -150,4 +149,4 @@ def write_fn(data, out_df): if __name__ == "__main__": - tabularize_static_data() + main() diff --git a/scripts/summarize_over_windows.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py similarity index 95% rename from scripts/summarize_over_windows.py rename to src/MEDS_tabular_automl/scripts/tabularize_time_series.py index 049c0d6..e13679d 100644 --- a/scripts/summarize_over_windows.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -10,7 +10,6 @@ from loguru import logger from omegaconf import DictConfig -from MEDS_tabular_automl.file_name import FileNameResolver from MEDS_tabular_automl.generate_summarized_reps import generate_summary from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep from MEDS_tabular_automl.mapper import wrap as rwlock_wrap @@ -24,7 +23,7 @@ @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") -def summarize_ts_data_over_windows( +def main( cfg: DictConfig, ): """Processes time-series data by summarizing it across different windows, creating a flat, summarized @@ -54,9 +53,9 @@ def summarize_ts_data_over_windows( ValueError: If required columns like 'code' or 'value' are missing in the data files. """ iter_wrapper = load_tqdm(cfg.tqdm) - if not cfg.test: + if not cfg.loguru_init: hydra_loguru_init() - f_name_resolver = FileNameResolver(cfg) + f_name_resolver = cfg # Produce ts representation meds_shard_fps = f_name_resolver.list_meds_files() feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) @@ -109,4 +108,4 @@ def write_fn(out_matrix, out_fp): if __name__ == "__main__": - summarize_ts_data_over_windows() + main() diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 61c1950..5de40c6 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -14,7 +14,7 @@ import polars as pl import polars.selectors as cs from loguru import logger -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf from scipy.sparse import coo_array DF_T = pl.LazyFrame @@ -248,60 +248,6 @@ def get_ts_feature_cols(shard_df: DF_T) -> list[str]: return sorted(feature_columns) -def compute_feature_frequencies(cfg: DictConfig, shard_df: DF_T) -> list[str]: - """Generates a list of feature column names from the data within each shard based on specified - configurations. - - Parameters: - - cfg (DictConfig): Configuration dictionary specifying how features should be evaluated and aggregated. - - split_to_shard_df (dict): A dictionary of DataFrames, divided by data split (e.g., 'train', 'test'). - - Returns: - - tuple[list[str], dict]: A tuple containing a list of feature columns and a dictionary of code properties - identified during the evaluation. - - This function evaluates the properties of codes within training data and applies configured - aggregations to generate a comprehensive list of feature columns for modeling purposes. - Examples: - >>> import polars as pl - >>> data = {'code': ['A', 'A', 'B', 'B', 'C', 'C', 'C'], - ... 'timestamp': [None, '2021-01-01', None, None, '2021-01-03', '2021-01-04', None], - ... 'numerical_value': [1, None, 2, 2, None, None, 3]} - >>> df = pl.DataFrame(data).lazy() - >>> aggs = ['value/sum', 'code/count'] - >>> get_ts_feature_cols(aggs, df) - ['A/code', 'A/value', 'C/code', 'C/value'] - """ - static_df = shard_df.filter( - pl.col("patient_id").is_not_null() & pl.col("code").is_not_null() & pl.col("timestamp").is_null() - ) - static_code_freqs_df = static_df.group_by("code").agg(pl.count("code").alias("count")).collect() - static_code_freqs = { - row["code"] + "/static/present": row["count"] for row in static_code_freqs_df.iter_rows(named=True) - } - - static_value_df = static_df.filter(pl.col("numerical_value").is_not_null()) - static_value_freqs_df = ( - static_value_df.group_by("code").agg(pl.count("numerical_value").alias("count")).collect() - ) - static_value_freqs = { - row["code"] + "/static/first": row["count"] for row in static_value_freqs_df.iter_rows(named=True) - } - - ts_df = shard_df.filter( - pl.col("patient_id").is_not_null() & pl.col("code").is_not_null() & pl.col("timestamp").is_not_null() - ) - code_freqs_df = ts_df.group_by("code").agg(pl.count("code").alias("count")).collect() - code_freqs = {row["code"] + "/code": row["count"] for row in code_freqs_df.iter_rows(named=True)} - - value_df = ts_df.filter(pl.col("numerical_value").is_not_null()) - value_freqs_df = value_df.group_by("code").agg(pl.count("numerical_value").alias("count")).collect() - value_freqs = {row["code"] + "/value": row["count"] for row in value_freqs_df.iter_rows(named=True)} - - combined_freqs = {**static_code_freqs, **static_value_freqs, **code_freqs, **value_freqs} - return combined_freqs - - def get_prediction_ts_cols( aggregations: list[str], ts_feature_cols: DF_T, window_sizes: list[str] | None = None ) -> list[str]: @@ -430,3 +376,48 @@ def get_feature_indices(agg, feature_columns) -> str: feature_to_index = {c: i for i, c in enumerate(feature_columns)} agg_features = get_feature_names(agg, feature_columns) return [feature_to_index[c] for c in agg_features] + + +def store_config_yaml(config_fp: Path, cfg: DictConfig): + """Stores configuration parameters into a JSON file. + + This function writes a dictionary of parameters, which includes patient partitioning + information and configuration details, to a specified JSON file. + + Args: + - config_fp (Path): The file path for the JSON file where config should be stored. + - cfg (DictConfig): A configuration object containing settings like the number of patients + per sub-shard, minimum code inclusion frequency, and flags for updating or overwriting existing files. + + Behavior: + - If config_fp exists and cfg.do_overwrite is False (without do_update being True), a + FileExistsError is raised to prevent unintentional data loss. + + Raises: + - ValueError: If there are discrepancies between old and new parameters during an update. + - FileExistsError: If the file exists and overwriting is not allowed. + + Example: + >>> cfg = DictConfig({ + ... "n_patients_per_sub_shard": 100, + ... "min_code_inclusion_frequency": 5, + ... "do_overwrite": True, + ... }) + >>> import tempfile + >>> from pathlib import Path + >>> with tempfile.NamedTemporaryFile() as temp_f: + ... config_fp = Path(temp_f.name) + ... store_config_yaml(config_fp, cfg) + ... assert config_fp.exists() + ... store_config_yaml(config_fp, cfg) + ... cfg.do_overwrite = False + ... try: + ... store_config_yaml(config_fp, cfg) + ... except FileExistsError as e: + ... print("FileExistsError Error Triggered") + FileExistsError Error Triggered + """ + if config_fp.exists(): + if not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {config_fp} exists!") + OmegaConf.save(cfg, config_fp) diff --git a/tests/cli_test.py b/tests/cli_bk.py similarity index 100% rename from tests/cli_test.py rename to tests/cli_bk.py index b7b9e0d..a0d437e 100644 --- a/tests/cli_test.py +++ b/tests/cli_bk.py @@ -11,6 +11,8 @@ import polars as pl from loguru import logger from omegaconf import DictConfig +from scripts.identify_columns import store_columns +from scripts.tabularize_static import tabularize_static_data from test_tabularize import ( CODE_COLS, EXPECTED_STATIC_FILES, @@ -29,8 +31,6 @@ get_feature_names, load_matrix, ) -from scripts.identify_columns import store_columns -from scripts.tabularize_static import tabularize_static_data def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 93e26f7..0707f8b 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -11,18 +11,10 @@ from hydra import compose, initialize from loguru import logger -from MEDS_tabular_automl.file_name import FileNameResolver -from MEDS_tabular_automl.utils import ( - VALUE_AGGREGATIONS, - get_events_df, - get_feature_names, - load_matrix, -) -from scripts.identify_columns import store_columns -from scripts.launch_xgboost import launch_xgboost -from scripts.summarize_over_windows import summarize_ts_data_over_windows -from scripts.tabularize_static import tabularize_static_data -from scripts.task_specific_caching import task_specific_cache +from MEDS_tabular_automl.describe_codes import get_feature_columns +from MEDS_tabular_automl.file_name import list_subdir_parquets +from MEDS_tabular_automl.scripts import describe_codes +from MEDS_tabular_automl.utils import VALUE_AGGREGATIONS, get_feature_names SPLITS_JSON = """{"train/0": [239684, 1195293], "train/1": [68729, 814703], "tuning/0": [754281], "held_out/0": [1500733]}""" # noqa: E501 @@ -195,30 +187,21 @@ def test_tabularize(): with tempfile.TemporaryDirectory() as d: MEDS_cohort_dir = Path(d) / "processed" - tabularized_data_dir = Path(d) / "processed" / "tabularize" - tabularize_config_kwargs = { + describe_codes_config = { "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), - "tabularized_data_dir": str(tabularized_data_dir.resolve()), - "min_code_inclusion_frequency": 1, - "model_dir": str(Path(d) / "save_model"), - "window_sizes": ["30d", "365d", "full"], - "aggs": ["code/count", "value/sum", "static/present", "static/first"], - "codes": "null", - "n_patients_per_sub_shard": 2, - "do_overwrite": True, - "do_update": True, + "do_overwrite": False, "seed": 1, "hydra.verbose": True, "tqdm": False, - "test": True, + "loguru_init": True, } - with initialize(version_base=None, config_path="../configs/"): # path to config.yaml - overrides = [f"{k}={v}" for k, v in tabularize_config_kwargs.items()] - cfg = compose(config_name="tabularize", overrides=overrides) # config.yaml - - f_name_resolver = FileNameResolver(cfg) + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in describe_codes_config.items()] + cfg = compose(config_name="describe_codes", overrides=overrides) # config.yaml # Create the directories (MEDS_cohort_dir / "final_cohort").mkdir(parents=True, exist_ok=True) @@ -233,91 +216,92 @@ def test_tabularize(): ) # Check the files are not empty - meds_files = f_name_resolver.list_meds_files() - assert len(meds_files) == 4, "MEDS Data Files Should be 4!" + meds_files = list_subdir_parquets(Path(cfg.input_dir)) + assert ( + len(list_subdir_parquets(Path(cfg.input_dir).parent)) == 4 + ), "MEDS train split Data Files Should be 4!" for f in meds_files: assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" - split_json = json.load(StringIO(SPLITS_JSON)) splits_fp = MEDS_cohort_dir / "splits.json" json.dump(split_json, splits_fp.open("w")) logger.info("caching flat representation of MEDS data") - store_columns(cfg) - assert (tabularized_data_dir / "config.yaml").is_file() - assert (tabularized_data_dir / "feature_columns.json").is_file() - assert (tabularized_data_dir / "feature_freqs.json").is_file() + describe_codes.main(cfg) - feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) + assert (Path(cfg.output_dir) / "config.yaml").is_file() + assert Path(cfg.output_filepath).is_file() + + feature_columns = get_feature_columns(cfg.output_filepath) assert get_feature_names("code/count", feature_columns) == sorted(CODE_COLS) assert get_feature_names("static/present", feature_columns) == sorted(STATIC_PRESENT_COLS) assert get_feature_names("static/first", feature_columns) == sorted(STATIC_FIRST_COLS) for value_agg in VALUE_AGGREGATIONS: assert get_feature_names(value_agg, feature_columns) == sorted(VALUE_COLS) - # Check Static File Generation - tabularize_static_data(cfg) - actual_files = [str(Path(*f.parts[-5:])) for f in f_name_resolver.list_static_files()] - assert set(actual_files) == set(EXPECTED_STATIC_FILES) - # Check the files are not empty - for f in f_name_resolver.list_static_files(): - static_matrix = load_matrix(f) - assert static_matrix.shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" - expected_num_cols = len(get_feature_names(f"static/{f.stem}", feature_columns)) - logger.info((static_matrix.shape[1], expected_num_cols)) - logger.info(f_name_resolver.list_static_files()) - assert static_matrix.shape[1] == expected_num_cols, ( - f"Static Data Tabular Dataframe Should have {expected_num_cols}" - f"Columns but has {static_matrix.shape[1]}!" - ) - static_first_fp = f_name_resolver.get_flat_static_rep("tuning", "0", "static/first") - static_present_fp = f_name_resolver.get_flat_static_rep("tuning", "0", "static/present") - assert ( - load_matrix(static_first_fp).shape[0] == load_matrix(static_present_fp).shape[0] - ), "static data first and present aggregations have different numbers of rows" + # # Check Static File Generation + # tabularize_static(cfg) + # actual_files = [str(Path(*f.parts[-5:])) for f in f_name_resolver.list_static_files()] + # assert set(actual_files) == set(EXPECTED_STATIC_FILES) + # # Check the files are not empty + # for f in f_name_resolver.list_static_files(): + # static_matrix = load_matrix(f) + # assert static_matrix.shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" + # expected_num_cols = len(get_feature_names(f"static/{f.stem}", feature_columns)) + # logger.info((static_matrix.shape[1], expected_num_cols)) + # logger.info(f_name_resolver.list_static_files()) + # assert static_matrix.shape[1] == expected_num_cols, ( + # f"Static Data Tabular Dataframe Should have {expected_num_cols}" + # f"Columns but has {static_matrix.shape[1]}!" + # ) + # static_first_fp = f_name_resolver.get_flat_static_rep("tuning", "0", "static/first") + # static_present_fp = f_name_resolver.get_flat_static_rep("tuning", "0", "static/present") + # assert ( + # load_matrix(static_first_fp).shape[0] == load_matrix(static_present_fp).shape[0] + # ), "static data first and present aggregations have different numbers of rows" - summarize_ts_data_over_windows(cfg) - # confirm summary files exist: - output_files = f_name_resolver.list_ts_files() - f_name_resolver.list_ts_files() - actual_files = [str(Path(*f.parts[-5:])) for f in output_files] + # tabularize_time_series(cfg) + # # confirm summary files exist: + # output_files = f_name_resolver.list_ts_files() + # f_name_resolver.list_ts_files() + # actual_files = [str(Path(*f.parts[-5:])) for f in output_files] - assert set(actual_files) == set(SUMMARIZE_EXPECTED_FILES) - for f in output_files: - sparse_array = load_matrix(f) - assert sparse_array.shape[0] > 0 - assert sparse_array.shape[1] > 0 - ts_code_fp = f_name_resolver.get_flat_ts_rep("tuning", "0", "365d", "code/count") - ts_value_fp = f_name_resolver.get_flat_ts_rep("tuning", "0", "365d", "value/sum") - assert ( - load_matrix(ts_code_fp).shape[0] == load_matrix(ts_value_fp).shape[0] - ), "time series code and value have different numbers of rows" - assert ( - load_matrix(static_first_fp).shape[0] == load_matrix(ts_value_fp).shape[0] - ), "static data and time series have different numbers of rows" + # assert set(actual_files) == set(SUMMARIZE_EXPECTED_FILES) + # for f in output_files: + # sparse_array = load_matrix(f) + # assert sparse_array.shape[0] > 0 + # assert sparse_array.shape[1] > 0 + # ts_code_fp = f_name_resolver.get_flat_ts_rep("tuning", "0", "365d", "code/count") + # ts_value_fp = f_name_resolver.get_flat_ts_rep("tuning", "0", "365d", "value/sum") + # assert ( + # load_matrix(ts_code_fp).shape[0] == load_matrix(ts_value_fp).shape[0] + # ), "time series code and value have different numbers of rows" + # assert ( + # load_matrix(static_first_fp).shape[0] == load_matrix(ts_value_fp).shape[0] + # ), "static data and time series have different numbers of rows" - # Create fake labels - for f in f_name_resolver.list_meds_files(): - df = pl.read_parquet(f) - df = get_events_df(df, feature_columns) - pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) - df = df.with_columns(pl.Series(name="label", values=pseudo_labels)) - df = df.select(pl.col(["patient_id", "timestamp", "label"])) - df = df.unique(subset=["patient_id", "timestamp"]) - df = df.with_row_index("event_id") + # # Create fake labels + # for f in f_name_resolver.list_meds_files(): + # df = pl.read_parquet(f) + # df = get_events_df(df, feature_columns) + # pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) + # df = df.with_columns(pl.Series(name="label", values=pseudo_labels)) + # df = df.select(pl.col(["patient_id", "timestamp", "label"])) + # df = df.unique(subset=["patient_id", "timestamp"]) + # df = df.with_row_index("event_id") - split = f.parent.stem - shard_num = f.stem - out_f = f_name_resolver.get_label(split, shard_num) - out_f.parent.mkdir(parents=True, exist_ok=True) - df.write_parquet(out_f) + # split = f.parent.stem + # shard_num = f.stem + # out_f = f_name_resolver.get_label(split, shard_num) + # out_f.parent.mkdir(parents=True, exist_ok=True) + # df.write_parquet(out_f) - task_specific_cache(cfg) + # cache_task(cfg) - xgboost_config_kwargs = { - "hydra.mode": "MULTIRUN", - } - xgboost_config_kwargs = {**tabularize_config_kwargs, **xgboost_config_kwargs} - launch_xgboost(cfg) - output_files = list(Path(cfg.model_dir).glob("*.json")) - assert len(output_files) == 1 - assert output_files[0] == Path(cfg.model_dir) / "model.json" + # xgboost_config_kwargs = { + # "hydra.mode": "MULTIRUN", + # } + # xgboost_config_kwargs = {**tabularize_config_kwargs, **xgboost_config_kwargs} + # launch_xgboost(cfg) + # output_files = list(Path(cfg.model_dir).glob("*.json")) + # assert len(output_files) == 1 + # assert output_files[0] == Path(cfg.model_dir) / "model.json" From e122d6603926ed7693c51c4d08ae10a85e0dd9f8 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 4 Jun 2024 04:26:48 +0000 Subject: [PATCH 089/106] updated configs for the tabularization --- .../configs/tabularization/default.yaml | 2 +- src/MEDS_tabular_automl/describe_codes.py | 45 ++++- src/MEDS_tabular_automl/file_name.py | 4 +- .../scripts/describe_codes.py | 22 ++- .../scripts/launch_xgboost.py | 5 +- .../scripts/tabularize_static.py | 66 ++------ .../scripts/tabularize_time_series.py | 32 ++-- src/MEDS_tabular_automl/utils.py | 24 +++ tests/test_tabularize.py | 158 ++++++++++++------ 9 files changed, 228 insertions(+), 130 deletions(-) diff --git a/src/MEDS_tabular_automl/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml index 9b67776..b7de04f 100644 --- a/src/MEDS_tabular_automl/configs/tabularization/default.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization/default.yaml @@ -19,4 +19,4 @@ aggs: - "value/max" # Resolved inputs -_resolved_codes: ${filter_to_codes:${allowed_codes},${min_code_inclusion_frequency},${code_metadata_fp}} +_resolved_codes: ${filter_to_codes:${tabularization.allowed_codes},${tabularization.min_code_inclusion_frequency},${tabularization.code_metadata_fp}} diff --git a/src/MEDS_tabular_automl/describe_codes.py b/src/MEDS_tabular_automl/describe_codes.py index 50c4fa6..3046295 100644 --- a/src/MEDS_tabular_automl/describe_codes.py +++ b/src/MEDS_tabular_automl/describe_codes.py @@ -1,5 +1,9 @@ +from collections import defaultdict +from collections.abc import Mapping +from pathlib import Path + import polars as pl -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf from MEDS_tabular_automl.utils import DF_T @@ -90,8 +94,45 @@ def convert_to_freq_dict(df: pl.LazyFrame) -> dict: def get_feature_columns(fp): - return sorted(list(convert_to_freq_dict(pl.scan_parquet(fp)))) + return sorted(list(convert_to_freq_dict(pl.scan_parquet(fp)).keys())) def get_feature_freqs(fp): return convert_to_freq_dict(pl.scan_parquet(fp)) + + +def filter_to_codes( + allowed_codes: list[str] | None, + min_code_inclusion_frequency: Mapping[str, int], + code_metadata_fp: Path, +): + """Returns allowed codes if they are specified, otherwise filters to codes based on inclusion + frequency.""" + if allowed_codes is None: + feature_freqs = get_feature_freqs(code_metadata_fp) + + def clear_code_aggregation_suffix(code): + if code.endswith("/code"): + return code[:-5] + elif code.endswith("/value"): + return code[:-6] + elif code.endswith("/static/present"): + return code[:-15] + elif code.endswith("/static/first"): + return code[:-13] + + code_freqs_agg = defaultdict(list) + + for code, freq in feature_freqs.items(): + code_freqs_agg[clear_code_aggregation_suffix(code)].append(freq) + code_freqs = {code: sum(freqs) for code, freqs in code_freqs_agg.items()} + return sorted([code for code, freq in code_freqs.items() if freq >= min_code_inclusion_frequency]) + else: + return allowed_codes + + +OmegaConf.register_new_resolver("filter_to_codes", filter_to_codes) + + +def filter_parquet(fp, allowed_codes: list[str]): + return pl.scan_parquet(fp).filter(pl.col("code").is_in(allowed_codes)) diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py index 3ae6c07..9ddb580 100644 --- a/src/MEDS_tabular_automl/file_name.py +++ b/src/MEDS_tabular_automl/file_name.py @@ -74,8 +74,8 @@ def list_meds_files(meds_dir: Path, split=None): return sorted(list(meds_dir.glob("**/*.parquet"))) -def list_subdir_parquets(dir: [Path | str]): - return sorted(list(Path(dir).glob("**/*.parquet"))) +def list_subdir_files(dir: [Path | str], fmt: str): + return sorted(list(Path(dir).glob(f"**/*.{fmt}"))) def list_static_files(cfg, split=None): diff --git a/src/MEDS_tabular_automl/scripts/describe_codes.py b/src/MEDS_tabular_automl/scripts/describe_codes.py index 4038b06..fa40573 100644 --- a/src/MEDS_tabular_automl/scripts/describe_codes.py +++ b/src/MEDS_tabular_automl/scripts/describe_codes.py @@ -14,21 +14,29 @@ convert_to_df, convert_to_freq_dict, ) -from MEDS_tabular_automl.file_name import list_subdir_parquets +from MEDS_tabular_automl.file_name import list_subdir_files from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import load_tqdm, store_config_yaml, write_df +from MEDS_tabular_automl.utils import ( + get_shard_prefix, + hydra_loguru_init, + load_tqdm, + store_config_yaml, + write_df, +) @hydra.main(version_base=None, config_path="../configs", config_name="describe_codes") def main( cfg: DictConfig, ): - """Stores the configuration parameters and feature columns tabularized data we will be generated for. + """Computes the feature frequencies so we can filter out infrequent events. Args: cfg: The configuration object for the tabularization process. """ iter_wrapper = load_tqdm(cfg.tqdm) + if not cfg.loguru_init: + hydra_loguru_init() # Store Config output_dir = Path(cfg.output_dir) @@ -52,10 +60,12 @@ def read_fn(in_fp): return pl.scan_parquet(in_fp) # Map: Iterates through shards and caches feature frequencies - train_shards = list_subdir_parquets(cfg.input_dir) + train_shards = list_subdir_files(cfg.input_dir, "parquet") np.random.shuffle(train_shards) for shard_fp in iter_wrapper(train_shards): - out_fp = Path(cfg.cache_dir) / shard_fp.name + out_fp = (Path(cfg.cache_dir) / get_shard_prefix(cfg.input_dir, shard_fp)).with_suffix( + shard_fp.suffix + ) rwlock_wrap( shard_fp, out_fp, @@ -82,7 +92,7 @@ def write_fn(df, out_fp): write_df(df, out_fp) def read_fn(feature_dir): - files = list_subdir_parquets(feature_dir) + files = list_subdir_files(feature_dir, "parquet") return [pl.scan_parquet(fp) for fp in files] rwlock_wrap( diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py index 4f445e9..c2c4af7 100644 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ b/src/MEDS_tabular_automl/scripts/launch_xgboost.py @@ -13,7 +13,7 @@ from omegaconf import DictConfig, OmegaConf from sklearn.metrics import roc_auc_score -from MEDS_tabular_automl.utils import get_feature_indices +from MEDS_tabular_automl.utils import get_feature_indices, hydra_loguru_init class Iterator(xgb.DataIter, TimeableMixin): @@ -401,6 +401,9 @@ def main(cfg: DictConfig) -> float: Returns: - float: Evaluation result. """ + if not cfg.loguru_init: + hydra_loguru_init() + model = XGBoostModel(cfg) model.train() diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index 70bc4ef..c0be01d 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -1,20 +1,22 @@ #!/usr/bin/env python """Tabularizes static data in MEDS format into tabular representations.""" -import json from itertools import product from pathlib import Path import hydra import numpy as np import polars as pl -from omegaconf import DictConfig, OmegaConf +from omegaconf import DictConfig +from MEDS_tabular_automl.describe_codes import filter_parquet, get_feature_columns +from MEDS_tabular_automl.file_name import list_subdir_files from MEDS_tabular_automl.generate_static_features import get_flat_static_rep from MEDS_tabular_automl.mapper import wrap as rwlock_wrap from MEDS_tabular_automl.utils import ( STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION, + get_shard_prefix, hydra_loguru_init, load_tqdm, write_df, @@ -23,45 +25,6 @@ pl.enable_string_cache() -def store_config_yaml(config_fp: Path, cfg: DictConfig): - """Stores configuration parameters into a JSON file. - - This function writes a dictionary of parameters, which includes patient partitioning - information and configuration details, to a specified JSON file. - - Args: - - config_fp (Path): The file path for the JSON file where config should be stored. - - cfg (DictConfig): A configuration object containing settings like the number of patients - per sub-shard, minimum code inclusion frequency, and flags for updating or overwriting existing files. - - Behavior: - - If config_fp exists and cfg.do_overwrite is False (without do_update being True), a - FileExistsError is raised to prevent unintentional data loss. - - Raises: - - ValueError: If there are discrepancies between old and new parameters during an update. - - FileExistsError: If the file exists and neither updating nor overwriting is allowed. - - Example: - >>> cfg = DictConfig({ - ... "n_patients_per_sub_shard": 100, - ... "min_code_inclusion_frequency": 5, - ... "do_update": False, - ... "do_overwrite": True - ... }) - >>> import tempfile - >>> from pathlib import Path - >>> with tempfile.TemporaryDirectory() as d: - ... config_fp = Path(d) / "config.yaml" - ... store_config_yaml(config_fp, cfg) - ... assert config_fp.exists() - """ - if config_fp.exists(): - if not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {config_fp} exists!") - OmegaConf.save(cfg, config_fp) - - @hydra.main(version_base=None, config_path="../configs", config_name="tabularize") def main( cfg: DictConfig, @@ -108,24 +71,25 @@ def main( iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() - f_name_resolver = cfg # Produce ts representation - meds_shard_fps = f_name_resolver.list_meds_files() - # f_name_resolver.get_meds_dir() - feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) + meds_shard_fps = list_subdir_files(cfg.input_dir, "parquet") + feature_columns = get_feature_columns(cfg.input_code_metadata) # shuffle tasks - static_aggs = [agg for agg in cfg.aggs if agg in [STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION]] + aggs = cfg.tabularization.aggs + static_aggs = [agg for agg in aggs if agg in [STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION]] tabularization_tasks = list(product(meds_shard_fps, static_aggs)) np.random.shuffle(tabularization_tasks) for shard_fp, agg in iter_wrapper(tabularization_tasks): - static_fp = f_name_resolver.get_flat_static_rep(shard_fp.parent.stem, shard_fp.stem, agg) - if static_fp.exists() and not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {static_fp} exists!") + out_fp = ( + Path(cfg.output_dir) / get_shard_prefix(cfg.input_dir, shard_fp) / "none" / agg + ).with_suffix(".npz") + if out_fp.exists() and not cfg.do_overwrite: + raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {out_fp} exists!") def read_fn(in_fp): - return pl.scan_parquet(in_fp) + return filter_parquet(in_fp, cfg.tabularization._resolved_codes) def compute_fn(shard_df): return get_flat_static_rep( @@ -139,7 +103,7 @@ def write_fn(data, out_df): rwlock_wrap( shard_fp, - static_fp, + out_fp, read_fn, write_fn, compute_fn, diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index e13679d..9d95ddb 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -1,21 +1,23 @@ #!/usr/bin/env python """Aggregates time-series data for feature columns across different window sizes.""" -import json from itertools import product +from pathlib import Path import hydra import numpy as np -import polars as pl from loguru import logger from omegaconf import DictConfig +from MEDS_tabular_automl.describe_codes import filter_parquet, get_feature_columns +from MEDS_tabular_automl.file_name import list_subdir_files from MEDS_tabular_automl.generate_summarized_reps import generate_summary from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep from MEDS_tabular_automl.mapper import wrap as rwlock_wrap from MEDS_tabular_automl.utils import ( STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION, + get_shard_prefix, hydra_loguru_init, load_tqdm, write_df, @@ -55,25 +57,27 @@ def main( iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() - f_name_resolver = cfg # Produce ts representation - meds_shard_fps = f_name_resolver.list_meds_files() - feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) + meds_shard_fps = list_subdir_files(cfg.input_dir, "parquet") + feature_columns = get_feature_columns(cfg.input_code_metadata) # shuffle tasks - aggs = [agg for agg in cfg.aggs if agg not in [STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION]] - tabularization_tasks = list(product(meds_shard_fps, cfg.window_sizes, aggs)) + aggs = [ + agg + for agg in cfg.tabularization.aggs + if agg not in [STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION] + ] + tabularization_tasks = list(product(meds_shard_fps, cfg.tabularization.window_sizes, aggs)) np.random.shuffle(tabularization_tasks) # iterate through them for shard_fp, window_size, agg in iter_wrapper(tabularization_tasks): - shard_num = shard_fp.stem - split = shard_fp.parent.stem - assert split in ["train", "held_out", "tuning"], f"Invalid split {split}" - ts_fp = f_name_resolver.get_flat_ts_rep(split, shard_num, window_size, agg) + out_fp = ( + Path(cfg.output_dir) / get_shard_prefix(cfg.input_dir, shard_fp) / window_size / agg + ).with_suffix(".npz") - def read_fn(fp): - return pl.scan_parquet(fp) + def read_fn(in_fp): + return filter_parquet(in_fp, cfg.tabularization._resolved_codes) def compute_fn(shard_df): # Load Sparse DataFrame @@ -98,7 +102,7 @@ def write_fn(out_matrix, out_fp): rwlock_wrap( shard_fp, - ts_fp, + out_fp, read_fn, write_fn, compute_fn, diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 5de40c6..a77fba9 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -421,3 +421,27 @@ def store_config_yaml(config_fp: Path, cfg: DictConfig): if not cfg.do_overwrite: raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {config_fp} exists!") OmegaConf.save(cfg, config_fp) + + +def get_shard_prefix(base_path: Path, fp: Path) -> str: + """Extracts the shard prefix from a file path by removing the raw_cohort_dir. + + Args: + base_path: The base path to remove. + fp: The file path to extract the shard prefix from. + + Returns: + The shard prefix (the file path relative to the base path with the suffix removed). + + Examples: + >>> get_shard_prefix(Path("/a/b/c"), Path("/a/b/c/d.parquet")) + 'd' + >>> get_shard_prefix(Path("/a/b/c"), Path("/a/b/c/d/e.csv.gz")) + 'd/e' + """ + + relative_path = fp.relative_to(base_path) + relative_parent = relative_path.parent + file_name = relative_path.name.split(".")[0] + + return str(relative_parent / file_name) diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 0707f8b..a639b9a 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -9,12 +9,22 @@ import polars as pl from hydra import compose, initialize -from loguru import logger from MEDS_tabular_automl.describe_codes import get_feature_columns -from MEDS_tabular_automl.file_name import list_subdir_parquets -from MEDS_tabular_automl.scripts import describe_codes -from MEDS_tabular_automl.utils import VALUE_AGGREGATIONS, get_feature_names +from MEDS_tabular_automl.file_name import list_subdir_files +from MEDS_tabular_automl.scripts import ( + describe_codes, + tabularize_static, + tabularize_time_series, +) +from MEDS_tabular_automl.utils import ( + VALUE_AGGREGATIONS, + get_events_df, + get_feature_names, + get_shard_prefix, + get_unique_time_events_df, + load_matrix, +) SPLITS_JSON = """{"train/0": [239684, 1195293], "train/1": [68729, 814703], "tuning/0": [754281], "held_out/0": [1500733]}""" # noqa: E501 @@ -119,14 +129,14 @@ STATIC_FIRST_COLS = ["HEIGHT/static/first"] EXPECTED_STATIC_FILES = [ - "tabularize/static/held_out/0/first.npz", - "tabularize/static/held_out/0/present.npz", - "tabularize/static/train/0/first.npz", - "tabularize/static/train/0/present.npz", - "tabularize/static/train/1/first.npz", - "tabularize/static/train/1/present.npz", - "tabularize/static/tuning/0/first.npz", - "tabularize/static/tuning/0/present.npz", + "held_out/0/none/static/first.npz", + "held_out/0/none/static/present.npz", + "train/0/none/static/first.npz", + "train/0/none/static/present.npz", + "train/1/none/static/first.npz", + "train/1/none/static/present.npz", + "tuning/0/none/static/first.npz", + "tuning/0/none/static/present.npz", ] SUMMARIZE_EXPECTED_FILES = [ @@ -216,16 +226,16 @@ def test_tabularize(): ) # Check the files are not empty - meds_files = list_subdir_parquets(Path(cfg.input_dir)) + meds_files = list_subdir_files(Path(cfg.input_dir), "parquet") assert ( - len(list_subdir_parquets(Path(cfg.input_dir).parent)) == 4 + len(list_subdir_files(Path(cfg.input_dir).parent, "parquet")) == 4 ), "MEDS train split Data Files Should be 4!" for f in meds_files: assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" split_json = json.load(StringIO(SPLITS_JSON)) splits_fp = MEDS_cohort_dir / "splits.json" json.dump(split_json, splits_fp.open("w")) - logger.info("caching flat representation of MEDS data") + # Step 1: Describe Codes - compute code frequencies describe_codes.main(cfg) assert (Path(cfg.output_dir) / "config.yaml").is_file() @@ -238,46 +248,88 @@ def test_tabularize(): for value_agg in VALUE_AGGREGATIONS: assert get_feature_names(value_agg, feature_columns) == sorted(VALUE_COLS) - # # Check Static File Generation - # tabularize_static(cfg) - # actual_files = [str(Path(*f.parts[-5:])) for f in f_name_resolver.list_static_files()] - # assert set(actual_files) == set(EXPECTED_STATIC_FILES) - # # Check the files are not empty - # for f in f_name_resolver.list_static_files(): - # static_matrix = load_matrix(f) - # assert static_matrix.shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" - # expected_num_cols = len(get_feature_names(f"static/{f.stem}", feature_columns)) - # logger.info((static_matrix.shape[1], expected_num_cols)) - # logger.info(f_name_resolver.list_static_files()) - # assert static_matrix.shape[1] == expected_num_cols, ( - # f"Static Data Tabular Dataframe Should have {expected_num_cols}" - # f"Columns but has {static_matrix.shape[1]}!" - # ) - # static_first_fp = f_name_resolver.get_flat_static_rep("tuning", "0", "static/first") - # static_present_fp = f_name_resolver.get_flat_static_rep("tuning", "0", "static/present") - # assert ( - # load_matrix(static_first_fp).shape[0] == load_matrix(static_present_fp).shape[0] - # ), "static data first and present aggregations have different numbers of rows" + # Step 2: Tabularization + tabularize_static_config = { + "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), + "do_overwrite": False, + "seed": 1, + "hydra.verbose": True, + "tqdm": False, + "loguru_init": True, + "tabularization.min_code_inclusion_frequency": 1, + "tabularization.aggs": "[static/present,static/first,code/count,value/sum]", + "tabularization.window_sizes": "[30d,365d,full]", + } + + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in tabularize_static_config.items()] + cfg = compose(config_name="tabularization", overrides=overrides) # config.yaml + num_allowed_codes = len(cfg.tabularization._resolved_codes) + num_codes = ( + pl.scan_parquet(list_subdir_files(Path(cfg.input_dir), "parquet")) + .select(pl.col("code")) + .collect() + .n_unique() + ) + assert num_allowed_codes == num_codes, f"Should have {num_codes} codes but has {num_allowed_codes}" + tabularize_static.main(cfg) + output_files = list(Path(cfg.output_dir).glob("**/static/**/*.npz")) + actual_files = [get_shard_prefix(Path(cfg.output_dir), each) + ".npz" for each in output_files] + assert set(actual_files) == set(EXPECTED_STATIC_FILES) + # Check the files are not empty + for f in output_files: + static_matrix = load_matrix(f) + assert static_matrix.shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" + expected_num_cols = len(get_feature_names(f"static/{f.stem}", feature_columns)) + assert static_matrix.shape[1] == expected_num_cols, ( + f"Static Data Tabular Dataframe Should have {expected_num_cols}" + f"Columns but has {static_matrix.shape[1]}!" + ) + split = f.parts[-5] + shard_num = f.parts[-4] + med_shard_fp = (Path(cfg.input_dir) / split / shard_num).with_suffix(".parquet") + expected_num_rows = ( + get_unique_time_events_df(get_events_df(pl.scan_parquet(med_shard_fp), feature_columns)) + .collect() + .shape[0] + ) + assert static_matrix.shape[0] == expected_num_rows, ( + f"Static Data matrix Should have {expected_num_rows}" + f" rows but has {static_matrix.shape[0]}!" + ) - # tabularize_time_series(cfg) - # # confirm summary files exist: - # output_files = f_name_resolver.list_ts_files() - # f_name_resolver.list_ts_files() - # actual_files = [str(Path(*f.parts[-5:])) for f in output_files] + tabularize_time_series.main(cfg) - # assert set(actual_files) == set(SUMMARIZE_EXPECTED_FILES) - # for f in output_files: - # sparse_array = load_matrix(f) - # assert sparse_array.shape[0] > 0 - # assert sparse_array.shape[1] > 0 - # ts_code_fp = f_name_resolver.get_flat_ts_rep("tuning", "0", "365d", "code/count") - # ts_value_fp = f_name_resolver.get_flat_ts_rep("tuning", "0", "365d", "value/sum") - # assert ( - # load_matrix(ts_code_fp).shape[0] == load_matrix(ts_value_fp).shape[0] - # ), "time series code and value have different numbers of rows" - # assert ( - # load_matrix(static_first_fp).shape[0] == load_matrix(ts_value_fp).shape[0] - # ), "static data and time series have different numbers of rows" + # confirm summary files exist: + output_files = list_subdir_files(cfg.output_dir, "npz") + actual_files = [ + get_shard_prefix(Path(cfg.output_dir), each) + ".npz" + for each in output_files + if "none/static" not in str(each) + ] + assert set(actual_files) == set(SUMMARIZE_EXPECTED_FILES) + for f in output_files: + ts_matrix = load_matrix(f) + assert ts_matrix.shape[0] > 0, "Time-Series Tabular Dataframe Should not be Empty!" + expected_num_cols = len(get_feature_names(f"{f.parent.stem}/{f.stem}", feature_columns)) + assert ts_matrix.shape[1] == expected_num_cols, ( + f"Time-Series Tabular Dataframe Should have {expected_num_cols}" + f"Columns but has {ts_matrix.shape[1]}!" + ) + split = f.parts[-5] + shard_num = f.parts[-4] + med_shard_fp = (Path(cfg.input_dir) / split / shard_num).with_suffix(".parquet") + expected_num_rows = ( + get_unique_time_events_df(get_events_df(pl.scan_parquet(med_shard_fp), feature_columns)) + .collect() + .shape[0] + ) + assert ts_matrix.shape[0] == expected_num_rows, ( + f"Time-Series Data matrix Should have {expected_num_rows}" + f" rows but has {ts_matrix.shape[0]}!" + ) # # Create fake labels # for f in f_name_resolver.list_meds_files(): From 43ac162a544d43889cdfe54e6f97e55ce8aa26f0 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 4 Jun 2024 06:09:33 +0000 Subject: [PATCH 090/106] updated task caching to use the updated config --- src/MEDS_tabular_automl/scripts/cache_task.py | 17 +++--- tests/test_tabularize.py | 58 ++++++++++++++----- 2 files changed, 51 insertions(+), 24 deletions(-) diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index f597344..c596524 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -1,18 +1,22 @@ #!/usr/bin/env python """Aggregates time-series data for feature columns across different window sizes.""" +from pathlib import Path + import hydra import numpy as np import polars as pl import scipy.sparse as sp from omegaconf import DictConfig +from MEDS_tabular_automl.file_name import list_subdir_files from MEDS_tabular_automl.mapper import wrap as rwlock_wrap from MEDS_tabular_automl.utils import ( CODE_AGGREGATIONS, STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION, VALUE_AGGREGATIONS, + get_shard_prefix, hydra_loguru_init, load_matrix, load_tqdm, @@ -48,23 +52,18 @@ def main( iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() - f_name_resolver = cfg # Produce ts representation # shuffle tasks - tabularization_tasks = f_name_resolver.list_static_files() + f_name_resolver.list_ts_files() + tabularization_tasks = list_subdir_files(cfg.input_dir, "npz") np.random.shuffle(tabularization_tasks) # iterate through them for data_fp in iter_wrapper(tabularization_tasks): # parse as time series agg - try: - split, shard_num, agg = f_name_resolver.parse_static_file_path(data_fp) - window_size = None - except ValueError: - split, shard_num, window_size, agg = f_name_resolver.parse_ts_file_path(data_fp) - label_fp = f_name_resolver.get_label(split, shard_num) - out_fp = f_name_resolver.get_task_specific_path(split, shard_num, window_size, agg) + split, shard_num, window_size, code_type, agg_name = Path(data_fp).with_suffix("").parts[-5:] + label_fp = Path(cfg.input_label_dir) / split / f"{shard_num}.parquet" + out_fp = (Path(cfg.output_dir) / get_shard_prefix(cfg.input_dir, data_fp)).with_suffix(".npz") assert label_fp.exists(), f"Output file {label_fp} does not exist." def read_fn(fps): diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index a639b9a..578a897 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -13,6 +13,7 @@ from MEDS_tabular_automl.describe_codes import get_feature_columns from MEDS_tabular_automl.file_name import list_subdir_files from MEDS_tabular_automl.scripts import ( + cache_task, describe_codes, tabularize_static, tabularize_time_series, @@ -331,27 +332,54 @@ def test_tabularize(): f" rows but has {ts_matrix.shape[0]}!" ) - # # Create fake labels - # for f in f_name_resolver.list_meds_files(): - # df = pl.read_parquet(f) - # df = get_events_df(df, feature_columns) - # pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) - # df = df.with_columns(pl.Series(name="label", values=pseudo_labels)) - # df = df.select(pl.col(["patient_id", "timestamp", "label"])) - # df = df.unique(subset=["patient_id", "timestamp"]) - # df = df.with_row_index("event_id") + # Step 3: Cache Task data + cache_config = { + "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), + "do_overwrite": False, + "seed": 1, + "hydra.verbose": True, + "tqdm": False, + "loguru_init": True, + "tabularization.min_code_inclusion_frequency": 1, + "tabularization.aggs": "[static/present,static/first,code/count,value/sum]", + "tabularization.window_sizes": "[30d,365d,full]", + } + + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in cache_config.items()] + cfg = compose(config_name="task_specific_caching", overrides=overrides) # config.yaml + + # Create fake labels + for f in list_subdir_files(Path(cfg.MEDS_cohort_dir) / "final_cohort", "parquet"): + df = pl.scan_parquet(f) + df = get_unique_time_events_df(get_events_df(df, feature_columns)).collect() + pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) + df = df.with_columns(pl.Series(name="label", values=pseudo_labels)) + df = df.select(pl.col(["patient_id", "timestamp", "label"])) + df = df.with_row_index("event_id") - # split = f.parent.stem - # shard_num = f.stem - # out_f = f_name_resolver.get_label(split, shard_num) - # out_f.parent.mkdir(parents=True, exist_ok=True) - # df.write_parquet(out_f) + split = f.parent.stem + shard_num = f.stem + out_f = Path(cfg.input_label_dir) / Path( + get_shard_prefix(Path(cfg.MEDS_cohort_dir) / "final_cohort", f) + ).with_suffix(".parquet") + out_f.parent.mkdir(parents=True, exist_ok=True) + df.write_parquet(out_f) - # cache_task(cfg) + cache_task.main(cfg) # xgboost_config_kwargs = { # "hydra.mode": "MULTIRUN", # } + + # with initialize( + # version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + # ): # path to config.yaml + # overrides = [f"{k}={v}" for k, v in cache_config.items()] + # cfg = compose(config_name="task_specific_caching", overrides=overrides) # config.yaml + # xgboost_config_kwargs = {**tabularize_config_kwargs, **xgboost_config_kwargs} # launch_xgboost(cfg) # output_files = list(Path(cfg.model_dir).glob("*.json")) From 6856ee8bacda09281e4d387ab19e10750769f981 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 4 Jun 2024 10:27:09 +0000 Subject: [PATCH 091/106] added xgboost support for the updated configs --- .../configs/launch_xgboost.yaml | 9 +- src/MEDS_tabular_automl/describe_codes.py | 126 ++++++++++++---- src/MEDS_tabular_automl/file_name.py | 141 +----------------- .../scripts/launch_xgboost.py | 66 ++++---- tests/test_tabularize.py | 57 ++++--- 5 files changed, 178 insertions(+), 221 deletions(-) diff --git a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml index 6eb7084..3767400 100644 --- a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml @@ -6,9 +6,14 @@ defaults: - _self_ task_name: task +# min code frequency used for modeling, can potentially sweep over different values. +modeling_min_code_freq: 10 -# Raw data -input_dir: ${MEDS_cohort_dir}/${task_name} +# Task cached data dir +input_dir: ${MEDS_cohort_dir}/${task_name}/task_cache +# Directory with task labels +input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels +# Feature Columns input_code_metadata: ${MEDS_cohort_dir}/code_metadata.parquet # Where to output the model and cached data output_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} diff --git a/src/MEDS_tabular_automl/describe_codes.py b/src/MEDS_tabular_automl/describe_codes.py index 3046295..5e6162d 100644 --- a/src/MEDS_tabular_automl/describe_codes.py +++ b/src/MEDS_tabular_automl/describe_codes.py @@ -1,11 +1,10 @@ -from collections import defaultdict from collections.abc import Mapping from pathlib import Path import polars as pl from omegaconf import DictConfig, OmegaConf -from MEDS_tabular_automl.utils import DF_T +from MEDS_tabular_automl.utils import DF_T, get_feature_names def convert_to_df(freq_dict): @@ -27,14 +26,14 @@ def compute_feature_frequencies(cfg: DictConfig, shard_df: DF_T) -> list[str]: This function evaluates the properties of codes within training data and applies configured aggregations to generate a comprehensive list of feature columns for modeling purposes. Examples: - >>> import polars as pl - >>> data = {'code': ['A', 'A', 'B', 'B', 'C', 'C', 'C'], - ... 'timestamp': [None, '2021-01-01', None, None, '2021-01-03', '2021-01-04', None], - ... 'numerical_value': [1, None, 2, 2, None, None, 3]} - >>> df = pl.DataFrame(data).lazy() - >>> aggs = ['value/sum', 'code/count'] - >>> get_ts_feature_cols(aggs, df) - ['A/code', 'A/value', 'C/code', 'C/value'] + # >>> import polars as pl + # >>> data = {'code': ['A', 'A', 'B', 'B', 'C', 'C', 'C'], + # ... 'timestamp': [None, '2021-01-01', None, None, '2021-01-03', '2021-01-04', None], + # ... 'numerical_value': [1, None, 2, 2, None, None, 3]} + # >>> df = pl.DataFrame(data).lazy() + # >>> aggs = ['value/sum', 'code/count'] + # >>> compute_feature_frequencies(aggs, df) + # ['A/code', 'A/value', 'C/code', 'C/value'] """ static_df = shard_df.filter( pl.col("patient_id").is_not_null() & pl.col("code").is_not_null() & pl.col("timestamp").is_null() @@ -80,13 +79,13 @@ def convert_to_freq_dict(df: pl.LazyFrame) -> dict: dictionaries of code frequencies. Example: - >>> import polars as pl - >>> df = pl.DataFrame({ - ... "code": [1, 2, 3, 4, 5], - ... "value": [10, 20, 30, 40, 50] - ... }) - >>> convert_to_freq_dict(df) - {'code': {1: 1, 2: 1, 3: 1, 4: 1, 5: 1}, 'value': {10: 1, 20: 1, 30: 1, 40: 1, 50: 1}} + # >>> import polars as pl + # >>> df = pl.DataFrame({ + # ... "code": [1, 2, 3, 4, 5], + # ... "value": [10, 20, 30, 40, 50] + # ... }) + # >>> convert_to_freq_dict(df) + # {'code': {1: 1, 2: 1, 3: 1, 4: 1, 5: 1}, 'value': {10: 1, 20: 1, 30: 1, 40: 1, 50: 1}} """ if not df.columns == ["code", "count"]: raise ValueError(f"DataFrame must have columns 'code' and 'count', but has columns {df.columns}!") @@ -111,21 +110,9 @@ def filter_to_codes( if allowed_codes is None: feature_freqs = get_feature_freqs(code_metadata_fp) - def clear_code_aggregation_suffix(code): - if code.endswith("/code"): - return code[:-5] - elif code.endswith("/value"): - return code[:-6] - elif code.endswith("/static/present"): - return code[:-15] - elif code.endswith("/static/first"): - return code[:-13] - - code_freqs_agg = defaultdict(list) - - for code, freq in feature_freqs.items(): - code_freqs_agg[clear_code_aggregation_suffix(code)].append(freq) - code_freqs = {code: sum(freqs) for code, freqs in code_freqs_agg.items()} + code_freqs = { + code: freq for code, freq in feature_freqs.items() if freq >= min_code_inclusion_frequency + } return sorted([code for code, freq in code_freqs.items() if freq >= min_code_inclusion_frequency]) else: return allowed_codes @@ -134,5 +121,78 @@ def clear_code_aggregation_suffix(code): OmegaConf.register_new_resolver("filter_to_codes", filter_to_codes) +def clear_code_aggregation_suffix(code): + if code.endswith("/code"): + return code[:-5] + elif code.endswith("/value"): + return code[:-6] + elif code.endswith("/static/present"): + return code[:-15] + elif code.endswith("/static/first"): + return code[:-13] + + def filter_parquet(fp, allowed_codes: list[str]): - return pl.scan_parquet(fp).filter(pl.col("code").is_in(allowed_codes)) + """Loads Parquet with Polars and filters to allowed codes. + + Args: + fp: Path to the Meds cohort shard + allowed_codes: List of codes to filter to. + + Expect: + >>> from tempfile import NamedTemporaryFile + >>> fp = NamedTemporaryFile() + >>> pl.DataFrame({ + ... "code": ["A", "A", "A", "A", "D", "D", "E", "E"], + ... "timestamp": [None, None, "2021-01-01", "2021-01-01", None, None, "2021-01-03", "2021-01-04"], + ... "numerical_value": [1, None, 2, 2, None, 5, None, 3] + ... }).write_parquet(fp.name) + >>> filter_parquet(fp.name, ["A/code", "D/static/present", "E/code", "E/value"]).collect() + shape: (6, 3) + ┌──────┬────────────┬─────────────────┐ + │ code ┆ timestamp ┆ numerical_value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞══════╪════════════╪═════════════════╡ + │ A ┆ 2021-01-01 ┆ null │ + │ A ┆ 2021-01-01 ┆ null │ + │ D ┆ null ┆ null │ + │ D ┆ null ┆ null │ + │ E ┆ 2021-01-03 ┆ null │ + │ E ┆ 2021-01-04 ┆ 3 │ + └──────┴────────────┴─────────────────┘ + >>> fp.close() + """ + df = pl.scan_parquet(fp) + # Drop values that are rare + # Drop Rare Static Codes + static_present_feature_columns = [ + clear_code_aggregation_suffix(each) for each in get_feature_names("static/present", allowed_codes) + ] + static_first_feature_columns = [ + clear_code_aggregation_suffix(each) for each in get_feature_names("static/first", allowed_codes) + ] + code_feature_columns = [ + clear_code_aggregation_suffix(each) for each in get_feature_names("code/count", allowed_codes) + ] + value_feature_columns = [ + clear_code_aggregation_suffix(each) for each in get_feature_names("value/sum", allowed_codes) + ] + + is_static_code = pl.col("timestamp").is_null() + is_numeric_code = pl.col("numerical_value").is_not_null() + rare_static_code = is_static_code & ~pl.col("code").is_in(static_present_feature_columns) + rare_ts_code = ~is_static_code & ~pl.col("code").is_in(code_feature_columns) + rare_ts_value = ~is_static_code & ~pl.col("code").is_in(value_feature_columns) & is_numeric_code + rare_static_value = is_static_code & ~pl.col("code").is_in(static_first_feature_columns) & is_numeric_code + + # Remove rare numeric values by converting them to null + df = df.with_columns( + pl.when(rare_static_value | rare_ts_value) + .then(None) + .otherwise(pl.col("numerical_value")) + .alias("numerical_value") + ) + # Drop rows with rare codes + df = df.filter(~(rare_static_code | rare_ts_code)) + return df diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py index 9ddb580..898d11e 100644 --- a/src/MEDS_tabular_automl/file_name.py +++ b/src/MEDS_tabular_automl/file_name.py @@ -1,116 +1,18 @@ """Help functions for getting file names and paths for MEDS tabular automl tasks.""" from pathlib import Path -from MEDS_tabular_automl.utils import ( - CODE_AGGREGATIONS, - STATIC_CODE_AGGREGATION, - STATIC_VALUE_AGGREGATION, - VALUE_AGGREGATIONS, -) - - -def get_meds_dir(cfg): - return cfg.meds_dir / "final_cohort" - - -def get_static_dir(cfg): - return cfg.tabularize_dir / "static" - - -def get_ts_dir(cfg): - return cfg.tabularize_dir / "ts" - - -def get_sparse_dir(cfg): - return cfg.tabularize_dir / "sparse" - - -def get_label_dir(cfg): - return Path(cfg.task_dir) - - -def get_feature_columns_fp(cfg): - return cfg.tabularize_dir / "feature_columns.json" - - -def get_feature_freqs_fp(cfg): - return cfg.tabularize_dir / "feature_freqs.json" - - -def get_config_path(cfg): - return cfg.tabularize_dir / "config.yaml" - - -def get_meds_shard(cfg, split: str, shard_num: int): - # Given a shard number, return the MEDS format data - return get_meds_dir(cfg) / split / f"{shard_num}.parquet" - - -def get_flat_static_rep(cfg, split: str, shard_num: int, agg: str): - # Given a shard number, returns the static representation path - agg_name = agg.split("/")[-1] - return cfg.get_static_dir() / split / f"{shard_num}" / f"{agg_name}.npz" - - -def get_flat_ts_rep(cfg, split: str, shard_num: int, window_size: int, agg: str): - # Given a shard number, returns the time series representation path - return cfg.get_ts_dir() / split / f"{shard_num}" / f"{window_size}" / f"{agg}.npz" - - -def get_flat_sparse_rep(cfg, split: str, shard_num: int, window_size: int, agg: str): - # Given a shard number, returns the sparse representation path - return cfg.get_sparse_dir() / split / f"{shard_num}" / f"{window_size}" / f"{agg}.npz" - - -def get_label(cfg, split: str, shard_num: int): - # Given a shard number, returns the label path - return cfg.get_label_dir() / split / f"{shard_num}.parquet" - - -def list_meds_files(meds_dir: Path, split=None): - # List all MEDS files - if split: - return sorted(list(meds_dir.glob(f"{split}/*.parquet"))) - return sorted(list(meds_dir.glob("**/*.parquet"))) - def list_subdir_files(dir: [Path | str], fmt: str): return sorted(list(Path(dir).glob(f"**/*.{fmt}"))) -def list_static_files(cfg, split=None): - # List all static files - if split: - return sorted(list(cfg.get_static_dir().glob(f"{split}/*/*.npz"))) - return sorted(list(cfg.get_static_dir().glob("*/*/*.npz"))) - - -def list_ts_files(cfg, split=None): - # List all time series files - if split: - return sorted(list(cfg.get_ts_dir().glob(f"{split}/*/*/*/*.npz"))) - return sorted(list(cfg.get_ts_dir().glob("*/*/*/*/*.npz"))) - - -def list_sparse_files(cfg, split=None): - # List all sparse files - if split: - return sorted(list(cfg.get_sparse_dir().glob(f"{split}/*/*.npz"))) - return sorted(list(cfg.get_sparse_dir().glob("*/*/*.npz"))) - - -def list_label_files(cfg, split=None): - # List all label files - if split: - return sorted(list(cfg.get_label_dir().glob(f"{split}/*.parquet"))) - return sorted(list(cfg.get_label_dir().glob("*/*.parquet"))) - - -def get_cache_dir(cfg): - return cfg.cache_dir +def get_task_specific_path(cfg, split, shard_num, window_size, agg): + return Path(cfg.input_dir) / split / f"{shard_num}" / f"{window_size}" / f"{agg}.npz" -def get_model_files(cfg, window_sizes, aggs, split, shard_num: int): +def get_model_files(cfg, split: str, shard_num: int): + window_sizes = cfg.tabularization.window_sizes + aggs = cfg.tabularization.aggs # Given a shard number, returns the model files model_files = [] for window_size in window_sizes: @@ -118,36 +20,9 @@ def get_model_files(cfg, window_sizes, aggs, split, shard_num: int): if agg.startswith("static"): continue else: - model_files.append(cfg.get_task_specific_path(split, shard_num, window_size, agg)) + model_files.append(get_task_specific_path(cfg, split, shard_num, window_size, agg)) for agg in aggs: if agg.startswith("static"): - window_size = None - model_files.append(cfg.get_task_specific_path(split, shard_num, window_size, agg)) + window_size = "none" + model_files.append(get_task_specific_path(cfg, split, shard_num, window_size, agg)) return sorted(model_files) - - -def parse_ts_file_path(cfg, data_fp): - agg = f"{data_fp.parent.stem}/{data_fp.stem}" - if agg not in CODE_AGGREGATIONS + VALUE_AGGREGATIONS: - raise ValueError(f"Invalid aggregation: {agg}") - window_size = data_fp.parts[-3] - shard_num = data_fp.parts[-4] - split = data_fp.parts[-5] - return split, shard_num, window_size, agg - - -def parse_static_file_path(cfg, data_fp): - # parse as static agg - agg = f"{data_fp.parent.parent.parent.stem}/{data_fp.stem}" - if agg not in [STATIC_VALUE_AGGREGATION, STATIC_CODE_AGGREGATION]: - raise ValueError(f"Invalid aggregation: {agg}") - shard_num = data_fp.parent.stem - split = data_fp.parts[-3] - return split, shard_num, agg - - -def get_task_specific_path(cfg, split, shard_num, window_size, agg): - if window_size: - return cfg.get_label_dir() / split / f"{shard_num}" / f"{window_size}" / f"{agg}.npz" - else: - return cfg.get_label_dir() / split / f"{shard_num}" / f"{agg}.npz" diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py index c2c4af7..ca20103 100644 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ b/src/MEDS_tabular_automl/scripts/launch_xgboost.py @@ -1,5 +1,3 @@ -import json -import os from collections.abc import Callable, Mapping from pathlib import Path @@ -13,6 +11,8 @@ from omegaconf import DictConfig, OmegaConf from sklearn.metrics import roc_auc_score +from MEDS_tabular_automl.describe_codes import get_feature_columns, get_feature_freqs +from MEDS_tabular_automl.file_name import get_model_files, list_subdir_files from MEDS_tabular_automl.utils import get_feature_indices, hydra_loguru_init @@ -51,15 +51,16 @@ def __init__(self, cfg: DictConfig, split: str = "train"): or "held_out". This determines which subset of the data is loaded and processed. """ self.cfg = cfg - self.file_name_resolver = cfg self.split = split - - self._data_shards = sorted([shard.stem for shard in self.file_name_resolver.list_label_files(split)]) + # Load shards for this split + self._data_shards = sorted( + [shard.stem for shard in list_subdir_files(Path(cfg.input_label_dir) / split, "parquet")] + ) self.valid_event_ids, self.labels = self.load_labels() self.codes_set, self.code_masks, self.num_features = self._get_code_set() self._it = 0 - super().__init__(cache_prefix=os.path.join(self.file_name_resolver.get_cache_dir())) + super().__init__(cache_prefix=Path(cfg.cache_dir)) @TimeableMixin.TimeAs def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, list[bool]]: @@ -76,7 +77,7 @@ def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, Dictionary of code masks for each aggregation. """ code_masks = {} - for agg in set(self.cfg.aggs): + for agg in set(self.cfg.tabularization.aggs): feature_ids = get_feature_indices(agg, feature_columns) code_mask = [True if idx in codes_set else False for idx in feature_ids] code_masks[agg] = code_mask @@ -110,7 +111,9 @@ def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: dictionary from shard number to list of labels for these valid event ids """ label_fps = { - shard: self.file_name_resolver.get_label(self.split, shard) for shard in self._data_shards + shard: (Path(self.cfg.input_label_dir) / self.split / shard).with_suffix(".parquet") + for shard in self._data_shards + for shard in self._data_shards } cached_labels, cached_event_ids = dict(), dict() for shard, label_fp in label_fps.items(): @@ -119,7 +122,7 @@ def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: # TODO: check this for Nan or any other case we need to worry about cached_labels[shard] = label_df.select(pl.col("label")).collect().to_series() - if self.cfg.iterator.binarize_task: + if self.cfg.model_params.iterator.binarize_task: cached_labels[shard] = cached_labels[shard].map_elements( lambda x: 1 if x > 0 else 0, return_dtype=pl.Int8 ) @@ -129,25 +132,30 @@ def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: @TimeableMixin.TimeAs def _get_code_set(self) -> tuple[set, Mapping[int, list], int]: """Get the set of codes to include in the data based on the configuration.""" - with open(self.file_name_resolver.get_feature_columns_fp()) as f: - feature_columns = json.load(f) + feature_columns = get_feature_columns(self.cfg.input_code_metadata) + feature_freqs = get_feature_freqs(self.cfg.input_code_metadata) + feature_columns = [ + col + for col in feature_columns + if feature_freqs[col] >= self.cfg.tabularization.min_code_inclusion_frequency + ] feature_dict = {col: i for i, col in enumerate(feature_columns)} - if self.cfg.codes is not None: - codes_set = {feature_dict[code] for code in set(self.cfg.codes) if code in feature_dict} + allowed_codes = self.cfg.tabularization.allowed_codes + if self.cfg.tabularization.allowed_codes is not None: + codes_set = {feature_dict[code] for code in set(allowed_codes) if code in feature_dict} - if self.cfg.min_code_inclusion_frequency is not None: - with open(self.file_name_resolver.get_feature_freqs_fp()) as f: - feature_freqs = json.load(f) + if self.cfg.modeling_min_code_freq is not None: + feature_freqs = get_feature_freqs(self.cfg.input_code_metadata) min_frequency_set = { - key for key, value in feature_freqs.items() if value >= self.cfg.min_code_inclusion_frequency + key for key, value in feature_freqs.items() if value >= self.cfg.modeling_min_code_freq } frequency_set = {feature_dict[code] for code in min_frequency_set if code in feature_dict} - if self.cfg.codes is not None and self.cfg.min_code_inclusion_frequency is not None: + if allowed_codes is not None and self.cfg.modeling_min_code_freq is not None: codes_set = codes_set.intersection(frequency_set) - elif self.cfg.codes is not None: + elif allowed_codes is not None: codes_set = codes_set - elif self.cfg.min_code_inclusion_frequency is not None: + elif self.cfg.modeling_min_code_freq is not None: codes_set = frequency_set else: codes_set = None # set(feature_columns) @@ -190,11 +198,8 @@ def _get_dynamic_shard_by_index(self, idx: int) -> sp.csc_matrix: Returns: - sp.csc_matrix: Filtered sparse matrix. """ - # TODO Nassim Fix this guy # get all window_size x aggreagation files using the file resolver - files = self.file_name_resolver.get_model_files( - self.cfg.window_sizes, self.cfg.aggs, self.split, self._data_shards[idx] - ) + files = get_model_files(self.cfg, self.split, self._data_shards[idx]) if not all(file.exists() for file in files): raise ValueError(f"Not all files exist for shard {self._data_shards[idx]}") @@ -315,7 +320,7 @@ def __init__(self, cfg: DictConfig): """ self.cfg = cfg - self.keep_data_in_memory = getattr(getattr(cfg, "iterator", {}), "keep_data_in_memory", True) + self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory self.itrain = None self.ituning = None @@ -331,10 +336,10 @@ def __init__(self, cfg: DictConfig): def _train(self): """Train the model.""" self.model = xgb.train( - OmegaConf.to_container(self.cfg.model), + OmegaConf.to_container(self.cfg.model_params.model), self.dtrain, - num_boost_round=self.cfg.num_boost_round, - early_stopping_rounds=self.cfg.early_stopping_rounds, + num_boost_round=self.cfg.model_params.num_boost_round, + early_stopping_rounds=self.cfg.model_params.early_stopping_rounds, # nthreads=self.cfg.nthreads, evals=[(self.dtrain, "train"), (self.dtuning, "tuning")], ) @@ -409,7 +414,8 @@ def main(cfg: DictConfig) -> float: print( "Time Profiling for window sizes ", - f"{cfg.window_sizes} and min code frequency of {cfg.min_code_inclusion_frequency}:", + f"{cfg.tabularization.window_sizes} and min ", + "code frequency of {cfg.tabularization.min_code_inclusion_frequency}:", ) print("Train Time: \n", model._profile_durations()) print("Train Iterator Time: \n", model.itrain._profile_durations()) @@ -417,7 +423,7 @@ def main(cfg: DictConfig) -> float: print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) # save model - save_dir = Path(cfg.model_dir) + save_dir = Path(cfg.output_dir) save_dir.mkdir(parents=True, exist_ok=True) model.model.save_model(save_dir / "model.json") diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 578a897..bd2929f 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -15,6 +15,7 @@ from MEDS_tabular_automl.scripts import ( cache_task, describe_codes, + launch_xgboost, tabularize_static, tabularize_time_series, ) @@ -267,14 +268,17 @@ def test_tabularize(): ): # path to config.yaml overrides = [f"{k}={v}" for k, v in tabularize_static_config.items()] cfg = compose(config_name="tabularization", overrides=overrides) # config.yaml - num_allowed_codes = len(cfg.tabularization._resolved_codes) - num_codes = ( - pl.scan_parquet(list_subdir_files(Path(cfg.input_dir), "parquet")) - .select(pl.col("code")) - .collect() - .n_unique() - ) - assert num_allowed_codes == num_codes, f"Should have {num_codes} codes but has {num_allowed_codes}" + allowed_codes = cfg.tabularization._resolved_codes + num_allowed_codes = len(allowed_codes) + # num_codes = ( + # pl.scan_parquet(list_subdir_files(Path(cfg.input_dir), "parquet")) + # .select(pl.col("code")) + # .collect() + # .n_unique() + # ) + assert num_allowed_codes == len( + feature_columns + ), f"Should have {len(feature_columns)} codes but has {num_allowed_codes}" tabularize_static.main(cfg) output_files = list(Path(cfg.output_dir).glob("**/static/**/*.npz")) actual_files = [get_shard_prefix(Path(cfg.output_dir), each) + ".npz" for each in output_files] @@ -370,18 +374,25 @@ def test_tabularize(): cache_task.main(cfg) - # xgboost_config_kwargs = { - # "hydra.mode": "MULTIRUN", - # } - - # with initialize( - # version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - # ): # path to config.yaml - # overrides = [f"{k}={v}" for k, v in cache_config.items()] - # cfg = compose(config_name="task_specific_caching", overrides=overrides) # config.yaml - - # xgboost_config_kwargs = {**tabularize_config_kwargs, **xgboost_config_kwargs} - # launch_xgboost(cfg) - # output_files = list(Path(cfg.model_dir).glob("*.json")) - # assert len(output_files) == 1 - # assert output_files[0] == Path(cfg.model_dir) / "model.json" + xgboost_config_kwargs = { + "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), + "do_overwrite": False, + "seed": 1, + "hydra.verbose": True, + "tqdm": False, + "loguru_init": True, + "tabularization.min_code_inclusion_frequency": 1, + "tabularization.aggs": "[static/present,static/first,code/count,value/sum]", + "tabularization.window_sizes": "[30d,365d,full]", + } + + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] + cfg = compose(config_name="launch_xgboost", overrides=overrides) # config.yaml + + launch_xgboost.main(cfg) + output_files = list(Path(cfg.output_dir).glob("*.json")) + assert len(output_files) == 1 + assert output_files[0] == Path(cfg.output_dir) / "model.json" From f426263623a89e87d20e9ef9f226323d01f6f80d Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 4 Jun 2024 11:42:19 +0000 Subject: [PATCH 092/106] updated hydra to reference the correct config files, cli seems to be working --- src/MEDS_tabular_automl/configs/describe_codes.yaml | 2 ++ src/MEDS_tabular_automl/configs/launch_xgboost.yaml | 2 ++ src/MEDS_tabular_automl/configs/tabularization.yaml | 2 ++ src/MEDS_tabular_automl/configs/task_specific_caching.yaml | 2 ++ src/MEDS_tabular_automl/scripts/cache_task.py | 2 +- src/MEDS_tabular_automl/scripts/launch_xgboost.py | 2 +- src/MEDS_tabular_automl/scripts/tabularize_static.py | 2 +- src/MEDS_tabular_automl/scripts/tabularize_time_series.py | 5 ++++- 8 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/MEDS_tabular_automl/configs/describe_codes.yaml b/src/MEDS_tabular_automl/configs/describe_codes.yaml index f0e56dc..9aad365 100644 --- a/src/MEDS_tabular_automl/configs/describe_codes.yaml +++ b/src/MEDS_tabular_automl/configs/describe_codes.yaml @@ -10,3 +10,5 @@ input_dir: ${MEDS_cohort_dir}/final_cohort/${split} cache_dir: ${MEDS_cohort_dir}/.cache output_dir: ${MEDS_cohort_dir} output_filepath: ${output_dir}/code_metadata.parquet + +name: describe_codes \ No newline at end of file diff --git a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml index 3767400..46ab52d 100644 --- a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml @@ -50,3 +50,5 @@ hydra: window_sizes: _target_: hydra.utils.call(${hydra.utils.cross_product}, values=["1d", "7d", "30d", "365d", "full"], min_options=1) + +name: launch_xgboost \ No newline at end of file diff --git a/src/MEDS_tabular_automl/configs/tabularization.yaml b/src/MEDS_tabular_automl/configs/tabularization.yaml index dc2c48d..e734e56 100644 --- a/src/MEDS_tabular_automl/configs/tabularization.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization.yaml @@ -8,3 +8,5 @@ defaults: input_code_metadata: ${MEDS_cohort_dir}/code_metadata.parquet input_dir: ${MEDS_cohort_dir}/final_cohort output_dir: ${MEDS_cohort_dir}/tabularize + +name: tabularization \ No newline at end of file diff --git a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml index 27135f8..776f6a5 100644 --- a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml +++ b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml @@ -10,3 +10,5 @@ input_dir: ${MEDS_cohort_dir}/tabularize input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels # Where to output the task specific tabularized data output_dir: ${MEDS_cohort_dir}/${task_name}/task_cache + +name: task_specific_caching \ No newline at end of file diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index c596524..f7eb5b0 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -44,7 +44,7 @@ def generate_row_cached_matrix(matrix, label_df): return sp.coo_array(csr) -@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") +@hydra.main(version_base=None, config_path="../configs", config_name="task_specific_caching") def main( cfg: DictConfig, ): diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py index ca20103..88ee7c7 100644 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ b/src/MEDS_tabular_automl/scripts/launch_xgboost.py @@ -396,7 +396,7 @@ def evaluate(self) -> float: return roc_auc_score(y_true, y_pred) -@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") +@hydra.main(version_base=None, config_path="../configs", config_name="launch_xgboost") def main(cfg: DictConfig) -> float: """Optimize the model based on the provided configuration. diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index c0be01d..3703801 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -25,7 +25,7 @@ pl.enable_string_cache() -@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") +@hydra.main(version_base=None, config_path="../configs", config_name="tabularization") def main( cfg: DictConfig, ): diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index 9d95ddb..d13916e 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -8,6 +8,7 @@ import numpy as np from loguru import logger from omegaconf import DictConfig +import polars as pl from MEDS_tabular_automl.describe_codes import filter_parquet, get_feature_columns from MEDS_tabular_automl.file_name import list_subdir_files @@ -23,8 +24,10 @@ write_df, ) +pl.enable_string_cache() -@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") + +@hydra.main(version_base=None, config_path="../configs", config_name="tabularization") def main( cfg: DictConfig, ): From 6d68a1e8f78cc4cb0191e1d40b037134ee974b05 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 4 Jun 2024 11:42:57 +0000 Subject: [PATCH 093/106] updated e2e script for heart failure cohort using the cli --- hf_cohort/hf_cohort_cli.sh | 62 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 hf_cohort/hf_cohort_cli.sh diff --git a/hf_cohort/hf_cohort_cli.sh b/hf_cohort/hf_cohort_cli.sh new file mode 100644 index 0000000..ddb8c74 --- /dev/null +++ b/hf_cohort/hf_cohort_cli.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# bash hf_cohort/hf_cohort_e2e.sh hf_cohort 80 + +METHOD=meds + +MEDS_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed +OUTPUT_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize +ID=$1 +N_PARALLEL_WORKERS="$2" +WINDOW_SIZES="tabularization.window_sizes=[1d,7d,30d,365d,full]" +AGGS="tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" + +echo "Running identify_columns.py: Caching feature names and frequencies." +rm -rf $OUTPUT_DIR +meds-tab-describe MEDS_cohort_dir=$MEDS_DIR + +echo "Running tabularize_static.py: tabularizing static data" +meds_tab-tabularize-static \ + MEDS_cohort_dir=$MEDS_DIR \ + tabularization.min_code_inclusion_frequency=10 "$WINDOW_SIZES" do_overwrite=False "$AGGS" + + +POLARS_MAX_THREADS=1 +LOG_DIR="logs/$METHOD/$ID-logs" +mkdir -p $LOG_DIR +{ time \ + mprof run --include-children --exit-code --output "$LOG_DIR/mprofile.dat" \ + meds_tab-tabularize-time-series \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + MEDS_cohort_dir=$MEDS_DIR \ + tabularization.min_code_inclusion_frequency=10 do_overwrite=False \ + "$WINDOW_SIZES" "$AGGS" \ + 2> $LOG_DIR/cmd.stderr +} 2> $LOG_DIR/timings.txt + +cmd_exit_status=${PIPESTATUS[0]} +# Check the exit status of the second command in the pipeline (mprof run ...) +if [ -n "$cmd_exit_status" ] && [ "$cmd_exit_status" -ne 0 ]; then + echo "build_dataset.sh failed with status $cmd_exit_status." + echo "Stderr from build_dataset.sh (see $LOG_DIR/cmd.stderr):" + tail $LOG_DIR/cmd.stderr + exit "$cmd_exit_status" +fi +mprof plot -o $LOG_DIR/mprofile.png $LOG_DIR/mprofile.dat +mprof peak $LOG_DIR/mprofile.dat > $LOG_DIR/peak_memory_usage.txt + + +echo "Running task_specific_caching.py: tabularizing static data" +meds_tab-cache-task \ + MEDS_cohort_dir=$MEDS_DIR \ + tabularization.min_code_inclusion_frequency=10 "$WINDOW_SIZES" do_overwrite=False "$AGGS" + +echo "Running xgboost: tabularizing static data" +meds_tab-xgboost \ + MEDS_cohort_dir=$MEDS_DIR \ + modeling_min_code_freq=10 + tabularization.min_code_inclusion_frequency=10 "$WINDOW_SIZES" do_overwrite=False "$AGGS" + + + From 1cda7f5f4778891687d67e6ff671b0ba2453c90b Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 4 Jun 2024 12:04:17 +0000 Subject: [PATCH 094/106] fixed polars string cache bug --- src/MEDS_tabular_automl/generate_summarized_reps.py | 1 + src/MEDS_tabular_automl/scripts/tabularize_time_series.py | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index da79cd5..a917714 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import polars as pl +pl.enable_string_cache() from loguru import logger from scipy.sparse import coo_array, csr_array, sparray diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index d13916e..5bb1f57 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -1,14 +1,17 @@ #!/usr/bin/env python """Aggregates time-series data for feature columns across different window sizes.""" +import polars as pl +pl.enable_string_cache() + from itertools import product from pathlib import Path - import hydra import numpy as np from loguru import logger from omegaconf import DictConfig -import polars as pl + + from MEDS_tabular_automl.describe_codes import filter_parquet, get_feature_columns from MEDS_tabular_automl.file_name import list_subdir_files @@ -24,7 +27,6 @@ write_df, ) -pl.enable_string_cache() @hydra.main(version_base=None, config_path="../configs", config_name="tabularization") From 0111abacb9c934b8c9d3dfbe32e63a7604b12776 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 4 Jun 2024 23:57:46 +0000 Subject: [PATCH 095/106] added caching of filtered codes --- pyproject.toml | 2 +- .../configs/launch_xgboost.yaml | 2 -- .../configs/tabularization.yaml | 2 +- .../configs/tabularization/default.yaml | 4 +-- src/MEDS_tabular_automl/describe_codes.py | 19 ++++++----- .../scripts/launch_xgboost.py | 27 +++------------ .../scripts/tabularize_static.py | 33 +++++++++++++++++-- .../scripts/tabularize_time_series.py | 2 +- tests/test_tabularize.py | 17 ++++------ 9 files changed, 55 insertions(+), 53 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9b21e4a..fdef251 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", "pandas", "tqdm", "xgboost", "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher"] +dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", "pandas", "tqdm", "xgboost", "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins"] [project.scripts] meds-tab-describe = "MEDS_tabular_automl.scripts.describe_codes:main" diff --git a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml index 46ab52d..306a158 100644 --- a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml @@ -13,8 +13,6 @@ modeling_min_code_freq: 10 input_dir: ${MEDS_cohort_dir}/${task_name}/task_cache # Directory with task labels input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels -# Feature Columns -input_code_metadata: ${MEDS_cohort_dir}/code_metadata.parquet # Where to output the model and cached data output_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} output_filepath: ${output_dir}/model_metadata.parquet diff --git a/src/MEDS_tabular_automl/configs/tabularization.yaml b/src/MEDS_tabular_automl/configs/tabularization.yaml index e734e56..222b7af 100644 --- a/src/MEDS_tabular_automl/configs/tabularization.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization.yaml @@ -5,7 +5,7 @@ defaults: # Raw data # Where the code metadata is stored -input_code_metadata: ${MEDS_cohort_dir}/code_metadata.parquet +input_code_metadata_fp: ${MEDS_cohort_dir}/code_metadata.parquet input_dir: ${MEDS_cohort_dir}/final_cohort output_dir: ${MEDS_cohort_dir}/tabularize diff --git a/src/MEDS_tabular_automl/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml index b7de04f..d11dd62 100644 --- a/src/MEDS_tabular_automl/configs/tabularization/default.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization/default.yaml @@ -1,7 +1,7 @@ # User inputs -code_metadata_fp: ${MEDS_cohort_dir}/code_metadata.parquet allowed_codes: null min_code_inclusion_frequency: 10 +filtered_code_metadata_fp: ${MEDS_cohort_dir}/tabularized_code_metadata.parquet window_sizes: - "1d" - "7d" @@ -19,4 +19,4 @@ aggs: - "value/max" # Resolved inputs -_resolved_codes: ${filter_to_codes:${tabularization.allowed_codes},${tabularization.min_code_inclusion_frequency},${tabularization.code_metadata_fp}} +_resolved_codes: ${filter_to_codes:${tabularization.allowed_codes},${tabularization.min_code_inclusion_frequency},${tabularization.filtered_code_metadata_fp}} diff --git a/src/MEDS_tabular_automl/describe_codes.py b/src/MEDS_tabular_automl/describe_codes.py index 5e6162d..a92730f 100644 --- a/src/MEDS_tabular_automl/describe_codes.py +++ b/src/MEDS_tabular_automl/describe_codes.py @@ -102,20 +102,21 @@ def get_feature_freqs(fp): def filter_to_codes( allowed_codes: list[str] | None, - min_code_inclusion_frequency: Mapping[str, int], + min_code_inclusion_frequency: int, code_metadata_fp: Path, ): """Returns allowed codes if they are specified, otherwise filters to codes based on inclusion frequency.""" if allowed_codes is None: - feature_freqs = get_feature_freqs(code_metadata_fp) - - code_freqs = { - code: freq for code, freq in feature_freqs.items() if freq >= min_code_inclusion_frequency - } - return sorted([code for code, freq in code_freqs.items() if freq >= min_code_inclusion_frequency]) - else: - return allowed_codes + allowed_codes = get_feature_columns(code_metadata_fp) + feature_freqs = get_feature_freqs(code_metadata_fp) + + code_freqs = { + code: freq for code, freq in feature_freqs.items() if ( + freq >= min_code_inclusion_frequency and code in allowed_codes + ) + } + return sorted([code for code, freq in code_freqs.items() if freq >= min_code_inclusion_frequency]) OmegaConf.register_new_resolver("filter_to_codes", filter_to_codes) diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py index 88ee7c7..8474a99 100644 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ b/src/MEDS_tabular_automl/scripts/launch_xgboost.py @@ -132,35 +132,16 @@ def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: @TimeableMixin.TimeAs def _get_code_set(self) -> tuple[set, Mapping[int, list], int]: """Get the set of codes to include in the data based on the configuration.""" - feature_columns = get_feature_columns(self.cfg.input_code_metadata) - feature_freqs = get_feature_freqs(self.cfg.input_code_metadata) + feature_columns = get_feature_columns(self.cfg.tabularization.filtered_code_metadata_fp) + feature_freqs = get_feature_freqs(self.cfg.tabularization.filtered_code_metadata_fp) feature_columns = [ col for col in feature_columns if feature_freqs[col] >= self.cfg.tabularization.min_code_inclusion_frequency ] feature_dict = {col: i for i, col in enumerate(feature_columns)} - allowed_codes = self.cfg.tabularization.allowed_codes - if self.cfg.tabularization.allowed_codes is not None: - codes_set = {feature_dict[code] for code in set(allowed_codes) if code in feature_dict} - - if self.cfg.modeling_min_code_freq is not None: - feature_freqs = get_feature_freqs(self.cfg.input_code_metadata) - min_frequency_set = { - key for key, value in feature_freqs.items() if value >= self.cfg.modeling_min_code_freq - } - frequency_set = {feature_dict[code] for code in min_frequency_set if code in feature_dict} - - if allowed_codes is not None and self.cfg.modeling_min_code_freq is not None: - codes_set = codes_set.intersection(frequency_set) - elif allowed_codes is not None: - codes_set = codes_set - elif self.cfg.modeling_min_code_freq is not None: - codes_set = frequency_set - else: - codes_set = None # set(feature_columns) - if codes_set == set(feature_columns): - codes_set = None + allowed_codes = set(self.cfg.tabularization._resolved_codes) + codes_set = {feature_dict[code] for code in feature_dict if code in allowed_codes} return ( codes_set, diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index 3703801..d12eb3a 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -9,7 +9,7 @@ import polars as pl from omegaconf import DictConfig -from MEDS_tabular_automl.describe_codes import filter_parquet, get_feature_columns +from MEDS_tabular_automl.describe_codes import filter_parquet, get_feature_columns, get_feature_freqs, convert_to_df, filter_to_codes from MEDS_tabular_automl.file_name import list_subdir_files from MEDS_tabular_automl.generate_static_features import get_flat_static_rep from MEDS_tabular_automl.mapper import wrap as rwlock_wrap @@ -71,9 +71,36 @@ def main( iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() - # Produce ts representation + + # Step 1: Cache the filtered features that will be used in the tabularization process and modeling + # import pdb; pdb.set_trace() + def read_fn(_): + return _ + def compute_fn(_): + filtered_feature_columns = filter_to_codes( + cfg.tabularization.allowed_codes, + cfg.tabularization.min_code_inclusion_frequency, + cfg.input_code_metadata_fp, + ) + feature_freqs = get_feature_freqs(cfg.input_code_metadata_fp) + filtered_feeature_freqs = {code: count for code, count in feature_freqs.items() if code in filtered_feature_columns} + return convert_to_df(filtered_feeature_freqs) + def write_fn(data, out_fp): + data.write_parquet(out_fp) + in_fp = Path(cfg.input_code_metadata_fp) + out_fp = Path(cfg.tabularization.filtered_code_metadata_fp) + rwlock_wrap( + in_fp, + out_fp, + read_fn, + write_fn, + compute_fn, + do_overwrite=cfg.do_overwrite, + do_return=False, + ) + # Step 2: Produce static data representation meds_shard_fps = list_subdir_files(cfg.input_dir, "parquet") - feature_columns = get_feature_columns(cfg.input_code_metadata) + feature_columns = get_feature_columns(cfg.tabularization.filtered_code_metadata_fp) # shuffle tasks aggs = cfg.tabularization.aggs diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index 5bb1f57..a4d51db 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -64,7 +64,7 @@ def main( hydra_loguru_init() # Produce ts representation meds_shard_fps = list_subdir_files(cfg.input_dir, "parquet") - feature_columns = get_feature_columns(cfg.input_code_metadata) + feature_columns = get_feature_columns(cfg.tabularization.filtered_code_metadata_fp) # shuffle tasks aggs = [ diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index bd2929f..bdc99b3 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -268,17 +268,6 @@ def test_tabularize(): ): # path to config.yaml overrides = [f"{k}={v}" for k, v in tabularize_static_config.items()] cfg = compose(config_name="tabularization", overrides=overrides) # config.yaml - allowed_codes = cfg.tabularization._resolved_codes - num_allowed_codes = len(allowed_codes) - # num_codes = ( - # pl.scan_parquet(list_subdir_files(Path(cfg.input_dir), "parquet")) - # .select(pl.col("code")) - # .collect() - # .n_unique() - # ) - assert num_allowed_codes == len( - feature_columns - ), f"Should have {len(feature_columns)} codes but has {num_allowed_codes}" tabularize_static.main(cfg) output_files = list(Path(cfg.output_dir).glob("**/static/**/*.npz")) actual_files = [get_shard_prefix(Path(cfg.output_dir), each) + ".npz" for each in output_files] @@ -304,6 +293,12 @@ def test_tabularize(): f"Static Data matrix Should have {expected_num_rows}" f" rows but has {static_matrix.shape[0]}!" ) + allowed_codes = cfg.tabularization._resolved_codes + num_allowed_codes = len(allowed_codes) + feature_columns = get_feature_columns(cfg.tabularization.filtered_code_metadata_fp) + assert num_allowed_codes == len( + feature_columns + ), f"Should have {len(feature_columns)} codes but has {num_allowed_codes}" tabularize_time_series.main(cfg) From 3e5ebae76735918d2d6436952b102228716f97d5 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 5 Jun 2024 00:19:43 +0000 Subject: [PATCH 096/106] updated cli to take min code frequency --- hf_cohort/hf_cohort_cli.sh | 10 +++++----- src/MEDS_tabular_automl/describe_codes.py | 4 ++-- src/MEDS_tabular_automl/utils.py | 3 --- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/hf_cohort/hf_cohort_cli.sh b/hf_cohort/hf_cohort_cli.sh index ddb8c74..15453a4 100644 --- a/hf_cohort/hf_cohort_cli.sh +++ b/hf_cohort/hf_cohort_cli.sh @@ -9,6 +9,7 @@ ID=$1 N_PARALLEL_WORKERS="$2" WINDOW_SIZES="tabularization.window_sizes=[1d,7d,30d,365d,full]" AGGS="tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" +MIN_CODE_FREQ=10 echo "Running identify_columns.py: Caching feature names and frequencies." rm -rf $OUTPUT_DIR @@ -17,7 +18,7 @@ meds-tab-describe MEDS_cohort_dir=$MEDS_DIR echo "Running tabularize_static.py: tabularizing static data" meds_tab-tabularize-static \ MEDS_cohort_dir=$MEDS_DIR \ - tabularization.min_code_inclusion_frequency=10 "$WINDOW_SIZES" do_overwrite=False "$AGGS" + tabularization.min_code_inclusion_frequency="$MIN_CODE_FREQ" "$WINDOW_SIZES" do_overwrite=False "$AGGS" POLARS_MAX_THREADS=1 @@ -30,7 +31,7 @@ mkdir -p $LOG_DIR worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ MEDS_cohort_dir=$MEDS_DIR \ - tabularization.min_code_inclusion_frequency=10 do_overwrite=False \ + tabularization.min_code_inclusion_frequency="$MIN_CODE_FREQ" do_overwrite=False \ "$WINDOW_SIZES" "$AGGS" \ 2> $LOG_DIR/cmd.stderr } 2> $LOG_DIR/timings.txt @@ -50,13 +51,12 @@ mprof peak $LOG_DIR/mprofile.dat > $LOG_DIR/peak_memory_usage.txt echo "Running task_specific_caching.py: tabularizing static data" meds_tab-cache-task \ MEDS_cohort_dir=$MEDS_DIR \ - tabularization.min_code_inclusion_frequency=10 "$WINDOW_SIZES" do_overwrite=False "$AGGS" + tabularization.min_code_inclusion_frequency="$MIN_CODE_FREQ" "$WINDOW_SIZES" do_overwrite=False "$AGGS" echo "Running xgboost: tabularizing static data" meds_tab-xgboost \ MEDS_cohort_dir=$MEDS_DIR \ - modeling_min_code_freq=10 - tabularization.min_code_inclusion_frequency=10 "$WINDOW_SIZES" do_overwrite=False "$AGGS" + tabularization.min_code_inclusion_frequency="$MIN_CODE_FREQ" "$WINDOW_SIZES" do_overwrite=False "$AGGS" diff --git a/src/MEDS_tabular_automl/describe_codes.py b/src/MEDS_tabular_automl/describe_codes.py index a92730f..5008f2d 100644 --- a/src/MEDS_tabular_automl/describe_codes.py +++ b/src/MEDS_tabular_automl/describe_codes.py @@ -105,7 +105,7 @@ def filter_to_codes( min_code_inclusion_frequency: int, code_metadata_fp: Path, ): - """Returns allowed codes if they are specified, otherwise filters to codes based on inclusion + """Returns intersection of allowed codes if they are specified, and filters to codes based on inclusion frequency.""" if allowed_codes is None: allowed_codes = get_feature_columns(code_metadata_fp) @@ -113,7 +113,7 @@ def filter_to_codes( code_freqs = { code: freq for code, freq in feature_freqs.items() if ( - freq >= min_code_inclusion_frequency and code in allowed_codes + freq >= min_code_inclusion_frequency and code in set(allowed_codes) ) } return sorted([code for code, freq in code_freqs.items() if freq >= min_code_inclusion_frequency]) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index a77fba9..dc0ebed 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -417,9 +417,6 @@ def store_config_yaml(config_fp: Path, cfg: DictConfig): ... print("FileExistsError Error Triggered") FileExistsError Error Triggered """ - if config_fp.exists(): - if not cfg.do_overwrite: - raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {config_fp} exists!") OmegaConf.save(cfg, config_fp) From 9f6b0d36765d9ca03c9e032101197228edc278d8 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 5 Jun 2024 01:57:17 +0000 Subject: [PATCH 097/106] [wip] debugging config path resolutions --- .../configs/launch_xgboost.yaml | 18 ++++++++---- src/MEDS_tabular_automl/scripts/cache_task.py | 8 +++++- .../scripts/describe_codes.py | 7 ++++- .../scripts/launch_xgboost.py | 25 ++++++++++++++++- .../scripts/tabularize_static.py | 28 +++++++++++++++---- .../scripts/tabularize_time_series.py | 10 +++++-- 6 files changed, 80 insertions(+), 16 deletions(-) diff --git a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml index 306a158..954e56a 100644 --- a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml @@ -45,8 +45,16 @@ hydra: # Define search space for Optuna params: - window_sizes: - _target_: hydra.utils.call(${hydra.utils.cross_product}, - values=["1d", "7d", "30d", "365d", "full"], min_options=1) - -name: launch_xgboost \ No newline at end of file + window_sizes: ${generate_permutations:${tabularization.window_sizes}} + aggs: ${generate_permutations:${tabularization.aggs}} + min_code_freq: tag(log,range(10,1_000_000)) + model_params: + model: + +num_boost_round: range(10, 1000) + +max_depth: range(2, 16) + +eta: tag(log,interval(.001,1)) + +lambda: tag(log,interval(.001,1)) + +alpha: tag(log,interval(.001,1)) + +subsample: interval(0.5,1) + +min_child_weight: uniform(1e-2, 100) +name: launch_xgboost diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index f7eb5b0..5f0aff3 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -1,6 +1,7 @@ #!/usr/bin/env python """Aggregates time-series data for feature columns across different window sizes.""" +from importlib.resources import files from pathlib import Path import hydra @@ -23,6 +24,11 @@ write_df, ) +config_yaml = files("MEDS_tabular_automl").joinpath("configs/task_specific_caching.yaml") +if not config_yaml.is_file(): + raise FileNotFoundError("Core configuration not successfully installed!") + + VALID_AGGREGATIONS = [ *VALUE_AGGREGATIONS, *CODE_AGGREGATIONS, @@ -44,7 +50,7 @@ def generate_row_cached_matrix(matrix, label_df): return sp.coo_array(csr) -@hydra.main(version_base=None, config_path="../configs", config_name="task_specific_caching") +@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) def main( cfg: DictConfig, ): diff --git a/src/MEDS_tabular_automl/scripts/describe_codes.py b/src/MEDS_tabular_automl/scripts/describe_codes.py index fa40573..034244a 100644 --- a/src/MEDS_tabular_automl/scripts/describe_codes.py +++ b/src/MEDS_tabular_automl/scripts/describe_codes.py @@ -1,6 +1,7 @@ #!/usr/bin/env python """This Python script, stores the configuration parameters and feature columns used in the output.""" from collections import defaultdict +from importlib.resources import files from pathlib import Path import hydra @@ -24,8 +25,12 @@ write_df, ) +config_yaml = files("MEDS_tabular_automl").joinpath("configs/describe_codes.yaml") +if not config_yaml.is_file(): + raise FileNotFoundError("Core configuration not successfully installed!") -@hydra.main(version_base=None, config_path="../configs", config_name="describe_codes") + +@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) def main( cfg: DictConfig, ): diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py index 8474a99..b48fbe9 100644 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ b/src/MEDS_tabular_automl/scripts/launch_xgboost.py @@ -1,4 +1,6 @@ from collections.abc import Callable, Mapping +from importlib.resources import files +from itertools import combinations from pathlib import Path import hydra @@ -15,6 +17,25 @@ from MEDS_tabular_automl.file_name import get_model_files, list_subdir_files from MEDS_tabular_automl.utils import get_feature_indices, hydra_loguru_init +config_yaml = files("MEDS_tabular_automl").joinpath("configs/describe_codes.yaml") +if not config_yaml.is_file(): + raise FileNotFoundError("Core configuration not successfully installed!") + + +def generate_permutations(list_of_options): + """Generate all possible permutations of a list of options. + + Args: + - list_of_options (list): List of options. + + Returns: + - list: List of all possible permutations of length > 1 + """ + permutations = [] + for i in range(1, len(list_of_options) + 1): + permutations.extend(list(combinations(list_of_options, r=i))) + return permutations + class Iterator(xgb.DataIter, TimeableMixin): """Iterator class for loading and processing data shards. @@ -50,6 +71,8 @@ def __init__(self, cfg: DictConfig, split: str = "train"): split: The data split to use, which can be one of "train", "tuning", or "held_out". This determines which subset of the data is loaded and processed. """ + # generate_permutations(cfg.tabularization.window_sizes) + # generate_permutations(cfg.tabularization.aggs) self.cfg = cfg self.split = split # Load shards for this split @@ -377,7 +400,7 @@ def evaluate(self) -> float: return roc_auc_score(y_true, y_pred) -@hydra.main(version_base=None, config_path="../configs", config_name="launch_xgboost") +@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) def main(cfg: DictConfig) -> float: """Optimize the model based on the provided configuration. diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index d12eb3a..b8637b6 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -7,9 +7,20 @@ import hydra import numpy as np import polars as pl + +pl.enable_string_cache() + +from importlib.resources import files + from omegaconf import DictConfig -from MEDS_tabular_automl.describe_codes import filter_parquet, get_feature_columns, get_feature_freqs, convert_to_df, filter_to_codes +from MEDS_tabular_automl.describe_codes import ( + convert_to_df, + filter_parquet, + filter_to_codes, + get_feature_columns, + get_feature_freqs, +) from MEDS_tabular_automl.file_name import list_subdir_files from MEDS_tabular_automl.generate_static_features import get_flat_static_rep from MEDS_tabular_automl.mapper import wrap as rwlock_wrap @@ -22,10 +33,12 @@ write_df, ) -pl.enable_string_cache() +config_yaml = files("MEDS_tabular_automl").joinpath("configs/describe_codes.yaml") +if not config_yaml.is_file(): + raise FileNotFoundError("Core configuration not successfully installed!") -@hydra.main(version_base=None, config_path="../configs", config_name="tabularization") +@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) def main( cfg: DictConfig, ): @@ -71,11 +84,12 @@ def main( iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() - + # Step 1: Cache the filtered features that will be used in the tabularization process and modeling # import pdb; pdb.set_trace() def read_fn(_): return _ + def compute_fn(_): filtered_feature_columns = filter_to_codes( cfg.tabularization.allowed_codes, @@ -83,10 +97,14 @@ def compute_fn(_): cfg.input_code_metadata_fp, ) feature_freqs = get_feature_freqs(cfg.input_code_metadata_fp) - filtered_feeature_freqs = {code: count for code, count in feature_freqs.items() if code in filtered_feature_columns} + filtered_feeature_freqs = { + code: count for code, count in feature_freqs.items() if code in filtered_feature_columns + } return convert_to_df(filtered_feeature_freqs) + def write_fn(data, out_fp): data.write_parquet(out_fp) + in_fp = Path(cfg.input_code_metadata_fp) out_fp = Path(cfg.tabularization.filtered_code_metadata_fp) rwlock_wrap( diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index a4d51db..9772797 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -2,17 +2,18 @@ """Aggregates time-series data for feature columns across different window sizes.""" import polars as pl + pl.enable_string_cache() +from importlib.resources import files from itertools import product from pathlib import Path + import hydra import numpy as np from loguru import logger from omegaconf import DictConfig - - from MEDS_tabular_automl.describe_codes import filter_parquet, get_feature_columns from MEDS_tabular_automl.file_name import list_subdir_files from MEDS_tabular_automl.generate_summarized_reps import generate_summary @@ -27,9 +28,12 @@ write_df, ) +config_yaml = files("MEDS_tabular_automl").joinpath("configs/describe_codes.yaml") +if not config_yaml.is_file(): + raise FileNotFoundError("Core configuration not successfully installed!") -@hydra.main(version_base=None, config_path="../configs", config_name="tabularization") +@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) def main( cfg: DictConfig, ): From bdbe1fc0e484a43512cc0f2094e398edc296fa80 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 4 Jun 2024 21:59:06 -0400 Subject: [PATCH 098/106] Added setuptools scm plugin --- pyproject.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fdef251..bcbb32f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,3 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - [project] name = "MEDS_tabularization" version = "0.0.1" @@ -32,6 +28,10 @@ dev = ["pre-commit"] tests = ["pytest", "pytest-cov", "rootutils"] profiling = ["mprofile", "matplotlib"] +[build-system] +requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"] +build-backend = "setuptools.build_meta" + [project.urls] Homepage = "https://github.com/mmcdermott/MEDS_Tabular_AutoML" Issues = "https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues" From 3f5a23db4d8f0e9fd3de74cd80cb2f4aabef5842 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 5 Jun 2024 02:16:55 +0000 Subject: [PATCH 099/106] added integration test that reproduces the Could not override 'tabularization.min_code_inclusion_frequency'. error --- pyproject.toml | 8 +- tests/cli_bk.py | 199 -------------------------- tests/test_integration.py | 289 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 293 insertions(+), 203 deletions(-) delete mode 100644 tests/cli_bk.py create mode 100644 tests/test_integration.py diff --git a/pyproject.toml b/pyproject.toml index bcbb32f..3e12854 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,10 +18,10 @@ dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", " [project.scripts] meds-tab-describe = "MEDS_tabular_automl.scripts.describe_codes:main" -meds_tab-tabularize-static = "MEDS_tabular_automl.scripts.tabularize_static:main" -meds_tab-tabularize-time-series = "MEDS_tabular_automl.scripts.tabularize_time_series:main" -meds_tab-cache-task = "MEDS_tabular_automl.scripts.cache_task:main" -meds_tab-xgboost = "MEDS_tabular_automl.scripts.launch_xgboost:main" +meds-tab-tabularize-static = "MEDS_tabular_automl.scripts.tabularize_static:main" +meds-tab-tabularize-time-series = "MEDS_tabular_automl.scripts.tabularize_time_series:main" +meds-tab-cache-task = "MEDS_tabular_automl.scripts.cache_task:main" +meds-tab-xgboost = "MEDS_tabular_automl.scripts.launch_xgboost:main" [project.optional-dependencies] dev = ["pre-commit"] diff --git a/tests/cli_bk.py b/tests/cli_bk.py deleted file mode 100644 index a0d437e..0000000 --- a/tests/cli_bk.py +++ /dev/null @@ -1,199 +0,0 @@ -import rootutils - -root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) - -import json -import subprocess -import tempfile -from io import StringIO -from pathlib import Path - -import polars as pl -from loguru import logger -from omegaconf import DictConfig -from scripts.identify_columns import store_columns -from scripts.tabularize_static import tabularize_static_data -from test_tabularize import ( - CODE_COLS, - EXPECTED_STATIC_FILES, - MEDS_OUTPUTS, - SPLITS_JSON, - STATIC_FIRST_COLS, - STATIC_PRESENT_COLS, - SUMMARIZE_EXPECTED_FILES, - VALUE_COLS, -) - -from MEDS_tabular_automl.file_name import FileNameResolver -from MEDS_tabular_automl.utils import ( - VALUE_AGGREGATIONS, - get_events_df, - get_feature_names, - load_matrix, -) - - -def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): - command_parts = [script] + args + [f"{k}={v}" for k, v in hydra_kwargs.items()] - command_out = subprocess.run(" ".join(command_parts), shell=True, capture_output=True) - stderr = command_out.stderr.decode() - stdout = command_out.stdout.decode() - if command_out.returncode != 0: - raise AssertionError(f"{test_name} failed!\nstdout:\n{stdout}\nstderr:\n{stderr}") - return stderr, stdout - - -def test_tabularize(): - # Step 0: Setup Environment - with tempfile.TemporaryDirectory() as d: - MEDS_cohort_dir = Path(d) / "processed" - tabularized_data_dir = Path(d) / "processed" / "tabularize" - # Create the directories - (MEDS_cohort_dir / "final_cohort").mkdir(parents=True, exist_ok=True) - - # Store MEDS outputs - for split, data in MEDS_OUTPUTS.items(): - file_path = MEDS_cohort_dir / "final_cohort" / f"{split}.parquet" - file_path.parent.mkdir(exist_ok=True) - df = pl.read_csv(StringIO(data)) - df.with_columns(pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f")).write_parquet( - file_path - ) - - tabularize_config_kwargs = { - "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), - "tabularized_data_dir": str(tabularized_data_dir.resolve()), - "min_code_inclusion_frequency": 1, - "model_dir": str(Path(d) / "save_model"), - "window_sizes": "[30d,365d,full]", - "aggs": "[code/count,value/sum,static/present,static/first]", - "codes": "null", - "n_patients_per_sub_shard": 2, - "do_overwrite": True, - "do_update": True, - "seed": 1, - "hydra.verbose": True, - "tqdm": False, - "test": True, - "task_dir": str((tabularized_data_dir / "task").resolve()), - } - cfg = DictConfig(tabularize_config_kwargs) - f_name_resolver = FileNameResolver(cfg) - meds_files = f_name_resolver.list_meds_files() - assert len(meds_files) == 4, "MEDS Data Files Should be 4!" - for f in meds_files: - assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" - - split_json = json.load(StringIO(SPLITS_JSON)) - splits_fp = MEDS_cohort_dir / "splits.json" - json.dump(split_json, splits_fp.open("w")) - logger.info("caching flat representation of MEDS data") - - # Step 1: Run the describe_codes script - stderr, stdout = run_command( - "meds_tab describe_codes", - [], - tabularize_config_kwargs, - "describe_codes", - ) - - store_columns(cfg) - assert (tabularized_data_dir / "config.yaml").is_file() - assert (tabularized_data_dir / "feature_columns.json").is_file() - assert (tabularized_data_dir / "feature_freqs.json").is_file() - - feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) - assert get_feature_names("code/count", feature_columns) == sorted(CODE_COLS) - assert get_feature_names("static/present", feature_columns) == sorted(STATIC_PRESENT_COLS) - assert get_feature_names("static/first", feature_columns) == sorted(STATIC_FIRST_COLS) - for value_agg in VALUE_AGGREGATIONS: - assert get_feature_names(value_agg, feature_columns) == sorted(VALUE_COLS) - - # Step 2: Run the tabularization script - n_workers = "1" - stderr, stdout = run_command( - "meds_tab tabularization", - [n_workers], - tabularize_config_kwargs, - "tabularization", - ) - # Check Static File Generation - tabularize_static_data(cfg) - actual_files = [str(Path(*f.parts[-5:])) for f in f_name_resolver.list_static_files()] - assert set(actual_files) == set(EXPECTED_STATIC_FILES) - # Check the files are not empty - for f in f_name_resolver.list_static_files(): - static_matrix = load_matrix(f) - assert static_matrix.shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" - expected_num_cols = len(get_feature_names(f"static/{f.stem}", feature_columns)) - logger.info((static_matrix.shape[1], expected_num_cols)) - logger.info(f_name_resolver.list_static_files()) - assert static_matrix.shape[1] == expected_num_cols, ( - f"Static Data Tabular Dataframe Should have {expected_num_cols}" - f"Columns but has {static_matrix.shape[1]}!" - ) - static_first_fp = f_name_resolver.get_flat_static_rep("tuning", "0", "static/first") - static_present_fp = f_name_resolver.get_flat_static_rep("tuning", "0", "static/present") - assert ( - load_matrix(static_first_fp).shape[0] == load_matrix(static_present_fp).shape[0] - ), "static data first and present aggregations have different numbers of rows" - - # Check Time Series File Generation - output_files = f_name_resolver.list_ts_files() - f_name_resolver.list_ts_files() - actual_files = [str(Path(*f.parts[-5:])) for f in output_files] - - assert set(actual_files) == set(SUMMARIZE_EXPECTED_FILES) - for f in output_files: - sparse_array = load_matrix(f) - assert sparse_array.shape[0] > 0 - assert sparse_array.shape[1] > 0 - ts_code_fp = f_name_resolver.get_flat_ts_rep("tuning", "0", "365d", "code/count") - ts_value_fp = f_name_resolver.get_flat_ts_rep("tuning", "0", "365d", "value/sum") - assert ( - load_matrix(ts_code_fp).shape[0] == load_matrix(ts_value_fp).shape[0] - ), "time series code and value have different numbers of rows" - assert ( - load_matrix(static_first_fp).shape[0] == load_matrix(ts_value_fp).shape[0] - ), "static data and time series have different numbers of rows" - - # Create Fake Labels - feature_columns = json.load(open(f_name_resolver.get_feature_columns_fp())) - for f in f_name_resolver.list_meds_files(): - df = pl.read_parquet(f) - df = get_events_df(df, feature_columns) - pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) - df = df.with_columns(pl.Series(name="label", values=pseudo_labels)) - df = df.select(pl.col(["patient_id", "timestamp", "label"])) - df = df.unique(subset=["patient_id", "timestamp"]) - df = df.with_row_index("event_id") - - split = f.parent.stem - shard_num = f.stem - out_f = f_name_resolver.get_label(split, shard_num) - out_f.parent.mkdir(parents=True, exist_ok=True) - df.write_parquet(out_f) - - # Step 3: Run the task_specific_caching script - stderr, stdout = run_command( - "meds_tab task_specific_caching", - [], - tabularize_config_kwargs, - "task_specific_caching", - ) - # Check the files are not empty - - # Step 4: Run the xgboost script - xgboost_config_kwargs = { - "hydra.mode": "MULTIRUN", - } - xgboost_config_kwargs = {**tabularize_config_kwargs, **xgboost_config_kwargs} - stderr, stdout = run_command( - "meds_tab xgboost", - [], - xgboost_config_kwargs, - "xgboost", - ) - output_files = list(Path(cfg.model_dir).glob("*.json")) - assert len(output_files) == 1 - assert output_files[0] == Path(cfg.model_dir) / "model.json" diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..8f2cc92 --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,289 @@ +import rootutils + +root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) + +import json +import tempfile +from io import StringIO +from pathlib import Path + +import polars as pl +from hydra import compose, initialize + +from MEDS_tabular_automl.describe_codes import get_feature_columns +from MEDS_tabular_automl.file_name import list_subdir_files +from MEDS_tabular_automl.scripts import ( + cache_task, + describe_codes, + launch_xgboost, + tabularize_static, + tabularize_time_series, +) +from MEDS_tabular_automl.utils import ( + VALUE_AGGREGATIONS, + get_events_df, + get_feature_names, + get_shard_prefix, + get_unique_time_events_df, + load_matrix, +) +from test_tabularize import ( + CODE_COLS, + EXPECTED_STATIC_FILES, + MEDS_OUTPUTS, + SPLITS_JSON, + STATIC_FIRST_COLS, + STATIC_PRESENT_COLS, + SUMMARIZE_EXPECTED_FILES, + VALUE_COLS, +) +import subprocess +from loguru import logger + + +def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): + command_parts = [script] + args + [f"{k}={v}" for k, v in hydra_kwargs.items()] + command_out = subprocess.run(" ".join(command_parts), shell=True, capture_output=True) + stderr = command_out.stderr.decode() + stdout = command_out.stdout.decode() + if command_out.returncode != 0: + raise AssertionError(f"{test_name} failed!\nstdout:\n{stdout}\nstderr:\n{stderr}") + return stderr, stdout + + +def test_tabularize(): + # Step 0: Setup Environment + with tempfile.TemporaryDirectory() as d: + MEDS_cohort_dir = Path(d) / "processed" + + describe_codes_config = { + "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), + "do_overwrite": False, + "seed": 1, + "hydra.verbose": True, + "tqdm": False, + "loguru_init": True, + } + + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in describe_codes_config.items()] + cfg = compose(config_name="describe_codes", overrides=overrides) # config.yaml + + # Create the directories + (MEDS_cohort_dir / "final_cohort").mkdir(parents=True, exist_ok=True) + + # Store MEDS outputs + for split, data in MEDS_OUTPUTS.items(): + file_path = MEDS_cohort_dir / "final_cohort" / f"{split}.parquet" + file_path.parent.mkdir(exist_ok=True) + df = pl.read_csv(StringIO(data)) + df.with_columns(pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f")).write_parquet( + file_path + ) + + # Check the files are not empty + meds_files = list_subdir_files(Path(cfg.input_dir), "parquet") + assert ( + len(list_subdir_files(Path(cfg.input_dir).parent, "parquet")) == 4 + ), "MEDS train split Data Files Should be 4!" + for f in meds_files: + assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" + split_json = json.load(StringIO(SPLITS_JSON)) + splits_fp = MEDS_cohort_dir / "splits.json" + json.dump(split_json, splits_fp.open("w")) + + + # Step 1: Run the describe_codes script + stderr, stdout = run_command( + "meds-tab-describe", + [], + describe_codes_config, + "describe_codes", + ) + assert (Path(cfg.output_dir) / "config.yaml").is_file() + assert Path(cfg.output_filepath).is_file() + + feature_columns = get_feature_columns(cfg.output_filepath) + assert get_feature_names("code/count", feature_columns) == sorted(CODE_COLS) + assert get_feature_names("static/present", feature_columns) == sorted(STATIC_PRESENT_COLS) + assert get_feature_names("static/first", feature_columns) == sorted(STATIC_FIRST_COLS) + for value_agg in VALUE_AGGREGATIONS: + assert get_feature_names(value_agg, feature_columns) == sorted(VALUE_COLS) + + # Step 2: Run the static data tabularization script + tabularize_config = { + "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), + "do_overwrite": False, + "seed": 1, + "hydra.verbose": True, + "tqdm": False, + "loguru_init": True, + "tabularization.min_code_inclusion_frequency": 1, + "tabularization.aggs": "[static/present,static/first,code/count,value/sum]", + "tabularization.window_sizes": "[30d,365d,full]", + } + stderr, stdout = run_command( + "meds-tab-tabularize-static", + [], + tabularize_config, + "tabularization", + ) + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in tabularize_config.items()] + cfg = compose(config_name="tabularization", overrides=overrides) # config.yaml + + output_files = list(Path(cfg.output_dir).glob("**/static/**/*.npz")) + actual_files = [get_shard_prefix(Path(cfg.output_dir), each) + ".npz" for each in output_files] + assert set(actual_files) == set(EXPECTED_STATIC_FILES) + # Check the files are not empty + for f in output_files: + static_matrix = load_matrix(f) + assert static_matrix.shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" + expected_num_cols = len(get_feature_names(f"static/{f.stem}", feature_columns)) + assert static_matrix.shape[1] == expected_num_cols, ( + f"Static Data Tabular Dataframe Should have {expected_num_cols}" + f"Columns but has {static_matrix.shape[1]}!" + ) + split = f.parts[-5] + shard_num = f.parts[-4] + med_shard_fp = (Path(cfg.input_dir) / split / shard_num).with_suffix(".parquet") + expected_num_rows = ( + get_unique_time_events_df(get_events_df(pl.scan_parquet(med_shard_fp), feature_columns)) + .collect() + .shape[0] + ) + assert static_matrix.shape[0] == expected_num_rows, ( + f"Static Data matrix Should have {expected_num_rows}" + f" rows but has {static_matrix.shape[0]}!" + ) + allowed_codes = cfg.tabularization._resolved_codes + num_allowed_codes = len(allowed_codes) + feature_columns = get_feature_columns(cfg.tabularization.filtered_code_metadata_fp) + assert num_allowed_codes == len( + feature_columns + ), f"Should have {len(feature_columns)} codes but has {num_allowed_codes}" + + # Step 3: Run the time series tabularization script + tabularize_config = { + "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), + "do_overwrite": False, + "seed": 1, + "hydra.verbose": True, + "tqdm": False, + "loguru_init": True, + "tabularization.min_code_inclusion_frequency": 1, + "tabularization.aggs": "[static/present,static/first,code/count,value/sum]", + "tabularization.window_sizes": "[30d,365d,full]", + } + + + stderr, stdout = run_command( + "meds-tab-tabularize-time-series", + ["--multirun", "worker=range(0,1)", "hydra/launcher=joblib"], + tabularize_config, + "tabularization", + ) + + # confirm summary files exist: + output_files = list_subdir_files(cfg.output_dir, "npz") + actual_files = [ + get_shard_prefix(Path(cfg.output_dir), each) + ".npz" + for each in output_files + if "none/static" not in str(each) + ] + assert set(actual_files) == set(SUMMARIZE_EXPECTED_FILES) + for f in output_files: + ts_matrix = load_matrix(f) + assert ts_matrix.shape[0] > 0, "Time-Series Tabular Dataframe Should not be Empty!" + expected_num_cols = len(get_feature_names(f"{f.parent.stem}/{f.stem}", feature_columns)) + assert ts_matrix.shape[1] == expected_num_cols, ( + f"Time-Series Tabular Dataframe Should have {expected_num_cols}" + f"Columns but has {ts_matrix.shape[1]}!" + ) + split = f.parts[-5] + shard_num = f.parts[-4] + med_shard_fp = (Path(cfg.input_dir) / split / shard_num).with_suffix(".parquet") + expected_num_rows = ( + get_unique_time_events_df(get_events_df(pl.scan_parquet(med_shard_fp), feature_columns)) + .collect() + .shape[0] + ) + assert ts_matrix.shape[0] == expected_num_rows, ( + f"Time-Series Data matrix Should have {expected_num_rows}" + f" rows but has {ts_matrix.shape[0]}!" + ) + + # Step 4: Run the task_specific_caching script + cache_config = { + "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), + "do_overwrite": False, + "seed": 1, + "hydra.verbose": True, + "tqdm": False, + "loguru_init": True, + "tabularization.min_code_inclusion_frequency": 1, + "tabularization.aggs": "[static/present,static/first,code/count,value/sum]", + "tabularization.window_sizes": "[30d,365d,full]", + } + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in cache_config.items()] + cfg = compose(config_name="task_specific_caching", overrides=overrides) # config.yaml + # Create fake labels + for f in list_subdir_files(Path(cfg.MEDS_cohort_dir) / "final_cohort", "parquet"): + df = pl.scan_parquet(f) + df = get_unique_time_events_df(get_events_df(df, feature_columns)).collect() + pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) + df = df.with_columns(pl.Series(name="label", values=pseudo_labels)) + df = df.select(pl.col(["patient_id", "timestamp", "label"])) + df = df.with_row_index("event_id") + + split = f.parent.stem + shard_num = f.stem + out_f = Path(cfg.input_label_dir) / Path( + get_shard_prefix(Path(cfg.MEDS_cohort_dir) / "final_cohort", f) + ).with_suffix(".parquet") + out_f.parent.mkdir(parents=True, exist_ok=True) + df.write_parquet(out_f) + + stderr, stdout = run_command( + "meds-tab-cache-task", + [], + cache_config, + "task_specific_caching", + ) + # Check the files are not empty + + # Step 5: Run the xgboost script + + xgboost_config_kwargs = { + "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), + "do_overwrite": False, + "seed": 1, + "hydra.verbose": True, + "tqdm": False, + "loguru_init": True, + "tabularization.min_code_inclusion_frequency": 1, + "tabularization.aggs": "[static/present,static/first,code/count,value/sum]", + "tabularization.window_sizes": "[30d,365d,full]", + } + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] + cfg = compose(config_name="launch_xgboost", overrides=overrides) # config.yaml + stderr, stdout = run_command( + "meds-tab-xgboost", + [], + xgboost_config_kwargs, + "xgboost", + ) + + output_files = list(Path(cfg.output_dir).glob("*.json")) + assert len(output_files) == 1 + assert output_files[0] == Path(cfg.output_dir) / "model.json" From f1e564781fc9c297464381c2d41563cdd3b238c8 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 5 Jun 2024 02:22:34 +0000 Subject: [PATCH 100/106] Added init file so tabularize default config path is resolved --- .../configs/tabularization/__init__.py | 0 tests/test_integration.py | 32 +++++++------------ 2 files changed, 11 insertions(+), 21 deletions(-) create mode 100644 src/MEDS_tabular_automl/configs/tabularization/__init__.py diff --git a/src/MEDS_tabular_automl/configs/tabularization/__init__.py b/src/MEDS_tabular_automl/configs/tabularization/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_integration.py b/tests/test_integration.py index 8f2cc92..74ac4fb 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -3,22 +3,26 @@ root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) import json +import subprocess import tempfile from io import StringIO from pathlib import Path import polars as pl from hydra import compose, initialize +from test_tabularize import ( + CODE_COLS, + EXPECTED_STATIC_FILES, + MEDS_OUTPUTS, + SPLITS_JSON, + STATIC_FIRST_COLS, + STATIC_PRESENT_COLS, + SUMMARIZE_EXPECTED_FILES, + VALUE_COLS, +) from MEDS_tabular_automl.describe_codes import get_feature_columns from MEDS_tabular_automl.file_name import list_subdir_files -from MEDS_tabular_automl.scripts import ( - cache_task, - describe_codes, - launch_xgboost, - tabularize_static, - tabularize_time_series, -) from MEDS_tabular_automl.utils import ( VALUE_AGGREGATIONS, get_events_df, @@ -27,18 +31,6 @@ get_unique_time_events_df, load_matrix, ) -from test_tabularize import ( - CODE_COLS, - EXPECTED_STATIC_FILES, - MEDS_OUTPUTS, - SPLITS_JSON, - STATIC_FIRST_COLS, - STATIC_PRESENT_COLS, - SUMMARIZE_EXPECTED_FILES, - VALUE_COLS, -) -import subprocess -from loguru import logger def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): @@ -94,7 +86,6 @@ def test_tabularize(): splits_fp = MEDS_cohort_dir / "splits.json" json.dump(split_json, splits_fp.open("w")) - # Step 1: Run the describe_codes script stderr, stdout = run_command( "meds-tab-describe", @@ -180,7 +171,6 @@ def test_tabularize(): "tabularization.window_sizes": "[30d,365d,full]", } - stderr, stdout = run_command( "meds-tab-tabularize-time-series", ["--multirun", "worker=range(0,1)", "hydra/launcher=joblib"], From 16857682e0f9483b5fb0428d2b0a1f6be3d4f7e7 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 5 Jun 2024 02:51:10 +0000 Subject: [PATCH 101/106] passing integration tests now --- hf_cohort/hf_cohort_cli.sh | 32 +++++++++---------- .../scripts/launch_xgboost.py | 3 +- .../scripts/tabularize_static.py | 2 +- .../scripts/tabularize_time_series.py | 2 +- tests/test_integration.py | 4 +-- tests/test_tabularize.py | 2 +- 6 files changed, 23 insertions(+), 22 deletions(-) diff --git a/hf_cohort/hf_cohort_cli.sh b/hf_cohort/hf_cohort_cli.sh index 15453a4..e4259f1 100644 --- a/hf_cohort/hf_cohort_cli.sh +++ b/hf_cohort/hf_cohort_cli.sh @@ -16,47 +16,47 @@ rm -rf $OUTPUT_DIR meds-tab-describe MEDS_cohort_dir=$MEDS_DIR echo "Running tabularize_static.py: tabularizing static data" -meds_tab-tabularize-static \ +meds-tab-tabularize-static \ MEDS_cohort_dir=$MEDS_DIR \ tabularization.min_code_inclusion_frequency="$MIN_CODE_FREQ" "$WINDOW_SIZES" do_overwrite=False "$AGGS" -POLARS_MAX_THREADS=1 +export POLARS_MAX_THREADS=1 LOG_DIR="logs/$METHOD/$ID-logs" -mkdir -p $LOG_DIR +mkdir -p "${LOG_DIR}" { time \ - mprof run --include-children --exit-code --output "$LOG_DIR/mprofile.dat" \ - meds_tab-tabularize-time-series \ + mprof run --include-children --exit-code --output "${LOG_DIR}/mprofile.dat" \ + meds-tab-tabularize-time-series \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ MEDS_cohort_dir=$MEDS_DIR \ tabularization.min_code_inclusion_frequency="$MIN_CODE_FREQ" do_overwrite=False \ "$WINDOW_SIZES" "$AGGS" \ - 2> $LOG_DIR/cmd.stderr -} 2> $LOG_DIR/timings.txt + 2> "${LOG_DIR}/cmd.stderr" +} 2> "${LOG_DIR}/timings.txt" cmd_exit_status=${PIPESTATUS[0]} # Check the exit status of the second command in the pipeline (mprof run ...) if [ -n "$cmd_exit_status" ] && [ "$cmd_exit_status" -ne 0 ]; then echo "build_dataset.sh failed with status $cmd_exit_status." - echo "Stderr from build_dataset.sh (see $LOG_DIR/cmd.stderr):" - tail $LOG_DIR/cmd.stderr + echo "Stderr from build_dataset.sh (see ${LOG_DIR}/cmd.stderr):" + tail "${LOG_DIR}/cmd.stderr" exit "$cmd_exit_status" fi -mprof plot -o $LOG_DIR/mprofile.png $LOG_DIR/mprofile.dat -mprof peak $LOG_DIR/mprofile.dat > $LOG_DIR/peak_memory_usage.txt +mprof plot -o "${LOG_DIR}/mprofile.png" "${LOG_DIR}/mprofile.dat" +mprof peak "${LOG_DIR}/mprofile.dat" > "${LOG_DIR}/peak_memory_usage.txt" echo "Running task_specific_caching.py: tabularizing static data" -meds_tab-cache-task \ +meds-tab-cache-task \ MEDS_cohort_dir=$MEDS_DIR \ tabularization.min_code_inclusion_frequency="$MIN_CODE_FREQ" "$WINDOW_SIZES" do_overwrite=False "$AGGS" echo "Running xgboost: tabularizing static data" -meds_tab-xgboost \ +meds-tab-xgboost \ + --multirun \ MEDS_cohort_dir=$MEDS_DIR \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ tabularization.min_code_inclusion_frequency="$MIN_CODE_FREQ" "$WINDOW_SIZES" do_overwrite=False "$AGGS" - - - diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py index b48fbe9..6b4ef7a 100644 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ b/src/MEDS_tabular_automl/scripts/launch_xgboost.py @@ -17,7 +17,7 @@ from MEDS_tabular_automl.file_name import get_model_files, list_subdir_files from MEDS_tabular_automl.utils import get_feature_indices, hydra_loguru_init -config_yaml = files("MEDS_tabular_automl").joinpath("configs/describe_codes.yaml") +config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml") if not config_yaml.is_file(): raise FileNotFoundError("Core configuration not successfully installed!") @@ -430,6 +430,7 @@ def main(cfg: DictConfig) -> float: save_dir = Path(cfg.output_dir) save_dir.mkdir(parents=True, exist_ok=True) + logger.info(f"Saving the model to directory: {save_dir}") model.model.save_model(save_dir / "model.json") auc = model.evaluate() logger.info(f"AUC: {auc}") diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index b8637b6..d653ac2 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -33,7 +33,7 @@ write_df, ) -config_yaml = files("MEDS_tabular_automl").joinpath("configs/describe_codes.yaml") +config_yaml = files("MEDS_tabular_automl").joinpath("configs/tabularization.yaml") if not config_yaml.is_file(): raise FileNotFoundError("Core configuration not successfully installed!") diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index 9772797..d3a653d 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -28,7 +28,7 @@ write_df, ) -config_yaml = files("MEDS_tabular_automl").joinpath("configs/describe_codes.yaml") +config_yaml = files("MEDS_tabular_automl").joinpath("configs/tabularization.yaml") if not config_yaml.is_file(): raise FileNotFoundError("Core configuration not successfully installed!") diff --git a/tests/test_integration.py b/tests/test_integration.py index 74ac4fb..e0fb00c 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -173,7 +173,7 @@ def test_tabularize(): stderr, stdout = run_command( "meds-tab-tabularize-time-series", - ["--multirun", "worker=range(0,1)", "hydra/launcher=joblib"], + ["--multirun", 'worker="range(0,1)"', "hydra/launcher=joblib"], tabularize_config, "tabularization", ) @@ -274,6 +274,6 @@ def test_tabularize(): "xgboost", ) - output_files = list(Path(cfg.output_dir).glob("*.json")) + output_files = list(Path(cfg.output_dir).glob("**/*.json")) assert len(output_files) == 1 assert output_files[0] == Path(cfg.output_dir) / "model.json" diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index bdc99b3..a49790d 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -388,6 +388,6 @@ def test_tabularize(): cfg = compose(config_name="launch_xgboost", overrides=overrides) # config.yaml launch_xgboost.main(cfg) - output_files = list(Path(cfg.output_dir).glob("*.json")) + output_files = list(Path(cfg.output_dir).glob("**/*.json")) assert len(output_files) == 1 assert output_files[0] == Path(cfg.output_dir) / "model.json" From b1f5db64485b2e59e0e25a17ef7038b28c6616d7 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 5 Jun 2024 02:57:50 +0000 Subject: [PATCH 102/106] added fix for model.json saving check in integration test --- tests/test_integration.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index e0fb00c..44cf54c 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -273,7 +273,6 @@ def test_tabularize(): xgboost_config_kwargs, "xgboost", ) - - output_files = list(Path(cfg.output_dir).glob("**/*.json")) + output_files = list(Path(cfg.output_dir).parent.glob("**/*.json")) assert len(output_files) == 1 - assert output_files[0] == Path(cfg.output_dir) / "model.json" + assert output_files[0].stem == "model" From 19f7ce76c2e4d6da560fdec6e6cea4f07636161d Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 5 Jun 2024 04:44:17 +0000 Subject: [PATCH 103/106] added xgboost sweeping support --- pyproject.toml | 1 + .../configs/launch_xgboost.yaml | 69 +++++++++----- src/MEDS_tabular_automl/configs/tmp.yaml.yaml | 89 +++++++++++++++++++ .../scripts/launch_xgboost.py | 16 ---- .../scripts/sweep_xgboost.py | 85 ++++++++++++++++++ tests/test_integration.py | 12 ++- tests/test_tabularize.py | 26 ++++++ 7 files changed, 257 insertions(+), 41 deletions(-) create mode 100644 src/MEDS_tabular_automl/configs/tmp.yaml.yaml create mode 100644 src/MEDS_tabular_automl/scripts/sweep_xgboost.py diff --git a/pyproject.toml b/pyproject.toml index 3e12854..8e53854 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ meds-tab-tabularize-static = "MEDS_tabular_automl.scripts.tabularize_static:main meds-tab-tabularize-time-series = "MEDS_tabular_automl.scripts.tabularize_time_series:main" meds-tab-cache-task = "MEDS_tabular_automl.scripts.cache_task:main" meds-tab-xgboost = "MEDS_tabular_automl.scripts.launch_xgboost:main" +meds-tab-xgboost-sweep = "MEDS_tabular_automl.scripts.sweep_xgboost:main" [project.optional-dependencies] dev = ["pre-commit"] diff --git a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml index 954e56a..123846f 100644 --- a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml @@ -1,8 +1,6 @@ defaults: - default - tabularization: default - - override hydra/sweeper: optuna - - override hydra/sweeper/sampler: tpe - _self_ task_name: task @@ -32,29 +30,52 @@ model_params: keep_data_in_memory: True binarize_task: True -# Sweep parameters for Optuna -hydra: - # Optuna Sweeper - sweeper: - sampler: - seed: ${seed} - storage: null - study_name: tabularize_study_${now:%Y-%m-%d_%H-%M-%S} - direction: minimize - n_trials: 10 +# Define search space for Optuna +optuna: + study_name: xgboost_sweep_${now:%Y-%m-%d_%H-%M-%S} + storage: null + load_if_exists: False + direction: minimize + sampler: null + pruner: null - # Define search space for Optuna - params: + n_trials: 10 + n_jobs: 1 + show_progress_bar: False + + params: + suggest_categorical: window_sizes: ${generate_permutations:${tabularization.window_sizes}} aggs: ${generate_permutations:${tabularization.aggs}} - min_code_freq: tag(log,range(10,1_000_000)) - model_params: - model: - +num_boost_round: range(10, 1000) - +max_depth: range(2, 16) - +eta: tag(log,interval(.001,1)) - +lambda: tag(log,interval(.001,1)) - +alpha: tag(log,interval(.001,1)) - +subsample: interval(0.5,1) - +min_child_weight: uniform(1e-2, 100) + suggest_float: + eta: + low: .001 + high: 1 + log: True + lambda: + low: .001 + high: 1 + log: True + alpha: + low: .001 + high: 1 + log: True + subsample: + low: 0.5 + high: 1 + min_child_weight: + low: 1e-2 + high: 100 + suggest_int: + num_boost_round: + low: 10 + high: 1000 + max_depth: + low: 2 + high: 16 + min_code_inclusion_frequency: + low: 10 + high: 1_000_000 + log: True + name: launch_xgboost diff --git a/src/MEDS_tabular_automl/configs/tmp.yaml.yaml b/src/MEDS_tabular_automl/configs/tmp.yaml.yaml new file mode 100644 index 0000000..6312a45 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/tmp.yaml.yaml @@ -0,0 +1,89 @@ +# Raw data +MEDS_cohort_dir: /storage/shared/meds_tabular_ml/ebcl_dataset/processed +tabularized_data_dir: ${MEDS_cohort_dir}/tabularize +task_dir: ${tabularized_data_dir}/task +model_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} +cache_dir: ${tabularized_data_dir}/.cache + +# Pre-processing +min_code_inclusion_frequency: 10 +window_sizes: [1d] +codes: null +aggs: + - "static/present" + # - "static/first" + - "code/count" + - "value/count" + - "value/sum" + - "value/sum_sqd" + - "value/min" + - "value/max" + +dynamic_threshold: 0.01 +numerical_value_threshold: 0.1 + +# Sharding +n_patients_per_sub_shard: null + +# Misc +do_overwrite: False +do_update: True +seed: 1 +tqdm: True +worker: 0 +test: False + +num_boost_round: 1000 +early_stopping_rounds: 5 +model: + booster: gbtree + device: cpu + tree_method: hist + objective: binary:logistic + +iterator: + keep_data_in_memory: True + binarize_task: True + +hydra: + verbose: False + run: + dir: ${model_dir}/.logs/ + +optuna: + storage: null + sampler: null + pruner: null + study_name: /home/teya/xgboost/tmp/xgboost_study_${now:%Y-%m-%d_%H-%M-%S} + direction: minimize + load_if_exists: True + show_progress_bar: False + n_trials: 10 + n_jobs: 3 + + params: + categorical: # choose single item from a list + window_sizes: + [ + [1d], + [7d], + [30d], + [365d], + [full], + [1d, 7d], + [1d, 7d, 30d], + [1d, 7d, 30d, 365d], + ] + # set: # choose any subset from a list + # window_sizes: [1d, 7d, 30d, 365d, full] # TODO: teya implement + # aggs: + # - "static/present" + # - "static/first" + # - "code/count" + # - "value/count" + # - "value/sum" + # - "value/sum_sqd" + # - "value/min" + # - "value/max" + integer: # choose integer value from a range [start, end, step] + min_code_inclusion_frequency: [10, 100, 10] diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py index 6b4ef7a..6babbcc 100644 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ b/src/MEDS_tabular_automl/scripts/launch_xgboost.py @@ -1,6 +1,5 @@ from collections.abc import Callable, Mapping from importlib.resources import files -from itertools import combinations from pathlib import Path import hydra @@ -22,21 +21,6 @@ raise FileNotFoundError("Core configuration not successfully installed!") -def generate_permutations(list_of_options): - """Generate all possible permutations of a list of options. - - Args: - - list_of_options (list): List of options. - - Returns: - - list: List of all possible permutations of length > 1 - """ - permutations = [] - for i in range(1, len(list_of_options) + 1): - permutations.extend(list(combinations(list_of_options, r=i))) - return permutations - - class Iterator(xgb.DataIter, TimeableMixin): """Iterator class for loading and processing data shards. diff --git a/src/MEDS_tabular_automl/scripts/sweep_xgboost.py b/src/MEDS_tabular_automl/scripts/sweep_xgboost.py new file mode 100644 index 0000000..3e019b6 --- /dev/null +++ b/src/MEDS_tabular_automl/scripts/sweep_xgboost.py @@ -0,0 +1,85 @@ +import warnings +from copy import deepcopy +from importlib.resources import files +from itertools import combinations + +import hydra +import optuna +from loguru import logger +from omegaconf import DictConfig, OmegaConf, open_dict + +from MEDS_tabular_automl.scripts import launch_xgboost + +warnings.filterwarnings("ignore", category=UserWarning) + +config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml") +if not config_yaml.is_file(): + raise FileNotFoundError("Core configuration not successfully installed!") + + +def generate_permutations(list_of_options): + """Generate all possible permutations of a list of options. + + Args: + - list_of_options (list): List of options. + + Returns: + - list: List of all possible permutations of length > 1 + """ + permutations = [] + for i in range(1, len(list_of_options) + 1): + permutations.extend(list(combinations(list_of_options, r=i))) + return permutations + + +OmegaConf.register_new_resolver("generate_permutations", generate_permutations) + + +def xgboost_singleton(trial: optuna.Trial, config: DictConfig) -> float: + for key, value in config.optuna.params.suggest_categorical.items(): + logger.info(f"Optimizing {key} with {value}") + config.tabularization[key] = trial.suggest_categorical(key, value) + for key, value in config.optuna.params.suggest_float.items(): + with open_dict(config): + config[key] = trial.suggest_float(key, **value) + for key, value in config.optuna.params.suggest_int.items(): + with open_dict(config): + config[key] = trial.suggest_int(key, **value) + return launch_xgboost.main(config) + + +@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) +def main(cfg: DictConfig) -> None: + study = optuna.create_study( + study_name=cfg.optuna.study_name, + storage=cfg.optuna.storage, + load_if_exists=cfg.optuna.load_if_exists, + direction=cfg.optuna.direction, + sampler=cfg.optuna.sampler, + pruner=cfg.optuna.pruner, + ) + study.optimize( + lambda trial: xgboost_singleton(trial, deepcopy(cfg)), + n_trials=cfg.optuna.n_trials, + n_jobs=cfg.optuna.n_jobs, + show_progress_bar=cfg.optuna.show_progress_bar, + ) + print( + "Number of finished trials: ", + len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]), + ) + print( + "Number of pruned trials: ", + len([t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]), + ) + print("Sampler:", study.sampler) + print("Best trial:") + trial = study.best_trial + print(" Value: ", trial.value) + print(" Params: ") + for key, value in trial.params.items(): + print(f" {key}: {value}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_integration.py b/tests/test_integration.py index 44cf54c..0a751b6 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -43,7 +43,7 @@ def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test return stderr, stdout -def test_tabularize(): +def test_integration(): # Step 0: Setup Environment with tempfile.TemporaryDirectory() as d: MEDS_cohort_dir = Path(d) / "processed" @@ -276,3 +276,13 @@ def test_tabularize(): output_files = list(Path(cfg.output_dir).parent.glob("**/*.json")) assert len(output_files) == 1 assert output_files[0].stem == "model" + + stderr, stdout = run_command( + "meds-tab-xgboost-sweep", + [], + xgboost_config_kwargs, + "xgboost-sweep", + ) + output_files = list(Path(cfg.output_dir).parent.glob("**/*.json")) + assert len(output_files) == 2 + assert output_files[0].stem == "model" diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index a49790d..ca67465 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -3,6 +3,7 @@ root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) import json +import os import tempfile from io import StringIO from pathlib import Path @@ -16,6 +17,7 @@ cache_task, describe_codes, launch_xgboost, + sweep_xgboost, tabularize_static, tabularize_time_series, ) @@ -391,3 +393,27 @@ def test_tabularize(): output_files = list(Path(cfg.output_dir).glob("**/*.json")) assert len(output_files) == 1 assert output_files[0] == Path(cfg.output_dir) / "model.json" + os.remove(Path(cfg.output_dir) / "model.json") + + xgboost_config_kwargs = { + "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), + "do_overwrite": False, + "seed": 1, + "hydra.verbose": True, + "tqdm": False, + "loguru_init": True, + "tabularization.min_code_inclusion_frequency": 1, + "tabularization.aggs": "[static/present,static/first,code/count,value/sum]", + "tabularization.window_sizes": "[30d,365d,full]", + } + + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] + cfg = compose(config_name="launch_xgboost", overrides=overrides) # config.yaml + + sweep_xgboost.main(cfg) + output_files = list(Path(cfg.output_dir).glob("**/*.json")) + assert len(output_files) == 1 + assert output_files[0] == Path(cfg.output_dir) / "model.json" From ed193f0a732f37daff16f7e02afd3acc7405fdd2 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 5 Jun 2024 04:46:53 +0000 Subject: [PATCH 104/106] updated to mapper that is more robust to race conditions by using a lock with a timestamp --- src/MEDS_tabular_automl/mapper.py | 109 ++++++++++++++++++++++++------ 1 file changed, 90 insertions(+), 19 deletions(-) diff --git a/src/MEDS_tabular_automl/mapper.py b/src/MEDS_tabular_automl/mapper.py index deefd0d..34275b8 100644 --- a/src/MEDS_tabular_automl/mapper.py +++ b/src/MEDS_tabular_automl/mapper.py @@ -8,6 +8,79 @@ from loguru import logger +LOCK_TIME_FMT = "%Y-%m-%dT%H:%M:%S.%f" + + +def get_earliest_lock(cache_directory: Path) -> datetime | None: + """Returns the earliest start time of any lock file present in a cache directory, or None if none exist. + + Args: + cache_directory: The cache directory to check for the presence of a lock file. + + Examples: + >>> import tempfile + >>> directory = tempfile.TemporaryDirectory() + >>> root = Path(directory.name) + >>> empty_directory = root / "cache_empty" + >>> empty_directory.mkdir(exist_ok=True, parents=True) + >>> cache_directory = root / "cache_with_locks" + >>> locks_directory = cache_directory / "locks" + >>> locks_directory.mkdir(exist_ok=True, parents=True) + >>> time_1 = datetime(2021, 1, 1) + >>> time_1_str = time_1.strftime(LOCK_TIME_FMT) # "2021-01-01T00:00:00.000000" + >>> lock_fp_1 = locks_directory / f"{time_1_str}.json" + >>> _ = lock_fp_1.write_text(json.dumps({"start": time_1_str})) + >>> time_2 = datetime(2021, 1, 2, 3, 4, 5) + >>> time_2_str = time_2.strftime(LOCK_TIME_FMT) # "2021-01-02T03:04:05.000000" + >>> lock_fp_2 = locks_directory / f"{time_2_str}.json" + >>> _ = lock_fp_2.write_text(json.dumps({"start": time_2_str})) + >>> get_earliest_lock(cache_directory) + datetime.datetime(2021, 1, 1, 0, 0) + >>> get_earliest_lock(empty_directory) is None + True + >>> lock_fp_1.unlink() + >>> get_earliest_lock(cache_directory) + datetime.datetime(2021, 1, 2, 3, 4, 5) + >>> directory.cleanup() + """ + locks_directory = cache_directory / "locks" + + lock_times = [ + datetime.strptime(json.loads(lock_fp.read_text())["start"], LOCK_TIME_FMT) + for lock_fp in locks_directory.glob("*.json") + ] + + return min(lock_times) if lock_times else None + + +def register_lock(cache_directory: Path) -> tuple[datetime, Path]: + """Register a lock file in a cache directory. + + Args: + cache_directory: The cache directory to register a lock file in. + + Examples: + >>> import tempfile + >>> directory = tempfile.TemporaryDirectory() + >>> root = Path(directory.name) + >>> cache_directory = root / "cache_with_locks" + >>> lock_time, lock_fp = register_lock(cache_directory) + >>> assert (datetime.now() - lock_time).total_seconds() < 1, "Lock time should be ~ now." + >>> lock_fp.is_file() + True + >>> lock_fp.read_text() == f'{{"start": "{lock_time.strftime(LOCK_TIME_FMT)}"}}' + True + >>> directory.cleanup() + """ + + lock_directory = cache_directory / "locks" + lock_directory.mkdir(exist_ok=True, parents=True) + + lock_time = datetime.now() + lock_fp = lock_directory / f"{lock_time.strftime(LOCK_TIME_FMT)}.json" + lock_fp.write_text(json.dumps({"start": lock_time.strftime(LOCK_TIME_FMT)})) + return lock_time, lock_fp + def wrap[ DF_T @@ -108,15 +181,15 @@ def wrap[ │ 3 ┆ 5 ┆ 12 │ └─────┴─────┴─────┘ >>> shutil.rmtree(cache_directory) - >>> lock_fp = cache_directory / "lock.json" - >>> assert not lock_fp.is_file() - >>> def lock_fp_checker_fn(df: pl.DataFrame) -> pl.DataFrame: - ... print(f"Lock fp exists? {lock_fp.is_file()}") + >>> lock_dir = cache_directory / "locks" + >>> assert not lock_dir.exists() + >>> def lock_dir_checker_fn(df: pl.DataFrame) -> pl.DataFrame: + ... print(f"Lock dir exists? {lock_dir.exists()}") ... return df >>> result_computed, out_df = wrap( - ... in_fp, out_fp, read_fn, write_fn, lock_fp_checker_fn, do_return=True + ... in_fp, out_fp, read_fn, write_fn, lock_dir_checker_fn, do_return=True ... ) - Lock fp exists? True + Lock dir exists? True >>> assert result_computed >>> out_df shape: (3, 3) @@ -146,21 +219,19 @@ def wrap[ cache_directory = out_fp.parent / f".{out_fp.stem}_cache" cache_directory.mkdir(exist_ok=True, parents=True) - st_time = datetime.now() - runtime_info = {"start": str(st_time)} + earliest_lock_time = get_earliest_lock(cache_directory) + if earliest_lock_time is not None: + logger.info(f"{out_fp} is in progress as of {earliest_lock_time}. Returning.") + return False, None if do_return else False - lock_fp = cache_directory / "lock.json" - if lock_fp.is_file(): - started_at = json.loads(lock_fp.read_text())["start"] - logger.info( - f"{out_fp} is under construction as of {started_at} as {lock_fp} exists. " "Returning None." - ) - if do_return: - return False, None - else: - return False + st_time, lock_fp = register_lock(cache_directory) - lock_fp.write_text(json.dumps(runtime_info)) + logger.info(f"Registered lock at {st_time}. Double checking no earlier locks have been registered.") + earliest_lock_time = get_earliest_lock(cache_directory) + if earliest_lock_time < st_time: + logger.info(f"Earlier lock found at {earliest_lock_time}. Deleting current lock and returning.") + lock_fp.unlink() + return False, None if do_return else False logger.info(f"Reading input dataframe from {in_fp}") df = read_fn(in_fp) From 4047d7cef61f81a88735d0820bb194f099ce33b9 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 5 Jun 2024 05:15:18 +0000 Subject: [PATCH 105/106] removed experiment scripts and old bash cli --- cli/describe_codes.sh | 4 - cli/profile_tabularization.sh | 26 --- cli/tabularization.sh | 15 -- cli/task_specific_caching.sh | 4 - cli/xgboost.sh | 4 - hf_cohort/aces_task.sh | 13 -- hf_cohort/aces_task_extraction.py | 76 ------ hf_cohort/cohort.yaml | 373 ------------------------------ hf_cohort/config.yaml | 21 -- hf_cohort/hf_cohort_cli.sh | 62 ----- hf_cohort/hf_cohort_e2e.sh | 61 ----- hf_cohort/hf_cohort_shard.sh | 42 ---- hf_cohort/task.yaml | 21 -- hf_cohort/xgboost.sh | 7 - 14 files changed, 729 deletions(-) delete mode 100755 cli/describe_codes.sh delete mode 100755 cli/profile_tabularization.sh delete mode 100755 cli/tabularization.sh delete mode 100755 cli/task_specific_caching.sh delete mode 100755 cli/xgboost.sh delete mode 100644 hf_cohort/aces_task.sh delete mode 100644 hf_cohort/aces_task_extraction.py delete mode 100644 hf_cohort/cohort.yaml delete mode 100644 hf_cohort/config.yaml delete mode 100644 hf_cohort/hf_cohort_cli.sh delete mode 100644 hf_cohort/hf_cohort_e2e.sh delete mode 100644 hf_cohort/hf_cohort_shard.sh delete mode 100644 hf_cohort/task.yaml delete mode 100644 hf_cohort/xgboost.sh diff --git a/cli/describe_codes.sh b/cli/describe_codes.sh deleted file mode 100755 index 79187f5..0000000 --- a/cli/describe_codes.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash - -echo "Describing Codes: Caching feature names and frequencies." -python scripts/identify_columns.py "$@" diff --git a/cli/profile_tabularization.sh b/cli/profile_tabularization.sh deleted file mode 100755 index 4b34366..0000000 --- a/cli/profile_tabularization.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash - -PROFILE_LOG_DIR="$1" - -shift 1 - -SCRIPT_DIR=$(dirname "$0") -SCRIPT_NAME=$(basename "$0") - -mkdir -p "$PROFILE_LOG_DIR" -{ time \ - mprof run --include-children --exit-code --output "${PROFILE_LOG_DIR}/mprofile.dat" \ - bash "${SCRIPT_DIR}/tabularization.sh" "$@" \ - 2> "${PROFILE_LOG_DIR}/cmd.stderr" -} 2> "${PROFILE_LOG_DIR}/timings.txt" - -cmd_exit_status=${PIPESTATUS[0]} -# Check the exit status of the second command in the pipeline (mprof run ...) -if [ -n "$cmd_exit_status" ] && [ "$cmd_exit_status" -ne 0 ]; then - echo "${SCRIPT_NAME} failed with status $cmd_exit_status." - echo "Stderr from ${SCRIPT_NAME} (see ${PROFILE_LOG_DIR}/cmd.stderr):" - tail "${PROFILE_LOG_DIR}/cmd.stderr" - exit "$cmd_exit_status" -fi -mprof plot -o "${PROFILE_LOG_DIR}/mprofile.png" "${PROFILE_LOG_DIR}/mprofile.dat" -mprof peak "${PROFILE_LOG_DIR}/mprofile.dat" > "${PROFILE_LOG_DIR}/peak_memory_usage.txt" diff --git a/cli/tabularization.sh b/cli/tabularization.sh deleted file mode 100755 index ceb147f..0000000 --- a/cli/tabularization.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -N_PARALLEL_WORKERS="$1" -shift 1 - -echo "Tabularizing Static Data" -python scripts/tabularize_static.py "$@" - - -echo "Tabularizing Time-Series Data" -python scripts/summarize_over_windows.py \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - "$@" diff --git a/cli/task_specific_caching.sh b/cli/task_specific_caching.sh deleted file mode 100755 index 92d8273..0000000 --- a/cli/task_specific_caching.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash - -echo "Caching Training Data for Task" -python scripts/task_specific_caching.py "$@" diff --git a/cli/xgboost.sh b/cli/xgboost.sh deleted file mode 100755 index 9d9b286..0000000 --- a/cli/xgboost.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash - -echo "Running XGBoost" -python scripts/launch_xgboost.py "$@" diff --git a/hf_cohort/aces_task.sh b/hf_cohort/aces_task.sh deleted file mode 100644 index c8dcf5d..0000000 --- a/hf_cohort/aces_task.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash - -MEDS_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort -OUTPUT_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize -# N_PARALLEL_WORKERS="$1" -WINDOW_SIZES="window_sizes=[1d]" -AGGS="aggs=[code/count,value/sum]" - -python /home/nassim/projects/MEDS_Tabular_AutoML/hf_cohort/aces_task_extraction.py \ - MEDS_cohort_dir=$MEDS_DIR \ - tabularized_data_dir=$OUTPUT_DIR \ - min_code_inclusion_frequency=1 do_overwrite=False \ - "$WINDOW_SIZES" "$AGGS" diff --git a/hf_cohort/aces_task_extraction.py b/hf_cohort/aces_task_extraction.py deleted file mode 100644 index 851c0c1..0000000 --- a/hf_cohort/aces_task_extraction.py +++ /dev/null @@ -1,76 +0,0 @@ -""" -Setup Conda environment as described here: https://github.com/justin13601/ACES -""" -import json -from pathlib import Path - -import hydra -import polars as pl -from aces import config, predicates, query -from tqdm import tqdm - - -def get_events_df(shard_df: pl.DataFrame, feature_columns) -> pl.DataFrame: - """Extracts Events DataFrame with one row per observation (timestamps can be duplicated)""" - # Filter out feature_columns that were not present in the training set - raw_feature_columns = ["/".join(c.split("/")[:-1]) for c in feature_columns] - shard_df = shard_df.filter(pl.col("code").is_in(raw_feature_columns)) - # Drop rows with missing timestamp or code to get events - ts_shard_df = shard_df.drop_nulls(subset=["timestamp", "code"]) - return ts_shard_df - - -def get_unique_time_events_df(events_df: pl.DataFrame): - """Updates Events DataFrame to have unique timestamps and sorted by patient_id and timestamp.""" - assert events_df.select(pl.col("timestamp")).null_count().collect().item() == 0 - # Check events_df is sorted - so it aligns with the ts_matrix we generate later in the pipeline - events_df = ( - events_df.drop_nulls("timestamp") - .select(pl.col(["patient_id", "timestamp"])) - .unique(maintain_order=True) - ) - assert events_df.sort(by=["patient_id", "timestamp"]).collect().equals(events_df.collect()) - return events_df - - -@hydra.main(version_base=None, config_path="../configs", config_name="tabularize") -def main(cfg): - # create task configuration object - task_cfg = config.TaskExtractorConfig.load(config_path="hf_cohort/task.yaml") - - # setup directories - med_dir = Path(cfg.tabularized_data_dir) - - # location of MEDS format Data - cohort_dir = med_dir.parent / "final_cohort" - # output directory for tables with event_ids and labels - output_dir = med_dir / "task" - - shard_fps = list(cohort_dir.glob("*/*.parquet")) - - for in_fp in tqdm(shard_fps): - out_fp = output_dir / "/".join(in_fp.parts[-2:]) - out_fp.parent.mkdir(parents=True, exist_ok=True) - # one of the following - predicates_df = predicates.generate_predicates_df(task_cfg, in_fp, "meds") - - # execute query - df_result = query.query(task_cfg, predicates_df) - label_df = ( - df_result.select(pl.col(["subject_id", "trigger", "label"])) - .rename({"trigger": "timestamp", "subject_id": "patient_id"}) - .sort(by=["patient_id", "timestamp"]) - ) - feature_columns = json.load(open(Path(cfg.tabularized_data_dir) / "feature_columns.json")) - data_df = pl.scan_parquet(in_fp) - data_df = get_unique_time_events_df(get_events_df(data_df, feature_columns)) - data_df = data_df.drop(["code", "numerical_value"]) - data_df = data_df.with_row_index("event_id") - output_df = label_df.lazy().join_asof(other=data_df, by="patient_id", on="timestamp") - - # store it - output_df.collect().write_parquet(out_fp) - - -if __name__ == "__main__": - main() diff --git a/hf_cohort/cohort.yaml b/hf_cohort/cohort.yaml deleted file mode 100644 index 51eb868..0000000 --- a/hf_cohort/cohort.yaml +++ /dev/null @@ -1,373 +0,0 @@ -patient_id_col: "empi" - -demographic: - sex: - code: - - SEX - - col(sex) - timestamp: null - race: - code: - - RACE - - col(race) - timestamp: null - country: - code: - - COUNTRY - - col(country) - timestamp: null - zip_code: - code: - - ZIP_CODE - - col(zip_code) - timestamp: null - birth: - code: BIRTH - timestamp: col(date_of_birth) - timestamp_format: "%Y-%m-%d" - death: - code: DEATH - timestamp: col(date_of_death) - timestamp_format: "%Y-%m-%d" - -diagnosis: - diagnosis: - code: - - DIAGNOSIS - - col(diagnosis_name) - timestamp: col(date) - timestamp_format: "%Y-%m-%d" - -encounters_modified: - admit_date: - code: ADMIT_DATE - timestamp: col(admit_date) - timestamp_format: "%Y-%m-%d" - discharge_date: - code: DISCHARGE_DATE - timestamp: col(discharge_date) - timestamp_format: "%Y-%m-%d" - -physical: - physical: - code: - - PHYSICAL - - col(physical_name) - timestamp: col(date) - timestamp_format: "%Y-%m-%d" - numerical_value: col(result) - -ecg: - ecg: - code: ECG - timestamp: col(date) - timestamp_format: "%Y-%m-%d %H:%M:%S" - -echo: - echo_type: - code: - - ECHO_TYPE - - col(echo_type) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - lv_wall_thickness: - code: - - LV_WALL_THICKNESS - - col(lv_wall_thickness) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - lv_hypertrophy: - code: - - LV_HYPERTROPHY - - col(lv_hypertrophy) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - lv_ef: - code: - - LV_EF - - col(lv_ef) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - lv_ef_method: - code: - - LV_EF_METHOD - - col(lv_ef_method) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - rv_size: - code: - - RV_SIZE - - col(rv_size) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - tv_regurg: - code: - - TV_REGURG - - col(tv_regurg) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - tv_regurg_severity: - code: - - TV_REGURG_SEVERITY - - col(tv_regurg_severity) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - av_abnormal: - code: - - AV_ABNORMAL - - col(av_abnormal) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - av_stenosis: - code: - - AV_STENOSIS - - col(av_stenosis) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - av_calcified: - code: - - AV_CALCIFIED - - col(av_calcified) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - mv_regurg: - code: - - MV_REGURG - - col(mv_regurg) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - mv_regurg_severity: - code: - - MV_REGURG_SEVERITY - - col(mv_regurg_severity) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - pv_stenosis_severity: - code: - - PV_STENOSIS_SEVERITY - - col(pv_stenosis_severity) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - pv_regurg: - code: - - PV_REGURG - - col(pv_regurg) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - pv_regurg_severity: - code: - - PV_REGURG_SEVERITY - - col(pv_regurg_severity) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - echo_quality: - code: - - ECHO_QUALITY - - col(echo_quality) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - la_size: - code: - - LA_SIZE - - col(la_size) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - ra_size: - code: - - RA_SIZE - - col(ra_size) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - pericardium_normal: - code: - - PERICARDIUM_NORMAL - - col(pericardium_normal) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - pericardial_effusion: - code: - - PERICARDIAL_EFFUSION - - col(pericardial_effusion) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - pleural_effusion: - code: - - PLEURAL_EFFUSION - - col(pleural_effusion) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - av_morphology: - code: - - AV_MORPHOLOGY - - col(av_morphology) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - indication_category: - code: - - INDICATION_CATEGORY - - col(indication_category) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - indication_subcategory: - code: - - INDICATION_SUBCATEGORY - - col(indication_subcategory) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - echo_comprehensive: - code: - - ECHO_COMPREHENSIVE - - col(echo_comprehensive) - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - lv_ef_value: - code: LV_EF_VALUE - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - numerical_value: col(lv_ef_value) - tv_rv_systolic_pressure: - code: TV_RV_SYSTOLIC_PRESSURE - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - numerical_value: col(tv_rv_systolic_pressure) - tv_ra_estimated_pressure: - code: - - TV_RA_ESTIMATED_PRESSURE - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - numerical_value: col(tv_ra_estimated_pressure) - body_surface_area: - code: BODY_SURFACE_AREA - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - numerical_value: col(body_surface_area) - height_cm: - code: HEIGHT_CM - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - numerical_value: col(height_cm) - weight_kg: - code: WEIGHT_KG - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - numerical_value: col(weight_kg) - heart_rate: - code: HEART_RATE - timestamp: col(date) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - numerical_value: col(heart_rate) - -pressure: - mean_ra_pressure: - code: MEAN_RA_PRESSURE - timestamp: col(date) - timestamp_format: "%Y-%m-%d" - numerical_value: col(mean_ra_pressure) - mean_pa_pressure: - code: MEAN_PA_PRESSURE - timestamp: col(date) - timestamp_format: "%Y-%m-%d" - numerical_value: col(mean_pa_pressure) - mean_wedge_pressure: - code: MEAN_WEDGE_PRESSURE - timestamp: col(date) - timestamp_format: "%Y-%m-%d" - numerical_value: col(mean_wedge_pressure) - rvedp: - code: RVEDP - timestamp: col(date) - timestamp_format: "%Y-%m-%d" - numerical_value: col(rvedp) - -lab: - lab: - code: - - LAB - - col(group) - timestamp: col(date) - timestamp_format: "%Y-%m-%d %H:%M:%S" - numerical_value: col(result) - -medication_after_1960: - medication: - code: - - MEDICATION - - col(medication_name) - timestamp: col(date) - timestamp_format: "%m/%d/%Y" - numerical_value: quantity - -procedure: - procedure: - code: - - PROCEDURE - - col(procedure_name) - timestamp: col(date) - timestamp_format: "%Y-%m-%d" - numerical_value: quantity diff --git a/hf_cohort/config.yaml b/hf_cohort/config.yaml deleted file mode 100644 index a9911fb..0000000 --- a/hf_cohort/config.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Path to the task configuration file -config_path: task.yaml - -# Raw Data -data: - # Path to the data file or directory - path: /storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort/train/0.parquet - - # Data standard, one of (csv, meds, esgpt) - standard: meds - -# Output Directory (saves as .parquet file) -output_dir: results/ - -# Hydra -hydra: - job: - name: ACES_${now:%Y-%m-%d_%H-%M-%S} - run: - dir: ${ACES_dir}/.logs/${hydra.job.name} -# aces-cli --config-dir='./' --config-name='config.yaml' diff --git a/hf_cohort/hf_cohort_cli.sh b/hf_cohort/hf_cohort_cli.sh deleted file mode 100644 index e4259f1..0000000 --- a/hf_cohort/hf_cohort_cli.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# bash hf_cohort/hf_cohort_e2e.sh hf_cohort 80 - -METHOD=meds - -MEDS_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed -OUTPUT_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize -ID=$1 -N_PARALLEL_WORKERS="$2" -WINDOW_SIZES="tabularization.window_sizes=[1d,7d,30d,365d,full]" -AGGS="tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" -MIN_CODE_FREQ=10 - -echo "Running identify_columns.py: Caching feature names and frequencies." -rm -rf $OUTPUT_DIR -meds-tab-describe MEDS_cohort_dir=$MEDS_DIR - -echo "Running tabularize_static.py: tabularizing static data" -meds-tab-tabularize-static \ - MEDS_cohort_dir=$MEDS_DIR \ - tabularization.min_code_inclusion_frequency="$MIN_CODE_FREQ" "$WINDOW_SIZES" do_overwrite=False "$AGGS" - - -export POLARS_MAX_THREADS=1 -LOG_DIR="logs/$METHOD/$ID-logs" -mkdir -p "${LOG_DIR}" -{ time \ - mprof run --include-children --exit-code --output "${LOG_DIR}/mprofile.dat" \ - meds-tab-tabularize-time-series \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - MEDS_cohort_dir=$MEDS_DIR \ - tabularization.min_code_inclusion_frequency="$MIN_CODE_FREQ" do_overwrite=False \ - "$WINDOW_SIZES" "$AGGS" \ - 2> "${LOG_DIR}/cmd.stderr" -} 2> "${LOG_DIR}/timings.txt" - -cmd_exit_status=${PIPESTATUS[0]} -# Check the exit status of the second command in the pipeline (mprof run ...) -if [ -n "$cmd_exit_status" ] && [ "$cmd_exit_status" -ne 0 ]; then - echo "build_dataset.sh failed with status $cmd_exit_status." - echo "Stderr from build_dataset.sh (see ${LOG_DIR}/cmd.stderr):" - tail "${LOG_DIR}/cmd.stderr" - exit "$cmd_exit_status" -fi -mprof plot -o "${LOG_DIR}/mprofile.png" "${LOG_DIR}/mprofile.dat" -mprof peak "${LOG_DIR}/mprofile.dat" > "${LOG_DIR}/peak_memory_usage.txt" - - -echo "Running task_specific_caching.py: tabularizing static data" -meds-tab-cache-task \ - MEDS_cohort_dir=$MEDS_DIR \ - tabularization.min_code_inclusion_frequency="$MIN_CODE_FREQ" "$WINDOW_SIZES" do_overwrite=False "$AGGS" - -echo "Running xgboost: tabularizing static data" -meds-tab-xgboost \ - --multirun \ - MEDS_cohort_dir=$MEDS_DIR \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - tabularization.min_code_inclusion_frequency="$MIN_CODE_FREQ" "$WINDOW_SIZES" do_overwrite=False "$AGGS" diff --git a/hf_cohort/hf_cohort_e2e.sh b/hf_cohort/hf_cohort_e2e.sh deleted file mode 100644 index 3d0963e..0000000 --- a/hf_cohort/hf_cohort_e2e.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env bash -# bash hf_cohort/hf_cohort_e2e.sh hf_cohort 80 - -METHOD=meds - -MEDS_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed -OUTPUT_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed/tabularize -ID=$1 -N_PARALLEL_WORKERS="$2" -WINDOW_SIZES="window_sizes=[1d,7d,30d,365d,full]" -AGGS="aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" -# WINDOW_SIZES="window_sizes=[1d,7d,30d,365d,full]" -# AGGS="aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" - -echo "Running identify_columns.py: Caching feature names and frequencies." -rm -rf $OUTPUT_DIR -POLARS_MAX_THREADS=32 python scripts/identify_columns.py \ - MEDS_cohort_dir=$MEDS_DIR \ - tabularized_data_dir=$OUTPUT_DIR \ - min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" - -echo "Running tabularize_static.py: tabularizing static data" -POLARS_MAX_THREADS=32 python scripts/tabularize_static.py \ - MEDS_cohort_dir=$MEDS_DIR \ - tabularized_data_dir=$OUTPUT_DIR \ - min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" - - -POLARS_MAX_THREADS=1 -LOG_DIR="logs/$METHOD/$ID-logs" -mkdir -p $LOG_DIR -{ time \ - mprof run --include-children --exit-code --output "$LOG_DIR/mprofile.dat" \ - python scripts/summarize_over_windows.py \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - MEDS_cohort_dir=$MEDS_DIR \ - tabularized_data_dir=$OUTPUT_DIR \ - min_code_inclusion_frequency=1 do_overwrite=False \ - "$WINDOW_SIZES" "$AGGS" \ - 2> $LOG_DIR/cmd.stderr -} 2> $LOG_DIR/timings.txt - -cmd_exit_status=${PIPESTATUS[0]} -# Check the exit status of the second command in the pipeline (mprof run ...) -if [ -n "$cmd_exit_status" ] && [ "$cmd_exit_status" -ne 0 ]; then - echo "build_dataset.sh failed with status $cmd_exit_status." - echo "Stderr from build_dataset.sh (see $LOG_DIR/cmd.stderr):" - tail $LOG_DIR/cmd.stderr - exit "$cmd_exit_status" -fi -mprof plot -o $LOG_DIR/mprofile.png $LOG_DIR/mprofile.dat -mprof peak $LOG_DIR/mprofile.dat > $LOG_DIR/peak_memory_usage.txt - - -echo "Running task_specific_caching.py: tabularizing static data" -POLARS_MAX_THREADS=32 python scripts/task_specific_caching.py \ - MEDS_cohort_dir=$MEDS_DIR \ - tabularized_data_dir=$OUTPUT_DIR \ - min_code_inclusion_frequency=1 "$WINDOW_SIZES" do_overwrite=False "$AGGS" diff --git a/hf_cohort/hf_cohort_shard.sh b/hf_cohort/hf_cohort_shard.sh deleted file mode 100644 index 6b81051..0000000 --- a/hf_cohort/hf_cohort_shard.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env bash -OUTPUT_DIR=/data/storage/shared/meds_tabular_ml/ebcl_dataset/processed -PATIENTS_PER_SHARD="5_000" -CHUNKSIZE="200_000_000" - -rm -rf $OUTPUT_DIR - -echo "Running shard_events.py" -POLARS_MAX_THREADS=32 python /home/nassim/projects/MEDS_polars_functions/scripts/extraction/shard_events.py \ - raw_cohort_dir=/data/storage/shared/meds_tabular_ml/ebcl_dataset \ - MEDS_cohort_dir=$OUTPUT_DIR \ - event_conversion_config_fp=/home/nassim/projects/MEDS_Tabular_AutoML/hf_cohort/cohort.yaml \ - split_fracs.train=0.6666666666666666 split_fracs.tuning=0.16666666666666666 \ - split_fracs.held_out=0.16666666666666666 row_chunksize=$CHUNKSIZE \ - n_patients_per_shard=$PATIENTS_PER_SHARD hydra.verbose=True - -echo "Running split_and_shard_patients.py" -POLARS_MAX_THREADS=32 python /home/nassim/projects/MEDS_polars_functions/scripts/extraction/split_and_shard_patients.py \ - raw_cohort_dir=/data/storage/shared/meds_tabular_ml/ebcl_dataset \ - MEDS_cohort_dir=$OUTPUT_DIR \ - event_conversion_config_fp=/home/nassim/projects/MEDS_Tabular_AutoML/hf_cohort/cohort.yaml \ - split_fracs.train=0.6666666666666666 split_fracs.tuning=0.16666666666666666 \ - split_fracs.held_out=0.16666666666666666 row_chunksize=$CHUNKSIZE \ - n_patients_per_shard=$PATIENTS_PER_SHARD hydra.verbose=True - -echo "Running convert_to_sharded_events.py" -POLARS_MAX_THREADS=32 python /home/nassim/projects/MEDS_polars_functions/scripts/extraction/convert_to_sharded_events.py \ - raw_cohort_dir=/data/storage/shared/meds_tabular_ml/ebcl_dataset \ - MEDS_cohort_dir=$OUTPUT_DIR \ - event_conversion_config_fp=/home/nassim/projects/MEDS_Tabular_AutoML/hf_cohort/cohort.yaml \ - split_fracs.train=0.6666666666666666 split_fracs.tuning=0.16666666666666666 \ - split_fracs.held_out=0.16666666666666666 row_chunksize=$CHUNKSIZE \ - n_patients_per_shard=$PATIENTS_PER_SHARD hydra.verbose=True - -echo "Running merge_to_MEDS_cohort.py" -POLARS_MAX_THREADS=32 python /home/nassim/projects/MEDS_polars_functions/scripts/extraction/merge_to_MEDS_cohort.py \ - raw_cohort_dir=/data/storage/shared/meds_tabular_ml/ebcl_dataset \ - MEDS_cohort_dir=$OUTPUT_DIR \ - event_conversion_config_fp=/home/nassim/projects/MEDS_Tabular_AutoML/hf_cohort/cohort.yaml \ - split_fracs.train=0.6666666666666666 split_fracs.tuning=0.16666666666666666 \ - split_fracs.held_out=0.16666666666666666 row_chunksize=$CHUNKSIZE \ - n_patients_per_shard=$PATIENTS_PER_SHARD hydra.verbose=True diff --git a/hf_cohort/task.yaml b/hf_cohort/task.yaml deleted file mode 100644 index 19ff7f0..0000000 --- a/hf_cohort/task.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Task: 30-day Readmission Risk Prediction -predicates: - admission: - code: ADMIT_DATE - discharge: - code: DISCHARGE_DATE - -trigger: admission - -windows: - input: - start: trigger - end: start -> discharge - start_inclusive: False - end_inclusive: True - target: - start: input.end - end: start + 30 days - start_inclusive: False - end_inclusive: True - label: admission diff --git a/hf_cohort/xgboost.sh b/hf_cohort/xgboost.sh deleted file mode 100644 index d45793a..0000000 --- a/hf_cohort/xgboost.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash - -BASE_DIR=/storage/shared/meds_tabular_ml/ebcl_dataset/processed_bad_code_cohort -TAB_DIR=$BASE_DIR/tabularize -KEEP_IN_MEMORY=True - -python -m scripts.xgboost MEDS_cohort_dir=$BASE_DIR tabularized_data_dir=$TAB_DIR iterator.keep_data_in_memory=$KEEP_IN_MEMORY From a19ad3ecbb516621d147f2c91576459bb66190f1 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 5 Jun 2024 05:18:35 +0000 Subject: [PATCH 106/106] cleaned up formatting for codebase --- src/MEDS_tabular_automl/configs/describe_codes.yaml | 2 +- src/MEDS_tabular_automl/configs/tabularization.yaml | 2 +- src/MEDS_tabular_automl/configs/task_specific_caching.yaml | 2 +- src/MEDS_tabular_automl/describe_codes.py | 7 +++---- src/MEDS_tabular_automl/generate_summarized_reps.py | 1 + 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/MEDS_tabular_automl/configs/describe_codes.yaml b/src/MEDS_tabular_automl/configs/describe_codes.yaml index 9aad365..d171513 100644 --- a/src/MEDS_tabular_automl/configs/describe_codes.yaml +++ b/src/MEDS_tabular_automl/configs/describe_codes.yaml @@ -11,4 +11,4 @@ cache_dir: ${MEDS_cohort_dir}/.cache output_dir: ${MEDS_cohort_dir} output_filepath: ${output_dir}/code_metadata.parquet -name: describe_codes \ No newline at end of file +name: describe_codes diff --git a/src/MEDS_tabular_automl/configs/tabularization.yaml b/src/MEDS_tabular_automl/configs/tabularization.yaml index 222b7af..dd40e3f 100644 --- a/src/MEDS_tabular_automl/configs/tabularization.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization.yaml @@ -9,4 +9,4 @@ input_code_metadata_fp: ${MEDS_cohort_dir}/code_metadata.parquet input_dir: ${MEDS_cohort_dir}/final_cohort output_dir: ${MEDS_cohort_dir}/tabularize -name: tabularization \ No newline at end of file +name: tabularization diff --git a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml index 776f6a5..f1ca160 100644 --- a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml +++ b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml @@ -11,4 +11,4 @@ input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels # Where to output the task specific tabularized data output_dir: ${MEDS_cohort_dir}/${task_name}/task_cache -name: task_specific_caching \ No newline at end of file +name: task_specific_caching diff --git a/src/MEDS_tabular_automl/describe_codes.py b/src/MEDS_tabular_automl/describe_codes.py index 5008f2d..de70682 100644 --- a/src/MEDS_tabular_automl/describe_codes.py +++ b/src/MEDS_tabular_automl/describe_codes.py @@ -1,4 +1,3 @@ -from collections.abc import Mapping from pathlib import Path import polars as pl @@ -112,9 +111,9 @@ def filter_to_codes( feature_freqs = get_feature_freqs(code_metadata_fp) code_freqs = { - code: freq for code, freq in feature_freqs.items() if ( - freq >= min_code_inclusion_frequency and code in set(allowed_codes) - ) + code: freq + for code, freq in feature_freqs.items() + if (freq >= min_code_inclusion_frequency and code in set(allowed_codes)) } return sorted([code for code, freq in code_freqs.items() if freq >= min_code_inclusion_frequency]) diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index a917714..254a381 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import polars as pl + pl.enable_string_cache() from loguru import logger from scipy.sparse import coo_array, csr_array, sparray