diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 6aa8294..6b8286e 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -33,7 +33,7 @@ jobs: #---------------------------------------------- - name: Run tests run: | - pytest -v --doctest-modules --cov --ignore=hf_cohort/ + pytest -v --doctest-modules --cov --ignore=docs - name: Upload coverage to Codecov uses: codecov/codecov-action@v4.0.1 diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 40169ce..e50ea1d 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -6,9 +6,14 @@ from loguru import logger from scipy.sparse import coo_array, csr_array, sparray -from MEDS_tabular_automl.generate_ts_features import get_feature_names, get_flat_ts_rep from MEDS_tabular_automl.describe_codes import get_feature_columns -from MEDS_tabular_automl.utils import CODE_AGGREGATIONS, VALUE_AGGREGATIONS, load_tqdm, get_min_dtype +from MEDS_tabular_automl.generate_ts_features import get_feature_names, get_flat_ts_rep +from MEDS_tabular_automl.utils import ( + CODE_AGGREGATIONS, + VALUE_AGGREGATIONS, + get_min_dtype, + load_tqdm, +) def sparse_aggregate(sparse_matrix, agg): @@ -250,13 +255,11 @@ def generate_summary( if __name__ == "__main__": - import json from pathlib import Path - # feature_columns_fp = Path("/storage/shared/meds_tabular_ml/mimiciv_dataset/mimiciv_MEDS") / "tabularized_code_metadata.parquet" - # shard_fp = Path("/storage/shared/meds_tabular_ml/mimiciv_dataset/mimiciv_MEDS/final_cohort/train/0.parquet") - - feature_columns_fp = Path("/storage/shared/meds_tabular_ml/ebcl_dataset/processed") / "tabularized_code_metadata.parquet" + feature_columns_fp = ( + Path("/storage/shared/meds_tabular_ml/ebcl_dataset/processed") / "tabularized_code_metadata.parquet" + ) shard_fp = Path("/storage/shared/meds_tabular_ml/ebcl_dataset/processed/final_cohort/train/0.parquet") feature_columns = get_feature_columns(feature_columns_fp) diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index 7449321..c48e59a 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -5,6 +5,7 @@ pl.enable_string_cache() +import gc from importlib.resources import files from itertools import product from pathlib import Path @@ -27,7 +28,6 @@ load_tqdm, write_df, ) -import gc config_yaml = files("MEDS_tabular_automl").joinpath("configs/tabularization.yaml") if not config_yaml.is_file(): diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index ebe895e..516a1da 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -73,11 +73,43 @@ def array_to_sparse_matrix(array: np.ndarray, shape: tuple[int, int]): return coo_array((data, (row, col)), shape=shape) -def get_min_dtype(array): - try: +def get_min_dtype(array: np.ndarray) -> np.dtype: + """Get the minimal dtype that can represent the array. + + Args: + array: The array to determine the minimal dtype for. + + Returns: + The minimal dtype that can represent the array, or the array's dtype if it is non-numeric. + + Examples: + >>> get_min_dtype(np.array([1, 2, 3])) + dtype('uint8') + >>> get_min_dtype(np.array([1, 2, 3, int(1e9)])) + dtype('uint32') + >>> get_min_dtype(np.array([1, 2, 3, int(1e18)])) + dtype('uint64') + >>> get_min_dtype(np.array([1, 2, 3, -128])) + dtype('int8') + >>> get_min_dtype(np.array([1.0, 2.0, 3.0])) + dtype('float32') + >>> get_min_dtype(np.array([1, 2, 3, np.nan])) + dtype('float32') + >>> get_min_dtype(np.array([1, 2, 3, "a"])) + dtype('