Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coding fix #32

Merged
merged 12 commits into from
Nov 12, 2024
22 changes: 11 additions & 11 deletions nbs/01_basic_plots.ipynb

Large diffs are not rendered by default.

18 changes: 15 additions & 3 deletions nbs/05_pheno_loader.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -284,9 +284,7 @@
" return None\n",
"\n",
" # load data\n",
" if load_func is not None:\n",
" warnings.warn(\"The 'load_func' is deprecated and will be removed in future versions.\")\n",
" else: \n",
" if load_func is None:\n",
" if 'field_type' not in self.dict:\n",
" field_type = None\n",
" else:\n",
Expand Down Expand Up @@ -740,6 +738,20 @@
" self.fields |= set(self.dfs[table_name].columns.tolist())\n",
" self.fields = sorted(list(self.fields))\n",
"\n",
" # Merge the data_codings dataframe with the dictionary dataframe\n",
" if 'data_coding' in self.dict.columns:\n",
" self.data_codings = self.dict\\\n",
" ['data_coding']\\\n",
" .reset_index()\\\n",
" .rename(columns={'data_coding': 'code_number'})\\\n",
" .astype({'code_number': 'str'})\\\n",
" .merge(\n",
" self.data_codings.astype({'code_number': 'str'}), \n",
" on='code_number',\n",
" how='inner'\n",
" )\\\n",
" .set_index('tabular_field_name')\n",
"\n",
" def __load_one_dataframe__(self, relative_location: str) -> pd.DataFrame:\n",
" \"\"\"\n",
" Load one dataframe.\n",
Expand Down
9 changes: 0 additions & 9 deletions nbs/12_cohort_selector.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -180,15 +180,6 @@
"For example, the following query selects participants who have moderate obstructive sleep apnea (AHI > 15) based on recordings of at least 4 hours of sleep."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ml = MetaLoader()"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
100 changes: 63 additions & 37 deletions nbs/13_questionnaire_handler.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"metadata": {},
"outputs": [],
"source": [
"#| default_exp questionnaires_handler\n"
"#| default_exp questionnaires_handler"
]
},
{
Expand Down Expand Up @@ -204,40 +204,55 @@
" else:\n",
" code_string = convert_to_string(dict_df.loc[tab_field_name][\"data_coding\"])\n",
" \n",
" #getting the data coding df from the large data coding csv\n",
" # Getting the data coding df from the large data coding csv\n",
" code_df = mapping_df[mapping_df[\"code_number\"] == code_string].copy()\n",
" \n",
" #Make sure no leading 0s for coding values\n",
" # Make sure no leading 0s for coding values\n",
" code_df[\"coding\"] = code_df[\"coding\"].apply(convert_to_string)\n",
" cat_ordered = code_df\\\n",
" .astype({'coding': 'int'})\\\n",
" .sort_values('coding')[code_to]\\\n",
" .astype('str')\\\n",
" .drop_duplicates()\n",
" \n",
" mapping_dict = dict(zip(code_df[code_from].astype(str), code_df[code_to]))\n",
" \n",
" \n",
" #adding fail safe incase older dictionaries don't have field type : TODO potentaily remove once older dictionaires are updated\n",
" if 'field_type' in dict_df.columns:\n",
" field_type = dict_df.loc[tab_field_name]['field_type']\n",
" if isinstance(field_type, pd.Series):\n",
" if field_type.nunique() == 1:\n",
" field_type = field_type.iloc[0]\n",
" else:\n",
" warnings.warn(f\"tabular field {tab_field_name} is used in 2 columns and have conflicting field types,please check and update dictionary. This field has not be converted.\")\n",
" return orig_answer\n",
" \n",
" if field_type == 'Categorical (multiple)': \n",
" normalise_answer = normalize_answers(orig_answer, field_type)\n",
" check_invalid_values(normalise_answer , code_df)\n",
" transformed_answer = normalise_answer.apply(replace_values, mapping_dict = mapping_dict)\n",
" field_type = dict_df.loc[tab_field_name]['field_type']\n",
" if isinstance(field_type, pd.Series):\n",
" if field_type.nunique() == 1:\n",
" field_type = field_type.iloc[0]\n",
" else:\n",
" #if categorical single\n",
" normalized_answer = normalize_answers(orig_answer, field_type)\n",
" check_invalid_values(normalized_answer, code_df)\n",
" transformed_answer = normalized_answer.replace(mapping_dict)\n",
" transformed_answer = transformed_answer.astype(\"category\")\n",
"\n",
" return transformed_answer\n",
" warnings.warn(f\"tabular field {tab_field_name} is used in 2 columns and have conflicting field types,please check and update dictionary. This field has not be converted.\")\n",
" return orig_answer\n",
" \n",
" if field_type == 'Categorical (multiple)': \n",
" normalise_answer = normalize_answers(orig_answer, field_type)\n",
" check_invalid_values(normalise_answer , code_df)\n",
" transformed_answer = normalise_answer.apply(replace_values, mapping_dict = mapping_dict)\n",
" else:\n",
" return orig_answer\n",
" normalized_answer = normalize_answers(orig_answer, field_type)\n",
" check_invalid_values(normalized_answer, code_df)\n",
" transformed_answer = normalized_answer.replace(mapping_dict)\n",
" transformed_answer = pd.Categorical(transformed_answer,\n",
" categories=cat_ordered, \n",
" ordered=True\n",
" )\n",
" # We update the dictionary to ensure that the categories are not reset later\n",
" dict_df.loc[tab_field_name, 'pandas_dtype'] = 'category_ordered'\n",
"\n",
" return transformed_answer\n",
"\n",
"def convert_codings_to_int(df: pd.Series, dict_df: pd.DataFrame) -> pd.Series:\n",
" tabular_field_name = df.name\n",
" field_array = dict_df.loc[tabular_field_name, 'array']\n",
" if isinstance(field_array, pd.Series):\n",
" field_array = field_array.iloc[0]\n",
" if field_array == 'Multiple':\n",
" return df\n",
" else: \n",
" dict_df.loc[tabular_field_name, 'pandas_dtype'] = 'Int16'\n",
" return df.astype('Int16', errors='ignore')\n",
"\n",
"def transform_dataframe(\n",
" df: pd.DataFrame,\n",
Expand All @@ -246,20 +261,31 @@
" dict_df: pd.DataFrame,\n",
" mapping_df: pd.DataFrame,\n",
") -> pd.DataFrame:\n",
" if 'data_coding' not in dict_df.columns or transform_from == transform_to:\n",
" if 'data_coding' not in dict_df.columns:\n",
" warnings.warn(\"data_coding column not found in dictionary, skipping transformation\")\n",
" return df\n",
" \n",
" # Validate input parameters\n",
" if transform_from not in valid_codings or transform_to not in valid_codings:\n",
" raise ValueError(f\"transform_from and transform_to must be one of {valid_codings}\")\n",
"\n",
" # Only fields with a code in data_coding property will be transformed\n",
" fields_for_translation = dict_df[pd.notna(dict_df.data_coding)].index.intersection(df.columns)\n",
" if len(fields_for_translation) == 0:\n",
" if len(fields_for_translation) == 0: # No fields with data_coding code\n",
" return df\n",
"\n",
" transformed_df = df.copy()\n",
" for column in fields_for_translation:\n",
" data_coding = dict_df.loc[column, 'data_coding']\n",
" # Handle the case where data_coding is a Series (multiple entries)\n",
" if isinstance(data_coding, pd.Series):\n",
" if data_coding.nunique() > 1:\n",
" warnings.warn(f\"Multiple different data_coding values found for column {column}. Using first value.\")\n",
" data_coding = data_coding.iloc[0]\n",
" \n",
" if pd.notna(data_coding):\n",
"\n",
" if pd.isna(data_coding):\n",
" continue\n",
"\n",
" if transform_from != transform_to:\n",
" transformed_df[column] = transform_answers(\n",
" column,\n",
" transformed_df[column],\n",
Expand All @@ -268,15 +294,15 @@
" dict_df,\n",
" mapping_df\n",
" )\n",
" \n",
" if transform_to == 'coding':\n",
" transformed_df[column] = convert_codings_to_int(\n",
" transformed_df[column], \n",
" dict_df=dict_df\n",
" )\n",
"\n",
" return transformed_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Loading
Loading