PhenoAI · alondmnt · Nov 12, 2024 · Aug 14, 2024 · Sep 3, 2024 · Nov 8, 2024
diff --git a/nbs/01_basic_plots.ipynb b/nbs/01_basic_plots.ipynb
diff --git a/nbs/05_pheno_loader.ipynb b/nbs/05_pheno_loader.ipynb
@@ -284,9 +284,7 @@
     "            return None\n",
     "\n",
     "        # load data\n",
-    "        if load_func is not None:\n",
-    "            warnings.warn(\"The 'load_func' is deprecated and will be removed in future versions.\")\n",
-    "        else: \n",
+    "        if load_func is None:\n",
     "            if 'field_type' not in self.dict:\n",
     "                field_type = None\n",
     "            else:\n",
@@ -740,6 +738,20 @@
     "            self.fields |= set(self.dfs[table_name].columns.tolist())\n",
     "        self.fields = sorted(list(self.fields))\n",
     "\n",
+    "        # Merge the data_codings dataframe with the dictionary dataframe\n",
+    "        if 'data_coding' in self.dict.columns:\n",
+    "            self.data_codings = self.dict\\\n",
+    "                ['data_coding']\\\n",
+    "                .reset_index()\\\n",
+    "                .rename(columns={'data_coding': 'code_number'})\\\n",
+    "                .astype({'code_number': 'str'})\\\n",
+    "                .merge(\n",
+    "                    self.data_codings.astype({'code_number': 'str'}), \n",
+    "                    on='code_number',\n",
+    "                    how='inner'\n",
+    "                )\\\n",
+    "                .set_index('tabular_field_name')\n",
+    "\n",
     "    def __load_one_dataframe__(self, relative_location: str) -> pd.DataFrame:\n",
     "        \"\"\"\n",
     "        Load one dataframe.\n",

diff --git a/nbs/12_cohort_selector.ipynb b/nbs/12_cohort_selector.ipynb
@@ -180,15 +180,6 @@
     "For example, the following query selects participants who have moderate obstructive sleep apnea (AHI > 15) based on recordings of at least 4 hours of sleep."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ml = MetaLoader()"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/nbs/13_questionnaire_handler.ipynb b/nbs/13_questionnaire_handler.ipynb
@@ -18,7 +18,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| default_exp questionnaires_handler\n"
+    "#| default_exp questionnaires_handler"
    ]
   },
   {
@@ -204,40 +204,55 @@
     "    else:\n",
     "        code_string = convert_to_string(dict_df.loc[tab_field_name][\"data_coding\"])\n",
     "    \n",
-    "    #getting the data coding df from the large data coding csv\n",
+    "    # Getting the data coding df from the large data coding csv\n",
     "    code_df = mapping_df[mapping_df[\"code_number\"] == code_string].copy()\n",
     "    \n",
-    "    #Make sure no leading 0s for coding values\n",
+    "    # Make sure no leading 0s for coding values\n",
     "    code_df[\"coding\"] =  code_df[\"coding\"].apply(convert_to_string)\n",
+    "    cat_ordered = code_df\\\n",
+    "        .astype({'coding': 'int'})\\\n",
+    "        .sort_values('coding')[code_to]\\\n",
+    "        .astype('str')\\\n",
+    "        .drop_duplicates()\n",
     "    \n",
     "    mapping_dict = dict(zip(code_df[code_from].astype(str), code_df[code_to]))\n",
     "    \n",
     "  \n",
-    "  #adding fail safe incase older dictionaries don't have field type : TODO potentaily remove once older dictionaires are updated\n",
-    "    if 'field_type' in dict_df.columns:\n",
-    "        field_type =  dict_df.loc[tab_field_name]['field_type']\n",
-    "        if isinstance(field_type, pd.Series):\n",
-    "            if field_type.nunique() == 1:\n",
-    "                field_type = field_type.iloc[0]\n",
-    "            else:\n",
-    "                warnings.warn(f\"tabular field {tab_field_name} is used in 2 columns and have conflicting field types,please check and update dictionary. This field has not be converted.\")\n",
-    "                return orig_answer\n",
-    "        \n",
-    "        if field_type == 'Categorical (multiple)': \n",
-    "            normalise_answer = normalize_answers(orig_answer, field_type)\n",
-    "            check_invalid_values(normalise_answer , code_df)\n",
-    "            transformed_answer = normalise_answer.apply(replace_values, mapping_dict = mapping_dict)\n",
+    "    field_type =  dict_df.loc[tab_field_name]['field_type']\n",
+    "    if isinstance(field_type, pd.Series):\n",
+    "        if field_type.nunique() == 1:\n",
+    "            field_type = field_type.iloc[0]\n",
     "        else:\n",
-    "            #if categorical single\n",
-    "            normalized_answer = normalize_answers(orig_answer, field_type)\n",
-    "            check_invalid_values(normalized_answer, code_df)\n",
-    "            transformed_answer = normalized_answer.replace(mapping_dict)\n",
-    "            transformed_answer = transformed_answer.astype(\"category\")\n",
-    "\n",
-    "        return transformed_answer\n",
+    "            warnings.warn(f\"tabular field {tab_field_name} is used in 2 columns and have conflicting field types,please check and update dictionary. This field has not be converted.\")\n",
+    "            return orig_answer\n",
+    "    \n",
+    "    if field_type == 'Categorical (multiple)': \n",
+    "        normalise_answer = normalize_answers(orig_answer, field_type)\n",
+    "        check_invalid_values(normalise_answer , code_df)\n",
+    "        transformed_answer = normalise_answer.apply(replace_values, mapping_dict = mapping_dict)\n",
     "    else:\n",
-    "        return orig_answer\n",
+    "        normalized_answer = normalize_answers(orig_answer, field_type)\n",
+    "        check_invalid_values(normalized_answer, code_df)\n",
+    "        transformed_answer = normalized_answer.replace(mapping_dict)\n",
+    "        transformed_answer = pd.Categorical(transformed_answer,\n",
+    "            categories=cat_ordered, \n",
+    "            ordered=True\n",
+    "        )\n",
+    "        # We update the dictionary to ensure that the categories are not reset later\n",
+    "        dict_df.loc[tab_field_name, 'pandas_dtype'] = 'category_ordered'\n",
     "\n",
+    "    return transformed_answer\n",
+    "\n",
+    "def convert_codings_to_int(df: pd.Series, dict_df: pd.DataFrame) -> pd.Series:\n",
+    "    tabular_field_name = df.name\n",
+    "    field_array = dict_df.loc[tabular_field_name, 'array']\n",
+    "    if isinstance(field_array, pd.Series):\n",
+    "        field_array = field_array.iloc[0]\n",
+    "    if field_array == 'Multiple':\n",
+    "        return df\n",
+    "    else: \n",
+    "        dict_df.loc[tabular_field_name, 'pandas_dtype'] = 'Int16'\n",
+    "        return df.astype('Int16', errors='ignore')\n",
     "\n",
     "def transform_dataframe(\n",
     "    df: pd.DataFrame,\n",
@@ -246,20 +261,31 @@
     "    dict_df: pd.DataFrame,\n",
     "    mapping_df: pd.DataFrame,\n",
     ") -> pd.DataFrame:\n",
-    "    if 'data_coding' not in dict_df.columns or transform_from == transform_to:\n",
+    "    if 'data_coding' not in dict_df.columns:\n",
+    "        warnings.warn(\"data_coding column not found in dictionary, skipping transformation\")\n",
     "        return df\n",
-    "    \n",
+    "    # Validate input parameters\n",
+    "    if transform_from not in valid_codings or transform_to not in valid_codings:\n",
+    "        raise ValueError(f\"transform_from and transform_to must be one of {valid_codings}\")\n",
+    "\n",
+    "    # Only fields with a code in data_coding property will be transformed\n",
     "    fields_for_translation = dict_df[pd.notna(dict_df.data_coding)].index.intersection(df.columns)\n",
-    "    if len(fields_for_translation) == 0:\n",
+    "    if len(fields_for_translation) == 0: # No fields with data_coding code\n",
     "        return df\n",
+    "\n",
     "    transformed_df = df.copy()\n",
     "    for column in fields_for_translation:\n",
     "        data_coding = dict_df.loc[column, 'data_coding']\n",
     "        # Handle the case where data_coding is a Series (multiple entries)\n",
     "        if isinstance(data_coding, pd.Series):\n",
+    "            if data_coding.nunique() > 1:\n",
+    "                warnings.warn(f\"Multiple different data_coding values found for column {column}. Using first value.\")\n",
     "            data_coding = data_coding.iloc[0]\n",
-    "        \n",
-    "        if pd.notna(data_coding):\n",
+    "\n",
+    "        if pd.isna(data_coding):\n",
+    "            continue\n",
+    "\n",
+    "        if transform_from != transform_to:\n",
     "            transformed_df[column] = transform_answers(\n",
     "                    column,\n",
     "                    transformed_df[column],\n",
@@ -268,15 +294,15 @@
     "                    dict_df,\n",
     "                    mapping_df\n",
     "                )\n",
+    "    \n",
+    "        if transform_to == 'coding':\n",
+    "            transformed_df[column] = convert_codings_to_int(\n",
+    "                transformed_df[column], \n",
+    "                dict_df=dict_df\n",
+    "            )\n",
+    "\n",
     "    return transformed_df"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {