chore: Add Py03-Assignments.ipynb to gitignore

stawiskm · Aug 22, 2024 · c6a88f6 · c6a88f6
1 parent 571b965
commit c6a88f6
Show file tree

Hide file tree

Showing 2 changed files with 350 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,5 @@ Code_Examples/Day 4/Data/Pancreas
 .vscode
 Assignments/*.ipynb
 Assignments/StudAssignments
-!Assignments/Py02-Assignments.ipynb
+!Assignments/Py02-Assignments.ipynb
+!Assignments/Py03-Assignments.ipynb
diff --git a/Assignments/Py03-Assignments.ipynb b/Assignments/Py03-Assignments.ipynb
@@ -0,0 +1,348 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Missing values in each column:\n",
+      "Patient_ID                   0\n",
+      "Age                          0\n",
+      "Gender                       0\n",
+      "Primary_Diagnosis            0\n",
+      "Num_Comorbidities            0\n",
+      "Length_of_Stay               0\n",
+      "Discharge_Destination        0\n",
+      "Follow_Up_Appointment        0\n",
+      "Readmitted_Within_30_Days    0\n",
+      "dtype: int64\n",
+      "Readmission rate by Gender:\n",
+      "Gender\n",
+      "female    0.136364\n",
+      "male      0.160714\n",
+      "Name: Readmitted_Within_30_Days, dtype: float64\n",
+      "\n",
+      "Readmission rate by Primary Diagnosis:\n",
+      "Primary_Diagnosis\n",
+      "copd              0.153846\n",
+      "diabetes          0.192308\n",
+      "heart failure     0.200000\n",
+      "kidney disease    0.058824\n",
+      "pneumonia         0.125000\n",
+      "Name: Readmitted_Within_30_Days, dtype: float64\n",
+      "\n",
+      "Readmission rate by Length of Stay Category:\n",
+      "Length_of_Stay_Category\n",
+      "Short        0.6\n",
+      "Medium       0.0\n",
+      "Long         0.0\n",
+      "Very Long    0.0\n",
+      "Name: Readmitted_Within_30_Days, dtype: float64\n",
+      "\n",
+      "Readmission rate by Age Group:\n",
+      "Age_Group\n",
+      "Young          0.190476\n",
+      "Middle-aged    0.260870\n",
+      "Senior         0.103448\n",
+      "Elderly        0.074074\n",
+      "Name: Readmitted_Within_30_Days, dtype: float64\n",
+      "\n",
+      "Readmission rate by Discharge Destination:\n",
+      "Discharge_Destination\n",
+      "home                     0.205882\n",
+      "home with services       0.160000\n",
+      "nursing home             0.076923\n",
+      "rehabilitation center    0.133333\n",
+      "Name: Readmitted_Within_30_Days, dtype: float64\n",
+      "\n",
+      "Readmission rate by Comorbidity Burden:\n",
+      "Comorbidity_Burden\n",
+      "None      0.0\n",
+      "Low       0.0\n",
+      "Medium    NaN\n",
+      "High      0.6\n",
+      "Name: Readmitted_Within_30_Days, dtype: float64\n",
+      "\n",
+      "Summary statistics of cleaned data:\n",
+      "       Patient_ID         Age  Num_Comorbidities  Length_of_Stay  \\\n",
+      "count  100.000000  100.000000         100.000000      100.000000   \n",
+      "mean    50.500000   52.870000           1.630000        9.930000   \n",
+      "std     29.011492   21.901365           2.003305        6.267207   \n",
+      "min      1.000000   18.000000           0.000000        1.000000   \n",
+      "25%     25.750000   32.000000           0.000000        4.500000   \n",
+      "50%     50.500000   53.500000           1.000000        9.000000   \n",
+      "75%     75.250000   71.250000           2.000000       16.000000   \n",
+      "max    100.000000   89.000000           5.000000       20.000000   \n",
+      "\n",
+      "       Follow_Up_Appointment  Readmitted_Within_30_Days  \n",
+      "count                 100.00                  100.00000  \n",
+      "mean                    0.45                    0.15000  \n",
+      "std                     0.50                    0.35887  \n",
+      "min                     0.00                    0.00000  \n",
+      "25%                     0.00                    0.00000  \n",
+      "50%                     0.00                    0.00000  \n",
+      "75%                     1.00                    0.00000  \n",
+      "max                     1.00                    1.00000  \n",
+      "\n",
+      "Count of patients by Age Group:\n",
+      "Age_Group\n",
+      "Senior         29\n",
+      "Elderly        27\n",
+      "Middle-aged    23\n",
+      "Young          21\n",
+      "Name: count, dtype: int64\n",
+      "\n",
+      "Count of patients by Length of Stay Category:\n",
+      "Length_of_Stay_Category\n",
+      "Long         30\n",
+      "Very Long    28\n",
+      "Short        25\n",
+      "Medium       17\n",
+      "Name: count, dtype: int64\n",
+      "\n",
+      "Correlation matrix: of values abs(corr) > 0.8\n",
+      "Num_Comorbidities              Length_of_Stay_Category_Short    0.976123\n",
+      "                               Comorbidity_Burden_High          0.976123\n",
+      "Gender_female                  Gender_male                     -1.000000\n",
+      "Gender_male                    Gender_female                   -1.000000\n",
+      "Length_of_Stay_Category_Short  Num_Comorbidities                0.976123\n",
+      "Comorbidity_Burden_High        Num_Comorbidities                0.976123\n",
+      "dtype: float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "# Load the data\n",
+    "url= \"https://raw.githubusercontent.com/stawiskm/pythoncourse/student/Assignments/Data/fictitious_hospital_data.csv\"\n",
+    "data = pd.read_csv(url)\n",
+    "\n",
+    "# 1. Data Cleaning\n",
+    "# -----------------\n",
+    "\n",
+    "# Check for missing values\n",
+    "missing_values = data.isnull().sum()\n",
+    "print(\"Missing values in each column:\")\n",
+    "print(missing_values)\n",
+    "\n",
+    "# Drop rows with missing values (if any)\n",
+    "data_cleaned = data.dropna()\n",
+    "\n",
+    "# Check for duplicate rows\n",
+    "duplicate_rows = data_cleaned.duplicated().sum()\n",
+    "# print(f\"Number of duplicate rows: {duplicate_rows}\")\n",
+    "\n",
+    "# Drop duplicate rows (if any)\n",
+    "data_cleaned = data_cleaned.drop_duplicates()\n",
+    "\n",
+    "# Standardize categorical data (convert to lower case or consistent format)\n",
+    "data_cleaned['Gender'] = data_cleaned['Gender'].str.lower()\n",
+    "data_cleaned['Primary_Diagnosis'] = data_cleaned['Primary_Diagnosis'].str.lower()\n",
+    "data_cleaned['Discharge_Destination'] = data_cleaned['Discharge_Destination'].str.lower()\n",
+    "\n",
+    "# 2. Feature Engineering\n",
+    "# ----------------------\n",
+    "\n",
+    "# Create a new feature: Length of Stay Category\n",
+    "bins = [0, 3, 7, 14, np.inf]\n",
+    "labels = ['Short', 'Medium', 'Long', 'Very Long']\n",
+    "data_cleaned['Length_of_Stay_Category'] = pd.cut(data_cleaned['Length_of_Stay'], bins=bins, labels=labels)\n",
+    "\n",
+    "# Create a new feature: Age Group\n",
+    "age_bins = [0, 30, 50, 70, np.inf]\n",
+    "age_labels = ['Young', 'Middle-aged', 'Senior', 'Elderly']\n",
+    "data_cleaned['Age_Group'] = pd.cut(data_cleaned['Age'], bins=age_bins, labels=age_labels)\n",
+    "\n",
+    "# Create a new feature: Comorbidity Burden\n",
+    "data_cleaned['Comorbidity_Burden'] = pd.cut(data_cleaned['Num_Comorbidities'], bins=[-1, 0, 2, 4, np.inf], labels=['None', 'Low', 'Medium', 'High'])\n",
+    "\n",
+    "# 3. Data Analysis\n",
+    "# ----------------\n",
+    "\n",
+    "# Analyzing Readmission Rates by Various Factors\n",
+    "\n",
+    "# Readmission rate by Gender\n",
+    "readmission_by_gender = data_cleaned.groupby('Gender')['Readmitted_Within_30_Days'].mean()\n",
+    "print(\"Readmission rate by Gender:\")\n",
+    "print(readmission_by_gender)\n",
+    "\n",
+    "# Readmission rate by Primary Diagnosis\n",
+    "readmission_by_diagnosis = data_cleaned.groupby('Primary_Diagnosis')['Readmitted_Within_30_Days'].mean()\n",
+    "print(\"\\nReadmission rate by Primary Diagnosis:\")\n",
+    "print(readmission_by_diagnosis)\n",
+    "\n",
+    "# Readmission rate by Length of Stay Category\n",
+    "readmission_by_los = data_cleaned.groupby('Length_of_Stay_Category')['Readmitted_Within_30_Days'].mean()\n",
+    "print(\"\\nReadmission rate by Length of Stay Category:\")\n",
+    "print(readmission_by_los)\n",
+    "\n",
+    "# Readmission rate by Age Group\n",
+    "readmission_by_age_group = data_cleaned.groupby('Age_Group')['Readmitted_Within_30_Days'].mean()\n",
+    "print(\"\\nReadmission rate by Age Group:\")\n",
+    "print(readmission_by_age_group)\n",
+    "\n",
+    "# Readmission rate by Discharge Destination\n",
+    "readmission_by_discharge = data_cleaned.groupby('Discharge_Destination')['Readmitted_Within_30_Days'].mean()\n",
+    "print(\"\\nReadmission rate by Discharge Destination:\")\n",
+    "print(readmission_by_discharge)\n",
+    "\n",
+    "# Readmission rate by Comorbidity Burden\n",
+    "readmission_by_comorbidity = data_cleaned.groupby('Comorbidity_Burden')['Readmitted_Within_30_Days'].mean()\n",
+    "print(\"\\nReadmission rate by Comorbidity Burden:\")\n",
+    "print(readmission_by_comorbidity)\n",
+    "\n",
+    "# Summary Statistics for Data\n",
+    "summary_stats = data_cleaned.describe()\n",
+    "print(\"\\nSummary statistics of cleaned data:\")\n",
+    "print(summary_stats)\n",
+    "\n",
+    "# Count of patients in each group (e.g., Age Group, Length of Stay Category)\n",
+    "age_group_counts = data_cleaned['Age_Group'].value_counts()\n",
+    "length_of_stay_counts = data_cleaned['Length_of_Stay_Category'].value_counts()\n",
+    "\n",
+    "print(\"\\nCount of patients by Age Group:\")\n",
+    "print(age_group_counts)\n",
+    "\n",
+    "print(\"\\nCount of patients by Length of Stay Category:\")\n",
+    "print(length_of_stay_counts)\n",
+    "\n",
+    "# Correlation Matrix\n",
+    "# Convert categorical features to numerical using one-hot encoding\n",
+    "data_cleaned = pd.get_dummies(data_cleaned)\n",
+    "# Correlation matrix for numerical features\n",
+    "correlation_matrix = data_cleaned.corr()\n",
+    "print(\"\\nCorrelation matrix: of values abs(corr) > 0.8\")\n",
+    "# format the output only show the values that are greater than 0.8 in absolute value\n",
+    "# Output: feature 1 vs feature 2 : correlation value and only show the values that are greater than 0.8 in absolute value\n",
+    "print(correlation_matrix[(correlation_matrix.abs() > 0.8) & (correlation_matrix < 1)].stack().dropna())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Adherence_Level        Age  Num_Medications                Condition  \\\n",
+      "0            High  52.164444         4.333333             Hypertension   \n",
+      "1             Low  75.969697        10.166667             Hypertension   \n",
+      "2          Medium  72.222222        10.000000  Hypertension & Diabetes   \n",
+      "\n",
+      "   Patient_ID  \n",
+      "0          70  \n",
+      "1          24  \n",
+      "2           8  \n",
+      "Patients at high risk of non-adherence:\n",
+      "     Patient_ID  Age\n",
+      "1             2   84\n",
+      "5            13   85\n",
+      "26           73   68\n",
+      "28           17   67\n",
+      "115          11   75\n",
+      "125          37   76\n",
+      "128           5   72\n",
+      "131          83   72\n",
+      "140          89   80\n",
+      "162          97   73\n",
+      "171          23   70\n",
+      "179          53   73\n",
+      "185          67   72\n",
+      "188           7   71\n",
+      "191          29   76\n",
+      "230          31   81\n",
+      "232          47   71\n",
+      "254          43   83\n",
+      "273          41   67\n",
+      "287          59   76\n",
+      "289          71   86\n",
+      "294          61   79\n",
+      "298          19   85\n",
+      "299           3   79\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "# Url\n",
+    "url1 = \"https://raw.githubusercontent.com/stawiskm/pythoncourse/student/Assignments/Data/fictitious_patients.csv\"\n",
+    "url2 = \"https://raw.githubusercontent.com/stawiskm/pythoncourse/student/Assignments/Data/fictitious_visits.csv\"\n",
+    "# Load the datasets\n",
+    "patients_df = pd.read_csv(url1)\n",
+    "visits_df = pd.read_csv(url2)\n",
+    "\n",
+    "# 1. Data Merging: Combine patients and visits data into a single DataFrame\n",
+    "# Assume there's a common column 'Patient_ID' in both datasets\n",
+    "merged_df = pd.merge(visits_df, patients_df, on='Patient_ID', how='inner')\n",
+    "\n",
+    "# 2. Adherence Calculation: Calculate medication adherence rates\n",
+    "# Adherence Rate = (Reported Doses Taken / Prescribed Doses) * 100\n",
+    "merged_df['Adherence_Rate'] = (merged_df['Reported_Doses_Taken'] / merged_df['Prescribed_Doses']) * 100\n",
+    "\n",
+    "# 3. Risk Stratification: Classify patients into risk categories based on adherence rate\n",
+    "def classify_adherence(rate):\n",
+    "    if rate >= 80:\n",
+    "        return 'High'\n",
+    "    elif 50 <= rate < 80:\n",
+    "        return 'Medium'\n",
+    "    else:\n",
+    "        return 'Low'\n",
+    "\n",
+    "merged_df['Adherence_Level'] = merged_df['Adherence_Rate'].apply(classify_adherence)\n",
+    "\n",
+    "# Analyze characteristics of each group\n",
+    "# Grouping by Adherence Level and aggregating other characteristics\n",
+    "grouped_df = merged_df.groupby('Adherence_Level').agg({\n",
+    "    'Age': 'mean',\n",
+    "    'Num_Medications': 'mean',\n",
+    "    'Condition': pd.Series.mode,\n",
+    "    'Patient_ID': 'nunique'  # Count of unique patients in each adherence group\n",
+    "}).reset_index()\n",
+    "\n",
+    "# Output the results\n",
+    "print(grouped_df)\n",
+    "\n",
+    "# Example: Flag patients with low adherence for additional support\n",
+    "low_adherence_patients = merged_df[merged_df['Adherence_Level'] == 'Low']\n",
+    "print(\"Patients at high risk of non-adherence:\")\n",
+    "print(low_adherence_patients[['Patient_ID', 'Age']].drop_duplicates())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "datascience",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}