-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
chore: Add Py03-Assignments.ipynb to gitignore
- Loading branch information
marc
committed
Aug 22, 2024
1 parent
571b965
commit c6a88f6
Showing
2 changed files
with
350 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,348 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Missing values in each column:\n", | ||
"Patient_ID 0\n", | ||
"Age 0\n", | ||
"Gender 0\n", | ||
"Primary_Diagnosis 0\n", | ||
"Num_Comorbidities 0\n", | ||
"Length_of_Stay 0\n", | ||
"Discharge_Destination 0\n", | ||
"Follow_Up_Appointment 0\n", | ||
"Readmitted_Within_30_Days 0\n", | ||
"dtype: int64\n", | ||
"Readmission rate by Gender:\n", | ||
"Gender\n", | ||
"female 0.136364\n", | ||
"male 0.160714\n", | ||
"Name: Readmitted_Within_30_Days, dtype: float64\n", | ||
"\n", | ||
"Readmission rate by Primary Diagnosis:\n", | ||
"Primary_Diagnosis\n", | ||
"copd 0.153846\n", | ||
"diabetes 0.192308\n", | ||
"heart failure 0.200000\n", | ||
"kidney disease 0.058824\n", | ||
"pneumonia 0.125000\n", | ||
"Name: Readmitted_Within_30_Days, dtype: float64\n", | ||
"\n", | ||
"Readmission rate by Length of Stay Category:\n", | ||
"Length_of_Stay_Category\n", | ||
"Short 0.6\n", | ||
"Medium 0.0\n", | ||
"Long 0.0\n", | ||
"Very Long 0.0\n", | ||
"Name: Readmitted_Within_30_Days, dtype: float64\n", | ||
"\n", | ||
"Readmission rate by Age Group:\n", | ||
"Age_Group\n", | ||
"Young 0.190476\n", | ||
"Middle-aged 0.260870\n", | ||
"Senior 0.103448\n", | ||
"Elderly 0.074074\n", | ||
"Name: Readmitted_Within_30_Days, dtype: float64\n", | ||
"\n", | ||
"Readmission rate by Discharge Destination:\n", | ||
"Discharge_Destination\n", | ||
"home 0.205882\n", | ||
"home with services 0.160000\n", | ||
"nursing home 0.076923\n", | ||
"rehabilitation center 0.133333\n", | ||
"Name: Readmitted_Within_30_Days, dtype: float64\n", | ||
"\n", | ||
"Readmission rate by Comorbidity Burden:\n", | ||
"Comorbidity_Burden\n", | ||
"None 0.0\n", | ||
"Low 0.0\n", | ||
"Medium NaN\n", | ||
"High 0.6\n", | ||
"Name: Readmitted_Within_30_Days, dtype: float64\n", | ||
"\n", | ||
"Summary statistics of cleaned data:\n", | ||
" Patient_ID Age Num_Comorbidities Length_of_Stay \\\n", | ||
"count 100.000000 100.000000 100.000000 100.000000 \n", | ||
"mean 50.500000 52.870000 1.630000 9.930000 \n", | ||
"std 29.011492 21.901365 2.003305 6.267207 \n", | ||
"min 1.000000 18.000000 0.000000 1.000000 \n", | ||
"25% 25.750000 32.000000 0.000000 4.500000 \n", | ||
"50% 50.500000 53.500000 1.000000 9.000000 \n", | ||
"75% 75.250000 71.250000 2.000000 16.000000 \n", | ||
"max 100.000000 89.000000 5.000000 20.000000 \n", | ||
"\n", | ||
" Follow_Up_Appointment Readmitted_Within_30_Days \n", | ||
"count 100.00 100.00000 \n", | ||
"mean 0.45 0.15000 \n", | ||
"std 0.50 0.35887 \n", | ||
"min 0.00 0.00000 \n", | ||
"25% 0.00 0.00000 \n", | ||
"50% 0.00 0.00000 \n", | ||
"75% 1.00 0.00000 \n", | ||
"max 1.00 1.00000 \n", | ||
"\n", | ||
"Count of patients by Age Group:\n", | ||
"Age_Group\n", | ||
"Senior 29\n", | ||
"Elderly 27\n", | ||
"Middle-aged 23\n", | ||
"Young 21\n", | ||
"Name: count, dtype: int64\n", | ||
"\n", | ||
"Count of patients by Length of Stay Category:\n", | ||
"Length_of_Stay_Category\n", | ||
"Long 30\n", | ||
"Very Long 28\n", | ||
"Short 25\n", | ||
"Medium 17\n", | ||
"Name: count, dtype: int64\n", | ||
"\n", | ||
"Correlation matrix: of values abs(corr) > 0.8\n", | ||
"Num_Comorbidities Length_of_Stay_Category_Short 0.976123\n", | ||
" Comorbidity_Burden_High 0.976123\n", | ||
"Gender_female Gender_male -1.000000\n", | ||
"Gender_male Gender_female -1.000000\n", | ||
"Length_of_Stay_Category_Short Num_Comorbidities 0.976123\n", | ||
"Comorbidity_Burden_High Num_Comorbidities 0.976123\n", | ||
"dtype: float64\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"\n", | ||
"# Load the data\n", | ||
"url= \"https://raw.githubusercontent.com/stawiskm/pythoncourse/student/Assignments/Data/fictitious_hospital_data.csv\"\n", | ||
"data = pd.read_csv(url)\n", | ||
"\n", | ||
"# 1. Data Cleaning\n", | ||
"# -----------------\n", | ||
"\n", | ||
"# Check for missing values\n", | ||
"missing_values = data.isnull().sum()\n", | ||
"print(\"Missing values in each column:\")\n", | ||
"print(missing_values)\n", | ||
"\n", | ||
"# Drop rows with missing values (if any)\n", | ||
"data_cleaned = data.dropna()\n", | ||
"\n", | ||
"# Check for duplicate rows\n", | ||
"duplicate_rows = data_cleaned.duplicated().sum()\n", | ||
"# print(f\"Number of duplicate rows: {duplicate_rows}\")\n", | ||
"\n", | ||
"# Drop duplicate rows (if any)\n", | ||
"data_cleaned = data_cleaned.drop_duplicates()\n", | ||
"\n", | ||
"# Standardize categorical data (convert to lower case or consistent format)\n", | ||
"data_cleaned['Gender'] = data_cleaned['Gender'].str.lower()\n", | ||
"data_cleaned['Primary_Diagnosis'] = data_cleaned['Primary_Diagnosis'].str.lower()\n", | ||
"data_cleaned['Discharge_Destination'] = data_cleaned['Discharge_Destination'].str.lower()\n", | ||
"\n", | ||
"# 2. Feature Engineering\n", | ||
"# ----------------------\n", | ||
"\n", | ||
"# Create a new feature: Length of Stay Category\n", | ||
"bins = [0, 3, 7, 14, np.inf]\n", | ||
"labels = ['Short', 'Medium', 'Long', 'Very Long']\n", | ||
"data_cleaned['Length_of_Stay_Category'] = pd.cut(data_cleaned['Length_of_Stay'], bins=bins, labels=labels)\n", | ||
"\n", | ||
"# Create a new feature: Age Group\n", | ||
"age_bins = [0, 30, 50, 70, np.inf]\n", | ||
"age_labels = ['Young', 'Middle-aged', 'Senior', 'Elderly']\n", | ||
"data_cleaned['Age_Group'] = pd.cut(data_cleaned['Age'], bins=age_bins, labels=age_labels)\n", | ||
"\n", | ||
"# Create a new feature: Comorbidity Burden\n", | ||
"data_cleaned['Comorbidity_Burden'] = pd.cut(data_cleaned['Num_Comorbidities'], bins=[-1, 0, 2, 4, np.inf], labels=['None', 'Low', 'Medium', 'High'])\n", | ||
"\n", | ||
"# 3. Data Analysis\n", | ||
"# ----------------\n", | ||
"\n", | ||
"# Analyzing Readmission Rates by Various Factors\n", | ||
"\n", | ||
"# Readmission rate by Gender\n", | ||
"readmission_by_gender = data_cleaned.groupby('Gender')['Readmitted_Within_30_Days'].mean()\n", | ||
"print(\"Readmission rate by Gender:\")\n", | ||
"print(readmission_by_gender)\n", | ||
"\n", | ||
"# Readmission rate by Primary Diagnosis\n", | ||
"readmission_by_diagnosis = data_cleaned.groupby('Primary_Diagnosis')['Readmitted_Within_30_Days'].mean()\n", | ||
"print(\"\\nReadmission rate by Primary Diagnosis:\")\n", | ||
"print(readmission_by_diagnosis)\n", | ||
"\n", | ||
"# Readmission rate by Length of Stay Category\n", | ||
"readmission_by_los = data_cleaned.groupby('Length_of_Stay_Category')['Readmitted_Within_30_Days'].mean()\n", | ||
"print(\"\\nReadmission rate by Length of Stay Category:\")\n", | ||
"print(readmission_by_los)\n", | ||
"\n", | ||
"# Readmission rate by Age Group\n", | ||
"readmission_by_age_group = data_cleaned.groupby('Age_Group')['Readmitted_Within_30_Days'].mean()\n", | ||
"print(\"\\nReadmission rate by Age Group:\")\n", | ||
"print(readmission_by_age_group)\n", | ||
"\n", | ||
"# Readmission rate by Discharge Destination\n", | ||
"readmission_by_discharge = data_cleaned.groupby('Discharge_Destination')['Readmitted_Within_30_Days'].mean()\n", | ||
"print(\"\\nReadmission rate by Discharge Destination:\")\n", | ||
"print(readmission_by_discharge)\n", | ||
"\n", | ||
"# Readmission rate by Comorbidity Burden\n", | ||
"readmission_by_comorbidity = data_cleaned.groupby('Comorbidity_Burden')['Readmitted_Within_30_Days'].mean()\n", | ||
"print(\"\\nReadmission rate by Comorbidity Burden:\")\n", | ||
"print(readmission_by_comorbidity)\n", | ||
"\n", | ||
"# Summary Statistics for Data\n", | ||
"summary_stats = data_cleaned.describe()\n", | ||
"print(\"\\nSummary statistics of cleaned data:\")\n", | ||
"print(summary_stats)\n", | ||
"\n", | ||
"# Count of patients in each group (e.g., Age Group, Length of Stay Category)\n", | ||
"age_group_counts = data_cleaned['Age_Group'].value_counts()\n", | ||
"length_of_stay_counts = data_cleaned['Length_of_Stay_Category'].value_counts()\n", | ||
"\n", | ||
"print(\"\\nCount of patients by Age Group:\")\n", | ||
"print(age_group_counts)\n", | ||
"\n", | ||
"print(\"\\nCount of patients by Length of Stay Category:\")\n", | ||
"print(length_of_stay_counts)\n", | ||
"\n", | ||
"# Correlation Matrix\n", | ||
"# Convert categorical features to numerical using one-hot encoding\n", | ||
"data_cleaned = pd.get_dummies(data_cleaned)\n", | ||
"# Correlation matrix for numerical features\n", | ||
"correlation_matrix = data_cleaned.corr()\n", | ||
"print(\"\\nCorrelation matrix: of values abs(corr) > 0.8\")\n", | ||
"# format the output only show the values that are greater than 0.8 in absolute value\n", | ||
"# Output: feature 1 vs feature 2 : correlation value and only show the values that are greater than 0.8 in absolute value\n", | ||
"print(correlation_matrix[(correlation_matrix.abs() > 0.8) & (correlation_matrix < 1)].stack().dropna())\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 17, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" Adherence_Level Age Num_Medications Condition \\\n", | ||
"0 High 52.164444 4.333333 Hypertension \n", | ||
"1 Low 75.969697 10.166667 Hypertension \n", | ||
"2 Medium 72.222222 10.000000 Hypertension & Diabetes \n", | ||
"\n", | ||
" Patient_ID \n", | ||
"0 70 \n", | ||
"1 24 \n", | ||
"2 8 \n", | ||
"Patients at high risk of non-adherence:\n", | ||
" Patient_ID Age\n", | ||
"1 2 84\n", | ||
"5 13 85\n", | ||
"26 73 68\n", | ||
"28 17 67\n", | ||
"115 11 75\n", | ||
"125 37 76\n", | ||
"128 5 72\n", | ||
"131 83 72\n", | ||
"140 89 80\n", | ||
"162 97 73\n", | ||
"171 23 70\n", | ||
"179 53 73\n", | ||
"185 67 72\n", | ||
"188 7 71\n", | ||
"191 29 76\n", | ||
"230 31 81\n", | ||
"232 47 71\n", | ||
"254 43 83\n", | ||
"273 41 67\n", | ||
"287 59 76\n", | ||
"289 71 86\n", | ||
"294 61 79\n", | ||
"298 19 85\n", | ||
"299 3 79\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"# Url\n", | ||
"url1 = \"https://raw.githubusercontent.com/stawiskm/pythoncourse/student/Assignments/Data/fictitious_patients.csv\"\n", | ||
"url2 = \"https://raw.githubusercontent.com/stawiskm/pythoncourse/student/Assignments/Data/fictitious_visits.csv\"\n", | ||
"# Load the datasets\n", | ||
"patients_df = pd.read_csv(url1)\n", | ||
"visits_df = pd.read_csv(url2)\n", | ||
"\n", | ||
"# 1. Data Merging: Combine patients and visits data into a single DataFrame\n", | ||
"# Assume there's a common column 'Patient_ID' in both datasets\n", | ||
"merged_df = pd.merge(visits_df, patients_df, on='Patient_ID', how='inner')\n", | ||
"\n", | ||
"# 2. Adherence Calculation: Calculate medication adherence rates\n", | ||
"# Adherence Rate = (Reported Doses Taken / Prescribed Doses) * 100\n", | ||
"merged_df['Adherence_Rate'] = (merged_df['Reported_Doses_Taken'] / merged_df['Prescribed_Doses']) * 100\n", | ||
"\n", | ||
"# 3. Risk Stratification: Classify patients into risk categories based on adherence rate\n", | ||
"def classify_adherence(rate):\n", | ||
" if rate >= 80:\n", | ||
" return 'High'\n", | ||
" elif 50 <= rate < 80:\n", | ||
" return 'Medium'\n", | ||
" else:\n", | ||
" return 'Low'\n", | ||
"\n", | ||
"merged_df['Adherence_Level'] = merged_df['Adherence_Rate'].apply(classify_adherence)\n", | ||
"\n", | ||
"# Analyze characteristics of each group\n", | ||
"# Grouping by Adherence Level and aggregating other characteristics\n", | ||
"grouped_df = merged_df.groupby('Adherence_Level').agg({\n", | ||
" 'Age': 'mean',\n", | ||
" 'Num_Medications': 'mean',\n", | ||
" 'Condition': pd.Series.mode,\n", | ||
" 'Patient_ID': 'nunique' # Count of unique patients in each adherence group\n", | ||
"}).reset_index()\n", | ||
"\n", | ||
"# Output the results\n", | ||
"print(grouped_df)\n", | ||
"\n", | ||
"# Example: Flag patients with low adherence for additional support\n", | ||
"low_adherence_patients = merged_df[merged_df['Adherence_Level'] == 'Low']\n", | ||
"print(\"Patients at high risk of non-adherence:\")\n", | ||
"print(low_adherence_patients[['Patient_ID', 'Age']].drop_duplicates())\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "datascience", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.18" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |