Skip to content

Commit

Permalink
chore: Add Py03-Assignments.ipynb to gitignore
Browse files Browse the repository at this point in the history
  • Loading branch information
marc committed Aug 22, 2024
1 parent 571b965 commit c6a88f6
Show file tree
Hide file tree
Showing 2 changed files with 350 additions and 1 deletion.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ Code_Examples/Day 4/Data/Pancreas
.vscode
Assignments/*.ipynb
Assignments/StudAssignments
!Assignments/Py02-Assignments.ipynb
!Assignments/Py02-Assignments.ipynb
!Assignments/Py03-Assignments.ipynb
348 changes: 348 additions & 0 deletions Assignments/Py03-Assignments.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,348 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Missing values in each column:\n",
"Patient_ID 0\n",
"Age 0\n",
"Gender 0\n",
"Primary_Diagnosis 0\n",
"Num_Comorbidities 0\n",
"Length_of_Stay 0\n",
"Discharge_Destination 0\n",
"Follow_Up_Appointment 0\n",
"Readmitted_Within_30_Days 0\n",
"dtype: int64\n",
"Readmission rate by Gender:\n",
"Gender\n",
"female 0.136364\n",
"male 0.160714\n",
"Name: Readmitted_Within_30_Days, dtype: float64\n",
"\n",
"Readmission rate by Primary Diagnosis:\n",
"Primary_Diagnosis\n",
"copd 0.153846\n",
"diabetes 0.192308\n",
"heart failure 0.200000\n",
"kidney disease 0.058824\n",
"pneumonia 0.125000\n",
"Name: Readmitted_Within_30_Days, dtype: float64\n",
"\n",
"Readmission rate by Length of Stay Category:\n",
"Length_of_Stay_Category\n",
"Short 0.6\n",
"Medium 0.0\n",
"Long 0.0\n",
"Very Long 0.0\n",
"Name: Readmitted_Within_30_Days, dtype: float64\n",
"\n",
"Readmission rate by Age Group:\n",
"Age_Group\n",
"Young 0.190476\n",
"Middle-aged 0.260870\n",
"Senior 0.103448\n",
"Elderly 0.074074\n",
"Name: Readmitted_Within_30_Days, dtype: float64\n",
"\n",
"Readmission rate by Discharge Destination:\n",
"Discharge_Destination\n",
"home 0.205882\n",
"home with services 0.160000\n",
"nursing home 0.076923\n",
"rehabilitation center 0.133333\n",
"Name: Readmitted_Within_30_Days, dtype: float64\n",
"\n",
"Readmission rate by Comorbidity Burden:\n",
"Comorbidity_Burden\n",
"None 0.0\n",
"Low 0.0\n",
"Medium NaN\n",
"High 0.6\n",
"Name: Readmitted_Within_30_Days, dtype: float64\n",
"\n",
"Summary statistics of cleaned data:\n",
" Patient_ID Age Num_Comorbidities Length_of_Stay \\\n",
"count 100.000000 100.000000 100.000000 100.000000 \n",
"mean 50.500000 52.870000 1.630000 9.930000 \n",
"std 29.011492 21.901365 2.003305 6.267207 \n",
"min 1.000000 18.000000 0.000000 1.000000 \n",
"25% 25.750000 32.000000 0.000000 4.500000 \n",
"50% 50.500000 53.500000 1.000000 9.000000 \n",
"75% 75.250000 71.250000 2.000000 16.000000 \n",
"max 100.000000 89.000000 5.000000 20.000000 \n",
"\n",
" Follow_Up_Appointment Readmitted_Within_30_Days \n",
"count 100.00 100.00000 \n",
"mean 0.45 0.15000 \n",
"std 0.50 0.35887 \n",
"min 0.00 0.00000 \n",
"25% 0.00 0.00000 \n",
"50% 0.00 0.00000 \n",
"75% 1.00 0.00000 \n",
"max 1.00 1.00000 \n",
"\n",
"Count of patients by Age Group:\n",
"Age_Group\n",
"Senior 29\n",
"Elderly 27\n",
"Middle-aged 23\n",
"Young 21\n",
"Name: count, dtype: int64\n",
"\n",
"Count of patients by Length of Stay Category:\n",
"Length_of_Stay_Category\n",
"Long 30\n",
"Very Long 28\n",
"Short 25\n",
"Medium 17\n",
"Name: count, dtype: int64\n",
"\n",
"Correlation matrix: of values abs(corr) > 0.8\n",
"Num_Comorbidities Length_of_Stay_Category_Short 0.976123\n",
" Comorbidity_Burden_High 0.976123\n",
"Gender_female Gender_male -1.000000\n",
"Gender_male Gender_female -1.000000\n",
"Length_of_Stay_Category_Short Num_Comorbidities 0.976123\n",
"Comorbidity_Burden_High Num_Comorbidities 0.976123\n",
"dtype: float64\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"# Load the data\n",
"url= \"https://raw.githubusercontent.com/stawiskm/pythoncourse/student/Assignments/Data/fictitious_hospital_data.csv\"\n",
"data = pd.read_csv(url)\n",
"\n",
"# 1. Data Cleaning\n",
"# -----------------\n",
"\n",
"# Check for missing values\n",
"missing_values = data.isnull().sum()\n",
"print(\"Missing values in each column:\")\n",
"print(missing_values)\n",
"\n",
"# Drop rows with missing values (if any)\n",
"data_cleaned = data.dropna()\n",
"\n",
"# Check for duplicate rows\n",
"duplicate_rows = data_cleaned.duplicated().sum()\n",
"# print(f\"Number of duplicate rows: {duplicate_rows}\")\n",
"\n",
"# Drop duplicate rows (if any)\n",
"data_cleaned = data_cleaned.drop_duplicates()\n",
"\n",
"# Standardize categorical data (convert to lower case or consistent format)\n",
"data_cleaned['Gender'] = data_cleaned['Gender'].str.lower()\n",
"data_cleaned['Primary_Diagnosis'] = data_cleaned['Primary_Diagnosis'].str.lower()\n",
"data_cleaned['Discharge_Destination'] = data_cleaned['Discharge_Destination'].str.lower()\n",
"\n",
"# 2. Feature Engineering\n",
"# ----------------------\n",
"\n",
"# Create a new feature: Length of Stay Category\n",
"bins = [0, 3, 7, 14, np.inf]\n",
"labels = ['Short', 'Medium', 'Long', 'Very Long']\n",
"data_cleaned['Length_of_Stay_Category'] = pd.cut(data_cleaned['Length_of_Stay'], bins=bins, labels=labels)\n",
"\n",
"# Create a new feature: Age Group\n",
"age_bins = [0, 30, 50, 70, np.inf]\n",
"age_labels = ['Young', 'Middle-aged', 'Senior', 'Elderly']\n",
"data_cleaned['Age_Group'] = pd.cut(data_cleaned['Age'], bins=age_bins, labels=age_labels)\n",
"\n",
"# Create a new feature: Comorbidity Burden\n",
"data_cleaned['Comorbidity_Burden'] = pd.cut(data_cleaned['Num_Comorbidities'], bins=[-1, 0, 2, 4, np.inf], labels=['None', 'Low', 'Medium', 'High'])\n",
"\n",
"# 3. Data Analysis\n",
"# ----------------\n",
"\n",
"# Analyzing Readmission Rates by Various Factors\n",
"\n",
"# Readmission rate by Gender\n",
"readmission_by_gender = data_cleaned.groupby('Gender')['Readmitted_Within_30_Days'].mean()\n",
"print(\"Readmission rate by Gender:\")\n",
"print(readmission_by_gender)\n",
"\n",
"# Readmission rate by Primary Diagnosis\n",
"readmission_by_diagnosis = data_cleaned.groupby('Primary_Diagnosis')['Readmitted_Within_30_Days'].mean()\n",
"print(\"\\nReadmission rate by Primary Diagnosis:\")\n",
"print(readmission_by_diagnosis)\n",
"\n",
"# Readmission rate by Length of Stay Category\n",
"readmission_by_los = data_cleaned.groupby('Length_of_Stay_Category')['Readmitted_Within_30_Days'].mean()\n",
"print(\"\\nReadmission rate by Length of Stay Category:\")\n",
"print(readmission_by_los)\n",
"\n",
"# Readmission rate by Age Group\n",
"readmission_by_age_group = data_cleaned.groupby('Age_Group')['Readmitted_Within_30_Days'].mean()\n",
"print(\"\\nReadmission rate by Age Group:\")\n",
"print(readmission_by_age_group)\n",
"\n",
"# Readmission rate by Discharge Destination\n",
"readmission_by_discharge = data_cleaned.groupby('Discharge_Destination')['Readmitted_Within_30_Days'].mean()\n",
"print(\"\\nReadmission rate by Discharge Destination:\")\n",
"print(readmission_by_discharge)\n",
"\n",
"# Readmission rate by Comorbidity Burden\n",
"readmission_by_comorbidity = data_cleaned.groupby('Comorbidity_Burden')['Readmitted_Within_30_Days'].mean()\n",
"print(\"\\nReadmission rate by Comorbidity Burden:\")\n",
"print(readmission_by_comorbidity)\n",
"\n",
"# Summary Statistics for Data\n",
"summary_stats = data_cleaned.describe()\n",
"print(\"\\nSummary statistics of cleaned data:\")\n",
"print(summary_stats)\n",
"\n",
"# Count of patients in each group (e.g., Age Group, Length of Stay Category)\n",
"age_group_counts = data_cleaned['Age_Group'].value_counts()\n",
"length_of_stay_counts = data_cleaned['Length_of_Stay_Category'].value_counts()\n",
"\n",
"print(\"\\nCount of patients by Age Group:\")\n",
"print(age_group_counts)\n",
"\n",
"print(\"\\nCount of patients by Length of Stay Category:\")\n",
"print(length_of_stay_counts)\n",
"\n",
"# Correlation Matrix\n",
"# Convert categorical features to numerical using one-hot encoding\n",
"data_cleaned = pd.get_dummies(data_cleaned)\n",
"# Correlation matrix for numerical features\n",
"correlation_matrix = data_cleaned.corr()\n",
"print(\"\\nCorrelation matrix: of values abs(corr) > 0.8\")\n",
"# format the output only show the values that are greater than 0.8 in absolute value\n",
"# Output: feature 1 vs feature 2 : correlation value and only show the values that are greater than 0.8 in absolute value\n",
"print(correlation_matrix[(correlation_matrix.abs() > 0.8) & (correlation_matrix < 1)].stack().dropna())\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Adherence_Level Age Num_Medications Condition \\\n",
"0 High 52.164444 4.333333 Hypertension \n",
"1 Low 75.969697 10.166667 Hypertension \n",
"2 Medium 72.222222 10.000000 Hypertension & Diabetes \n",
"\n",
" Patient_ID \n",
"0 70 \n",
"1 24 \n",
"2 8 \n",
"Patients at high risk of non-adherence:\n",
" Patient_ID Age\n",
"1 2 84\n",
"5 13 85\n",
"26 73 68\n",
"28 17 67\n",
"115 11 75\n",
"125 37 76\n",
"128 5 72\n",
"131 83 72\n",
"140 89 80\n",
"162 97 73\n",
"171 23 70\n",
"179 53 73\n",
"185 67 72\n",
"188 7 71\n",
"191 29 76\n",
"230 31 81\n",
"232 47 71\n",
"254 43 83\n",
"273 41 67\n",
"287 59 76\n",
"289 71 86\n",
"294 61 79\n",
"298 19 85\n",
"299 3 79\n"
]
}
],
"source": [
"import pandas as pd\n",
"# Url\n",
"url1 = \"https://raw.githubusercontent.com/stawiskm/pythoncourse/student/Assignments/Data/fictitious_patients.csv\"\n",
"url2 = \"https://raw.githubusercontent.com/stawiskm/pythoncourse/student/Assignments/Data/fictitious_visits.csv\"\n",
"# Load the datasets\n",
"patients_df = pd.read_csv(url1)\n",
"visits_df = pd.read_csv(url2)\n",
"\n",
"# 1. Data Merging: Combine patients and visits data into a single DataFrame\n",
"# Assume there's a common column 'Patient_ID' in both datasets\n",
"merged_df = pd.merge(visits_df, patients_df, on='Patient_ID', how='inner')\n",
"\n",
"# 2. Adherence Calculation: Calculate medication adherence rates\n",
"# Adherence Rate = (Reported Doses Taken / Prescribed Doses) * 100\n",
"merged_df['Adherence_Rate'] = (merged_df['Reported_Doses_Taken'] / merged_df['Prescribed_Doses']) * 100\n",
"\n",
"# 3. Risk Stratification: Classify patients into risk categories based on adherence rate\n",
"def classify_adherence(rate):\n",
" if rate >= 80:\n",
" return 'High'\n",
" elif 50 <= rate < 80:\n",
" return 'Medium'\n",
" else:\n",
" return 'Low'\n",
"\n",
"merged_df['Adherence_Level'] = merged_df['Adherence_Rate'].apply(classify_adherence)\n",
"\n",
"# Analyze characteristics of each group\n",
"# Grouping by Adherence Level and aggregating other characteristics\n",
"grouped_df = merged_df.groupby('Adherence_Level').agg({\n",
" 'Age': 'mean',\n",
" 'Num_Medications': 'mean',\n",
" 'Condition': pd.Series.mode,\n",
" 'Patient_ID': 'nunique' # Count of unique patients in each adherence group\n",
"}).reset_index()\n",
"\n",
"# Output the results\n",
"print(grouped_df)\n",
"\n",
"# Example: Flag patients with low adherence for additional support\n",
"low_adherence_patients = merged_df[merged_df['Adherence_Level'] == 'Low']\n",
"print(\"Patients at high risk of non-adherence:\")\n",
"print(low_adherence_patients[['Patient_ID', 'Age']].drop_duplicates())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "datascience",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit c6a88f6

Please sign in to comment.