-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathU3 Data Project.py
99 lines (79 loc) · 2.98 KB
/
U3 Data Project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
# read the excel file into a dataframe
df = pd.read_excel('Unit_three_takehome_data.xlsx', engine='openpyxl')
# create a column of random values between 0 and 1
df['random'] = np.random.uniform(0, 1, len(df))
# sort all columns by the random column
df = df.sort_values('random')
# split the dataframe into two dataframes
df_a = df.iloc[:len(df)//2]
df_b = df.iloc[len(df)//2:]
# create a histogram of the A improvement column
plt.hist(df_a['A improvement'])
plt.title('A improvement')
plt.xlabel('Improvement')
plt.ylabel('Count')
plt.show()
# create a histogram of the B improvement column
plt.hist(df_b['B improvement'])
plt.title('B improvement')
plt.xlabel('Improvement')
plt.ylabel('Count')
plt.show()
# print out summary statistics
print(f'A Improvement \n {df_a["A improvement"].describe()}')
print(f'B Improvement \n {df_b["A improvement"].describe()}')
# create a boxplot of the A improvement and B improvement columns
plt.boxplot([df_a['A improvement'], df_b['B improvement']], labels=['A', 'B'])
plt.title('A improvement vs. B improvement')
plt.ylabel('Improvement')
plt.show()
# split df based on gender
df_m = df[df['Gender'].str.contains("Male")]
df_f = df[df['Gender'].str.contains("Female")]
df_m['random'] = np.random.uniform(0, 1, len(df_m))
df_m = df_m.sort_values('random')
df_f['random'] = np.random.uniform(0, 1, len(df_f))
df_f = df_f.sort_values('random')
df_m_a = df_m.iloc[:len(df_m)//2]
df_m_b = df_m.iloc[len(df_m)//2:]
df_f_a = df_f.iloc[:len(df_f)//2]
df_f_b = df_f.iloc[len(df_f)//2:]
# make histograms for each dataframe
plt.hist(df_m_a['A improvement'])
plt.title('Improvement in Cholesterol Levels in Males with Drug A')
plt.xlabel('Improvement')
plt.ylabel('Count')
plt.show()
plt.hist(df_m_b['B improvement'])
plt.title('Improvement in Cholesterol Levels in Males with Drug B')
plt.xlabel('Improvement')
plt.ylabel('Count')
plt.show()
plt.hist(df_f_a['A improvement'])
plt.title('Improvement in Cholesterol Levels in Females with Drug A')
plt.xlabel('Improvement')
plt.ylabel('Count')
plt.show()
plt.hist(df_f_b['B improvement'])
plt.title('Improvement in Cholesterol Levels in Females with Drug B')
plt.xlabel('Improvement')
plt.ylabel('Count')
plt.show()
# print out summary statistics for each dataframe
print(f'A Improvement Males \n {df_m_a["A improvement"].describe()}')
print(f'B Improvement Males \n {df_m_b["B improvement"].describe()}')
print(f'A Improvement Females \n {df_f_a["A improvement"].describe()}')
print(f'B Improvement Females \n {df_f_b["B improvement"].describe()}')
# make a boxplot for Males
plt.boxplot([df_m_a['A improvement'], df_m_b['B improvement']], labels=['Drug A', 'Drug B'])
plt.title('Improvements in Cholesterol Levels in Males')
plt.ylabel('Improvement')
plt.show()
# make a boxplot for Females
plt.boxplot([df_f_a['A improvement'], df_f_b['B improvement']], labels=['Drug A', 'Drug B'])
plt.title('Improvements in Cholesterol Levels in Females')
plt.ylabel('Improvement')
plt.show()