diff --git a/junaid_raza_lhr_r_2/junaidrazalhr.R b/junaid_raza_lhr_r_2/junaidrazalhr.R new file mode 100644 index 0000000..32b2fb3 --- /dev/null +++ b/junaid_raza_lhr_r_2/junaidrazalhr.R @@ -0,0 +1,206 @@ +#getwd() +getwd() + + +#setwd('C:/Users/junaid.raza/junaidraza_lhr_r_assignment2',header=TRUE) + +library(dplyr) +library(tidyr) +#install.packages("lubridate") +library(lubridate) + +#Working with two data sets +#Loading Data into df +df <- read.csv('hospitaldata.csv', header = TRUE) +genpro <- read.csv('hospitaldata.csv', header = TRUE, na.strings=c("","NA")) +#Loading Data into test +test <- read.csv('hospitaldata.csv', header = TRUE) + +#Loading Data into test +#rlab <- read.csv('hospitaldata.csv', header = TRUE) + + +#1 +#Removing the Dots +names(df) <- gsub("\\.", "", names(df)) +names(df) + +#2 +#Which week day have max visits +#Sepate the table to extract three more columns of +#Day Month and Year +test <- test %>% separate(Date,c('day','month','year'),sep=',') +df <- df %>% separate(Date,c('day','month','year'),sep=',') +weekdays<-table(df$day) +unique(weekdays) +weekdays +#Monday have 51 visits + + +#3 +#Average age of patients +class(df$Age) +df$Age<-as.numeric(df$Age) +mean(df$Age) +#Mean os 23.33 + + +#4 +#Children enteratined(age 1 - 12) +lengthofage<-length(which(df$Age<12)) +lengthofage +#58 is length of records + + + +#5 +#Which gender have what max procedures +class(df$Sex) +class(df$Procedure) +#spaces replaced with NA of all data frame +df$Sex<-tolower(df$Sex) +df$Sex +genproc <- df %>% + group_by(Sex,Procedure) %>% + summarize(count=n()) +View(genproc) + + +#6 +#Highest Earning +class(df$TotalCharges) +df$TotalCharges<- as.numeric(as.character(df$TotalCharges)) +class(df$TotalCharges) +class(df$ConsultingDoctor) +df$ConsultingDoctor<-as.character(df$ConsultingDoctor) +class(df$ConsultingDoctor) + +docmaxpay <- df %>% + group_by(ConsultingDoctor) %>% + summarise(total = sum(TotalCharges, na.rm = TRUE)) + +docmaxpay$ConsultingDoctor[which(docmaxpay$total==max(docmaxpay$total, na.rm = TRUE),arr.ind=TRUE)] +#Dr Alaf Khan + + +#7 +#Procedure type earn max money +class(df$Procedure) +df$Procedure <- as.character(df$Procedure) +maxprocd <- df %>% + group_by(Procedure) %>% + summarise(total = sum(TotalCharges, na.rm = TRUE)) + +maxprocd$Procedure[which(maxprocd$total==max(maxprocd$total, na.rm = TRUE),arr.ind=TRUE)] +#Orthodontics 240,000 + + +#8 +#Highest freq of Visits by Hour. +class(df$Time) +df$Time<-as.character(df$Time) +df$Time[df$Time=="-"]<-"" +df$Time[df$Time==""]<-NA +unique(df$Time) + +get_time<-df$Time + +get_time<-strptime(get_time,"%I:%M %p") +get_hours<-hour(get_time) + +actual_mode <- table(get_hours) +names(actual_mode)[actual_mode == max(actual_mode)] +#Hence the hour 13:00 is the most visited hour of the day + + + +#9 +#Create Bracket of Time. + + + + + +#10 +#patients are Repeated visitors +class(df$id) +vis_num <- df %>% + group_by(id) %>% + summarize(count=n()) +View(vis_num) +vis_num$id[(which(vis_num$count >1))] + +#11 +#Info Repeated Visitor +length(vis_num$id[(which(vis_num$count >1))]) +#37 + + +#12 +#Patient again visit for same problem +vis_two <- df %>% + group_by(id,Procedure) %>% + summarize(count=n()) +vis_two +vis_two <- vis_two$id[(which(vis_two$count >1))] +vis_two + + + + + +#13 +#Median age for male and females +fmed<-which(df$Sex=='f',arr.ind =TRUE) +median(as.numeric(as.character(df$Age[fmed])),na.rm = TRUE) +#22 +mmed<-which(df$Sex=='m',arr.ind =TRUE) +median(as.numeric(as.character(df$Age[mmed])),na.rm = TRUE) +#21 + + + +#14 +#Total Amount Balance + + + +#15 +#Procedure consultation Money +consult_money<- which(df$Procedure=='Consultation', arr.ind=TRUE) +sum(df$TotalCharges[consult_money], na.rm=TRUE) +#83950 + + + + +#16 +#Relation age and Total Charges +class(df$Age) +class(df$TotalCharges) +df$TotalCharges<-as.numeric(as.character(df$TotalCharges)) +cor.test(df$Age, df$TotalCharges) +#Value is greate then 0.05 + + + +#17 +#Age with max high visits + + + + + + +#18 +#Procedure X-Ray adn Scalling Total cost +total_xray_cost<- which(df$Procedure=='X Ray' | df$Procedure=='Scalling' , arr.ind=TRUE) +sum(df$TotalCharges[total_xray_cost], na.rm=TRUE) +#22300 + + + + + + + diff --git a/junaid_raza_lhr_r_2/junaidrazalhr.html b/junaid_raza_lhr_r_2/junaidrazalhr.html new file mode 100644 index 0000000..39c8510 --- /dev/null +++ b/junaid_raza_lhr_r_2/junaidrazalhr.html @@ -0,0 +1,449 @@ + + + + +
+ + + + + + + + + +#getwd()
+getwd()
+## [1] "C:/Users/junaid.raza/junaidraza_lhr_r_assignment2"
+#setwd('C:/Users/junaid.raza/junaidraza_lhr_r_assignment2',header=TRUE)
+
+library(dplyr)
+##
+## Attaching package: 'dplyr'
+## The following objects are masked from 'package:stats':
+##
+## filter, lag
+## The following objects are masked from 'package:base':
+##
+## intersect, setdiff, setequal, union
+library(tidyr)
+#install.packages("lubridate")
+library(lubridate)
+##
+## Attaching package: 'lubridate'
+## The following object is masked from 'package:base':
+##
+## date
+#Working with two data sets
+#Loading Data into df
+df <- read.csv('hospitaldata.csv', header = TRUE)
+genpro <- read.csv('hospitaldata.csv', header = TRUE, na.strings=c("","NA"))
+#Loading Data into test
+test <- read.csv('hospitaldata.csv', header = TRUE)
+
+#Loading Data into test
+#rlab <- read.csv('hospitaldata.csv', header = TRUE)
+
+
+#1
+#Removing the Dots
+names(df) <- gsub("\\.", "", names(df))
+names(df)
+## [1] "Date" "id" "Time"
+## [4] "Age" "Sex" "ConsultingDoctor"
+## [7] "Specialty" "Procedure" "TotalCharges"
+## [10] "AmountReceived" "AmountBalance" "AmountReceivedBy"
+## [13] "AmountinHospital" "ReceptionistName" "NextApt"
+#2
+#Which week day have max visits
+#Sepate the table to extract three more columns of
+#Day Month and Year
+test <- test %>% separate(Date,c('day','month','year'),sep=',')
+df <- df %>% separate(Date,c('day','month','year'),sep=',')
+weekdays<-table(df$day)
+unique(weekdays)
+## [1] 26 51 20 7 33 42 43
+weekdays
+##
+## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
+## 26 51 20 7 33 42 43
+#Monday have 51 visits
+
+
+#3
+#Average age of patients
+class(df$Age)
+## [1] "factor"
+df$Age<-as.numeric(df$Age)
+mean(df$Age)
+## [1] 23.33784
+#Mean os 23.33
+
+
+#4
+#Children enteratined(age 1 - 12)
+lengthofage<-length(which(df$Age<12))
+lengthofage
+## [1] 58
+#58 is length of records
+
+
+
+#5
+#Which gender have what max procedures
+class(df$Sex)
+## [1] "factor"
+class(df$Procedure)
+## [1] "factor"
+#spaces replaced with NA of all data frame
+df$Sex<-tolower(df$Sex)
+df$Sex
+## [1] "f" "m" "f" "m" "m" "f" "m" "f" "f" "m" "f" "f" "f" "f" "f" "f" "f"
+## [18] "f" "f" "f" "f" "m" "m" "m" "m" "m" "f" "" "f" "f" "m" "f" "m" "f"
+## [35] "m" "f" "" "f" "f" "m" "f" "m" "m" "m" "f" "m" "m" "f" "f" "f" "m"
+## [52] "m" "f" "f" "m" "m" "m" "m" "m" "m" "f" "f" "m" "m" "f" "" "" ""
+## [69] "f" "f" "m" "m" "f" "f" "m" "m" "m" "f" "m" "m" "" "m" "m" "f" ""
+## [86] "" "" "m" "f" "m" "" "f" "m" "f" "m" "m" "f" "m" "f" "f" "f" "m"
+## [103] "f" "m" "f" "f" "f" "f" "m" "f" "m" "f" "f" "m" "f" "m" "m" "f" "m"
+## [120] "f" "m" "f" "m" "m" "f" "f" "f" "f" "m" "m" "m" "m" "m" "f" "f" "f"
+## [137] "m" "f" "m" "m" "f" "-" "m" "m" "f" "f" "m" "m" "m" "m" "f" "m" "f"
+## [154] "m" "m" "m" "m" "f" "f" "m" "m" "f" "f" "m" "" "f" "m" "m" "m" "f"
+## [171] "f" "f" "f" "f" "f" "f" "f" "m" "m" "m" "f" "f" "m" "f" "f" "m" "f"
+## [188] "m" "f" "m" "f" "-" "f" "f" "m" "f" "m" "m" "m" "m" "f" "m" "m" "m"
+## [205] "m" "m" "f" "m" "f" "m" "f" "m" "f" "f" "m" "f" "m" "" "m" "m" "m"
+## [222] "f"
+genproc <- df %>%
+ group_by(Sex,Procedure) %>%
+ summarize(count=n())
+View(genproc)
+
+
+#6
+#Highest Earning
+class(df$TotalCharges)
+## [1] "factor"
+df$TotalCharges<- as.numeric(as.character(df$TotalCharges))
+## Warning: NAs introduced by coercion
+class(df$TotalCharges)
+## [1] "numeric"
+class(df$ConsultingDoctor)
+## [1] "factor"
+df$ConsultingDoctor<-as.character(df$ConsultingDoctor)
+class(df$ConsultingDoctor)
+## [1] "character"
+docmaxpay <- df %>%
+ group_by(ConsultingDoctor) %>%
+ summarise(total = sum(TotalCharges, na.rm = TRUE))
+
+docmaxpay$ConsultingDoctor[which(docmaxpay$total==max(docmaxpay$total, na.rm = TRUE),arr.ind=TRUE)]
+## [1] "Dr Alaf Khan"
+#Dr Alaf Khan
+
+
+#7
+#Procedure type earn max money
+class(df$Procedure)
+## [1] "factor"
+df$Procedure <- as.character(df$Procedure)
+maxprocd <- df %>%
+ group_by(Procedure) %>%
+ summarise(total = sum(TotalCharges, na.rm = TRUE))
+
+maxprocd$Procedure[which(maxprocd$total==max(maxprocd$total, na.rm = TRUE),arr.ind=TRUE)]
+## [1] "Orthodontics"
+#Orthodontics 240,000
+
+
+#8
+#Highest freq of Visits by Hour.
+class(df$Time)
+## [1] "factor"
+df$Time<-as.character(df$Time)
+df$Time[df$Time=="-"]<-""
+df$Time[df$Time==""]<-NA
+unique(df$Time)
+## [1] "11:00" "10:45AM" "12:38PM" "1:00PM" "2:45PM" "3:00PM" "3:28PM"
+## [8] "3:45PM" "5:00PM" "5:30PM" "3:25PM" "6:10PM" "11:45PM" "12:40PM"
+## [15] "8:10PM" "8:30PM" "2:00PM" "12:30PM" "1:30PM" NA "8:15PM"
+## [22] "12:36PM" "2:30PM" "3:15PM" "5:20PM" "3:50PM" "6:00PM" "4:30PM"
+## [29] "02:00PM" "11:20AM" "8:00PM" "6:30PM" "9:00PM" "6:20PM" "11:25AM"
+## [36] "11:15AM" "1:10PM" "3:30PM" "6:15PM" "9:40PM" "12:00PM" "11:00AM"
+## [43] "10:15AM" "1:20PM" "12:15PM" "1:15PM" "4:50PM" "2:10PM" "12:50PM"
+## [50] "5:40PM" "6:45PM" "9:45PM" "5:35PM" "6:50PM" "1:40PM" "1:25PM"
+## [57] "4:45PM" "4:00PM" "7:30PM" "7:45PM" "2:15PM" "10:13AM" "2:40PM"
+## [64] "10:00AM" "9:30AM" "7:00PM" "4:20PM" "5:57PM" "7:15PM" "3:40PM"
+## [71] "7:02PM" "11:40AM" "4:10PM" "11:30AM" "10:15PM" "7:11PM" "10:10PM"
+## [78] "1:55PM" "1:50PM" "9:30PM" "12:20PM" "10:30PM" "7:10PM" "12:48PM"
+## [85] "7:05PM" "5:45PM" "7:40PM" "9:35PM" "10:00PM" "6:55PM" "9:00AM"
+## [92] "10:20AM" "11:20PM"
+get_time<-df$Time
+
+get_time<-strptime(get_time,"%I:%M %p")
+get_hours<-hour(get_time)
+
+actual_mode <- table(get_hours)
+names(actual_mode)[actual_mode == max(actual_mode)]
+## [1] "13"
+#Hence the hour 13:00 is the most visited hour of the day
+
+
+
+#9
+#Create Bracket of Time.
+
+
+
+
+
+#10
+#patients are Repeated visitors
+class(df$id)
+## [1] "integer"
+vis_num <- df %>%
+ group_by(id) %>%
+ summarize(count=n())
+View(vis_num)
+vis_num$id[(which(vis_num$count >1))]
+## [1] 1 4 12 13 17 20 25 40 45 46 59 63 64 80 88 94 96
+## [18] 97 100 101 107 109 112 114 116 118 120 122 130 132 133 140 145 149
+## [35] 150 151 153
+#11
+#Info Repeated Visitor
+length(vis_num$id[(which(vis_num$count >1))])
+## [1] 37
+#37
+
+
+#12
+#Patient again visit for same problem
+vis_two <- df %>%
+ group_by(id,Procedure) %>%
+ summarize(count=n())
+vis_two
+## Source: local data frame [182 x 3]
+## Groups: id [?]
+##
+## id Procedure count
+## <int> <chr> <int>
+## 1 1 Cancelled 1
+## 2 1 Injection 1
+## 3 1 Pharmacy 10
+## 4 2 Consultation 1
+## 5 3 Dressing 1
+## 6 4 Consultation + Dressing 1
+## 7 4 Er Retain 1
+## 8 5 X Ray 1
+## 9 6 USG 1
+## 10 7 Laboratory Test 1
+## # ... with 172 more rows
+vis_two <- vis_two$id[(which(vis_two$count >1))]
+vis_two
+## [1] 1 12 13 17 17 20 25 45 46 63 80 94 97 101 109 112 114
+## [18] 116 122 130 140 145 151 153
+#13
+#Median age for male and females
+fmed<-which(df$Sex=='f',arr.ind =TRUE)
+median(as.numeric(as.character(df$Age[fmed])),na.rm = TRUE)
+## [1] 22
+#22
+mmed<-which(df$Sex=='m',arr.ind =TRUE)
+median(as.numeric(as.character(df$Age[mmed])),na.rm = TRUE)
+## [1] 21
+#21
+
+
+
+#14
+#Total Amount Balance
+
+
+
+#15
+#Procedure consultation Money
+consult_money<- which(df$Procedure=='Consultation', arr.ind=TRUE)
+sum(df$TotalCharges[consult_money], na.rm=TRUE)
+## [1] 83950
+#83950
+
+
+
+
+#16
+#Relation age and Total Charges
+class(df$Age)
+## [1] "numeric"
+class(df$TotalCharges)
+## [1] "numeric"
+df$TotalCharges<-as.numeric(as.character(df$TotalCharges))
+cor.test(df$Age, df$TotalCharges)
+##
+## Pearson's product-moment correlation
+##
+## data: df$Age and df$TotalCharges
+## t = -0.066519, df = 202, p-value = 0.947
+## alternative hypothesis: true correlation is not equal to 0
+## 95 percent confidence interval:
+## -0.1419601 0.1327763
+## sample estimates:
+## cor
+## -0.004680223
+#Value is greate then 0.05
+
+
+
+#17
+#Age with max high visits
+
+
+
+
+
+
+#18
+#Procedure X-Ray adn Scalling Total cost
+total_xray_cost<- which(df$Procedure=='X Ray' | df$Procedure=='Scalling' , arr.ind=TRUE)
+sum(df$TotalCharges[total_xray_cost], na.rm=TRUE)
+## [1] 22300
+#22300
+
+
+
+
+