diff --git a/junaid_raza_lhr_r_2/junaidrazalhr.R b/junaid_raza_lhr_r_2/junaidrazalhr.R new file mode 100644 index 0000000..32b2fb3 --- /dev/null +++ b/junaid_raza_lhr_r_2/junaidrazalhr.R @@ -0,0 +1,206 @@ +#getwd() +getwd() + + +#setwd('C:/Users/junaid.raza/junaidraza_lhr_r_assignment2',header=TRUE) + +library(dplyr) +library(tidyr) +#install.packages("lubridate") +library(lubridate) + +#Working with two data sets +#Loading Data into df +df <- read.csv('hospitaldata.csv', header = TRUE) +genpro <- read.csv('hospitaldata.csv', header = TRUE, na.strings=c("","NA")) +#Loading Data into test +test <- read.csv('hospitaldata.csv', header = TRUE) + +#Loading Data into test +#rlab <- read.csv('hospitaldata.csv', header = TRUE) + + +#1 +#Removing the Dots +names(df) <- gsub("\\.", "", names(df)) +names(df) + +#2 +#Which week day have max visits +#Sepate the table to extract three more columns of +#Day Month and Year +test <- test %>% separate(Date,c('day','month','year'),sep=',') +df <- df %>% separate(Date,c('day','month','year'),sep=',') +weekdays<-table(df$day) +unique(weekdays) +weekdays +#Monday have 51 visits + + +#3 +#Average age of patients +class(df$Age) +df$Age<-as.numeric(df$Age) +mean(df$Age) +#Mean os 23.33 + + +#4 +#Children enteratined(age 1 - 12) +lengthofage<-length(which(df$Age<12)) +lengthofage +#58 is length of records + + + +#5 +#Which gender have what max procedures +class(df$Sex) +class(df$Procedure) +#spaces replaced with NA of all data frame +df$Sex<-tolower(df$Sex) +df$Sex +genproc <- df %>% + group_by(Sex,Procedure) %>% + summarize(count=n()) +View(genproc) + + +#6 +#Highest Earning +class(df$TotalCharges) +df$TotalCharges<- as.numeric(as.character(df$TotalCharges)) +class(df$TotalCharges) +class(df$ConsultingDoctor) +df$ConsultingDoctor<-as.character(df$ConsultingDoctor) +class(df$ConsultingDoctor) + +docmaxpay <- df %>% + group_by(ConsultingDoctor) %>% + summarise(total = sum(TotalCharges, na.rm = TRUE)) + +docmaxpay$ConsultingDoctor[which(docmaxpay$total==max(docmaxpay$total, na.rm = TRUE),arr.ind=TRUE)] +#Dr Alaf Khan + + +#7 +#Procedure type earn max money +class(df$Procedure) +df$Procedure <- as.character(df$Procedure) +maxprocd <- df %>% + group_by(Procedure) %>% + summarise(total = sum(TotalCharges, na.rm = TRUE)) + +maxprocd$Procedure[which(maxprocd$total==max(maxprocd$total, na.rm = TRUE),arr.ind=TRUE)] +#Orthodontics 240,000 + + +#8 +#Highest freq of Visits by Hour. +class(df$Time) +df$Time<-as.character(df$Time) +df$Time[df$Time=="-"]<-"" +df$Time[df$Time==""]<-NA +unique(df$Time) + +get_time<-df$Time + +get_time<-strptime(get_time,"%I:%M %p") +get_hours<-hour(get_time) + +actual_mode <- table(get_hours) +names(actual_mode)[actual_mode == max(actual_mode)] +#Hence the hour 13:00 is the most visited hour of the day + + + +#9 +#Create Bracket of Time. + + + + + +#10 +#patients are Repeated visitors +class(df$id) +vis_num <- df %>% + group_by(id) %>% + summarize(count=n()) +View(vis_num) +vis_num$id[(which(vis_num$count >1))] + +#11 +#Info Repeated Visitor +length(vis_num$id[(which(vis_num$count >1))]) +#37 + + +#12 +#Patient again visit for same problem +vis_two <- df %>% + group_by(id,Procedure) %>% + summarize(count=n()) +vis_two +vis_two <- vis_two$id[(which(vis_two$count >1))] +vis_two + + + + + +#13 +#Median age for male and females +fmed<-which(df$Sex=='f',arr.ind =TRUE) +median(as.numeric(as.character(df$Age[fmed])),na.rm = TRUE) +#22 +mmed<-which(df$Sex=='m',arr.ind =TRUE) +median(as.numeric(as.character(df$Age[mmed])),na.rm = TRUE) +#21 + + + +#14 +#Total Amount Balance + + + +#15 +#Procedure consultation Money +consult_money<- which(df$Procedure=='Consultation', arr.ind=TRUE) +sum(df$TotalCharges[consult_money], na.rm=TRUE) +#83950 + + + + +#16 +#Relation age and Total Charges +class(df$Age) +class(df$TotalCharges) +df$TotalCharges<-as.numeric(as.character(df$TotalCharges)) +cor.test(df$Age, df$TotalCharges) +#Value is greate then 0.05 + + + +#17 +#Age with max high visits + + + + + + +#18 +#Procedure X-Ray adn Scalling Total cost +total_xray_cost<- which(df$Procedure=='X Ray' | df$Procedure=='Scalling' , arr.ind=TRUE) +sum(df$TotalCharges[total_xray_cost], na.rm=TRUE) +#22300 + + + + + + + diff --git a/junaid_raza_lhr_r_2/junaidrazalhr.html b/junaid_raza_lhr_r_2/junaidrazalhr.html new file mode 100644 index 0000000..39c8510 --- /dev/null +++ b/junaid_raza_lhr_r_2/junaidrazalhr.html @@ -0,0 +1,449 @@ + + + + + + + + + + + + + + +junaidrazalhr.R + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + +
#getwd()
+getwd()
+
## [1] "C:/Users/junaid.raza/junaidraza_lhr_r_assignment2"
+
#setwd('C:/Users/junaid.raza/junaidraza_lhr_r_assignment2',header=TRUE)
+
+library(dplyr)
+
## 
+## Attaching package: 'dplyr'
+
## The following objects are masked from 'package:stats':
+## 
+##     filter, lag
+
## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+
library(tidyr)
+#install.packages("lubridate")
+library(lubridate)
+
## 
+## Attaching package: 'lubridate'
+
## The following object is masked from 'package:base':
+## 
+##     date
+
#Working with two data sets
+#Loading Data into df
+df <- read.csv('hospitaldata.csv', header = TRUE)
+genpro <- read.csv('hospitaldata.csv', header = TRUE, na.strings=c("","NA"))
+#Loading Data into test
+test <- read.csv('hospitaldata.csv', header = TRUE)
+
+#Loading Data into test
+#rlab <- read.csv('hospitaldata.csv', header = TRUE)
+
+
+#1
+#Removing the Dots 
+names(df) <- gsub("\\.", "", names(df))
+names(df)
+
##  [1] "Date"             "id"               "Time"            
+##  [4] "Age"              "Sex"              "ConsultingDoctor"
+##  [7] "Specialty"        "Procedure"        "TotalCharges"    
+## [10] "AmountReceived"   "AmountBalance"    "AmountReceivedBy"
+## [13] "AmountinHospital" "ReceptionistName" "NextApt"
+
#2
+#Which week day have max visits
+#Sepate the table to extract three more columns of 
+#Day Month and Year
+test <- test %>% separate(Date,c('day','month','year'),sep=',')
+df <- df %>%  separate(Date,c('day','month','year'),sep=',')
+weekdays<-table(df$day)
+unique(weekdays)
+
## [1] 26 51 20  7 33 42 43
+
weekdays
+
## 
+##    Friday    Monday  Saturday    Sunday  Thursday   Tuesday Wednesday 
+##        26        51        20         7        33        42        43
+
#Monday have 51 visits
+
+
+#3
+#Average age of patients
+class(df$Age)
+
## [1] "factor"
+
df$Age<-as.numeric(df$Age)
+mean(df$Age)
+
## [1] 23.33784
+
#Mean os 23.33
+
+
+#4
+#Children enteratined(age 1 - 12)
+lengthofage<-length(which(df$Age<12))
+lengthofage
+
## [1] 58
+
#58 is length of records
+
+
+
+#5
+#Which gender have what max procedures
+class(df$Sex)
+
## [1] "factor"
+
class(df$Procedure)
+
## [1] "factor"
+
#spaces replaced with NA of all data frame
+df$Sex<-tolower(df$Sex)
+df$Sex
+
##   [1] "f" "m" "f" "m" "m" "f" "m" "f" "f" "m" "f" "f" "f" "f" "f" "f" "f"
+##  [18] "f" "f" "f" "f" "m" "m" "m" "m" "m" "f" ""  "f" "f" "m" "f" "m" "f"
+##  [35] "m" "f" ""  "f" "f" "m" "f" "m" "m" "m" "f" "m" "m" "f" "f" "f" "m"
+##  [52] "m" "f" "f" "m" "m" "m" "m" "m" "m" "f" "f" "m" "m" "f" ""  ""  "" 
+##  [69] "f" "f" "m" "m" "f" "f" "m" "m" "m" "f" "m" "m" ""  "m" "m" "f" "" 
+##  [86] ""  ""  "m" "f" "m" ""  "f" "m" "f" "m" "m" "f" "m" "f" "f" "f" "m"
+## [103] "f" "m" "f" "f" "f" "f" "m" "f" "m" "f" "f" "m" "f" "m" "m" "f" "m"
+## [120] "f" "m" "f" "m" "m" "f" "f" "f" "f" "m" "m" "m" "m" "m" "f" "f" "f"
+## [137] "m" "f" "m" "m" "f" "-" "m" "m" "f" "f" "m" "m" "m" "m" "f" "m" "f"
+## [154] "m" "m" "m" "m" "f" "f" "m" "m" "f" "f" "m" ""  "f" "m" "m" "m" "f"
+## [171] "f" "f" "f" "f" "f" "f" "f" "m" "m" "m" "f" "f" "m" "f" "f" "m" "f"
+## [188] "m" "f" "m" "f" "-" "f" "f" "m" "f" "m" "m" "m" "m" "f" "m" "m" "m"
+## [205] "m" "m" "f" "m" "f" "m" "f" "m" "f" "f" "m" "f" "m" ""  "m" "m" "m"
+## [222] "f"
+
genproc <- df %>%
+  group_by(Sex,Procedure) %>%
+  summarize(count=n())
+View(genproc)
+
+
+#6
+#Highest Earning
+class(df$TotalCharges)
+
## [1] "factor"
+
df$TotalCharges<- as.numeric(as.character(df$TotalCharges))
+
## Warning: NAs introduced by coercion
+
class(df$TotalCharges)
+
## [1] "numeric"
+
class(df$ConsultingDoctor)
+
## [1] "factor"
+
df$ConsultingDoctor<-as.character(df$ConsultingDoctor)
+class(df$ConsultingDoctor)
+
## [1] "character"
+
docmaxpay <- df %>%
+  group_by(ConsultingDoctor) %>%
+  summarise(total = sum(TotalCharges, na.rm = TRUE))
+
+docmaxpay$ConsultingDoctor[which(docmaxpay$total==max(docmaxpay$total, na.rm = TRUE),arr.ind=TRUE)]
+
## [1] "Dr Alaf Khan"
+
#Dr Alaf Khan
+
+
+#7
+#Procedure type earn max money
+class(df$Procedure)
+
## [1] "factor"
+
df$Procedure <- as.character(df$Procedure)
+maxprocd <- df %>%
+  group_by(Procedure) %>%
+  summarise(total = sum(TotalCharges, na.rm = TRUE))
+
+maxprocd$Procedure[which(maxprocd$total==max(maxprocd$total, na.rm = TRUE),arr.ind=TRUE)]
+
## [1] "Orthodontics"
+
#Orthodontics 240,000
+
+
+#8
+#Highest freq of Visits by Hour.
+class(df$Time)
+
## [1] "factor"
+
df$Time<-as.character(df$Time)
+df$Time[df$Time=="-"]<-""
+df$Time[df$Time==""]<-NA
+unique(df$Time)
+
##  [1] "11:00"   "10:45AM" "12:38PM" "1:00PM"  "2:45PM"  "3:00PM"  "3:28PM" 
+##  [8] "3:45PM"  "5:00PM"  "5:30PM"  "3:25PM"  "6:10PM"  "11:45PM" "12:40PM"
+## [15] "8:10PM"  "8:30PM"  "2:00PM"  "12:30PM" "1:30PM"  NA        "8:15PM" 
+## [22] "12:36PM" "2:30PM"  "3:15PM"  "5:20PM"  "3:50PM"  "6:00PM"  "4:30PM" 
+## [29] "02:00PM" "11:20AM" "8:00PM"  "6:30PM"  "9:00PM"  "6:20PM"  "11:25AM"
+## [36] "11:15AM" "1:10PM"  "3:30PM"  "6:15PM"  "9:40PM"  "12:00PM" "11:00AM"
+## [43] "10:15AM" "1:20PM"  "12:15PM" "1:15PM"  "4:50PM"  "2:10PM"  "12:50PM"
+## [50] "5:40PM"  "6:45PM"  "9:45PM"  "5:35PM"  "6:50PM"  "1:40PM"  "1:25PM" 
+## [57] "4:45PM"  "4:00PM"  "7:30PM"  "7:45PM"  "2:15PM"  "10:13AM" "2:40PM" 
+## [64] "10:00AM" "9:30AM"  "7:00PM"  "4:20PM"  "5:57PM"  "7:15PM"  "3:40PM" 
+## [71] "7:02PM"  "11:40AM" "4:10PM"  "11:30AM" "10:15PM" "7:11PM"  "10:10PM"
+## [78] "1:55PM"  "1:50PM"  "9:30PM"  "12:20PM" "10:30PM" "7:10PM"  "12:48PM"
+## [85] "7:05PM"  "5:45PM"  "7:40PM"  "9:35PM"  "10:00PM" "6:55PM"  "9:00AM" 
+## [92] "10:20AM" "11:20PM"
+
get_time<-df$Time
+
+get_time<-strptime(get_time,"%I:%M %p")
+get_hours<-hour(get_time)
+
+actual_mode <- table(get_hours)
+names(actual_mode)[actual_mode == max(actual_mode)]
+
## [1] "13"
+
#Hence the hour 13:00 is the most visited hour of the day
+
+
+
+#9
+#Create Bracket of Time.
+
+
+
+
+
+#10
+#patients are Repeated visitors
+class(df$id)
+
## [1] "integer"
+
vis_num <- df %>%
+  group_by(id) %>%
+  summarize(count=n())
+View(vis_num)
+vis_num$id[(which(vis_num$count >1))]
+
##  [1]   1   4  12  13  17  20  25  40  45  46  59  63  64  80  88  94  96
+## [18]  97 100 101 107 109 112 114 116 118 120 122 130 132 133 140 145 149
+## [35] 150 151 153
+
#11
+#Info Repeated Visitor
+length(vis_num$id[(which(vis_num$count >1))])
+
## [1] 37
+
#37
+
+
+#12
+#Patient again visit for same problem
+vis_two <- df %>%
+  group_by(id,Procedure) %>%
+  summarize(count=n())
+vis_two
+
## Source: local data frame [182 x 3]
+## Groups: id [?]
+## 
+##       id               Procedure count
+##    <int>                   <chr> <int>
+## 1      1               Cancelled     1
+## 2      1               Injection     1
+## 3      1                Pharmacy    10
+## 4      2            Consultation     1
+## 5      3                Dressing     1
+## 6      4 Consultation + Dressing     1
+## 7      4               Er Retain     1
+## 8      5                   X Ray     1
+## 9      6                     USG     1
+## 10     7         Laboratory Test     1
+## # ... with 172 more rows
+
vis_two <- vis_two$id[(which(vis_two$count >1))]
+vis_two
+
##  [1]   1  12  13  17  17  20  25  45  46  63  80  94  97 101 109 112 114
+## [18] 116 122 130 140 145 151 153
+
#13
+#Median age for male and females
+fmed<-which(df$Sex=='f',arr.ind =TRUE) 
+median(as.numeric(as.character(df$Age[fmed])),na.rm = TRUE)
+
## [1] 22
+
#22
+mmed<-which(df$Sex=='m',arr.ind =TRUE) 
+median(as.numeric(as.character(df$Age[mmed])),na.rm = TRUE)
+
## [1] 21
+
#21
+
+
+
+#14
+#Total Amount Balance
+
+
+
+#15
+#Procedure consultation Money
+consult_money<- which(df$Procedure=='Consultation', arr.ind=TRUE)
+sum(df$TotalCharges[consult_money], na.rm=TRUE)
+
## [1] 83950
+
#83950
+
+
+
+
+#16
+#Relation age and Total Charges
+class(df$Age)
+
## [1] "numeric"
+
class(df$TotalCharges)
+
## [1] "numeric"
+
df$TotalCharges<-as.numeric(as.character(df$TotalCharges))
+cor.test(df$Age, df$TotalCharges)
+
## 
+##  Pearson's product-moment correlation
+## 
+## data:  df$Age and df$TotalCharges
+## t = -0.066519, df = 202, p-value = 0.947
+## alternative hypothesis: true correlation is not equal to 0
+## 95 percent confidence interval:
+##  -0.1419601  0.1327763
+## sample estimates:
+##          cor 
+## -0.004680223
+
#Value is greate then 0.05
+
+
+
+#17
+#Age with max high visits
+
+
+
+
+
+
+#18
+#Procedure X-Ray adn Scalling Total cost
+total_xray_cost<- which(df$Procedure=='X Ray' | df$Procedure=='Scalling' , arr.ind=TRUE)
+sum(df$TotalCharges[total_xray_cost], na.rm=TRUE)
+
## [1] 22300
+
#22300
+ + + + +
+ + + + + + + +