Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .RData
Binary file not shown.
512 changes: 512 additions & 0 deletions .Rhistory

Large diffs are not rendered by default.

223 changes: 223 additions & 0 deletions CleanHospitalData.csv

Large diffs are not rendered by default.

177 changes: 177 additions & 0 deletions R_Markdown_HAK.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
---
title: "Assignment_2"
author: "HAK"
date: "25 March 2017"
output: pdf_document
---

```{r Importing Libraries, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
#importing library
library(dplyr)
library(tidyr)
library(lubridate)
```

## R ASSIGNMENT 2

Assignment 2 of a hospital data.

```{r Importing CSV, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
#loading csv from directory
bufferedDataFrame <- read.csv("hospitaldata.csv", strip.white = T, na.strings = c("-",""," ","\t","\n",NA), stringsAsFactors = F)
```

```{r Cleaning Data and Proper Conversion to Variables, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
#CLEANING DATA
#converting to tbl format
df <- tbl_df(bufferedDataFrame)

#1. Removing dots(.) from column names
names(df) <- gsub("\\.", "", names(df))

#Removing character from Age Like M
df$Age <- as.numeric(gsub("[^0-9]",'',df$Age))

#Cleaning Date Column
#Converting date factor format to Date class
df$Date <- as.Date(strptime(df$Date, "%a, %B %d, %Y"))

#Cleaning Time Column with formatting
df$Time <- format(strptime(df$Time, format='%I:%M %p'), '%I:%M %p')

#Changing case of Sex to upper case
df$Sex <- toupper(df$Sex)

#Remove Cancelled to NA as no payment is not given
df$TotalCharges <- as.numeric(gsub("cancelled", NA, ignore.case = T,df$TotalCharges))

#Remove Cancelled to NA as no consultation is not given
df$Procedure <- (gsub("cancelled", NA, ignore.case = T,df$Procedure))

#Coverting AmountBalance to numeric
df$AmountBalance <- as.numeric(gsub(",",'',df$AmountBalance))
class(df$AmountBalance)

#Exporting Clean CSV
write.csv(df, "CleanHospitalData.csv")

#View clean data
glimpse(df)
View(df)
```

```{r Questions}
#QUESTIONS
#2. Which day of the week is expected to have most visits?
print(paste("Average age of a person is",weekdays(df$Date[which(table(df$Date) == max(table(df$Date)))])))

#3. What is the average age of patients?
print(paste("Average age of a person is ",mean(df$Age, na.rm = T)))

#4. How many children were entertained? (Make a Bracket of Age from 1-12)
total_entertained <- df %>%
select(Age) %>%
filter(Age >= 1 & Age <= 12, !is.na(.)) %>%
count()
paste("Children that were entertained are", total_entertained)

#5. Which gender type had what kind of procedure in abundance? i.e. Female visit mostly because of Gynae Problem
df %>%
group_by(Sex, Procedure) %>%
tally(sort = T) %>%
filter(!is.na(Sex)) %>%
View()

#6. Which Doctor is earning highest?
highest_earning_doctor <- df %>%
group_by(ConsultingDoctor) %>%
summarise_each(funs(sum(TotalCharges, na.rm = T))) %>%
select(ConsultingDoctor, TotalCharges) %>%
arrange(desc(TotalCharges))

paste("Highest paid doctor is",(highest_earning_doctor$ConsultingDoctor[which(highest_earning_doctor$TotalCharges == max(highest_earning_doctor$TotalCharges))]), "with pay of",max(highest_earning_doctor$TotalCharges))

#7. Which procedure type earns more money?
highest_earning_procedure <- df %>%
group_by(Procedure) %>%
summarise_each(funs(sum(TotalCharges, na.rm = T))) %>%
select(Procedure, TotalCharges) %>%
filter(!is.na(Procedure)) %>%
arrange(desc(TotalCharges))
View(highest_earning_procedure)
paste("Highest earning procedure is",(highest_earning_procedure$Procedure[which(highest_earning_procedure$TotalCharges == max(highest_earning_procedure$TotalCharges))]), "worth of",max(highest_earning_procedure$TotalCharges))

#8. Which time of the day has highest frequency of visits by hour?
high_frequency_time <- df %>%
filter(!is.na(Date), !is.na(Time)) %>%
group_by(Date,Time) %>%
tally() %>%
arrange(desc(n))
View(high_frequency_time)
indexOfMax <- which(high_frequency_time$n == max(high_frequency_time$n))
paste("Highest number of visits were on",weekdays(high_frequency_time$Date[indexOfMax]),"at",format(high_frequency_time$Time[indexOfMax]))

#10. How many patients are repeated visitors?
visitor <- df %>%
group_by(id) %>%
tally() %>%
filter(n > 1) %>%
arrange(desc(n))
View(visitor)
paste("Number of repeated visitors are", count(visitor))

#11. Give us the id of repeated visitors.
#Data is take from above
View(visitor)
print(visitor$id)

#12. Which patients visited again for the same problem?
patients_repeated <- df %>%
group_by(id, Procedure) %>%
tally() %>%
filter(!is.na(Procedure), n > 1) %>%
arrange(desc(n))
View(patients_repeated)
#This data show ignoring NA for procedure variable
paste("Patient repeated most for is",(patients_repeated$Procedure[which(patients_repeated$n == max(patients_repeated$n))]))

#13. What is the median age for Females and Males?
df %>%
group_by(Sex) %>%
summarise(median(Age, na.rm = T))

#14. What is the total amount in balance?
paste("Total amount in balance is",sum(df$AmountBalance, na.rm = T))

#15. How much money was made by Procedure Type "Consultation"?
df %>%
group_by(Procedure) %>%
summarise(Total_Amount = sum(TotalCharges, na.rm = T)) %>%
filter(Procedure %in% c("Consultation"))

#16.Is there a relation between Age and Total Charges paid
data <- df %>%
filter(!is.na(Age), !is.na(TotalCharges))
paste(cor(data$Age, data$TotalCharges),"shows positive linearly directly propotional.")

#17. Which Age group had highest number of visits?
# HIGHEST NUMBER OF VISIT WAS BY AGE UNASIGNED 'NA' THAT IS 28 BUT IT WAS IGNORED
highest_age_visit <- df %>%
group_by(Age) %>%
tally() %>%
filter(!is.na(Age)) %>%
arrange(desc(n))
View(highest_age_visit)
paste("Highest number of visit was by age group of",(highest_age_visit$Age[which(highest_age_visit$n == max(highest_age_visit$n))]), "with number of visits of",max(highest_age_visit$n))

#18. What is the total cost earned by Procedure Type X Ray and Scalling together?
TotalEarning <- df %>%
filter(Procedure %in% c("X Ray", "Scalling")) %>%
summarise(Total = sum(TotalCharges, na.rm = T))
View(TotalEarning)
paste("Total earning by both X Ray and Scalling is worth",TotalEarning)

```
281 changes: 281 additions & 0 deletions R_Markdown_HAK.html

Large diffs are not rendered by default.

178 changes: 178 additions & 0 deletions hammad_R_assignment2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
library(dplyr)
library(tidyr)
library(lubridate)

#loading csv from directory
bufferedDataFrame <- read.csv("hospitaldata.csv", strip.white = T, na.strings = c("-",""," ","\t","\n",NA), stringsAsFactors = F)

#CLEANING DATA
#converting to tbl format
df <- tbl_df(bufferedDataFrame)

#1. Removing dots(.) from column names
names(df) <- gsub("\\.", "", names(df))

#Removing character from Age Like M
df$Age <- as.numeric(gsub("[^0-9]",'',df$Age))

#Cleaning Date Column
#Converting date factor format to Date class
df$Date <- as.Date(strptime(df$Date, "%a, %B %d, %Y"))

#Cleaning Time Column with formatting
df$Time <- format(strptime(df$Time, format='%I:%M %p'), '%I:%M %p')

#Changing case of Sex to upper case
df$Sex <- toupper(df$Sex)

#Remove Cancelled to NA as no payment is not given
df$TotalCharges <- as.numeric(gsub("cancelled", NA, ignore.case = T,df$TotalCharges))

#Remove Cancelled to NA as no consultation is not given
df$Procedure <- (gsub("cancelled", NA, ignore.case = T,df$Procedure))

#Coverting AmountBalance to numeric
df$AmountBalance <- as.numeric(gsub(",",'',df$AmountBalance))
class(df$AmountBalance)

#Exporting Clean CSV
write.csv(df, "CleanHospitalData.csv")

#View clean data
glimpse(df)
View(df)

#QUESTIONS
#2. Which day of the week is expected to have most visits?
print(paste("Average age of a person is",weekdays(df$Date[which(table(df$Date) == max(table(df$Date)))])))

#3. What is the average age of patients?
print(paste("Average age of a person is ",mean(df$Age, na.rm = T)))

#4. How many children were entertained? (Make a Bracket of Age from 1-12)
total_entertained <- df %>%
select(Age) %>%
filter(Age >= 1 & Age <= 12, !is.na(.)) %>%
count()
paste("Children that were entertained are", total_entertained)

#5. Which gender type had what kind of procedure in abundance? i.e. Female visit mostly because of Gynae Problem
df %>%
group_by(Sex, Procedure) %>%
tally(sort = T) %>%
filter(!is.na(Sex)) %>%
View()

#6. Which Doctor is earning highest?
highest_earning_doctor <- df %>%
group_by(ConsultingDoctor) %>%
summarise_each(funs(sum(TotalCharges, na.rm = T))) %>%
select(ConsultingDoctor, TotalCharges) %>%
arrange(desc(TotalCharges))

paste("Highest paid doctor is",(highest_earning_doctor$ConsultingDoctor[which(highest_earning_doctor$TotalCharges == max(highest_earning_doctor$TotalCharges))]), "with pay of",max(highest_earning_doctor$TotalCharges))

#7. Which procedure type earns more money?
highest_earning_procedure <- df %>%
group_by(Procedure) %>%
summarise_each(funs(sum(TotalCharges, na.rm = T))) %>%
select(Procedure, TotalCharges) %>%
filter(!is.na(Procedure)) %>%
arrange(desc(TotalCharges))
View(highest_earning_procedure)
paste("Highest earning procedure is",(highest_earning_procedure$Procedure[which(highest_earning_procedure$TotalCharges == max(highest_earning_procedure$TotalCharges))]), "worth of",max(highest_earning_procedure$TotalCharges))

#8. Which time of the day has highest frequency of visits by hour?
#It its only about number of highes frequency
#"01:00 PM" "01:30 PM" "03:00 PM" "12:00 PM"
high_time_repeated <- df %>%
filter(!is.na(Time)) %>%
group_by(Time) %>%
tally() %>%
arrange(desc(n))
View(high_time_repeated)
high_time_repeated$Time[high_time_repeated$n == max(high_time_repeated$n)]

#If your are talking about particular day
high_frequency_time <- df %>%
filter(!is.na(Date), !is.na(Time)) %>%
group_by(Date,Time) %>%
tally() %>%
arrange(desc(n))
View(high_frequency_time)
indexOfMax <- which(high_frequency_time$n == max(high_frequency_time$n))
paste("Highest number of visits were on",weekdays(high_frequency_time$Date[indexOfMax]),"at",format(high_frequency_time$Time[indexOfMax]))


#9. Create a bracket of time by Morning, Afternoon, Evening, Night (6am - 12pm - Morning, 12 pm- 4 pm, Afternoon, 4 pm- 7pm, Evening, 7pm - 6 am, Night).

hour(strptime(df$Time, "%I:%M %p"))
df %>%
mutate(TimeBracket = ifelse(hour(strptime(df$Time, "%I:%M %p")) >= 6 & hour(strptime(df$Time, "%I:%M %p")) <= 12, "Morning",
ifelse(hour(strptime(df$Time, "%I:%M %p")) > 12 & hour(strptime(df$Time, "%I:%M %p")) <= 16, "Afternoon",
ifelse(hour(strptime(df$Time, "%I:%M %p")) > 16 & hour(strptime(df$Time, "%I:%M %p")) <= 19, "Evening",
ifelse(hour(strptime(df$Time, "%I:%M %p")) > 19, "Night", NA))))) %>%
View()



#10. How many patients are repeated visitors?
visitor <- df %>%
group_by(id) %>%
tally() %>%
filter(n > 1) %>%
arrange(desc(n))
View(visitor)
paste("Number of repeated visitors are", count(visitor))


#11. Give us the id of repeated visitors.
#Data is take from above
View(visitor)
print(visitor$id)

#12. Which patients visited again for the same problem?
patients_repeated <- df %>%
group_by(id, Procedure) %>%
tally() %>%
filter(!is.na(Procedure), n > 1) %>%
arrange(desc(n))
View(patients_repeated)
#This data show ignoring NA for procedure variable
paste("Patient repeated most for is",(patients_repeated$Procedure[which(patients_repeated$n == max(patients_repeated$n))]))

#13. What is the median age for Females and Males?
df %>%
group_by(Sex) %>%
summarise(median(Age, na.rm = T))

#14. What is the total amount in balance?
paste("Total amount in balance is",sum(df$AmountBalance, na.rm = T))

#15. How much money was made by Procedure Type "Consultation"?
df %>%
group_by(Procedure) %>%
summarise(Total_Amount = sum(TotalCharges, na.rm = T)) %>%
filter(Procedure %in% c("Consultation"))

#16.Is there a relation between Age and Total Charges paid
data <- df %>%
filter(!is.na(Age), !is.na(TotalCharges))
paste(cor(data$Age, data$TotalCharges),"shows positive linearly directly propotional.")

#17. Which Age group had highest number of visits?
# HIGHEST NUMBER OF VISIT WAS BY AGE UNASIGNED 'NA' THAT IS 28 BUT IT WAS IGNORED
highest_age_visit <- df %>%
group_by(Age) %>%
tally() %>%
filter(!is.na(Age)) %>%
arrange(desc(n))
View(highest_age_visit)
paste("Highest number of visit was by age group of",(highest_age_visit$Age[which(highest_age_visit$n == max(highest_age_visit$n))]), "with number of visits of",max(highest_age_visit$n))

#18. What is the total cost earned by Procedure Type X Ray and Scalling together?
TotalEarning <- df %>%
filter(Procedure %in% c("X Ray", "Scalling")) %>%
summarise(Total = sum(TotalCharges, na.rm = T))
View(TotalEarning)
paste("Total earning by both X Ray and Scalling is worth",TotalEarning)
382 changes: 382 additions & 0 deletions hammad_R_assignment2.html

Large diffs are not rendered by default.

Loading