Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 70 additions & 3 deletions week1/citibike.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@ library(lubridate)
# READ AND TRANSFORM THE DATA
########################################




# read one month of data
trips <- read_csv('201402-citibike-tripdata.csv')
trips <- read_csv('./coursework/week1/201402-citibike-tripdata.csv')
##viewing the data
view(trips)

# replace spaces in column names with underscores
names(trips) <- gsub(' ', '_', names(trips))
Expand All @@ -23,26 +28,88 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn
########################################

# count the number of trips (= rows in the data frame)
nrow(trips)


# find the earliest and latest birth years (see help for max and min to deal with NAs)

# use filter and grepl to find all trips that either start or end on broadway
#converting birth_year into numbers
trips$birth_year <- as.numeric(trips$birth_year)
max(trips$birth_year, na.rm = TRUE)
min(trips$birth_year, na.rm = TRUE)

# use filter and grepl to find all trips that either start or end on broadway
filtered_df <- filter(trips, grepl("Broadway", start_station_name) | grepl("Broadway",end_station_name))
view(filtered_df)
# do the same, but find all trips that both start and end on broadway

filtered_both_df <- filter(trips, grepl("Broadway", start_station_name) & grepl("Broadway",end_station_name))
nrow(filtered_both_df)
# find all unique station names
#stupid idea nrow(distinct(trips[ ,"start_station_name"])) + nrow(distinct(trips[ ,"end_station_name"]))

union(trips$start_station_name, trips$end_station_name)

# count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender
# do this all at once, by using summarize() with multiple arguments
grouped_df <- trips %>%
group_by(gender)%>%
summarize(num_trips = n(),
average_trip = mean(tripduration)/60,
standard_deviation = sd(tripduration)/60)
grouped_df




# find the 10 most frequent station-to-station trips

frequent_df <- trips %>%
group_by(start_station_name, end_station_name) %>%
summarize(num_trips = n()) %>%
arrange(desc(num_trips))

head(frequent_df,10)

# find the top 3 end stations for trips starting from each start station

trips %>%
group_by(start_station_name, end_station_name) %>%
summarize(num_trips = n()) %>%
group_by(start_station_name) %>%
arrange(desc(num_trips))%>%
slice(1:3)



top3_end_stations
# find the top 3 most common station-to-station trips by gender
trips %>%
group_by(start_station_name, end_station_name, gender) %>%
summarize(num_trips= n())%>%
group_by(gender)%>%
arrange(desc(num_trips))%>%
slice(1:3)




# find the day with the most trips
# tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package)
trips %>%
mutate(date_only = as.Date(starttime)) %>%
count(date_only)%>%
arrange(desc(n)) %>%
head(1)

# compute the average number of trips taken during each of the 24 hours of the day across the entire month
# what time(s) of day tend to be peak hour(s)?
view(trips)
trips %>% mutate( hours = hour(starttime)) %>%
group_by(hours) %>% summarise(count = n(), day_in_month_count = days_in_month(starttime), avg = count / day_in_month_count)

trips %>% mutate( hours = hour(starttime)) %>%
group_by(hours) %>% summarise(count = n()) %>% arrange(desc(count)) %>% slice(1)




40 changes: 39 additions & 1 deletion week1/citibike.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,58 @@
#

# count the number of unique stations
#$ cut -d , -f4 201402-citibike-tripdata.csv | sort | uniq | wc -l
#330


# count the number of unique bikes
#$ cut -d , -f12 201402-citibike-tripdata.csv | sort | uniq | wc -l
#5700

# count the number of trips per day
# $ cut -d , -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | head -n2
# 1 starttime
# 876 2014-02-13

# find the day with the most rides
#$cut -d , -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort -r | head -n1
# 13816 2014-02-02


# find the day with the fewest rides
#$cut -d , -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | head -n2

# find the id of the bike with the most rides
#$cut -d , -f12 201402-citibike-tripdata.csv | sort | uniq -c | sort -r | head -n1

# count the number of rides by gender and birth year
#$ cut -d, -f15,14 201402-citibike-tripdata.csv | sort | uniq -c | sort -r

# count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...)

# $ cut -d, -f5 201402-citibike-tripdata.csv | tail -n +2 | grep -E '.*[0-9].*&.*[0-9].*' | wc -l
# 90549

# compute the average trip duration
# awk -F, {sum += $1; count++} END {print sum/count}' 201402-citibike-tripdata.csv
# 874.516


#running average script for first 1000 lines
# $ head -n 1000 201402-citibike-tripdata.csv | awk '{
# window[NR % 3] = $1
# if (NR >= 3) {
# print (window[(NR-1)%3] + window[(NR-2)%3] + window[(NR-3)%3]) / 3
# }
# }'

#MUSICAL CHAIR in python
# import random

# names = ["Alou", "Srijana", "Sara", "Drishya", "Dereck", "Ahmed",
# "Aisha", "Vaishnavi", "Naomi", "Sofia", "Ye", "Vanessa"]

# random.shuffle(names)

# for i in range(0, 12, 2):
# print(f"{names[i]} & {names[i+1]}")

157 changes: 155 additions & 2 deletions week1/plot_trips.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,32 +11,98 @@ theme_set(theme_bw())

# load RData file output by load_trips.R
load('trips.RData')

head(trips)

########################################
# plot trip data
########################################
######################################

# plot the distribution of trip times across all rides (compare a histogram vs. a density plot)
ggplot(trips,aes(tripduration/60))+
geom_histogram(fill= 'blue', bins = 50)+
scale_x_log10(labels=comma)+
scale_y_continuous(labels = comma)+
labs(x='Trip duration',
y= 'Frequency',
title = 'Histogram of trip duration in minutes')

ggplot(trips,aes(tripduration/60))+
geom_density(fill= 'red')+
scale_x_log10(labels=comma)+
labs(x='Trip duration ',
y= 'Frequency',
title = 'Density plot in minutes')

head(trips)

# plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot)
ggplot(trips, aes(x = tripduration, fill=usertype))+
geom_histogram(bins = 40)+
labs(x='Trip duration')+
scale_x_log10(labels=comma)+
facet_grid(~usertype)

ggplot(trips, aes(x = tripduration, fill=usertype))+
geom_density()+
labs(x='Trip duration')+
scale_x_log10(labels=comma)+
facet_grid(~usertype)

# plot the total number of trips on each day in the dataset
trips%>%
mutate(date = as.Date(starttime))%>%
ggplot(aes(x =date))+
geom_histogram(bins=30)+
scale_y_continuous(labels=comma)+
scale_x_date(labels = date_format("%Y-%m-%d"))

# plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color)
trips%>%
mutate(age = as.numeric(format(ymd, "%Y")) - as.numeric(birth_year)) %>%
ggplot(aes(age, fill = gender))+
geom_histogram(bins=40,alpha = 0.8)+
scale_y_continuous(labels = comma)

# plot the ratio of male to female trips (on the y axis) by age (on the x axis)
# hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio
# (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet)
trips %>% mutate(age = as.numeric(format(ymd, "%Y")) - as.numeric(birth_year)) %>%
group_by(age, gender) %>% summarise(num_trips = n(), .groups = "drop")%>%
pivot_wider(names_from = gender, values_from = num_trips) %>% mutate(male_to_female = Male/ Female) %>%
ggplot( aes(age, male_to_female))+ geom_line(color = "steelblue", size = 1) +
geom_smooth( color = "red", linetype = "dashed") +
scale_x_log10()+
labs(x = "AGE", y = "Male to Female Ratio", title = "Male/Female trip ratio by Age") +
theme_minimal()

########################################
# plot weather data
########################################
# plot the minimum temperature (on the y axis) over each day (on the x axis)
view(weather)
weather%>%
ggplot(aes(x= ymd, y= tmin, color=tmin))+
geom_point()+
scale_color_gradient(low = "blue", high = "red") +
labs(
title="Scatterplot of min temp over each day",
x= "Day",
y= "Minimum temperature")


# plot the minimum temperature and maximum temperature (on the y axis, with different colors) over each day (on the x axis)
# hint: try using the pivot_longer() function for this to reshape things before plotting
# (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet)
view(weather)

weather %>% pivot_longer(names_to = "temp_type", values_to = "temperature", cols = c(tmin,tmax)) %>%
ggplot(aes(ymd, temperature, color= temp_type))+ geom_line() + scale_x_date() + labs(
x = "Date",
y = "Temperature",
color = "Temperature Type",
title = "Daily Min and Max Temperatures")



########################################
# plot trip and weather data
Expand All @@ -45,18 +111,105 @@ load('trips.RData')
# join trips and weather
trips_with_weather <- inner_join(trips, weather, by="ymd")

head(trips_with_weather)

# plot the number of trips as a function of the minimum temperature, where each point represents a day
# you'll need to summarize the trips and join to the weather data to do this

str(weather)
head(trips)
trips_with_weather %>%
group_by(ymd,tmin)%>%
summarise(num_trips = n(), .groups = "drop")%>%
ggplot(aes(tmin, num_trips))+geom_point()

#works only for this data frame since tmin is not different
# trips_by_day <- trips %>%
# mutate(date = as.Date(starttime)) %>%
# group_by(date)%>%
# summarise(num_trips = n())
# weather %>%
# mutate(date = as.Date(date))%>%
# inner_join(trips_by_day,weather_by_day, by ='date') %>%
# ggplot(aes(x= mean(tmin),y= num_trips))+
# geom_point() +
# labs(x = "Minimum Temperature", y = "Number of Trips",
# title = "Trips vs. Min Temperature")

# repeat this, splitting results by whether there was substantial precipitation or not
# you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this
trips_with_weather %>%
mutate(precptf = ifelse(prcp > mean(prcp), 'T', 'F'))%>%
group_by(ymd,tmin, precptf)%>%
summarise(num_trips = n(), .groups = "drop")%>%
ungroup()%>%
ggplot(aes(tmin, num_trips))+geom_point() +
labs( x = "number of trip",
y = "minimum temp",
title = "Substantial precipitation on num of trips") +
facet_wrap(~precptf)



# add a smoothed fit on top of the previous plot, using geom_smooth
trips_with_weather %>%
mutate(precptf = ifelse(prcp > mean(prcp), 'T', 'F'))%>%
group_by(ymd,tmin, precptf)%>%
summarise(num_trips = n(), .groups = "drop")%>%
ungroup()%>%
ggplot(aes(tmin, num_trips))+geom_point() + geom_smooth()+
labs( x = "number of trip",
y = "minimum temp",
title = "Substantial precipitation on num of trips") +
facet_wrap(~precptf)

# compute the average number of trips and standard deviation in number of trips by hour of the day
# hint: use the hour() function from the lubridate package
library(lubridate)

trips_with_weather %>%
mutate(hour = hour(starttime),
date = as.Date(starttime))%>%
group_by(hour,date)%>%
summarise(num_trips = n(), .groups = "drop")%>%
group_by(hour)%>%
summarise(
average_trips = mean(num_trips),
sd_trips = sd(num_trips))

# plot the above
trips_with_weather %>%
mutate(hour = hour(starttime),
date = as.Date(starttime))%>%
group_by(hour,date)%>%
summarise(num_trips = n(), .groups = "drop")%>%
group_by(hour)%>%
summarise(
average_trips = mean(num_trips),
sd_trips = sd(num_trips)) %>%
ggplot( aes(hour, average_trips))+
geom_line(color = "red") + geom_ribbon(aes(ymin = average_trips - sd_trips, ymax = average_trips + sd_trips), alpha = 0.25)+
labs(
x = "Hour of Day",
y = "Average Number of Trips",
title = "Average Number of Trips by Hour with ±1 SD Ribbon",
subtitle = "Red line: Mean trips per hour; Blue ribbon: ±1 standard deviation"
) +
theme_minimal()


# repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days
# hint: use the wday() function from the lubridate package
trips %>%
mutate(
hour = hour(starttime),
day = as.Date(starttime),
weekday = wday(starttime, label =TRUE) )%>%
group_by(hour, weekday, day)%>%
summarise(trip_count = n(), .groups = "drop")%>%
group_by(hour,weekday)%>%
summarise(average = mean(trip_count),
standarddev = sd(trip_count),
.groups = "drop") %>%
ggplot(aes(hour, average)) + geom_line(color = "red")+ geom_ribbon(aes(ymin = average - standarddev, ymax = average + standarddev), alpha = 0.25)+
facet_wrap(~weekday)
Loading