Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 47 additions & 10 deletions week1/citibike.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,43 +6,80 @@ library(lubridate)
########################################

# read one month of data
trips <- read_csv('201402-citibike-tripdata.csv')

# replace spaces in column names with underscores
names(trips) <- gsub(' ', '_', names(trips))

# convert dates strings to dates
# trips <- mutate(trips, starttime = mdy_hms(starttime), stoptime = mdy_hms(stoptime))

# recode gender as a factor 0->"Unknown", 1->"Male", 2->"Female"
trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unknown","Male","Female")))


########################################
# YOUR SOLUTIONS BELOW
########################################

# count the number of trips (= rows in the data frame)
summarize(trips, count = n())

# find the earliest and latest birth years (see help for max and min to deal with NAs)
max(trips$birth_year, na.rm = FALSE)
#had issue with min that data model was not formatted with NA, instead it was \\N so figured out how to change it to NA so min could work
#my partner had a better solution as.numeric(trips$birthyear) which will automatically cast all non numerics as NA so the following is silly and overdramatic and pointless
trips <-
mutate(trips, birth_year
= if_else(!str_detect(birth_year,
"^[0-9]+$"), "NA", birth_year))
min(trips$birth_year, na.rm = FALSE)

# use filter and grepl to find all trips that either start or end on broadway
filter(trips, grepl("Broadway", start_station_name)| grepl("Broadway", end_station_name))

# do the same, but find all trips that both start and end on broadway
filter(trips, grepl("Broadway", start_station_name) & grepl("Broadway", end_station_name))

# find all unique station names
trips |> distinct(start_station_name)

# count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender
# do this all at once, by using summarize() with multiple arguments
summarize(group_by(trips, gender),
count = n(), mean = mean(tripduration),
sd = sd(tripduration))

# find the 10 most frequent station-to-station trips
trips |> group_by(start_station_name, end_station_name) |>
summarize(count = n()) |>
arrange(desc(count)) |>
head(n=10)

# find the top 3 end stations for trips starting from each start station
trips |> group_by(start_station_name, end_station_name) |>
summarize(count = n()) |>
group_by(start_station_name) |>
arrange(desc(count)) |>
mutate(rank = row_number()) |>
filter(rank <= 3) |>
arrange(start_station_name)
#Jake's solution: trips |>
#count(start_station_name, end_station_name) |>
#group_by(start_station_name) |>
#arrange(desc(n)) |>
#slice(1:3)



# find the top 3 most common station-to-station trips by gender
trips |> group_by(start_station_name, end_station_name, gender) |>
summarize(count = n()) |>
arrange(desc(count))|>
group_by(gender) |>
mutate(rank = row_number()) |>
filter(rank <=3) |>
arrange(gender)


# find the day with the most trips
trips |> group_by(as_date(starttime)) |> summarize(count = n()) |> arrange(desc(count)) |> head(n=1)

# tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package)


# compute the average number of trips taken during each of the 24 hours of the day across the entire month
# what time(s) of day tend to be peak hour(s)?
trips |> group_by(hour(starttime)) |> summarize(count =n(), average = count/n_distinct(as_date(starttime)))

trips |> group_by(hour(starttime)) |> summarize(count =n(), average = count/n_distinct(as_date(starttime))) |> arrange(average) |> tail(n=1)
22 changes: 11 additions & 11 deletions week1/citibike.sh
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
#!/bin/bash
#
# add your solution after each of the 10 comments below
# add your solution aftegr each of the 10 comments below
#

# count the number of unique stations

cut -d, -f4 201402-citibike-tripdata.csv | sort | uniq -c | wc -l
# count the number of unique bikes

cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | wc -l
# count the number of trips per day

cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c
# find the day with the most rides

cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort -r | head -1
# find the day with the fewest rides

cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | tail -n +2 | head -1
# find the id of the bike with the most rides

# count the number of rides by gender and birth year

cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | sort -r | head -1
# count the number f rides by gender and birth year
cut -d, -f14,15 201402-citibike-tripdata.csv | tail -2 +2 | sort | uniq -c
# count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...)


cut -d, -f5 201402-citibike-tripdata.csv | grep '.*[0-9].*&.*[0-9].*' | sort | uniq -c | awk '{sum += $1} END {print sum}'
# compute the average trip duration
awk -F, '{sum += $1; count++} END {print sum/count}' 201402-citibike-tripdata.csv
Loading