Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 31 additions & 3 deletions week1/citibike.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ library(lubridate)
########################################

# read one month of data
trips <- read_csv('201402-citibike-tripdata.csv')
trips <- read_csv('./week1/201402-citibike-tripdata.csv')

# replace spaces in column names with underscores
names(trips) <- gsub(' ', '_', names(trips))
Expand All @@ -23,26 +23,54 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn
########################################

# count the number of trips (= rows in the data frame)
print(nrow(trips))


# find the earliest and latest birth years (see help for max and min to deal with NAs)
birth_year_col <- as.numeric(birth_year_col)
birth_year_col <- na.omit(birth_year_col)
min(birth_year_col)
max(birth_year_col)

# use filter and grepl to find all trips that either start or end on broadway
filter(trips, grepl("Broadway", start_station_name) | grepl("Broadway", end_station_name))

# do the same, but find all trips that both start and end on broadway
filter(trips, grepl("Broadway", start_station_name) & grepl("Broadway", end_station_name))

# find all unique station names

stations <- paste(trips$start_station_name, trips$end_station_name)
unique_stations <- unique(stations)

# count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender
# do this all at once, by using summarize() with multiple arguments

trips %>% group_by(gender) %>% summarize(count =n(), mean = mean(tripduration), std = sd(tripduration))


# find the 10 most frequent station-to-station trips
trips %>%
group_by(start_station_name, end_station_name)
%>% summarize(frequency = n()) %>% arrange(desc(frequency)) %>% slice_head(n = 10)

# find the top 3 end stations for trips starting from each start station

trips %>% group_by(start_station_name, end_station_name) %>% summarize(count = n()) %>% group_by(start_station_name) %>% arrange(desc(count)) %>% slice_head(n=3)
# find the top 3 most common station-to-station trips by gender

trips %>% group_by(start_station_name, end_station_name) %>%
group_by(gender) %>% summarize(frequency = n()) %>% arrange(desc(frequency)) %>%
slice_head(n =3)
# find the day with the most trips
# tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package)

trips %>% group_by(as.Date(starttime)) %>% summarize(frequency = n()) %>% ar$
slice_head(n =1)

# compute the average number of trips taken during each of the 24 hours of the day across the entire month
# what time(s) of day tend to be peak hour(s)?

trips %>% mutate( hours = hour(starttime)) %>%
group_by(hours) %>% summarise(count = n(), day_in_month_count = days_in_month(starttime), avg = count / day_in_month_count)

trips %>% mutate( hours = hour(starttime)) %>%
group_by(hours) %>% summarise(count = n()) %>% arrange(desc(count)) %>% slice(1)
193 changes: 190 additions & 3 deletions week1/citibike.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,207 @@
#

# count the number of unique stations

# $ cut -d, -f4 201402-citibike-tripdata.csv | sort -u | wc -l
# 330
# count the number of unique bikes

# $ cut -d, -f12 201402-citibike-tripdata.csv | sort -u | wc -l
#5700
# count the number of trips per day

# $ cut -d, -f2 201402-citibike-tripdata.csv | cut -d" " -f1 | sort | uniq -c
# 12771 2014-02-01
# 13816 2014-02-02
# 2600 2014-02-03
# 8709 2014-02-04
# 2746 2014-02-05
# 7196 2014-02-06
# 8495 2014-02-07
# 5986 2014-02-08
# 4996 2014-02-09
# 6846 2014-02-10
# 8343 2014-02-11
# 8580 2014-02-12
# 876 2014-02-13
# 3609 2014-02-14
# 2261 2014-02-15
# 3003 2014-02-16
# 4854 2014-02-17
# 5140 2014-02-18
# 8506 2014-02-19
# 11792 2014-02-20
# 8680 2014-02-21
# 13044 2014-02-22
# 13324 2014-02-23
# 12922 2014-02-24
# 12830 2014-02-25
# 11188 2014-02-26
# 12036 2014-02-27
# 9587 2014-02-28

# find the day with the most rides

# find the day with the fewest rides
# $ cut -d, -f2 201402-citibike-tripdata.csv | cut -d" " -f1 | sort | uniq -c| sort -r | head -n1
# 13816 2014-02-02

# find the day with the fewest rides
# $ cut -d, -f2 201402-citibike-tripdata.csv | cut -d" " -f1| sort | uniq -c | sort | head -n2
# 1 starttime
# 876 2014-02-13
# find the id of the bike with the most rides
# $ cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | head -n1
# 10 14529

# count the number of rides by gender and birth year
# $ cut -d, -f15,14 201402-citibike-tripdata.csv | sort | uniq -c
# 6717 \N,0
# 9 1899,1
# 68 1900,1
# 11 1901,1
# 5 1907,1
# 4 1910,1
# 1 1913,1
# 3 1917,1
# 1 1921,1
# 32 1922,1
# 5 1926,2
# 2 1927,1
# 1 1932,1
# 7 1932,2
# 10 1933,1
# 21 1934,1
# 14 1935,1
# 31 1936,1
# 24 1937,1
# 70 1938,1
# 5 1938,2
# 24 1939,1
# 19 1939,2
# 83 1940,1
# 1 1940,2
# 148 1941,1
# 16 1941,2
# 173 1942,1
# 9 1942,2
# 108 1943,1
# 22 1943,2
# 277 1944,1
# 34 1944,2
# 171 1945,1
# 43 1945,2
# 424 1946,1
# 30 1946,2
# 391 1947,1
# 60 1947,2
# 664 1948,1
# 143 1948,2
# 624 1949,1
# 101 1949,2
# 738 1950,1
# 152 1950,2
# 6 1951,0
# 1006 1951,1
# 146 1951,2
# 1040 1952,1
# 143 1952,2
# 1474 1953,1
# 301 1953,2
# 1636 1954,1
# 306 1954,2
# 1568 1955,1
# 349 1955,2
# 1777 1956,1
# 542 1956,2
# 1676 1957,1
# 562 1957,2
# 2333 1958,1
# 643 1958,2
# 2281 1959,1
# 539 1959,2
# 2679 1960,1
# 776 1960,2
# 2315 1961,1
# 432 1961,2
# 2808 1962,1
# 833 1962,2
# 3514 1963,1
# 715 1963,2
# 3679 1964,1
# 570 1964,2
# 2957 1965,1
# 687 1965,2
# 3440 1966,1
# 565 1966,2
# 4016 1967,1
# 634 1967,2
# 3931 1968,1
# 545 1968,2
# 4557 1969,1
# 898 1969,2
# 4657 1970,1
# 1079 1970,2
# 4132 1971,1
# 791 1971,2
# 4066 1972,1
# 962 1972,2
# 4097 1973,1
# 877 1973,2
# 4957 1974,1
# 891 1974,2
# 4185 1975,1
# 699 1975,2
# 4557 1976,1
# 1022 1976,2
# 4817 1977,1
# 1140 1977,2
# 5645 1978,1
# 1231 1978,2
# 6433 1979,1
# 1338 1979,2
# 6173 1980,1
# 1488 1980,2
# 6620 1981,1
# 1588 1981,2
# 6244 1982,1
# 1724 1982,2
# 6890 1983,1
# 1889 1983,2
# 7348 1984,1
# 1791 1984,2
# 7043 1985,1
# 2262 1985,2
# 6147 1986,1
# 1962 1986,2
# 5776 1987,1
# 1696 1987,2
# 6449 1988,1
# 1599 1988,2
# 5408 1989,1
# 1435 1989,2
# 4541 1990,1
# 1156 1990,2
# 8 1991,0
# 2377 1991,1
# 689 1991,2
# 1758 1992,1
# 410 1992,2
# 1398 1993,1
# 289 1993,2
# 927 1994,1
# 288 1994,2
# 664 1995,1
# 163 1995,2
# 234 1996,1
# 100 1996,2
# 164 1997,1
# 87 1997,2
# 1 birth year,gender

# count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...)

# $ cut -d, -f5 201402-citibike-tripdata.csv | grep '[1-9].*[1-9]'| wc -l
# 122406

# compute the average trip duration

# $ awk -F, '{sum+=$1; count++} END {print sum/count}' 201402-citibike-tripdata.csv
# 874.516
8 changes: 4 additions & 4 deletions week1/intro_command_line.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -919,7 +919,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": null,
"metadata": {
"collapsed": false
},
Expand All @@ -937,7 +937,7 @@
],
"source": [
"# count trips by gender\n",
"cut -d, -f15 201402-citibike-tripdata.csv | sort | uniq -c"
"cut -d, -f15 201402-citibike-tripdata.ccsv | sort | uniq -c"
]
},
{
Expand Down Expand Up @@ -1012,7 +1012,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": null,
"metadata": {
"collapsed": false
},
Expand All @@ -1030,7 +1030,7 @@
],
"source": [
"# use awk to count trips by gender without having to sort\n",
"awk -F, '{counts[$15]++} END {for (k in counts) print counts[k]\"\\t\" k }' 201402-citibike-tripdata.csv"
"awk -F, '{counts[$15]++} END git 201402-citibike-tripdata.csv"
]
},
{
Expand Down
Loading