diff --git a/week1/citibike.R b/week1/citibike.R index ad01de1d..110b7adf 100644 --- a/week1/citibike.R +++ b/week1/citibike.R @@ -6,7 +6,7 @@ library(lubridate) ######################################## # read one month of data -trips <- read_csv('201402-citibike-tripdata.csv') +trips <- read_csv('./week1/201402-citibike-tripdata.csv') # replace spaces in column names with underscores names(trips) <- gsub(' ', '_', names(trips)) @@ -23,26 +23,54 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn ######################################## # count the number of trips (= rows in the data frame) +print(nrow(trips)) + # find the earliest and latest birth years (see help for max and min to deal with NAs) + birth_year_col <- as.numeric(birth_year_col) + birth_year_col <- na.omit(birth_year_col) + min(birth_year_col) + max(birth_year_col) # use filter and grepl to find all trips that either start or end on broadway +filter(trips, grepl("Broadway", start_station_name) | grepl("Broadway", end_station_name)) # do the same, but find all trips that both start and end on broadway +filter(trips, grepl("Broadway", start_station_name) & grepl("Broadway", end_station_name)) # find all unique station names + stations <- paste(trips$start_station_name, trips$end_station_name) + unique_stations <- unique(stations) + # count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender # do this all at once, by using summarize() with multiple arguments +trips %>% group_by(gender) %>% summarize(count =n(), mean = mean(tripduration), std = sd(tripduration)) + + # find the 10 most frequent station-to-station trips +trips %>% +group_by(start_station_name, end_station_name) +%>% summarize(frequency = n()) %>% arrange(desc(frequency)) %>% slice_head(n = 10) # find the top 3 end stations for trips starting from each start station - + trips %>% group_by(start_station_name, end_station_name) %>% summarize(count = n()) %>% group_by(start_station_name) %>% arrange(desc(count)) %>% slice_head(n=3) # find the top 3 most common station-to-station trips by gender - +trips %>% group_by(start_station_name, end_station_name) %>% +group_by(gender) %>% summarize(frequency = n()) %>% arrange(desc(frequency)) %>% + slice_head(n =3) # find the day with the most trips # tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package) +trips %>% group_by(as.Date(starttime)) %>% summarize(frequency = n()) %>% ar$ +slice_head(n =1) + # compute the average number of trips taken during each of the 24 hours of the day across the entire month # what time(s) of day tend to be peak hour(s)? + +trips %>% mutate( hours = hour(starttime)) %>% + group_by(hours) %>% summarise(count = n(), day_in_month_count = days_in_month(starttime), avg = count / day_in_month_count) + +trips %>% mutate( hours = hour(starttime)) %>% + group_by(hours) %>% summarise(count = n()) %>% arrange(desc(count)) %>% slice(1) diff --git a/week1/citibike.sh b/week1/citibike.sh index 25604f54..d9ea9609 100755 --- a/week1/citibike.sh +++ b/week1/citibike.sh @@ -4,20 +4,207 @@ # # count the number of unique stations - +# $ cut -d, -f4 201402-citibike-tripdata.csv | sort -u | wc -l + # 330 # count the number of unique bikes - +# $ cut -d, -f12 201402-citibike-tripdata.csv | sort -u | wc -l + #5700 # count the number of trips per day +# $ cut -d, -f2 201402-citibike-tripdata.csv | cut -d" " -f1 | sort | uniq -c +# 12771 2014-02-01 +# 13816 2014-02-02 +# 2600 2014-02-03 +# 8709 2014-02-04 +# 2746 2014-02-05 +# 7196 2014-02-06 +# 8495 2014-02-07 +# 5986 2014-02-08 +# 4996 2014-02-09 +# 6846 2014-02-10 +# 8343 2014-02-11 +# 8580 2014-02-12 +# 876 2014-02-13 +# 3609 2014-02-14 +# 2261 2014-02-15 +# 3003 2014-02-16 +# 4854 2014-02-17 +# 5140 2014-02-18 +# 8506 2014-02-19 +# 11792 2014-02-20 +# 8680 2014-02-21 +# 13044 2014-02-22 +# 13324 2014-02-23 +# 12922 2014-02-24 +# 12830 2014-02-25 +# 11188 2014-02-26 +# 12036 2014-02-27 +# 9587 2014-02-28 + # find the day with the most rides -# find the day with the fewest rides +# $ cut -d, -f2 201402-citibike-tripdata.csv | cut -d" " -f1 | sort | uniq -c| sort -r | head -n1 +# 13816 2014-02-02 +# find the day with the fewest rides +# $ cut -d, -f2 201402-citibike-tripdata.csv | cut -d" " -f1| sort | uniq -c | sort | head -n2 +# 1 starttime +# 876 2014-02-13 # find the id of the bike with the most rides +# $ cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | head -n1 +# 10 14529 # count the number of rides by gender and birth year +# $ cut -d, -f15,14 201402-citibike-tripdata.csv | sort | uniq -c +# 6717 \N,0 +# 9 1899,1 +# 68 1900,1 +# 11 1901,1 +# 5 1907,1 +# 4 1910,1 +# 1 1913,1 +# 3 1917,1 +# 1 1921,1 +# 32 1922,1 +# 5 1926,2 +# 2 1927,1 +# 1 1932,1 +# 7 1932,2 +# 10 1933,1 +# 21 1934,1 +# 14 1935,1 +# 31 1936,1 +# 24 1937,1 +# 70 1938,1 +# 5 1938,2 +# 24 1939,1 +# 19 1939,2 +# 83 1940,1 +# 1 1940,2 +# 148 1941,1 +# 16 1941,2 +# 173 1942,1 +# 9 1942,2 +# 108 1943,1 +# 22 1943,2 +# 277 1944,1 +# 34 1944,2 +# 171 1945,1 +# 43 1945,2 +# 424 1946,1 +# 30 1946,2 +# 391 1947,1 +# 60 1947,2 +# 664 1948,1 +# 143 1948,2 +# 624 1949,1 +# 101 1949,2 +# 738 1950,1 +# 152 1950,2 +# 6 1951,0 +# 1006 1951,1 +# 146 1951,2 +# 1040 1952,1 +# 143 1952,2 +# 1474 1953,1 +# 301 1953,2 +# 1636 1954,1 +# 306 1954,2 +# 1568 1955,1 +# 349 1955,2 +# 1777 1956,1 +# 542 1956,2 +# 1676 1957,1 +# 562 1957,2 +# 2333 1958,1 +# 643 1958,2 +# 2281 1959,1 +# 539 1959,2 +# 2679 1960,1 +# 776 1960,2 +# 2315 1961,1 +# 432 1961,2 +# 2808 1962,1 +# 833 1962,2 +# 3514 1963,1 +# 715 1963,2 +# 3679 1964,1 +# 570 1964,2 +# 2957 1965,1 +# 687 1965,2 +# 3440 1966,1 +# 565 1966,2 +# 4016 1967,1 +# 634 1967,2 +# 3931 1968,1 +# 545 1968,2 +# 4557 1969,1 +# 898 1969,2 +# 4657 1970,1 +# 1079 1970,2 +# 4132 1971,1 +# 791 1971,2 +# 4066 1972,1 +# 962 1972,2 +# 4097 1973,1 +# 877 1973,2 +# 4957 1974,1 +# 891 1974,2 +# 4185 1975,1 +# 699 1975,2 +# 4557 1976,1 +# 1022 1976,2 +# 4817 1977,1 +# 1140 1977,2 +# 5645 1978,1 +# 1231 1978,2 +# 6433 1979,1 +# 1338 1979,2 +# 6173 1980,1 +# 1488 1980,2 +# 6620 1981,1 +# 1588 1981,2 +# 6244 1982,1 +# 1724 1982,2 +# 6890 1983,1 +# 1889 1983,2 +# 7348 1984,1 +# 1791 1984,2 +# 7043 1985,1 +# 2262 1985,2 +# 6147 1986,1 +# 1962 1986,2 +# 5776 1987,1 +# 1696 1987,2 +# 6449 1988,1 +# 1599 1988,2 +# 5408 1989,1 +# 1435 1989,2 +# 4541 1990,1 +# 1156 1990,2 +# 8 1991,0 +# 2377 1991,1 +# 689 1991,2 +# 1758 1992,1 +# 410 1992,2 +# 1398 1993,1 +# 289 1993,2 +# 927 1994,1 +# 288 1994,2 +# 664 1995,1 +# 163 1995,2 +# 234 1996,1 +# 100 1996,2 +# 164 1997,1 +# 87 1997,2 +# 1 birth year,gender # count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...) +# $ cut -d, -f5 201402-citibike-tripdata.csv | grep '[1-9].*[1-9]'| wc -l +# 122406 # compute the average trip duration + +# $ awk -F, '{sum+=$1; count++} END {print sum/count}' 201402-citibike-tripdata.csv +# 874.516 \ No newline at end of file diff --git a/week1/intro_command_line.ipynb b/week1/intro_command_line.ipynb index 816197c9..f3750b64 100644 --- a/week1/intro_command_line.ipynb +++ b/week1/intro_command_line.ipynb @@ -919,7 +919,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": { "collapsed": false }, @@ -937,7 +937,7 @@ ], "source": [ "# count trips by gender\n", - "cut -d, -f15 201402-citibike-tripdata.csv | sort | uniq -c" + "cut -d, -f15 201402-citibike-tripdata.ccsv | sort | uniq -c" ] }, { @@ -1012,7 +1012,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": { "collapsed": false }, @@ -1030,7 +1030,7 @@ ], "source": [ "# use awk to count trips by gender without having to sort\n", - "awk -F, '{counts[$15]++} END {for (k in counts) print counts[k]\"\\t\" k }' 201402-citibike-tripdata.csv" + "awk -F, '{counts[$15]++} END git 201402-citibike-tripdata.csv" ] }, { diff --git a/week1/plot_trips.R b/week1/plot_trips.R index 4f25437b..cb8fce9d 100644 --- a/week1/plot_trips.R +++ b/week1/plot_trips.R @@ -11,52 +11,149 @@ theme_set(theme_bw()) # load RData file output by load_trips.R load('trips.RData') - - ######################################## # plot trip data ######################################## # plot the distribution of trip times across all rides (compare a histogram vs. a density plot) - +ggplot(trips, aes(x = tripduration))+ + geom_histogram(fill = 'blue', bins = 30) + + scale_x_log10(labels=comma) + + ylab('Number of trip per Time') + + ggplot(trips, aes(x = tripduration))+ + geom_density(fill = 'grey') + + scale_x_log10(labels=comma) # plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot) +ggplot(trips, aes(x = tripduration, , color = usertype, fill = usertype))+ + geom_histogram(bins = 30) + + scale_x_log10(labels=comma) + + ylab('Number of trip per Time')+ + facet_grid( ~usertype) + +ggplot(trips, aes(x = tripduration, , color = usertype, fill = usertype))+ + geom_density(fill = 'grey') + + scale_x_log10(labels=comma) + + ylab('Number of trip per Time')+ + facet_grid( ~usertype) + # plot the total number of trips on each day in the dataset +trips %>% mutate(date = as.Date(starttime)) %>% +ggplot(aes(x = date))+ + geom_histogram(fill = 'blue', bins = 30)+ + scale_y_continuous(label = comma)+ + scale_x_date(date_labels = "%Y-%m-%d") + # plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color) +trips %>% mutate(age = as.numeric(format(ymd, '%Y')) - as.numeric(birth_year)) %>% + ggplot(aes(x = age), fill = 'gender')+ + geom_histogram(bins = 30) + # plot the ratio of male to female trips (on the y axis) by age (on the x axis) # hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio # (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet) +view(head(trips,100)) + +view(head(trips,100)) + +trips %>% mutate(age = year(ymd) - as.numeric(birth_year)) %>% + group_by(age, gender) %>% + summarise(count = n()) %>% + pivot_wider(names_from = gender, values_from = count) %>% + ggplot(aes(x = age, y = Male/Female)) + + geom_point() + + scale_y_log10() ######################################## # plot weather data ######################################## # plot the minimum temperature (on the y axis) over each day (on the x axis) +ggplot(weather, aes(x =date, y = tmin)) + + geom_point() # plot the minimum temperature and maximum temperature (on the y axis, with different colors) over each day (on the x axis) # hint: try using the pivot_longer() function for this to reshape things before plotting # (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet) +weather %>% pivot_longer(names_to = "min_max", values_to = "temp", -c(date, prcp, snwd, snow, ymd)) %>% + ggplot(aes(x = date, y = temp, color= min_max)) + + geom_point() + ######################################## # plot trip and weather data ######################################## # join trips and weather -trips_with_weather <- inner_join(trips, weather, by="ymd") +trips_with_weather <- inner_join(trips, weather, by = "ymd") # plot the number of trips as a function of the minimum temperature, where each point represents a day # you'll need to summarize the trips and join to the weather data to do this +weather %>% head() +trips_with_weathe %>% head() + +trips_with_weather %>% + group_by(ymd,tmin) %>% + summarise(count = n(), .groups = "drop") %>% + ggplot(aes(x = tmin, y = count)) + + geom_point() + + xlab("minimum temperature") + + ylab("number of trip") # repeat this, splitting results by whether there was substantial precipitation or not # you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this +trips_with_weather %>% + mutate(prcp_T_F = ifelse(prcp > mean(prcp), 'T', 'F')) %>% + group_by(ymd,tmin,prcp_T_F) %>% + summarise(count = n(),.groups = "drop") %>% + ungroup() %>% + ggplot(aes(x = tmin, y = count)) + + geom_point() + + xlab("minimum temperature") + + ylab("number of trip") + + facet_wrap(~prcp_T_F) + # add a smoothed fit on top of the previous plot, using geom_smooth + + +trips_with_weather %>% + mutate(prcp_T_F = ifelse(prcp > mean(prcp), 'T', 'F')) %>% + group_by(ymd,tmin,prcp_T_F) %>% + summarise(count = n(),.groups = "drop") %>% + ungroup() %>% + ggplot(aes(x = tmin, y = count)) + + geom_point() + + geom_smooth() + + ylab("minimum temperature") + + xlab("number of trip") + + facet_wrap(~prcp_T_F) + # compute the average number of trips and standard deviation in number of trips by hour of the day # hint: use the hour() function from the lubridate package - +trips %>% head() + trips %>% group_by(hour = hour(starttime),ymd) %>% + summarise(n_trips = n(), .groups = "drop")%>% + group_by(hour) %>% summarise(avg = mean(n_trips),std = sd(n_trips)) # plot the above +trips %>% head() + trips %>% group_by( hour = hour(starttime),ymd) %>% + summarise(n_trips = n(), .groups = "drop")%>% + group_by(hour) %>% summarise(avg = mean(n_trips),std = sd(n_trips)) %>% + ggplot(aes(x= hour, y= avg))+ + geom_line(color = "red")+ + geom_ribbon(aes(ymin = (avg - std), ymax = (avg + std), alpha = 0.25)) + # repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days # hint: use the wday() function from the lubridate package + trips %>% + mutate(day_week = wday(ymd, label = TRUE)) %>% + group_by( hour = hour(starttime), ymd,day_week) %>% + summarise(n_trips = n() , .groups = "drop")%>% + group_by(day_week,hour) %>% summarise(avg = mean(n_trips),std = sd(n_trips), .groups = "drop") %>% + ggplot(aes(hour, avg)) + geom_line() + geom_ribbon(aes(ymin= avg - std, ymax = avg + std, alpha = 0.5)) + facet_wrap(~day_week) + \ No newline at end of file diff --git a/week4/best_model.RData b/week4/best_model.RData new file mode 100644 index 00000000..69fb97f5 Binary files /dev/null and b/week4/best_model.RData differ diff --git a/week4/trips_modeling.Rmd b/week4/trips_modeling.Rmd new file mode 100644 index 00000000..b9f7a685 --- /dev/null +++ b/week4/trips_modeling.Rmd @@ -0,0 +1,180 @@ +--- +title: "trips_model" +date: '`r Sys.time()`' +output: + html_document: + #code_folding: hide + number_sections: yes + toc: yes + toc_depth: 3 +--- + +```{r setup, include=FALSE} +library(scales) +library(tidyverse) +library(knitr) + +# set plot theme +theme_set(theme_bw()) +``` + +```{r setup, include=FALSE} +set.seed(42) + + +data <- read_tsv("C:/Users/ds3/Documents/coursework/week4/trips_per_day.tsv") +n <- nrow(data) +shuffled_indices <- sample(n) + + +n_test <- floor(0.1 * n) +test_idx <- shuffled_indices[1:n_test] +test_set <- data[test_idx, ] + + +remaining_idx <- shuffled_indices[(n_test + 1):n] +remaining_data <- data[remaining_idx, ] + + +num_folds <- 5 +trips_per_day <- remaining_data %>% + mutate(fold = ((row_number() - 1) %% num_folds) + 1) + +head(trips_per_day) + +``` + +```{r setup, include=FALSE} + +# fit a model for each polynomial degree +K <- 1:8 +avg_validate_err <- c() +se_validate_err <- c() +for (k in K) { + + # do 5-fold cross-validation within each value of k + validate_err <- c() + for (f in 1:num_folds) { + # fit on the training data + trips_per_day_train <- filter(trips_per_day, fold != f) + model <- lm(num_trips ~ poly(tmin, k, raw = T), data=trips_per_day_train) + + # evaluate on the validation data + trips_per_day_validate <- filter(trips_per_day, fold == f) + validate_err[f] <- sqrt(mean((predict(model, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) + } + + # compute the average validation error across folds + # and the standard error on this estimate + avg_validate_err[k] <- mean(validate_err) + se_validate_err[k] <- sd(validate_err) / sqrt(num_folds) +} + +``` + +```{r setup, include=FALSE} +# plot the validate error, highlighting the value of k with the lowest average error +plot_data <- data.frame(K, avg_validate_err, se_validate_err) +ggplot(plot_data, aes(x=K, y=avg_validate_err)) + + geom_pointrange(aes(ymin=avg_validate_err - se_validate_err, + ymax=avg_validate_err + se_validate_err, + color=avg_validate_err == min(avg_validate_err))) + + geom_line(color = "red") + + scale_x_continuous(breaks=1:12) + + theme(legend.position="none") + + xlab('Polynomial Degree') + + ylab('RMSE on validation data') +``` + + +```{r setup, include=FALSE} +n <- nrow(trips_per_day) +train_size <- floor(0.8 * n) + +# Randomly shuffle row indices +shuffled_indices <- sample(n) + +# Index split +train_indices <- shuffled_indices[1:train_size] +validation_indices <- shuffled_indices[(train_size + 1):n] + +# Create the two datasets +trips_per_day_train <- trips_per_day[train_indices, ] +trips_per_day_validate <- trips_per_day[validation_indices, ] +``` + + + + +```{r setup, include=FALSE} + +# model with features: num_trips prcp snwd snow tmax tmin +model_with_all_features <- trips_per_day_train |> select(-ymd, -date, -fold) + +model_with_all <- lm(num_trips ~ ., data =model_with_all_features) +model_with_all + +validate_err_with_all <- sqrt(mean((predict(model_with_all, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) + +# High validation error low predictive power +validate_err_with_all +``` + + +```{r setup, include=FALSE} + +# model with features and interaction: num_trips prcp snwd snow tmax tmin + + +model_with_all_interactions <- lm(num_trips ~ .^2, data = model_with_all_features) +model_with_all_interactions + + +validate_err_with_interactions_all <- sqrt(mean((predict(model_with_all_interactions, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) + +# High validation error low predictive power +validate_err_with_interactions_all +``` + + +```{r setup, include=FALSE} + +# model with features and interaction with tmin: num_trips prcp snwd snow tmax tmin + + +model_with_all_interactions_with_tmin <- model <- lm( + num_trips ~ prcp + snow + tmax + tmin + + prcp:tmin + snwd:tmin + snow:tmin + tmax:tmin, + data = model_with_all_features +) + +model_with_all_interactions_with_tmin + + +validate_err_with_interactions_with_min_all <- sqrt(mean((predict(model_with_all_interactions_with_tmin, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) + +# High validation error low predictive power +validate_err_with_interactions_with_min_all + +``` + + +```{r setup, include=FALSE} +# Fit the model on the training data with poly(., 4) and interactions with tmin +model_with_poly4_and_linear_interactions <- lm( + num_trips ~ + + data = model_with_all_features +) + +# Predict on the validation set +predictions <- predict(model_with_poly4_and_linear_interactions, trips_per_day_validate) + +# Compute validation RMSE +validate_err_poly4_with_interactions <- sqrt( + mean((predictions - trips_per_day_validate$num_trips)^2) +) + +# Print validation error +validate_err_poly4_with_interactions +``` \ No newline at end of file