diff --git a/week1/citibike.R b/week1/citibike.R
index ad01de1d3..2dead7071 100644
--- a/week1/citibike.R
+++ b/week1/citibike.R
@@ -6,16 +6,6 @@ library(lubridate)
########################################
# read one month of data
-trips <- read_csv('201402-citibike-tripdata.csv')
-
-# replace spaces in column names with underscores
-names(trips) <- gsub(' ', '_', names(trips))
-
-# convert dates strings to dates
-# trips <- mutate(trips, starttime = mdy_hms(starttime), stoptime = mdy_hms(stoptime))
-
-# recode gender as a factor 0->"Unknown", 1->"Male", 2->"Female"
-trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unknown","Male","Female")))
########################################
@@ -23,26 +13,73 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn
########################################
# count the number of trips (= rows in the data frame)
+ summarize(trips, count = n())
# find the earliest and latest birth years (see help for max and min to deal with NAs)
+max(trips$birth_year, na.rm = FALSE)
+#had issue with min that data model was not formatted with NA, instead it was \\N so figured out how to change it to NA so min could work
+#my partner had a better solution as.numeric(trips$birthyear) which will automatically cast all non numerics as NA so the following is silly and overdramatic and pointless
+trips <-
+ mutate(trips, birth_year
+ = if_else(!str_detect(birth_year,
+ "^[0-9]+$"), "NA", birth_year))
+min(trips$birth_year, na.rm = FALSE)
# use filter and grepl to find all trips that either start or end on broadway
+filter(trips, grepl("Broadway", start_station_name)| grepl("Broadway", end_station_name))
# do the same, but find all trips that both start and end on broadway
+filter(trips, grepl("Broadway", start_station_name) & grepl("Broadway", end_station_name))
# find all unique station names
+trips |> distinct(start_station_name)
# count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender
# do this all at once, by using summarize() with multiple arguments
+summarize(group_by(trips, gender),
+ count = n(), mean = mean(tripduration),
+ sd = sd(tripduration))
# find the 10 most frequent station-to-station trips
+trips |> group_by(start_station_name, end_station_name) |>
+ summarize(count = n()) |>
+ arrange(desc(count)) |>
+ head(n=10)
# find the top 3 end stations for trips starting from each start station
+trips |> group_by(start_station_name, end_station_name) |>
+ summarize(count = n()) |>
+ group_by(start_station_name) |>
+ arrange(desc(count)) |>
+ mutate(rank = row_number()) |>
+ filter(rank <= 3) |>
+ arrange(start_station_name)
+#Jake's solution: trips |>
+ #count(start_station_name, end_station_name) |>
+ #group_by(start_station_name) |>
+ #arrange(desc(n)) |>
+ #slice(1:3)
+
+
# find the top 3 most common station-to-station trips by gender
+trips |> group_by(start_station_name, end_station_name, gender) |>
+ summarize(count = n()) |>
+ arrange(desc(count))|>
+ group_by(gender) |>
+ mutate(rank = row_number()) |>
+ filter(rank <=3) |>
+ arrange(gender)
+
# find the day with the most trips
+trips |> group_by(as_date(starttime)) |> summarize(count = n()) |> arrange(desc(count)) |> head(n=1)
+
# tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package)
+
# compute the average number of trips taken during each of the 24 hours of the day across the entire month
# what time(s) of day tend to be peak hour(s)?
+trips |> group_by(hour(starttime)) |> summarize(count =n(), average = count/n_distinct(as_date(starttime)))
+
+trips |> group_by(hour(starttime)) |> summarize(count =n(), average = count/n_distinct(as_date(starttime))) |> arrange(average) |> tail(n=1)
diff --git a/week1/citibike.sh b/week1/citibike.sh
index 25604f545..65d9272be 100755
--- a/week1/citibike.sh
+++ b/week1/citibike.sh
@@ -1,23 +1,23 @@
#!/bin/bash
#
-# add your solution after each of the 10 comments below
+# add your solution aftegr each of the 10 comments below
#
# count the number of unique stations
-
+cut -d, -f4 201402-citibike-tripdata.csv | sort | uniq -c | wc -l
# count the number of unique bikes
-
+cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | wc -l
# count the number of trips per day
-
+cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c
# find the day with the most rides
-
+cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort -r | head -1
# find the day with the fewest rides
-
+cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | tail -n +2 | head -1
# find the id of the bike with the most rides
-
-# count the number of rides by gender and birth year
-
+cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | sort -r | head -1
+# count the number f rides by gender and birth year
+cut -d, -f14,15 201402-citibike-tripdata.csv | tail -2 +2 | sort | uniq -c
# count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...)
-
-
+cut -d, -f5 201402-citibike-tripdata.csv | grep '.*[0-9].*&.*[0-9].*' | sort | uniq -c | awk '{sum += $1} END {print sum}'
# compute the average trip duration
+awk -F, '{sum += $1; count++} END {print sum/count}' 201402-citibike-tripdata.csv
\ No newline at end of file
diff --git a/week1/intro_to_r.ipynb b/week1/intro_to_r.ipynb
index c2fabd69b..20871c509 100644
--- a/week1/intro_to_r.ipynb
+++ b/week1/intro_to_r.ipynb
@@ -22,7 +22,11 @@
{
"cell_type": "code",
"execution_count": 2,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [],
"source": [
"# load all packages in the tidyverse, although we'll use only dplyr here\n",
@@ -38,8 +42,12 @@
},
{
"cell_type": "code",
- "execution_count": 2,
- "metadata": {},
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -75,13 +83,17 @@
"source": [
"# vectors hold values of the same type\n",
"# c() is the \"concatenation\" operator\n",
- "c(1,2,3)"
+ "c(1,2,3)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -122,7 +134,11 @@
{
"cell_type": "code",
"execution_count": 4,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -169,7 +185,11 @@
{
"cell_type": "code",
"execution_count": 5,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -263,7 +283,11 @@
{
"cell_type": "code",
"execution_count": 6,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -301,7 +325,11 @@
{
"cell_type": "code",
"execution_count": 7,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -322,7 +350,11 @@
{
"cell_type": "code",
"execution_count": 8,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -371,7 +403,11 @@
{
"cell_type": "code",
"execution_count": 9,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -488,7 +524,11 @@
{
"cell_type": "code",
"execution_count": 10,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -542,7 +582,11 @@
{
"cell_type": "code",
"execution_count": 11,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -565,7 +609,11 @@
{
"cell_type": "code",
"execution_count": 12,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -592,7 +640,11 @@
{
"cell_type": "code",
"execution_count": 13,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -628,7 +680,11 @@
{
"cell_type": "code",
"execution_count": 14,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -778,7 +834,11 @@
{
"cell_type": "code",
"execution_count": 15,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -820,7 +880,11 @@
{
"cell_type": "code",
"execution_count": 16,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -868,8 +932,12 @@
},
{
"cell_type": "code",
- "execution_count": 17,
- "metadata": {},
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -906,13 +974,17 @@
"source": [
"# nicer syntax for the same thing\n",
"# note: the second entry of filter is a logical vector\n",
- "filter(df, Sepal.Length >= 5)"
+ "filter(df, Sepal.Length >= 5)\n"
]
},
{
"cell_type": "code",
- "execution_count": 18,
- "metadata": {},
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -945,13 +1017,17 @@
],
"source": [
"# filter ANDs conditions when given multiple arguments\n",
- "filter(df, Sepal.Length >= 5, Petal.Length <= 1.4)"
+ "filter(df, Sepal.Length >= 5, Petal.Length <= 1.4)\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1000,7 +1076,11 @@
{
"cell_type": "code",
"execution_count": 20,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1043,7 +1123,11 @@
{
"cell_type": "code",
"execution_count": 21,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1100,7 +1184,11 @@
{
"cell_type": "code",
"execution_count": 22,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1165,7 +1253,11 @@
{
"cell_type": "code",
"execution_count": 23,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1216,7 +1308,11 @@
{
"cell_type": "code",
"execution_count": 24,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1274,7 +1370,11 @@
{
"cell_type": "code",
"execution_count": 25,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1325,7 +1425,11 @@
{
"cell_type": "code",
"execution_count": 26,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1390,7 +1494,10 @@
"cell_type": "code",
"execution_count": 27,
"metadata": {
- "collapsed": true
+ "collapsed": true,
+ "vscode": {
+ "languageId": "r"
+ }
},
"outputs": [],
"source": [
@@ -1408,7 +1515,11 @@
{
"cell_type": "code",
"execution_count": 28,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1474,7 +1585,11 @@
{
"cell_type": "code",
"execution_count": 29,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1510,7 +1625,11 @@
{
"cell_type": "code",
"execution_count": 30,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1562,7 +1681,11 @@
{
"cell_type": "code",
"execution_count": 31,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -1602,7 +1725,11 @@
{
"cell_type": "code",
"execution_count": 32,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1648,7 +1775,11 @@
{
"cell_type": "code",
"execution_count": 33,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1694,7 +1825,11 @@
{
"cell_type": "code",
"execution_count": 34,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1743,7 +1878,11 @@
{
"cell_type": "code",
"execution_count": 35,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1788,7 +1927,11 @@
{
"cell_type": "code",
"execution_count": 36,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -1845,7 +1988,10 @@
"cell_type": "code",
"execution_count": 37,
"metadata": {
- "scrolled": true
+ "scrolled": true,
+ "vscode": {
+ "languageId": "r"
+ }
},
"outputs": [
{
@@ -1938,7 +2084,10 @@
"cell_type": "code",
"execution_count": 38,
"metadata": {
- "scrolled": false
+ "scrolled": false,
+ "vscode": {
+ "languageId": "r"
+ }
},
"outputs": [
{
@@ -1996,7 +2145,11 @@
{
"cell_type": "code",
"execution_count": 39,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2018,7 +2171,11 @@
{
"cell_type": "code",
"execution_count": 40,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2049,7 +2206,11 @@
{
"cell_type": "code",
"execution_count": 41,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2115,7 +2276,11 @@
{
"cell_type": "code",
"execution_count": 42,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2181,7 +2346,11 @@
{
"cell_type": "code",
"execution_count": 43,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2246,7 +2415,11 @@
{
"cell_type": "code",
"execution_count": 44,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2277,7 +2450,11 @@
{
"cell_type": "code",
"execution_count": 45,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2351,7 +2528,11 @@
{
"cell_type": "code",
"execution_count": 46,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2387,7 +2568,11 @@
{
"cell_type": "code",
"execution_count": 47,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2455,7 +2640,10 @@
"cell_type": "code",
"execution_count": 48,
"metadata": {
- "scrolled": true
+ "scrolled": true,
+ "vscode": {
+ "languageId": "r"
+ }
},
"outputs": [
{
@@ -2531,7 +2719,11 @@
{
"cell_type": "code",
"execution_count": 49,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2569,7 +2761,11 @@
{
"cell_type": "code",
"execution_count": 50,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2609,7 +2805,11 @@
{
"cell_type": "code",
"execution_count": 51,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2645,7 +2845,10 @@
"cell_type": "code",
"execution_count": 52,
"metadata": {
- "scrolled": false
+ "scrolled": false,
+ "vscode": {
+ "languageId": "r"
+ }
},
"outputs": [
{
@@ -2713,7 +2916,11 @@
{
"cell_type": "code",
"execution_count": 53,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2760,7 +2967,11 @@
{
"cell_type": "code",
"execution_count": 54,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2841,7 +3052,11 @@
{
"cell_type": "code",
"execution_count": 55,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2906,7 +3121,11 @@
{
"cell_type": "code",
"execution_count": 56,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2935,7 +3154,11 @@
{
"cell_type": "code",
"execution_count": 57,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -2982,7 +3205,11 @@
{
"cell_type": "code",
"execution_count": 58,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -3046,7 +3273,11 @@
{
"cell_type": "code",
"execution_count": 59,
- "metadata": {},
+ "metadata": {
+ "vscode": {
+ "languageId": "r"
+ }
+ },
"outputs": [
{
"data": {
@@ -3090,7 +3321,10 @@
"cell_type": "code",
"execution_count": 60,
"metadata": {
- "scrolled": true
+ "scrolled": true,
+ "vscode": {
+ "languageId": "r"
+ }
},
"outputs": [
{
diff --git a/week1/load_trips.R b/week1/load_trips.R
index 6333d786e..cbe76def7 100644
--- a/week1/load_trips.R
+++ b/week1/load_trips.R
@@ -11,7 +11,7 @@ parse_datetime <- function(s, format="%Y-%m-%d %H:%M:%S") {
########################################
# load each month of the trip data into one big data frame
-csvs <- Sys.glob('*-tripdata.csv')
+csvs <- Sys.glob('./week1/*-tripdata.csv')
trips <- data.frame()
for (csv in csvs) {
print(csv)
@@ -44,7 +44,7 @@ trips <- mutate(trips, gender=factor(gender, levels=c(0,1,2), labels=c("Unknown"
# https://www.ncei.noaa.gov/orders/cdo/2992179.csv
# ordered from
# http://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USW00094728/detail
-weather <- read.table('weather.csv', header=T, sep=',')
+weather <- read.table('./week1/weather.csv', header=T, sep=',')
# extract just a few columns, lowercase column names, and parse dates
weather <- select(weather, DATE, PRCP, SNWD, SNOW, TMAX, TMIN)
diff --git a/week1/plot_trips.R b/week1/plot_trips.R
index 4f25437ba..f87c7a656 100644
--- a/week1/plot_trips.R
+++ b/week1/plot_trips.R
@@ -10,22 +10,31 @@ library(scales)
theme_set(theme_bw())
# load RData file output by load_trips.R
-load('trips.RData')
+load('./week1/trips.RData')
########################################
# plot trip data
########################################
-# plot the distribution of trip times across all rides (compare a histogram vs. a density plot)
+# (compare a histogram vs. a density plot)
+ggplot(trips, aes(x = tripduration)) +
+ geom_histogram(bins = 30) + scale_x_log10(label = comma)
+
+ggplot(trips, aes(x=tripduration)) + geom_density(fill = "grey") + scale_x_log10(label = comma)
# plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot)
+ggplot(trips, aes(x = tripduration, fill = gender)) +
+ geom_histogram(bins = 30) + scale_x_log10(label = comma)
-# plot the total number of trips on each day in the dataset
+ggplot(trips, aes(x=tripduration, fill = gender)) + geom_density() + scale_x_log10(label = comma)
+# plot the total number of trips on each day in the dataset
+trips |> mutate(date = as_date(starttime))|> ggplot(aes(x = date)) + geom_histogram()
# plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color)
-
-# plot the ratio of male to female trips (on the y axis) by age (on the x axis)
+trips |> group_by(birth_year) |> ggplot(aes(x = year(as_date(starttime)) - birth_year, fill = gender)) + geom_histogram() # nolint: line_length_linter.
+# plot the ratio of male to female trip (on the y axis) by age (on the x axis)
+trips |> group_by(birth_year) |> ggplot(aes(x = year(as_date(starttime)) - birth_year, fill = gender)) + geom_histogram() # nolint: line_length_linter.
# hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio
# (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet)
@@ -44,19 +53,40 @@ load('trips.RData')
# join trips and weather
trips_with_weather <- inner_join(trips, weather, by="ymd")
-
+trips_with_weather |> head(n=30) |> View()
# plot the number of trips as a function of the minimum temperature, where each point represents a day
# you'll need to summarize the trips and join to the weather data to do this
-
+trips_with_weather |> group_by(date) |> summarize(weather = mean(tmin), count = n()) |> ggplot(aes(x=weather, y=count)) + geom_point()
# repeat this, splitting results by whether there was substantial precipitation or not
# you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this
-
+trips_with_weather |> mutate(subprec = if_else(prcp > mean(prcp), TRUE, FALSE)) |> group_by(date, subprec) |> summarize(weather = mean(tmin), count = n()) |> ggplot(aes(x=weather, y=count, color = subprec)) + geom_point()
# add a smoothed fit on top of the previous plot, using geom_smooth
-
+trips_with_weather |>
+ mutate(subprec = if_else(prcp > mean(prcp), TRUE, FALSE)) |>
+ group_by(date, subprec) |>
+ summarize(weather = mean(tmin), count = n()) |>
+ ggplot(aes(x=weather, y=count, color = subprec)) + geom_point() +
+ geom_smooth(method = "lm", se = FALSE)
# compute the average number of trips and standard deviation in number of trips by hour of the day
# hint: use the hour() function from the lubridate package
-
+trips |> mutate(date = as_date(starttime), hour = hour(starttime)) |>
+ count(date, hour) |> group_by(hour) |>
+ summarize(average =mean(n), std = sd(n)) |> ggplot(aes(x=hour, y=average)) +
+ geom_line() +
+ geom_ribbon(aes(ymin = average -std, ymax = average + std), alpha=0.25)
# plot the above
# repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days
# hint: use the wday() function from the lubridate package
+trips |> mutate(date = as_date(starttime), hour = hour(starttime), wday = wday(date)) |>
+ count(date, hour) #|> group_by(hour, wday) |>
+ summarize(average =mean(n), std = sd(n)) #|> ggplot(aes(x=hour, y=average)) +
+ geom_line() +
+ geom_ribbon(aes(ymin = average -std, ymax = average + std), alpha=0.25) +
+ facet_wrap(~wday)
+
+trips |> mutate(date = as_date(starttime), hour = hour(starttime), wday = wday(date)) |>
+ group_by(hour) |> summarize(average = mean(sum()), std = sd(count)) #|> ggplot(aes(x=hour, y=average)) +
+ geom_line() +
+ geom_ribbon(aes(ymin = average -std, ymax = average + std), alpha=0.25) +
+ facet_wrap(~wday)
diff --git a/week2/diamond-sizes.html b/week2/diamond-sizes.html
new file mode 100644
index 000000000..da3dab5fa
--- /dev/null
+++ b/week2/diamond-sizes.html
@@ -0,0 +1,408 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Diamond sizes
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
We have data about 53940 diamonds. Only 126 are larger than 2.5
+carats. The distribution of the remainder is shown below:
+

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/week3/README.html b/week3/README.html
new file mode 100644
index 000000000..7fa99884d
--- /dev/null
+++ b/week3/README.html
@@ -0,0 +1,334 @@
+
+
+
+
+
+
+ MovieLens 10M/100k Data Set README
+
+
+
+ Summary
+
+
+ This data set contains 10000054 ratings and 95580 tags
+ applied to 10681 movies by 71567 users of the
+ online movie recommender service MovieLens.
+
+
+ Users were selected at random for inclusion. All users selected had rated
+ at least 20 movies. Unlike previous MovieLens data sets, no demographic
+ information is included. Each user is represented by an id, and no other
+ information is provided.
+
+
+
+ The data are contained in three files, movies.dat,
+ ratings.dat and tags.dat.
+ Also included are scripts for generating subsets of the data to support five-fold
+ cross-validation of rating predictions. More details about the contents and use
+ of all these files follows.
+
+
+
+ This and other GroupLens data sets are publicly available for download at
+ GroupLens Data Sets.
+
+
+ Usage License
+
+
+ Neither the University of Minnesota nor any of the researchers
+ involved can guarantee the correctness of the data, its suitability
+ for any particular purpose, or the validity of results based on the
+ use of the data set. The data set may be used for any research
+ purposes under the following conditions:
+
+
+ - The user may not state or imply any endorsement from the
+ University of Minnesota or the GroupLens Research Group.
+
+ - The user must acknowledge the use of the data set in
+ publications resulting from the use of the data set (see below
+ for citation information).
+
+ - The user may not redistribute the data without separate
+ permission.
+
+ - The user may not use this information for any commercial or
+ revenue-bearing purposes without first obtaining permission
+ from a faculty member of the GroupLens Research Project at the
+ University of Minnesota.
+
+
+ The executable software scripts are provided "as is" without warranty
+ of any kind, either expressed or implied, including, but not limited to,
+ the implied warranties of merchantability and fitness for a particular purpose.
+ The entire risk as to the quality and performance of them is with you.
+ Should the program prove defective, you assume the cost of all
+ necessary servicing, repair or correction.
+
+
+ In no event shall the University of Minnesota, its affiliates or employees
+ be liable to you for any damages arising out of the use or inability to use
+ these programs (including but not limited to loss of data or data being
+ rendered inaccurate).
+
+
+
+ If you have any further questions or comments, please email grouplens-info
+
+
+
+ Citation
+
+
+ To acknowledge use of the dataset in publications, please cite the
+ following paper:
+
+
+ F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets:
+ History and Context. ACM Transactions on Interactive Intelligent
+ Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages.
+ DOI=http://dx.doi.org/10.1145/2827872
+
+
+
+ Acknowledgements
+
+
+ Thanks to Rich Davies for generating the data set.
+
+
+
+ Further Information About GroupLens
+
+
+ GroupLens is a research group in the
+ Department of Computer Science and Engineering
+ at the University of Minnesota. Since its
+ inception in 1992, GroupLens' research projects have explored a variety of fields
+ including:
+
+
+ - Information Filtering
+ - Recommender Systems
+ - Online Communities
+ - Mobile and Ubiquitious Technologies
+ - Digital Libraries
+ - Local Geographic Information Systems.
+
+
+ GroupLens Research operates a movie recommender based on
+ collaborative filtering, MovieLens,
+ which is the source of these data.
+
+
+
+ Content and Use of Files
+
+
+
+ Character Encoding
+
+
+ The three data files are encoded as
+ UTF-8. This is a departure
+ from previous MovieLens data sets, which used different character encodings.
+ If accented characters in movie titles or tag values (e.g. Misérables, Les (1995))
+ display incorrectly, make sure that any program reading the data, such as a
+ text editor, terminal, or script, is configured for UTF-8.
+
+
+
+ User Ids
+
+
+ Movielens users were selected at random for inclusion. Their ids have been
+ anonymized.
+
+
+ Users were selected separately for inclusion
+ in the ratings and tags data sets, which implies that user ids may appear in
+ one set but not the other.
+
+
+ The anonymized values are consistent between the ratings and tags data files.
+ That is, user id n, if it appears in both files, refers to the same
+ real MovieLens user.
+
+
+
+ Ratings Data File Structure
+
+
+ All ratings are contained in the file ratings.dat. Each line of this
+ file represents one rating of one movie by one user, and has the following format:
+
+
+ UserID::MovieID::Rating::Timestamp
+
+
+ The lines within this file are ordered first by UserID, then, within user,
+ by MovieID.
+
+
+ Ratings are made on a 5-star scale, with half-star increments.
+
+
+ Timestamps represent
+ seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
+
+
+
+ Tags Data File Structure
+
+
+ All tags are contained in the file tags.dat. Each line of this
+ file represents one tag applied to one movie by one user, and has
+ the following format:
+
+
+ UserID::MovieID::Tag::Timestamp
+
+
+ The lines within this file are ordered first by UserID, then, within user,
+ by MovieID.
+
+
+ Tags are user
+ generated metadata about movies. Each tag is typically a single word, or
+ short phrase. The meaning, value and purpose of a particular tag is
+ determined by each user.
+
+
+ Timestamps represent
+ seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
+
+
+
+ Movies Data File Structure
+
+
+ Movie information is contained in the file movies.dat.
+ Each line of this file represents one movie, and has the following format:
+
+
+ MovieID::Title::Genres
+
+
+ MovieID is the real MovieLens id.
+
+
+ Movie titles, by policy, should be entered identically to those
+ found in IMDB, including year of release.
+ However, they are entered manually, so errors and inconsistencies may exist.
+
+
+ Genres are a pipe-separated list, and are selected from the following:
+
+
+ - Action
+ - Adventure
+ - Animation
+ - Children's
+ - Comedy
+ - Crime
+ - Documentary
+ - Drama
+ - Fantasy
+ - Film-Noir
+ - Horror
+ - Musical
+ - Mystery
+ - Romance
+ - Sci-Fi
+ - Thriller
+ - War
+ - Western
+
+
+
+ Cross-Validation Subset Generation Scripts
+
+
+ A Unix shell script, split_ratings.sh, is provided that, if desired,
+ can be used to split the ratings data for five-fold cross-validation
+ of rating predictions. It depends on a second script, allbut.pl, which
+ is also included and is written in Perl. They should run without modification
+ under Linux, Mac OS X, Cygwin or other Unix like systems.
+
+
+ Running split_ratings.sh will use ratings.dat
+ as input, and produce the fourteen output files described below. Multiple
+ runs of the script will produce identical results.
+
+
+
+ | File Names |
+ Description |
+
+
+
+ r1.train, r2.train, r3.train, r4.train, r5.train
+ r1.test, r2.test, r3.test, r4.test, r5.test
+ |
+
+ The data sets r1.train and r1.test through r5.train and r5.test
+ are 80%/20% splits of the ratings data into training and test data.
+ Each of r1, ..., r5 have disjoint test sets; this if for
+ 5 fold cross validation (where you repeat your experiment
+ with each training and test set and average the results).
+ |
+
+
+
+ ra.train, rb.train
+ ra.test, rb.test
+ |
+
+ The data sets ra.train, ra.test, rb.train, and rb.test
+ split the ratings data into a training set and a test set with
+ exactly 10 ratings per user in the test set. The sets
+ ra.test and rb.test are disjoint.
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/week3/allbut.pl b/week3/allbut.pl
new file mode 100644
index 000000000..a22afad08
--- /dev/null
+++ b/week3/allbut.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/env perl
+
+# get args
+if (@ARGV < 3) {
+ print STDERR "Usage: $0 base_name start stop max_test [ratings ...]\n";
+ exit 1;
+}
+$basename = shift;
+$start = shift;
+$stop = shift;
+$maxtest = shift;
+
+# open files
+open( TESTFILE, ">$basename.test" ) or die "Cannot open $basename.test for writing\n";
+open( BASEFILE, ">$basename.train" ) or die "Cannot open $basename.train for writing\n";
+
+# init variables
+$testcnt = 0;
+
+while (<>) {
+ ($user) = split /::/, $_, 2;
+ if (! defined $ratingcnt{$user}) {
+ $ratingcnt{$user} = 1;
+ } else {
+ ++$ratingcnt{$user};
+ }
+ if (($testcnt < $maxtest || $maxtest <= 0)
+ && $ratingcnt{$user} >= $start && $ratingcnt{$user} <= $stop) {
+ ++$testcnt;
+ print TESTFILE;
+ }
+ else {
+ print BASEFILE;
+ }
+}
diff --git a/week3/babyweights.txt b/week3/babyweights.txt
index 876e90517..cd4636b38 100644
--- a/week3/babyweights.txt
+++ b/week3/babyweights.txt
@@ -1,4 +1,4 @@
-"bwt" "gestation" "parity" "age" "height" "weight" "smoke"
+"colname" "bwt" "gestation" "parity" "age" "height" "weight" "smoke"
"1" 120 284 0 27 62 100 0
"2" 113 282 0 33 64 135 0
"3" 128 279 0 28 64 115 1
diff --git a/week3/download_movielens.sh b/week3/download_movielens.sh
index 1fc6caba8..f949f4f4f 100755
--- a/week3/download_movielens.sh
+++ b/week3/download_movielens.sh
@@ -20,3 +20,5 @@ fi
[ -f ratings.csv ] || cat ratings.dat | sed 's/::/,/g' > ratings.csv
[ -f movies.tsv ] || cat movies.dat | sed 's/::/ /g' > movies.tsv
+
+
diff --git a/week3/movielens.Rmd b/week3/movielens.Rmd
index 78a442d9c..c2462b103 100644
--- a/week3/movielens.Rmd
+++ b/week3/movielens.Rmd
@@ -35,6 +35,7 @@ head(ratings) %>% kable()
```{r dist-ratings}
# plot the distribution of rating values https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=26
+ratings |> group_by(rating) |> mutate(num_of_ratings = n()) |> ggplot(aes(x = rating)) + geom_histogram()
```
## Per-movie stats
@@ -42,16 +43,19 @@ head(ratings) %>% kable()
```{r aggregate-by-movie}
# aggregate ratings by movie, computing mean rating and number of ratings
# hint: use the n() function for easy counting within a group
+ratings |> group_by(movie_id) |> summarize(num_of_ratings = n(), mean_rating = mean(rating)) |> head()
```
```{r dist-movie-popularity}
# plot distribution of movie popularity (= number of ratings the movie received)
# hint: try scale_x_log10() for a logarithmic x axis
+ratings |> group_by(movie_id) |> summarize(num_of_ratings = n()) |> ggplot(aes(x=num_of_ratings)) + geom_histogram() + scale_x_log10()
```
```{r dist-mean-ratings-by-movie}
# plot distribution of mean ratings by movie https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=28
# hint: try geom_histogram and geom_density
+ ratings |> group_by(movie_id) |> summarize(mean_of_ratings = mean(rating)) |> ggplot(aes(x = mean_of_ratings)) + geom_density(fill="black")
```
```{r cdf-movie-pop}
@@ -60,6 +64,9 @@ head(ratings) %>% kable()
# store the result in a new data frame so you can use it in creating figure 2 from the paper below
# plot the CDF of movie popularity
+ratings |> group_by(movie_id) |> summarize(count = n()) |> mutate(rank = rank(-count)) |>
+arrange(rank) |> mutate(cdf = cumsum(count)/sum(count)) |>
+ggplot(aes(x=rank, y=cdf)) + geom_line() + scale_y_continuous(labels=percent_format()) + labs(x="Movie Rank", y="CDF")
```
@@ -67,11 +74,14 @@ head(ratings) %>% kable()
```{r aggregate-by-user}
# aggregate ratings by user, computing mean and number of ratings
+ratings |> group_by(user_id) |> summarize(num_of_ratings = n(), mean_rating = mean(rating))
```
```{r dist-user-activity}
# plot distribution of user activity (= number of ratings the user made)
# hint: try a log scale here
+ratings |> group_by(user_id) |> summarize(num_of_ratings = n()) |> ggplot(aes(x=num_of_ratings)) + geom_histogram() + scale_x_log10()
+
```
# Anatomy of the long tail
@@ -91,4 +101,30 @@ head(ratings) %>% kable()
# paper, produce one curve for the 100% user satisfaction level and
# another for 90%---do not, however, bother implementing the null
# model (shown in the dashed lines).
+users_with_ten <- ratings |> group_by(user_id) |> summarize(num_of_ratings = n()) |>
+filter(num_of_ratings >= 10)
+
+ranked_movies <- ratings |> group_by(movie_id) |> summarize(count = n()) |> mutate(rank = rank(-count)) |> arrange(rank)
+user_movies <- ratings |> filter(user_id %in% users_with_ten$user_id) |> group_by(user_id)
+lowest_rated <- user_movies |> left_join(ranked_movies, by = "movie_id") |> group_by(user_id) |> summarize(lowest_rate = max(rank))
+lowest_rated90 <- user_movies |> left_join(ranked_movies, by = "movie_id") |> group_by(user_id) |> summarize(lowest_rate = quantile(rank, 0.9))
+
+lowest_rated <- lowest_rated |> arrange(lowest_rate)
+lowest_rated90 <- lowest_rated90 |> arrange(lowest_rate)
+num_of_users <- lowest_rated |> group_by(lowest_rate) |> summarize(count = n())
+num_of_users90 <- lowest_rated90 |> group_by(lowest_rate) |> summarize(count = n())
+
+num_of_users <- mutate(num_of_users, cumulated = cumsum(count), satisfied = cumulated/69678)
+num_of_users90 <- mutate(num_of_users90, cumulated = cumsum(count), satisfied = cumulated/69678)
+
+combined <- bind_rows(
+ "100%" = num_of_users,
+ "90%" = num_of_users90,
+ .id = "satisfaction"
+)
+
+ggplot(combined, aes(x = lowest_rate, y = satisfied, color = satisfaction)) +
+ geom_line() + labs(x="Inventory Size", y="Percent of Users Satisfied") + geom_line(aes(x=3000), color = "Black", linetype = 5)
+#ggplot(num_of_users, aes(x=lowest_rate, y=cumsum(count)/69678)) + geom_line()
+#ggplot(num_of_users90, aes(x=lowest_rate, y=cumsum(count)/69678)) + geom_line()
```
diff --git a/week3/movielens.html b/week3/movielens.html
new file mode 100644
index 000000000..7cf28c12e
--- /dev/null
+++ b/week3/movielens.html
@@ -0,0 +1,523 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+Movielens
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Load and preview
+data
+
Read data from the ratings.csv file
+
ratings <- read_csv('ratings.csv',
+ col_names = c('user_id','movie_id','rating','timestamp'))
+
## Rows: 10000054 Columns: 4
+## ── Column specification ────────────────────────────────────────────────────────
+## Delimiter: ","
+## dbl (4): user_id, movie_id, rating, timestamp
+##
+## ℹ Use `spec()` to retrieve the full column specification for this data.
+## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
+
Loaded 305.2 Mb of ratings data, containing 10,000,054 ratings.
+Here’s a preview:
+
head(ratings) %>% kable()
+
+
+
+
+
+
+| 1 |
+122 |
+5 |
+838985046 |
+
+
+| 1 |
+185 |
+5 |
+838983525 |
+
+
+| 1 |
+231 |
+5 |
+838983392 |
+
+
+| 1 |
+292 |
+5 |
+838983421 |
+
+
+| 1 |
+316 |
+5 |
+838983392 |
+
+
+| 1 |
+329 |
+5 |
+838983392 |
+
+
+
+
+
+
Summary statistics
+
# plot the distribution of rating values https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=26
+
+
Per-movie stats
+
# aggregate ratings by movie, computing mean rating and number of ratings
+# hint: use the n() function for easy counting within a group
+
# plot distribution of movie popularity (= number of ratings the movie received)
+# hint: try scale_x_log10() for a logarithmic x axis
+
# plot distribution of mean ratings by movie https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=28
+# hint: try geom_histogram and geom_density
+
# rank movies by popularity (number of ratings) and compute the cdf, or fraction of all views covered by the top-k movies https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=30
+# hint: use dplyr's rank and arrange functions, and the base R sum and cumsum functions
+# store the result in a new data frame so you can use it in creating figure 2 from the paper below
+
+# plot the CDF of movie popularity
+
+
+
+
Per-user stats
+
# aggregate ratings by user, computing mean and number of ratings
+
# plot distribution of user activity (= number of ratings the user made)
+# hint: try a log scale here
+
+
+
Anatomy of the long
+tail
+
# generate the equivalent of figure 2a of this paper:
+# note: don't worry about the "null model" lines
+# just do the solid lines and dotted line (optional)
+# https://5harad.com/papers/long_tail.pdf
+
+# Specifically, for the subset of users who rated at least 10 movies,
+# produce a plot that shows the fraction of users satisfied (vertical
+# axis) as a function of inventory size (horizontal axis). We will
+# define "satisfied" as follows: an individual user is satisfied p% of
+# the time at inventory of size k if at least p% of the movies they
+# rated are contained in the top k most popular movies. As in the
+# paper, produce one curve for the 100% user satisfaction level and
+# another for 90%---do not, however, bother implementing the null
+# model (shown in the dashed lines).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/week3/ngrams/01_download_1grams.sh b/week3/ngrams/01_download_1grams.sh
index 1d6d5bf10..c8e7646d0 100644
--- a/week3/ngrams/01_download_1grams.sh
+++ b/week3/ngrams/01_download_1grams.sh
@@ -1,6 +1,8 @@
#!/bin/bash
# use curl or wget to download the version 2 1gram file with all terms starting with "1", googlebooks-eng-all-1gram-20120701-1.gz
+curl -O http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-1.gz
+
# update the timestamp on the resulting file using touch
# do not remove, this will keep make happy and avoid re-downloading of the data once you have it
diff --git a/week3/ngrams/02_filter_1grams.sh b/week3/ngrams/02_filter_1grams.sh
index 3b8e9ec29..8cbf46087 100644
--- a/week3/ngrams/02_filter_1grams.sh
+++ b/week3/ngrams/02_filter_1grams.sh
@@ -2,5 +2,10 @@
# filter original 1gram file googlebooks-eng-all-1gram-20120701-1.gz to only lines where the ngram exactly matches a year (18xx, 19xx, or 20xx, where x is a digit)
# decompress the first using gunzip, zless, zcat or similar
+gunzip googlebooks-eng-all-1gram-20120701-1.gz
# then filter out rows that match using grep -E, egrep, awk, or similar
+ grep -E '[1-2][8,9,0][0-9][0-9]' googlebooks-eng-all-1gram-20120701-1 > year_counts.tsv
+
# write results to year_counts.tsv
+
+
diff --git a/week3/ngrams/03_download_totals.sh b/week3/ngrams/03_download_totals.sh
index f53381e8e..4acceee46 100644
--- a/week3/ngrams/03_download_totals.sh
+++ b/week3/ngrams/03_download_totals.sh
@@ -1,7 +1,7 @@
#!/bin/bash
# use curl or wget to download the version 2 of the total counts file, googlebooks-eng-all-totalcounts-20120701.txt
-
+curl -O http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-totalcounts-20120701.txt
# update the timestamp on the resulting file using touch
# do not remove, this will keep make happy and avoid re-downloading of the data once you have it
touch googlebooks-eng-all-totalcounts-20120701.txt
diff --git a/week3/ngrams/04_reformat_totals.sh b/week3/ngrams/04_reformat_totals.sh
index 0445c1ff6..1cc13754c 100644
--- a/week3/ngrams/04_reformat_totals.sh
+++ b/week3/ngrams/04_reformat_totals.sh
@@ -3,3 +3,4 @@
# reformat total counts in googlebooks-eng-all-totalcounts-20120701.txt to a valid csv
# use tr, awk, or sed to convert tabs to newlines
# write results to total_counts.csv
+
diff --git a/week3/ngrams/05_final_report.Rmd b/week3/ngrams/05_final_report.Rmd
index a7c90c1fb..d6d328737 100644
--- a/week3/ngrams/05_final_report.Rmd
+++ b/week3/ngrams/05_final_report.Rmd
@@ -52,14 +52,18 @@ Then edit the `03_download_totals.sh` file to down the `googlebooks-eng-all-tota
Load in the `year_counts.tsv` and `total_counts.csv` files. Use the `here()` function around the filename to keep things portable.Give the columns of `year_counts.tsv` the names `term`, `year`, `volume`, and `book_count`. Give the columns of `total_counts.csv` the names `year`, `total_volume`, `page_count`, and `book_count`. Note that column order in these files may not match the examples in the documentation.
```{r load-counts}
-
-
+setwd("week3/ngrams")
+year_counts <- read_tsv('year_counts.tsv',
+ col_names = c('term','year','volume','book_count'))
+total_counts <- read_csv('total_counts.csv',
+ col_names = c('year','total_volume','page_count','book_count'))
```
## Your written answer
Add a line below using Rmarkdown's inline syntax to print the total number of lines in each dataframe you've created.
-
+year_counts has 76,336 lines
+total_counts has 425 lines
# Part B
> Recreate the main part of figure 3a of Michel et al. (2011). To recreate this figure, you will need two files: the one you downloaded in part (a) and the “total counts” file, which you can use to convert the raw counts into proportions. Note that the total counts file has a structure that may make it a bit hard to read in. Does version 2 of the NGram data produce similar results to those presented in Michel et al. (2011), which are based on version 1 data?
@@ -69,8 +73,8 @@ Add a line below using Rmarkdown's inline syntax to print the total number of li
Join the raw year term counts with the total counts and divide to get a proportion of mentions for each term normalized by the total counts for each year.
```{r join-years-and-totals}
-
-
+combined_data <- left_join(year_counts, total_counts, by = "year")
+combined_data <- mutate(combined_data, proportion = volume/total_volume)
```
## Plot the main figure 3a
@@ -78,12 +82,14 @@ Join the raw year term counts with the total counts and divide to get a proporti
Plot the proportion of mentions for the terms "1883", "1910", and "1950" over time from 1850 to 2012, as in the main figure 3a of the original paper. Use the `percent` function from the `scales` package for a readable y axis. Each term should have a different color, it's nice if these match the original paper but not strictly necessary.
```{r plot-proportion-over-time}
-
+year_data <- filter(combined_data, term == "1883" | term == "1910" | term == "1950") |> filter(year >= 1850 & year <= 2012)
+ggplot(year_data, aes(x=year, y = (proportion*10000), color = term)) + geom_line() + scale_color_manual(values = c("1883" = "blue", "1910" = "green", "1950" = "red")) + labs(x = "Year", y="Frequency")
```
## Your written answer
Write up your answer to Part B here.
+Yes, they produce similar results, except that the year 1910 peaks slightly lower, and the year 1950 peaks slightly higher. Also they are all a lot more steep/drop down faster, especially 1883.
# Part C
@@ -96,6 +102,8 @@ Go to the ngram viewer, enter the terms "1883", "1910", and "1950" and take a sc
## Your written answer
Add your screenshot for Part C below this line using the `` syntax and comment on similarities / differences.
+
+They seem similar, but are a lot less steep and fall down much faster. They also peak much lower, according to my reading of the ngrm's y-axis, whic is different from my own.
# Part D
@@ -107,12 +115,17 @@ Add your screenshot for Part C below this line using the `) + geom_line() + scale_y_continuous(labels = comma) + scale_color_manual(values = c("1883" = "blue", "1910" = "green", "1950" = "red")) + labs(x = "Year", y="Volume")
```
# Part E
> Does the difference between (b) and (d) lead you to reevaluate any of the results of Michel et al. (2011). Why or why not?
+Yes! It's not that we necessarily "forgot" about these years, but rather, in the influx of new books, other books and terms crowded out
+these in the overall proportion, but actually there's more literature written about 1883 now, than there was in 1883 (which makes sense
+intuitively, considering the information age)
+
+
As part of answering this question, make an additional plot.
@@ -121,13 +134,15 @@ As part of answering this question, make an additional plot.
Plot the total counts for each year over time, from 1850 to 2012. Use the `comma` function from the `scales` package for a readable y axis. There should be only one line on this plot (not three).
```{r plot-totals}
-
+subset_total_counts <- filter(total_counts, year >= 1850 & year <= 2012)
+ggplot(subset_total_counts, aes(x=year, y=total_volume)) + geom_line() + scale_y_continuous(labels = comma)
```
## Your written answer
Write up your answer to Part E here.
-
+This strengthens my previous answer, because one sees that the amount of books (also, specifically the amount of books
+scanned in this project) have risen almost exponentially in recent years.
# Part F
> Now, using the proportion of mentions, replicate the inset of figure 3a. That is, for each year between 1875 and 1975, calculate the half-life of that year. The half-life is defined to be the number of years that pass before the proportion of mentions reaches half its peak value. Note that Michel et al. (2011) do something more complicated to estimate the half-life—see section III.6 of the Supporting Online Information—but they claim that both approaches produce similar results. Does version 2 of the NGram data produce similar results to those presented in Michel et al. (2011), which are based on version 1 data? (Hint: Don’t be surprised if it doesn’t.)
@@ -137,7 +152,14 @@ Write up your answer to Part E here.
For each year term, find the year where its proportion of mentions peaks (hits its highest value). Store this in an intermediate dataframe.
```{r compute-peaks}
-
+library(stringr)
+year_peaks <- combined_data |>
+filter(str_detect(term, '^187[5-9]$|^18[8-9][0-9]$|^19[0-6][0-9]$|^197[0-5]$')) |>
+filter(year >= 1850 & year <= 2012) |>
+ group_by(term) |>
+ mutate(max_prop= max(proportion))
+year_peaks <- filter(year_peaks, proportion==max_prop) |> select(term, year_peak = year, proportion)
+#View(head(year_peaks, 200))
```
## Compute half-lifes
@@ -145,7 +167,16 @@ For each year term, find the year where its proportion of mentions peaks (hits i
Now, for each year term, find the minimum number of years it takes for the proportion of mentions to decline from its peak value to half its peak value. Store this in an intermediate data frame.
```{r compute-half-lifes}
-
+year_peaks <- year_peaks |>
+ rename(max_prop = proportion)
+half_life <- combined_data |>
+ filter(term %in% year_peaks$term, year >= 1850 & year <= 2012) |>
+ left_join(year_peaks, by = "term") |>
+ filter(year > year_peak) |>
+ filter(proportion <= (max_prop / 2)) |>
+ group_by(term) |>
+ summarize(half_life = min(year - year_peak), .groups = "drop")
+View(head(half_life, 200))
```
## Plot the inset of figure 3a
@@ -154,21 +185,43 @@ Plot the half-life of each term over time from 1850 to 2012. Each point should r
```{r plot-half-lifes}
-
+half_life$term <- as.numeric(half_life$term)
+half_life$highlight <- ifelse(
+ half_life$term %in% c(1883, 1910, 1950),
+ as.character(half_life$term),
+ "normal"
+)
+ggplot(half_life, aes(x=term, y=half_life, color = highlight)) + geom_point(size=3) + geom_smooth(
+ data = subset(half_life, highlight != "normal"),
+ aes(x = term, y = half_life),
+ color = "black"
+ ) +
+ scale_color_manual(values = c(
+ "1883" = "red",
+ "1910" = "blue",
+ "1950" = "green",
+ "normal" = "black"
+ ),
+ breaks = c("1883", "1910", "1950")
+ ) +
+ labs(x="Year", y="Half-life (yrs)")
```
## Your written answer
Write up your answer to Part F here.
+I feel like I did something wrong, but it looks completely different, with the year 1910 have a much higher half-life
+and the year 1883 having the smallest
# Part G
> Were there any years that were outliers such as years that were forgotten particularly quickly or particularly slowly? Briefly speculate about possible reasons for that pattern and explain how you identified the outliers.
-
+If my data is correct, then years like 1877 and ~1940 have very short half-lifes (forgotten quickly) while the highest is 1900, which makes sense, but then I don't know why 1910 is so high also
## Your written answer
Write up your answer to Part G here. Include code that shows the years with the smallest and largest half-lifes.
# Makefile
-
+half_life |> filter(half_life == max(half_life))
+half_life |> filter(hailf_life == min(half_life))
Edit the `Makefile` in this directory to execute the full set of scripts that download the data, clean it, and produce this report. This must be turned in with your assignment such that running `make` on the command line produces the final report as a pdf file.
diff --git a/week3/ngrams/ngramscreenshot.png b/week3/ngrams/ngramscreenshot.png
new file mode 100644
index 000000000..531569982
Binary files /dev/null and b/week3/ngrams/ngramscreenshot.png differ
diff --git a/week3/split_ratings.sh b/week3/split_ratings.sh
new file mode 100644
index 000000000..34af4a9b4
--- /dev/null
+++ b/week3/split_ratings.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+RATINGS_COUNT=`wc -l ratings.dat | xargs | cut -d ' ' -f 1`
+echo "ratings count: $RATINGS_COUNT"
+SET_SIZE=`expr $RATINGS_COUNT / 5`
+echo "set size: $SET_SIZE"
+REMAINDER=`expr $RATINGS_COUNT % 5`
+echo "remainder: $REMAINDER"
+
+for i in 1 2 3 4 5
+ do
+ head -`expr $i \* $SET_SIZE` ratings.dat | tail -$SET_SIZE > r$i.test
+
+ # XXX: OSX users will see the message "head: illegal line count -- 0" here,
+ # but this is just a warning; the script still works as intended.
+ head -`expr \( $i - 1 \) \* $SET_SIZE` ratings.dat > r$i.train
+ tail -`expr \( 5 - $i \) \* $SET_SIZE` ratings.dat >> r$i.train
+
+ if [ $i -eq 5 ]; then
+ tail -$REMAINDER ratings.dat >> r5.test
+ else
+ tail -$REMAINDER ratings.dat >> r$i.train
+ fi
+
+ echo "r$i.test created. `wc -l r$i.test | xargs | cut -d " " -f 1` lines."
+ echo "r$i.train created. `wc -l r$i.train | xargs | cut -d " " -f 1` lines."
+done
+
+./allbut.pl ra 1 10 0 ratings.dat
+echo "ra.test created. `wc -l ra.test | xargs | cut -d " " -f 1` lines."
+echo "ra.train created. `wc -l ra.train | xargs | cut -d " " -f 1` lines."
+
+./allbut.pl rb 11 20 0 ratings.dat
+echo "rb.test created. `wc -l rb.test | xargs | cut -d " " -f 1` lines."
+echo "rb.train created. `wc -l rb.train | xargs | cut -d " " -f 1` lines."
+
diff --git a/week3/trips_per_day.tsv b/week3/trips_per_day.tsv
deleted file mode 100644
index 41a5887df..000000000
--- a/week3/trips_per_day.tsv
+++ /dev/null
@@ -1,366 +0,0 @@
-ymd num_trips date prcp snwd snow tmax tmin
-2014-01-01 6059 20140101 0 0 0 3.3 2.4
-2014-01-02 8600 20140102 0.33 0 3.1 3.3 1.8
-2014-01-03 1144 20140103 0.29 5.9 3.3 1.8 0.9
-2014-01-04 2292 20140104 0 5.9 0 2.9 0.8
-2014-01-05 2678 20140105 0.14 3.9 0 4 2.7
-2014-01-06 9510 20140106 0.36 1.2 0 5.5 1.9
-2014-01-07 6267 20140107 0 0 0 1.9 0.4
-2014-01-08 9246 20140108 0 0 0 2.2 0.9
-2014-01-09 13354 20140109 0 0 0 3.2 2.2
-2014-01-10 9847 20140110 0.11 0 0 3.7 3
-2014-01-11 7695 20140111 0.5 0 0 5.8 3.7
-2014-01-12 12515 20140112 0.05 0 0 5.4 3.8
-2014-01-13 20633 20140113 0 0 0 5.1 3.7
-2014-01-14 9999 20140114 0.38 0 0 5.2 4.4
-2014-01-15 21630 20140115 0 0 0 4.7 3.9
-2014-01-16 19953 20140116 0 0 0 4.2 3.6
-2014-01-17 20301 20140117 0 0 0 4.4 3.3
-2014-01-18 9438 20140118 0.07 0 0 4.1 2.9
-2014-01-19 9028 20140119 0 0 0 3.8 2.4
-2014-01-20 13407 20140120 0 0 0 4.6 3.1
-2014-01-21 3804 20140121 0.46 0 11 3.1 1.1
-2014-01-22 2451 20140122 0.02 11 0.5 1.7 0.5
-2014-01-23 5071 20140123 0 7.1 0 2 0.7
-2014-01-24 6176 20140124 0 5.9 0 2 1
-2014-01-25 4338 20140125 0.04 3.9 1 2.8 1.9
-2014-01-26 4980 20140126 0 3.9 0 3.4 1.7
-2014-01-27 13119 20140127 0 3.1 0 4.4 2.1
-2014-01-28 10033 20140128 0 1.2 0 2.1 1.2
-2014-01-29 10021 20140129 0.04 2 0.8 2.3 1.4
-2014-01-30 12158 20140130 0 1 0 3 1.6
-2014-01-31 14653 20140131 0 1.2 0 3.9 2.5
-2014-02-01 12771 20140201 0 1.2 0 4.5 3.6
-2014-02-02 13816 20140202 0 1.2 0 5.6 3.9
-2014-02-03 2600 20140203 1.17 1.2 8 4.3 2.7
-2014-02-04 8709 20140204 0 7.9 0 3.5 2.2
-2014-02-05 2746 20140205 1.43 9.1 4 3.4 2.9
-2014-02-06 7196 20140206 0 9.1 0 3.2 2.1
-2014-02-07 8495 20140207 0 9.1 0 3.2 2.4
-2014-02-08 5986 20140208 0 9.1 0 2.9 2.1
-2014-02-09 4996 20140209 0.1 7.9 1.2 3.1 2.1
-2014-02-10 6846 20140210 0 9.8 0 2.9 2.1
-2014-02-11 8343 20140211 0 9.8 0 2.6 1.6
-2014-02-12 8580 20140212 0 9.8 0 2.5 1.3
-2014-02-13 876 20140213 1.78 11.8 9.5 3.6 2.4
-2014-02-14 3609 20140214 0.3 18.1 3 4 3.1
-2014-02-15 2261 20140215 0.13 18.1 1.6 3.7 2.7
-2014-02-16 3003 20140216 0 18.1 0 3 2.1
-2014-02-17 4854 20140217 0 16.9 0 3.2 1.8
-2014-02-18 5140 20140218 0.16 18.1 1.5 3.9 2.6
-2014-02-19 8506 20140219 0.26 16.9 0 4.5 3.4
-2014-02-20 11792 20140220 0.03 15 0 5.1 3.7
-2014-02-21 8680 20140221 0.09 13 0 4.9 3.6
-2014-02-22 13044 20140222 0 9.8 0 5.4 4
-2014-02-23 13324 20140223 0 9.1 0 5.4 4.3
-2014-02-24 12922 20140224 0 5.9 0 4.4 2.7
-2014-02-25 12830 20140225 0 5.9 0 3.3 2.4
-2014-02-26 11188 20140226 0.03 5.9 0.2 3.1 2
-2014-02-27 12036 20140227 0 5.9 0 3.4 1.4
-2014-02-28 9587 20140228 0 5.9 0 2.4 0.9
-2014-03-01 9202 20140301 0 5.9 0 3.7 2
-2014-03-02 8195 20140302 0 5.1 0 4 3.2
-2014-03-03 7708 20140303 0.04 5.1 0.1 3.2 1.7
-2014-03-04 10398 20140304 0 5.1 0 2.9 1.3
-2014-03-05 13801 20140305 0 3.9 0 3.9 2.6
-2014-03-06 12773 20140306 0 3.9 0 3.1 1.6
-2014-03-07 13304 20140307 0 3.9 0 3.7 2.6
-2014-03-08 16350 20140308 0 3.1 0 5.7 3.5
-2014-03-09 12262 20140309 0 2 0 4.4 3.6
-2014-03-10 17231 20140310 0 1.2 0 5.1 3.6
-2014-03-11 24356 20140311 0 0 0 6.6 4.5
-2014-03-12 15101 20140312 0.35 0 0 5.6 3.2
-2014-03-13 10979 20140313 0 0 0 3.2 1.8
-2014-03-14 16299 20140314 0 0 0 4.6 2.2
-2014-03-15 18053 20140315 0 0 0 5.8 4.2
-2014-03-16 9851 20140316 0 0 0 4.2 3
-2014-03-17 12503 20140317 0 0 0 3.5 2.3
-2014-03-18 16679 20140318 0 0 0 4.3 2.8
-2014-03-19 13341 20140319 0.92 0 0 4.6 3.2
-2014-03-20 19695 20140320 0 0 0 5.4 4
-2014-03-21 20718 20140321 0 0 0 5.1 3.9
-2014-03-22 19880 20140322 0 0 0 6.3 4.1
-2014-03-23 11740 20140323 0 0 0 4.2 2.7
-2014-03-24 13598 20140324 0 0 0 3.5 2.1
-2014-03-25 16037 20140325 0 0 0 3.9 2.6
-2014-03-26 14500 20140326 0 0 0 3.6 2.4
-2014-03-27 16666 20140327 0 0 0 4.4 2.2
-2014-03-28 18425 20140328 0.04 0 0 6.2 3.7
-2014-03-29 4480 20140329 1.81 0 0 5.9 4.3
-2014-03-30 7907 20140330 0.35 0 0 4.9 3.9
-2014-03-31 17085 20140331 0.16 0 0 5.6 3.6
-2014-04-01 23908 20140401 0 0 0 6 3.9
-2014-04-02 22515 20140402 0 0 0 5.4 4.2
-2014-04-03 26321 20140403 0.07 0 0 6.7 4.6
-2014-04-04 12566 20140404 0.21 0 0 4.7 4
-2014-04-05 18655 20140405 0 0 0 5.4 4
-2014-04-06 20750 20140406 0 0 0 6.1 3.6
-2014-04-07 14677 20140407 0.52 0 0 5.3 4.3
-2014-04-08 21884 20140408 0.34 0 0 6.4 4.6
-2014-04-09 26931 20140409 0 0 0 6.1 4.5
-2014-04-10 27463 20140410 0 0 0 5.8 4.2
-2014-04-11 26798 20140411 0 0 0 7.5 5.6
-2014-04-12 27108 20140412 0 0 0 7.3 5
-2014-04-13 26871 20140413 0 0 0 7.7 5.4
-2014-04-14 28832 20140414 0 0 0 7.5 5.9
-2014-04-15 10523 20140415 0.71 0 0 6.3 3.3
-2014-04-16 20082 20140416 0.05 0 0 4.9 3.1
-2014-04-17 22282 20140417 0 0 0 4.8 3.6
-2014-04-18 20770 20140418 0 0 0 4.9 3.5
-2014-04-19 24369 20140419 0 0 0 6.8 4.1
-2014-04-20 18464 20140420 0 0 0 6 4.5
-2014-04-21 27124 20140421 0 0 0 6.7 4.1
-2014-04-22 27259 20140422 0.01 0 0 7.1 5.1
-2014-04-23 26949 20140423 0 0 0 6.1 4.4
-2014-04-24 28107 20140424 0 0 0 6.2 4.2
-2014-04-25 27919 20140425 0.02 0 0 6.3 4.4
-2014-04-26 19899 20140426 0.92 0 0 6.7 4.6
-2014-04-27 21926 20140427 0 0 0 5.9 4.7
-2014-04-28 28557 20140428 0 0 0 6.7 4.5
-2014-04-29 18404 20140429 0.03 0 0 5.2 4.3
-2014-04-30 2867 20140430 4.97 0 0 5.2 4.1
-2014-05-01 26762 20140501 0.12 0 0 7.8 5.1
-2014-05-02 32377 20140502 0 0 0 7 5.6
-2014-05-03 25501 20140503 0.08 0 0 7.1 5.4
-2014-05-04 21277 20140504 0.02 0 0 6.5 5.4
-2014-05-05 30288 20140505 0 0 0 7 5
-2014-05-06 32233 20140506 0 0 0 7.1 5.2
-2014-05-07 32882 20140507 0 0 0 6.9 5
-2014-05-08 17619 20140508 0.41 0 0 5.9 5.3
-2014-05-09 22641 20140509 0.04 0 0 6.3 5.5
-2014-05-10 21832 20140510 0.37 0 0 8.3 5.7
-2014-05-11 28326 20140511 0.03 0 0 8.2 6
-2014-05-12 31644 20140512 0 0 0 8.5 6.4
-2014-05-13 32847 20140513 0 0 0 7.3 5.3
-2014-05-14 30044 20140514 0.01 0 0 7.1 5.2
-2014-05-15 26057 20140515 0.15 0 0 7.1 5.8
-2014-05-16 17784 20140516 1.54 0 0 6.8 5.8
-2014-05-17 29912 20140517 0 0 0 7 5.3
-2014-05-18 27436 20140518 0 0 0 6.7 5.1
-2014-05-19 31692 20140519 0 0 0 7.2 4.9
-2014-05-20 35567 20140520 0 0 0 7.8 5.3
-2014-05-21 34420 20140521 0 0 0 7.4 6.3
-2014-05-22 24535 20140522 0.24 0 0 6.7 5.9
-2014-05-23 24484 20140523 0.91 0 0 7.1 5.6
-2014-05-24 16798 20140524 0.4 0 0 7 5.7
-2014-05-25 25637 20140525 0 0 0 8 5.5
-2014-05-26 26930 20140526 0 0 0 8.6 6.6
-2014-05-27 32936 20140527 0 0 0 8.6 6.4
-2014-05-28 29690 20140528 0 0 0 6.4 5.4
-2014-05-29 33889 20140529 0 0 0 6.6 5.1
-2014-05-30 34343 20140530 0 0 0 7.5 5.5
-2014-05-31 27734 20140531 0.05 0 0 7.3 5.7
-2014-06-01 31333 20140601 0 0 0 7.7 5.5
-2014-06-02 34519 20140602 0 0 0 8 5.9
-2014-06-03 29501 20140603 0.12 0 0 8.7 6.5
-2014-06-04 36068 20140604 0 0 0 8.2 6.3
-2014-06-05 24619 20140605 0.87 0 0 7.6 6.1
-2014-06-06 37719 20140606 0 0 0 7.6 6.1
-2014-06-07 31204 20140607 0 0 0 8.2 6
-2014-06-08 30162 20140608 0 0 0 8.6 6.6
-2014-06-09 18611 20140609 1.6 0 0 7.3 6.3
-2014-06-10 33435 20140610 0 0 0 7.7 6.5
-2014-06-11 30548 20140611 0.02 0 0 7 6.1
-2014-06-12 29084 20140612 0.07 0 0 7.3 6
-2014-06-13 18308 20140613 1.28 0 0 7.9 6.5
-2014-06-14 28417 20140614 0 0 0 7.4 6
-2014-06-15 28382 20140615 0 0 0 8 5.9
-2014-06-16 34932 20140616 0 0 0 8.1 6.3
-2014-06-17 35097 20140617 0 0 0 8.9 7.1
-2014-06-18 35531 20140618 0 0 0 8.9 7.6
-2014-06-19 31724 20140619 0.15 0 0 7.7 6.8
-2014-06-20 34405 20140620 0 0 0 7.9 6.4
-2014-06-21 30288 20140621 0 0 0 7.8 6.2
-2014-06-22 29417 20140622 0 0 0 7.9 6.4
-2014-06-23 32580 20140623 0 0 0 8.1 6.5
-2014-06-24 35897 20140624 0 0 0 8.1 6.8
-2014-06-25 34340 20140625 0.08 0 0 8.5 7
-2014-06-26 34846 20140626 0.07 0 0 8.5 7
-2014-06-27 35823 20140627 0 0 0 8.3 6.7
-2014-06-28 28507 20140628 0 0 0 8.7 6.6
-2014-06-29 27986 20140629 0 0 0 8.3 6.8
-2014-06-30 33597 20140630 0 0 0 8.4 6.9
-2014-07-01 34854 20140701 0 0 0 8.9 7.2
-2014-07-02 26582 20140702 0.96 0 0 9.1 7.2
-2014-07-03 27587 20140703 1.78 0 0 8.7 6.9
-2014-07-04 13612 20140704 0.14 0 0 7.4 6.5
-2014-07-05 22913 20140705 0 0 0 8.1 6.3
-2014-07-06 23822 20140706 0 0 0 8.4 6.6
-2014-07-07 31863 20140707 0.04 0 0 9 7.2
-2014-07-08 32713 20140708 0.39 0 0 9.1 7.1
-2014-07-09 34426 20140709 0.09 0 0 8.8 7.1
-2014-07-10 36288 20140710 0 0 0 8.3 7.2
-2014-07-11 35000 20140711 0 0 0 8.6 7.1
-2014-07-12 28718 20140712 0 0 0 8.5 7.1
-2014-07-13 24869 20140713 0.03 0 0 8.3 7.2
-2014-07-14 25825 20140714 0.46 0 0 8.4 7.2
-2014-07-15 22963 20140715 1.3 0 0 8.6 7.2
-2014-07-16 35391 20140716 0 0 0 8.1 6.8
-2014-07-17 38619 20140717 0 0 0 8.1 6.7
-2014-07-18 36105 20140718 0 0 0 8.1 6.4
-2014-07-19 29291 20140719 0 0 0 7.6 6.8
-2014-07-20 30296 20140720 0 0 0 8 6.6
-2014-07-21 35670 20140721 0 0 0 8.5 6.7
-2014-07-22 36886 20140722 0 0 0 8.6 7.1
-2014-07-23 34173 20140723 0.19 0 0 8.8 7.2
-2014-07-24 36693 20140724 0 0 0 8 7
-2014-07-25 36384 20140725 0 0 0 8.2 6.6
-2014-07-26 27179 20140726 0 0 0 8.1 6.9
-2014-07-27 27004 20140727 0.02 0 0 8.5 7.1
-2014-07-28 33189 20140728 0.19 0 0 8.2 6.8
-2014-07-29 37092 20140729 0 0 0 7.6 6.4
-2014-07-30 37377 20140730 0 0 0 8 6.3
-2014-07-31 35458 20140731 0 0 0 8.2 6.8
-2014-08-01 32654 20140801 0 0 0 8.4 7.1
-2014-08-02 26784 20140802 0.41 0 0 7.4 6.3
-2014-08-03 25276 20140803 0.07 0 0 7.6 6.6
-2014-08-04 33822 20140804 0 0 0 8.4 7
-2014-08-05 34392 20140805 0 0 0 9 7.1
-2014-08-06 36336 20140806 0 0 0 8.3 7
-2014-08-07 36362 20140807 0 0 0 8.3 6.6
-2014-08-08 34073 20140808 0 0 0 8.3 6.5
-2014-08-09 31636 20140809 0 0 0 8.7 6.6
-2014-08-10 26749 20140810 0 0 0 8.8 6.8
-2014-08-11 33664 20140811 0 0 0 8.7 7.1
-2014-08-12 26261 20140812 0.19 0 0 7.9 7
-2014-08-13 27977 20140813 0.53 0 0 8.2 6.8
-2014-08-14 35154 20140814 0 0 0 7.7 6.3
-2014-08-15 32480 20140815 0 0 0 7.3 6.1
-2014-08-16 32017 20140816 0 0 0 7.8 6.3
-2014-08-17 27417 20140817 0.01 0 0 8.2 6.6
-2014-08-18 34339 20140818 0 0 0 8.1 6.3
-2014-08-19 35367 20140819 0 0 0 8.3 6.3
-2014-08-20 35532 20140820 0 0 0 8.4 7
-2014-08-21 33804 20140821 0.35 0 0 8.3 6.5
-2014-08-22 30406 20140822 0.06 0 0 7.9 6.5
-2014-08-23 26429 20140823 0.01 0 0 7.7 6.7
-2014-08-24 27493 20140824 0 0 0 8 6.4
-2014-08-25 32892 20140825 0 0 0 8.8 6.4
-2014-08-26 34519 20140826 0 0 0 8.9 7
-2014-08-27 33721 20140827 0 0 0 9 7
-2014-08-28 33927 20140828 0 0 0 8.2 6.6
-2014-08-29 31264 20140829 0 0 0 8 6.1
-2014-08-30 22689 20140830 0 0 0 8 6.5
-2014-08-31 18053 20140831 0.62 0 0 9 7.3
-2014-09-01 20725 20140901 0 0 0 8.8 7.5
-2014-09-02 29657 20140902 0 0 0 9.2 7.7
-2014-09-03 34843 20140903 0 0 0 8.6 7.2
-2014-09-04 36392 20140904 0 0 0 8.7 6.9
-2014-09-05 35579 20140905 0 0 0 8.7 7.2
-2014-09-06 26808 20140906 0.11 0 0 9.1 6.7
-2014-09-07 28901 20140907 0 0 0 8.1 6.5
-2014-09-08 33979 20140908 0 0 0 7.5 6.5
-2014-09-09 34166 20140909 0 0 0 7.3 6.3
-2014-09-10 37418 20140910 0 0 0 8 6.3
-2014-09-11 36668 20140911 0 0 0 8.3 6.9
-2014-09-12 38481 20140912 0 0 0 7.8 6.2
-2014-09-13 19499 20140913 0.26 0 0 6.9 5.8
-2014-09-14 27187 20140914 0 0 0 7.1 5.3
-2014-09-15 34258 20140915 0 0 0 7.1 5.5
-2014-09-16 25579 20140916 0.37 0 0 7 5.8
-2014-09-17 36791 20140917 0 0 0 7.3 5.5
-2014-09-18 37300 20140918 0 0 0 7.6 5.7
-2014-09-19 35674 20140919 0 0 0 6.6 5.4
-2014-09-20 29999 20140920 0 0 0 7.5 5.7
-2014-09-21 26650 20140921 0.15 0 0 7.5 6.7
-2014-09-22 32937 20140922 0 0 0 7.1 5.5
-2014-09-23 35599 20140923 0 0 0 7.1 5.2
-2014-09-24 35838 20140924 0 0 0 7.1 5.8
-2014-09-25 17165 20140925 0.32 0 0 6.4 5.7
-2014-09-26 34500 20140926 0 0 0 7.7 5.8
-2014-09-27 30463 20140927 0 0 0 8.3 6
-2014-09-28 29491 20140928 0 0 0 8.4 6.4
-2014-09-29 32385 20140929 0 0 0 7.9 6.7
-2014-09-30 34901 20140930 0 0 0 7.1 6.2
-2014-10-01 28053 20141001 0.02 0 0 6.5 6.1
-2014-10-02 34154 20141002 0 0 0 7 6.1
-2014-10-03 35966 20141003 0 0 0 7.1 5.6
-2014-10-04 14173 20141004 1.18 0 0 6.9 5.2
-2014-10-05 23578 20141005 0 0 0 6.1 4.6
-2014-10-06 30628 20141006 0 0 0 6.9 5
-2014-10-07 32756 20141007 0.06 0 0 7.1 6.3
-2014-10-08 33437 20141008 0.04 0 0 7.3 6.2
-2014-10-09 32322 20141009 0 0 0 6.8 5.5
-2014-10-10 31616 20141010 0 0 0 6.4 5.2
-2014-10-11 13807 20141011 0.33 0 0 6 5
-2014-10-12 23079 20141012 0 0 0 6.3 4.8
-2014-10-13 23668 20141013 0.05 0 0 6.5 5.2
-2014-10-14 30252 20141014 0 0 0 7.6 6.3
-2014-10-15 30477 20141015 0.69 0 0 7.7 6.9
-2014-10-16 24829 20141016 1.11 0 0 7.1 6.1
-2014-10-17 34755 20141017 0 0 0 7.1 5.9
-2014-10-18 25854 20141018 0 0 0 7 5.6
-2014-10-19 19646 20141019 0 0 0 5.6 4.4
-2014-10-20 26440 20141020 0 0 0 6 4.2
-2014-10-21 31087 20141021 0.11 0 0 6.7 5.5
-2014-10-22 12023 20141022 1.51 0 0 5.8 5
-2014-10-23 11194 20141023 0.61 0 0 5.3 5
-2014-10-24 29606 20141024 0 0 0 6.3 5.1
-2014-10-25 25232 20141025 0.01 0 0 6.7 5
-2014-10-26 21488 20141026 0 0 0 6.3 5.3
-2014-10-27 27532 20141027 0 0 0 6.3 4.8
-2014-10-28 32313 20141028 0 0 0 7.2 5.3
-2014-10-29 30651 20141029 0 0 0 7.2 5.1
-2014-10-30 31079 20141030 0 0 0 5.9 4.7
-2014-10-31 28843 20141031 0.05 0 0 5.5 4.5
-2014-11-01 7484 20141101 0.35 0 0 4.7 4.2
-2014-11-02 12990 20141102 0 0 0 4.8 4.1
-2014-11-03 24019 20141103 0 0 0 6.1 3.9
-2014-11-04 30181 20141104 0 0 0 6.8 5.3
-2014-11-05 30766 20141105 0 0 0 6.4 5.6
-2014-11-06 13949 20141106 0.37 0 0 5.7 4.8
-2014-11-07 25648 20141107 0 0 0 5.3 4
-2014-11-08 18211 20141108 0 0 0 4.8 3.6
-2014-11-09 18128 20141109 0 0 0 5.7 4.6
-2014-11-10 25573 20141110 0 0 0 6.1 4.4
-2014-11-11 28787 20141111 0 0 0 6.4 4.9
-2014-11-12 28164 20141112 0 0 0 6.5 4.7
-2014-11-13 23972 20141113 0.2 0 0 4.8 3.6
-2014-11-14 19709 20141114 0.06 0 0 4.2 3.5
-2014-11-15 14856 20141115 0 0 0 4.2 3.3
-2014-11-16 13445 20141116 0.03 0 0 4.5 3.5
-2014-11-17 7346 20141117 1.54 0 0 5.2 4
-2014-11-18 17010 20141118 0 0 0 4.5 2.4
-2014-11-19 16270 20141119 0 0 0 3.6 2.2
-2014-11-20 19987 20141120 0 0 0 4.5 3.1
-2014-11-21 18837 20141121 0 0 0 3.7 2.8
-2014-11-22 13154 20141122 0 0 0 4.4 2.8
-2014-11-23 15218 20141123 0 0 0 5.7 4.3
-2014-11-24 20794 20141124 0.7 0 0 6.9 5.3
-2014-11-25 26064 20141125 0 0 0 6.8 5.1
-2014-11-26 7479 20141126 1.24 0 0.2 5.1 3.4
-2014-11-27 3757 20141127 0.02 0 0 3.8 3.4
-2014-11-28 7839 20141128 0 0 0 3.7 2.9
-2014-11-29 7869 20141129 0 0 0 4.5 2.7
-2014-11-30 11772 20141130 0 0 0 5.5 4.5
-2014-12-01 18569 20141201 0.09 0 0 6.5 4.2
-2014-12-02 15175 20141202 0.08 0 0 4.3 3.5
-2014-12-03 12177 20141203 0.06 0 0 4.6 4.1
-2014-12-04 21055 20141204 0 0 0 4.5 3.7
-2014-12-05 18920 20141205 0.51 0 0 4.4 3.4
-2014-12-06 4441 20141206 1.22 0 0 5 3.9
-2014-12-07 9319 20141207 0.04 0 0 4.2 3
-2014-12-08 14283 20141208 0 0 0 3.7 2.4
-2014-12-09 6912 20141209 2.54 0 0 4.2 3.6
-2014-12-10 11098 20141210 0.08 0 1 4 3.2
-2014-12-11 16413 20141211 0.01 1.2 0 3.8 3.1
-2014-12-12 18850 20141212 0 0 0 3.8 3.2
-2014-12-13 13173 20141213 0 0 0 4.4 3.4
-2014-12-14 12096 20141214 0 0 0 4.6 3.8
-2014-12-15 17761 20141215 0 0 0 4.8 3.7
-2014-12-16 18941 20141216 0.2 0 0 4.9 3.8
-2014-12-17 18196 20141217 0.02 0 0 5.4 4.2
-2014-12-18 19206 20141218 0 0 0 4.2 3.7
-2014-12-19 18256 20141219 0 0 0 3.8 3.1
-2014-12-20 10421 20141220 0 0 0 3.3 3
-2014-12-21 8854 20141221 0 0 0 3.6 3.1
-2014-12-22 13120 20141222 0.04 0 0 4.4 3.5
-2014-12-23 9849 20141223 0.16 0 0 4.6 4.3
-2014-12-24 5049 20141224 0.8 0 0 5.8 4.4
-2014-12-25 4620 20141225 0.09 0 0 6.2 4.4
-2014-12-26 9360 20141226 0 0 0 5 4
-2014-12-27 10070 20141227 0 0 0 5.5 4.4
-2014-12-28 8055 20141228 0.1 0 0 5.4 4.3
-2014-12-29 13055 20141229 0 0 0 4.4 3.4
-2014-12-30 12483 20141230 0 0 0 3.4 2.8
-2014-12-31 10493 20141231 0 0 0 3.2 2.7
diff --git a/week4/final_model.RData b/week4/final_model.RData
new file mode 100644
index 000000000..d76dbc58d
Binary files /dev/null and b/week4/final_model.RData differ
diff --git a/week4/predict_citibike.Rmd b/week4/predict_citibike.Rmd
new file mode 100644
index 000000000..14b110b4a
--- /dev/null
+++ b/week4/predict_citibike.Rmd
@@ -0,0 +1,345 @@
+```{r setup}
+library(tidyverse)
+library(scales)
+library(modelr)
+
+trips_per_day_with_test <- read_tsv("trips_per_day.tsv")
+```
+
+```{r seperating test data}
+set.seed(24)
+
+num_days <- nrow(trips_per_day_with_test)
+frac_test <- 0.1
+num_test <- floor(num_days * frac_test)
+
+# randomly sample rows for the test set
+ndx <- sample(1:num_days, num_test, replace=F)
+
+# used to fit the model
+trips_per_day <- trips_per_day_with_test[-ndx, ]
+test_set <- trips_per_day_with_test[ndx, ]
+```
+
+```{r k-fold cross-validation polynomials}
+#k-fold cross-validation
+set.seed(42)
+num_folds <- 5
+num_days_train <- nrow(trips_per_day)
+
+trips_per_day <- trips_per_day %>%
+ mutate(fold = (row_number() %% num_folds) + 1)
+
+# fit a model for each polynomial degree
+K <- 1:8
+avg_validate_err <- c()
+se_validate_err <- c()
+for (k in K) {
+ # do 5-fold cross-validation within each value of k
+ validate_err <- c()
+ for (f in 1:num_folds) {
+ # fit on the training data
+ trips_per_day_train <- filter(trips_per_day, fold != f)
+ model <- lm(num_trips ~ poly(tmin, k, raw = T), data=trips_per_day_train)
+
+ # evaluate on the validation data
+ trips_per_day_validate <- filter(trips_per_day, fold == f)
+ validate_err[f] <- sqrt(mean((predict(model, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+ }
+
+ # compute the average validation error across folds
+ # and the standard error on this estimate
+ avg_validate_err[k] <- mean(validate_err)
+ se_validate_err[k] <- sd(validate_err) / sqrt(num_folds)
+}
+# plot the validate error, highlighting the value of k with the lowest average error
+plot_data <- data.frame(K, avg_validate_err, se_validate_err)
+ggplot(plot_data, aes(x=K, y=avg_validate_err)) +
+ geom_pointrange(aes(ymin=avg_validate_err - se_validate_err,
+ ymax=avg_validate_err + se_validate_err,
+ color=avg_validate_err == min(avg_validate_err))) +
+ geom_line(color = "red") +
+ scale_x_continuous(breaks=1:12) +
+ theme(legend.position="none") +
+ xlab('Polynomial Degree') +
+ ylab('RMSE on validation data')
+ ```
+
+```{r dividing into training and validating}
+#delete fold
+trips_per_day <- select(trips_per_day, -fold)
+#cross-validate with checking with all features through adding
+set.seed(42)
+
+num_days <- nrow(trips_per_day)
+frac_train <- 0.8
+num_train <- floor(num_days * frac_train)
+
+# randomly sample rows for the training set
+ndx <- sample(1:num_days, num_train, replace=F)
+
+# used to fit the model
+trips_per_day_train <- trips_per_day[ndx, ]
+
+# used to evaluate the fit
+trips_per_day_validate <- trips_per_day[-ndx, ]
+```
+
+```{r model experimenting}
+# fit on the training data
+model_with_all_additive <- lm(num_trips ~ . - date - ymd, data = trips_per_day_train)
+
+# evaluate on the training data
+train_err_all_additive <- sqrt(mean((predict(model_with_all_additive, trips_per_day_train) - trips_per_day_train$num_trips)^2))
+
+# evaluate on the validate data
+validate_err_all_additive <- sqrt(mean((predict(model_with_all_additive, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+
+#training error
+train_err_all_additive
+#[1] 4703.371
+#validation error
+validate_err_all_additive
+#5197.574
+#high error bad model
+
+#interactions
+# fit on the training data
+model_with_all_interactive <- lm(num_trips ~ (snwd + tmax + tmin + snow + prcp)^2, data = trips_per_day_train)
+
+# evaluate on the training data
+train_err_all_interactive <- sqrt(mean((predict(model_with_all_interactive, trips_per_day_train) - trips_per_day_train$num_trips)^2))
+
+# evaluate on the validate data
+validate_err_all_interactive <- sqrt(mean((predict(model_with_all_interactive, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+
+train_err_all_interactive
+#4497.649
+validate_err_all_interactive
+#7601.197
+
+model_with_rain_interact <- lm(num_trips ~ . - date - ymd + prcp:tmin , data = trips_per_day_train)
+
+train_err_rain<- sqrt(mean((predict(model_with_rain_interact, trips_per_day_train) - trips_per_day_train$num_trips)^2))
+
+# evaluate on the validate data
+validate_err_rain <- sqrt(mean((predict(model_with_rain_interact, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+
+train_err_rain
+#4497.467
+validate_err_rain
+#5992.959
+
+model_with_tmin_interact <- lm(num_trips ~ . - date - ymd + tmin:. - date - ymd, data = trips_per_day_train)
+
+train_err_tmin<- sqrt(mean((predict(model_with_tmin_interact, trips_per_day_train) - trips_per_day_train$num_trips)^2))
+
+# evaluate on the validate data
+validate_err_tmin <- sqrt(mean((predict(model_with_tmin_interact, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+
+train_err_tmin
+#4652.297
+validate_err_tmin
+#5647
+
+#model with only snwd and prcp interact with tmin
+model_tmin_snwd_prcp <- lm(num_trips ~ prcp + snwd + snow + tmax + tmin
+ + snwd:tmin + prcp:tmin, data = trips_per_day_train)
+
+train_err_tsp<- sqrt(mean((predict(model_tmin_snwd_prcp, trips_per_day_train) - trips_per_day_train$num_trips)^2))
+
+# evaluate on the validate data
+validate_err_tsp <- sqrt(mean((predict(model_tmin_snwd_prcp, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+
+train_err_tsp
+#4669.297
+validate_err_tsp
+#5490.993
+
+trips_per_day_train$month <- as.numeric(format(trips_per_day_train$ymd, "%m"))
+trips_per_day_validate$month <- as.numeric(format(trips_per_day_validate$ymd, "%m"))
+
+model_with_month <- lm(num_trips ~ prcp + snwd + snow + tmax + tmin + month + prcp:tmin + snwd:tmin, data = trips_per_day_train)
+
+train_err_month<- sqrt(mean((predict(model_with_month, trips_per_day_train) - trips_per_day_train$num_trips)^2))
+
+# evaluate on the validate data
+validate_err_month <- sqrt(mean((predict(model_with_month, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+
+train_err_month
+#4669
+validate_err_month
+#5490
+
+trips_per_day_train <- select(trips_per_day_train, -month)
+trips_per_day_validate <- select(trips_per_day_validate, -month)
+
+model_prcp_snwd_tmax <- lm(num_trips ~ prcp + snwd + tmax - ymd - date, data = trips_per_day_train)
+
+train_err_pst<- sqrt(mean((predict(model_prcp_snwd_tmax, trips_per_day_train) - trips_per_day_train$num_trips)^2))
+
+# evaluate on the validate data
+validate_err_pst <- sqrt(mean((predict(model_prcp_snwd_tmax, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+
+train_err_pst
+#4731
+validate_err_pst
+#4975
+summary(trips_per_day_train)
+summary(model_prcp_snwd_tmax)
+
+model_poly2 <- lm(num_trips ~ poly(tmax, 2, raw=TRUE) + poly(tmin, 2, raw=TRUE) +
+ poly(prcp, 2, raw=TRUE) + poly(snow, 2, raw=TRUE) +
+ poly(snwd, 2, raw=TRUE), data = trips_per_day_train)
+
+train_err_poly2<- sqrt(mean((predict(model_poly2, trips_per_day_train) - trips_per_day_train$num_trips)^2))
+
+# evaluate on the validate data
+validate_err_poly2 <- sqrt(mean((predict(model_poly2, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+
+train_err_poly2
+#4515
+validate_err_poly2
+#15306
+
+model_prcp_snwd_tmax <- lm(num_trips ~ prcp + snwd + tmax - ymd - date, data = trips_per_day_train)
+
+train_err_pst<- sqrt(mean((predict(model_prcp_snwd_tmax, trips_per_day_train) - trips_per_day_train$num_trips)^2))
+
+# evaluate on the validate data
+validate_err_pst <- sqrt(mean((predict(model_prcp_snwd_tmax, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+
+train_err_pst
+#4731
+validate_err_pst
+
+interact_prcp_snwd_tmax <- lm(num_trips ~ (prcp + snwd + tmax)^2 - ymd - date, data = trips_per_day_train)
+
+train_err_ipst<- sqrt(mean((predict(interact_prcp_snwd_tmax, trips_per_day_train) - trips_per_day_train$num_trips)^2))
+
+# evaluate on the validate data
+validate_err_ipst <- sqrt(mean((predict(interact_prcp_snwd_tmax, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+
+train_err_ipst
+#4648
+validate_err_ipst
+#5913
+```
+
+```{r adding features}
+mean_precip <- mean(trips_per_day$prcp, na.rm = TRUE)
+trips_per_day$high_precip <- trips_per_day$prcp > mean_precip
+trips_per_day$high_precip <- as.factor(trips_per_day$high_precip)
+trips_per_day$weekday <- as.factor(trips_per_day$weekday)
+trips_per_day$season <- as.factor(trips_per_day$season)
+
+get_season <- function(date) {
+ month <- as.numeric(format(date, "%m"))
+ if (month %in% c(12, 1, 2)) {
+ return("Winter")
+ } else if (month %in% c(3, 4, 5)) {
+ return("Spring")
+ } else if (month %in% c(6, 7, 8)) {
+ return("Summer")
+ } else {
+ return("Fall")
+ }
+}
+trips_per_day$season <- sapply(trips_per_day$ymd, get_season)
+trips_per_day$weekday <- !(weekdays(trips_per_day$ymd) %in% c("Saturday", "Sunday"))
+```
+
+
+```{r k-fold testing model}
+set.seed(42)
+num_folds <- 5
+num_days <- nrow(trips_per_day)
+
+# K-fold cross validation
+trips_per_day <- trips_per_day %>%
+ mutate(fold = sample(rep(1:num_folds, length.out = num_days)))
+
+
+validate_rmse <- c()
+train_rmse <- c()
+
+# Cross-validation k-fold
+for (f in 1:num_folds) {
+ # Split into training and validation
+ train_data <- filter(trips_per_day, fold != f)
+ validate_data <- filter(trips_per_day, fold == f)
+
+ model <- lm(num_trips ~ tmax * high_precip + snwd + tmax:snow + weekday * season, data = train_data)
+
+ # Predict and compute RMSE on validation set
+ predictions <- predict(model, newdata = validate_data)
+ t_rmse <- sqrt(mean(predictions - train_data$num_trips)^2)
+ rmse <- sqrt(mean((predictions - validate_data$num_trips)^2))
+ validate_rmse[f] <- rmse
+ train_rmse[f] <- t_rmse
+}
+
+# Results
+average_rmse <- mean(validate_rmse)
+average_train_rmse <- mean(train_rmse)
+se_rmse <- sd(validate_rmse) / sqrt(num_folds)
+
+cat("RMSE for each fold:", round(validate_rmse, 2), "\n")
+cat("Train RMSE for each fold:", round(train_rmse, 2), "\n")
+cat("Average RMSE:", round(average_rmse, 2), "\n")
+cat("Train Average RMSE:", round(average_train_rmse, 2), "\n")
+cat("Standard Error:", round(se_rmse, 2), "\n")
+```
+
+
+```{r plotting-model}
+#add predict colum for plotting
+trips_per_day_validate$predicted_trips <- predict(model, newdata = trips_per_day_validate)
+
+ggplot(trips_per_day_validate, aes(x = date)) +
+ geom_point(aes(y = num_trips), color = "blue", alpha = 0.6, size = 2) +
+ geom_line(aes(y = predicted_trips), color = "red", size = 1) +
+ labs(title = "Actual vs Predicted Trips Over Time",
+ x = "Date", y = "Number of Trips")
+
+
+ggplot(trips_per_day_validate, aes(x = predicted_trips, y = num_trips)) +
+ geom_point() + geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
+ labs(title = "Predicted vs Actual Trips",
+ x = "Predicted Trips", y = "Actual Trips")
+
+```
+
+```{r save-model}
+final_model <- model
+save(final_model, file = "final_model.RData")
+```
+```{r test!}
+ model <- lm(num_trips ~ tmax * high_precip + snwd + tmax:snow + weekday * season, data = trips_per_day)
+
+ mean_precip <- mean(test_set$prcp, na.rm = TRUE)
+trips_per_day$high_precip <- test_sety$prcp > mean_precip
+trips_per_day$high_precip <- as.factor(test_set$high_precip)
+trips_per_day$weekday <- as.factor(test_set$weekday)
+trips_per_day$season <- as.factor(test_set$season)
+
+get_season <- function(date) {
+ month <- as.numeric(format(date, "%m"))
+ if (month %in% c(12, 1, 2)) {
+ return("Winter")
+ } else if (month %in% c(3, 4, 5)) {
+ return("Spring")
+ } else if (month %in% c(6, 7, 8)) {
+ return("Summer")
+ } else {
+ return("Fall")
+ }
+}
+trips_per_day$season <- sapply(trips_per_day$ymd, get_season)
+trips_per_day$weekday <- !(weekdays(trips_per_day$ymd) %in% c("Saturday", "Sunday"))
+predictions <- predict(model, newdata = test_set)
+t_rmse <- sqrt(mean(predictions - trips_per_day$num_trips)^2)
+rmse <- sqrt(mean((predictions - test_set$num_trips)^2))
+t_rmse
+rmse
+```
\ No newline at end of file
diff --git a/week4/test_citibike_predictions.Rmd b/week4/test_citibike_predictions.Rmd
new file mode 100644
index 000000000..e69de29bb