diff --git a/week1/citibike.R b/week1/citibike.R
index ad01de1d3..7ef06e3c1 100644
--- a/week1/citibike.R
+++ b/week1/citibike.R
@@ -5,8 +5,13 @@ library(lubridate)
# READ AND TRANSFORM THE DATA
########################################
+
+
+
# read one month of data
-trips <- read_csv('201402-citibike-tripdata.csv')
+trips <- read_csv('./coursework/week1/201402-citibike-tripdata.csv')
+##viewing the data
+view(trips)
# replace spaces in column names with underscores
names(trips) <- gsub(' ', '_', names(trips))
@@ -23,26 +28,88 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn
########################################
# count the number of trips (= rows in the data frame)
+nrow(trips)
+
# find the earliest and latest birth years (see help for max and min to deal with NAs)
-# use filter and grepl to find all trips that either start or end on broadway
+#converting birth_year into numbers
+trips$birth_year <- as.numeric(trips$birth_year)
+max(trips$birth_year, na.rm = TRUE)
+min(trips$birth_year, na.rm = TRUE)
+# use filter and grepl to find all trips that either start or end on broadway
+filtered_df <- filter(trips, grepl("Broadway", start_station_name) | grepl("Broadway",end_station_name))
+view(filtered_df)
# do the same, but find all trips that both start and end on broadway
-
+filtered_both_df <- filter(trips, grepl("Broadway", start_station_name) & grepl("Broadway",end_station_name))
+nrow(filtered_both_df)
# find all unique station names
+#stupid idea nrow(distinct(trips[ ,"start_station_name"])) + nrow(distinct(trips[ ,"end_station_name"]))
+
+union(trips$start_station_name, trips$end_station_name)
# count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender
# do this all at once, by using summarize() with multiple arguments
+grouped_df <- trips %>%
+ group_by(gender)%>%
+ summarize(num_trips = n(),
+ average_trip = mean(tripduration)/60,
+ standard_deviation = sd(tripduration)/60)
+grouped_df
+
+
+
# find the 10 most frequent station-to-station trips
+frequent_df <- trips %>%
+ group_by(start_station_name, end_station_name) %>%
+ summarize(num_trips = n()) %>%
+ arrange(desc(num_trips))
+
+head(frequent_df,10)
+
# find the top 3 end stations for trips starting from each start station
+trips %>%
+ group_by(start_station_name, end_station_name) %>%
+ summarize(num_trips = n()) %>%
+ group_by(start_station_name) %>%
+ arrange(desc(num_trips))%>%
+ slice(1:3)
+
+
+
+top3_end_stations
# find the top 3 most common station-to-station trips by gender
+trips %>%
+ group_by(start_station_name, end_station_name, gender) %>%
+ summarize(num_trips= n())%>%
+ group_by(gender)%>%
+ arrange(desc(num_trips))%>%
+ slice(1:3)
+
+
+
# find the day with the most trips
# tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package)
+trips %>%
+ mutate(date_only = as.Date(starttime)) %>%
+ count(date_only)%>%
+ arrange(desc(n)) %>%
+ head(1)
# compute the average number of trips taken during each of the 24 hours of the day across the entire month
# what time(s) of day tend to be peak hour(s)?
+view(trips)
+trips %>% mutate( hours = hour(starttime)) %>%
+ group_by(hours) %>% summarise(count = n(), day_in_month_count = days_in_month(starttime), avg = count / day_in_month_count)
+
+trips %>% mutate( hours = hour(starttime)) %>%
+ group_by(hours) %>% summarise(count = n()) %>% arrange(desc(count)) %>% slice(1)
+
+
+
+
diff --git a/week1/citibike.sh b/week1/citibike.sh
index 25604f545..6b44fffbf 100755
--- a/week1/citibike.sh
+++ b/week1/citibike.sh
@@ -4,20 +4,58 @@
#
# count the number of unique stations
+#$ cut -d , -f4 201402-citibike-tripdata.csv | sort | uniq | wc -l
+#330
+
# count the number of unique bikes
+#$ cut -d , -f12 201402-citibike-tripdata.csv | sort | uniq | wc -l
+#5700
# count the number of trips per day
+# $ cut -d , -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | head -n2
+# 1 starttime
+# 876 2014-02-13
# find the day with the most rides
+ #$cut -d , -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort -r | head -n1
+ # 13816 2014-02-02
+
# find the day with the fewest rides
+#$cut -d , -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | head -n2
# find the id of the bike with the most rides
+#$cut -d , -f12 201402-citibike-tripdata.csv | sort | uniq -c | sort -r | head -n1
# count the number of rides by gender and birth year
+#$ cut -d, -f15,14 201402-citibike-tripdata.csv | sort | uniq -c | sort -r
# count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...)
-
+# $ cut -d, -f5 201402-citibike-tripdata.csv | tail -n +2 | grep -E '.*[0-9].*&.*[0-9].*' | wc -l
+# 90549
# compute the average trip duration
+# awk -F, {sum += $1; count++} END {print sum/count}' 201402-citibike-tripdata.csv
+# 874.516
+
+
+#running average script for first 1000 lines
+# $ head -n 1000 201402-citibike-tripdata.csv | awk '{
+# window[NR % 3] = $1
+# if (NR >= 3) {
+# print (window[(NR-1)%3] + window[(NR-2)%3] + window[(NR-3)%3]) / 3
+# }
+# }'
+
+#MUSICAL CHAIR in python
+# import random
+
+# names = ["Alou", "Srijana", "Sara", "Drishya", "Dereck", "Ahmed",
+# "Aisha", "Vaishnavi", "Naomi", "Sofia", "Ye", "Vanessa"]
+
+# random.shuffle(names)
+
+# for i in range(0, 12, 2):
+# print(f"{names[i]} & {names[i+1]}")
+
diff --git a/week1/plot_trips.R b/week1/plot_trips.R
index 4f25437ba..eae1e1826 100644
--- a/week1/plot_trips.R
+++ b/week1/plot_trips.R
@@ -11,32 +11,98 @@ theme_set(theme_bw())
# load RData file output by load_trips.R
load('trips.RData')
-
+head(trips)
########################################
# plot trip data
-########################################
+######################################
# plot the distribution of trip times across all rides (compare a histogram vs. a density plot)
+ggplot(trips,aes(tripduration/60))+
+ geom_histogram(fill= 'blue', bins = 50)+
+ scale_x_log10(labels=comma)+
+ scale_y_continuous(labels = comma)+
+ labs(x='Trip duration',
+ y= 'Frequency',
+ title = 'Histogram of trip duration in minutes')
+
+ggplot(trips,aes(tripduration/60))+
+ geom_density(fill= 'red')+
+ scale_x_log10(labels=comma)+
+ labs(x='Trip duration ',
+ y= 'Frequency',
+ title = 'Density plot in minutes')
+
+head(trips)
# plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot)
+ggplot(trips, aes(x = tripduration, fill=usertype))+
+ geom_histogram(bins = 40)+
+ labs(x='Trip duration')+
+ scale_x_log10(labels=comma)+
+ facet_grid(~usertype)
+
+ggplot(trips, aes(x = tripduration, fill=usertype))+
+ geom_density()+
+ labs(x='Trip duration')+
+ scale_x_log10(labels=comma)+
+ facet_grid(~usertype)
# plot the total number of trips on each day in the dataset
+trips%>%
+ mutate(date = as.Date(starttime))%>%
+ ggplot(aes(x =date))+
+ geom_histogram(bins=30)+
+ scale_y_continuous(labels=comma)+
+ scale_x_date(labels = date_format("%Y-%m-%d"))
# plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color)
+trips%>%
+ mutate(age = as.numeric(format(ymd, "%Y")) - as.numeric(birth_year)) %>%
+ ggplot(aes(age, fill = gender))+
+ geom_histogram(bins=40,alpha = 0.8)+
+ scale_y_continuous(labels = comma)
# plot the ratio of male to female trips (on the y axis) by age (on the x axis)
# hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio
# (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet)
+trips %>% mutate(age = as.numeric(format(ymd, "%Y")) - as.numeric(birth_year)) %>%
+ group_by(age, gender) %>% summarise(num_trips = n(), .groups = "drop")%>%
+ pivot_wider(names_from = gender, values_from = num_trips) %>% mutate(male_to_female = Male/ Female) %>%
+ ggplot( aes(age, male_to_female))+ geom_line(color = "steelblue", size = 1) +
+ geom_smooth( color = "red", linetype = "dashed") +
+ scale_x_log10()+
+ labs(x = "AGE", y = "Male to Female Ratio", title = "Male/Female trip ratio by Age") +
+ theme_minimal()
########################################
# plot weather data
########################################
# plot the minimum temperature (on the y axis) over each day (on the x axis)
+view(weather)
+weather%>%
+ ggplot(aes(x= ymd, y= tmin, color=tmin))+
+ geom_point()+
+ scale_color_gradient(low = "blue", high = "red") +
+ labs(
+ title="Scatterplot of min temp over each day",
+ x= "Day",
+ y= "Minimum temperature")
+
# plot the minimum temperature and maximum temperature (on the y axis, with different colors) over each day (on the x axis)
# hint: try using the pivot_longer() function for this to reshape things before plotting
# (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet)
+view(weather)
+
+weather %>% pivot_longer(names_to = "temp_type", values_to = "temperature", cols = c(tmin,tmax)) %>%
+ ggplot(aes(ymd, temperature, color= temp_type))+ geom_line() + scale_x_date() + labs(
+ x = "Date",
+ y = "Temperature",
+ color = "Temperature Type",
+ title = "Daily Min and Max Temperatures")
+
+
########################################
# plot trip and weather data
@@ -45,18 +111,105 @@ load('trips.RData')
# join trips and weather
trips_with_weather <- inner_join(trips, weather, by="ymd")
+head(trips_with_weather)
+
# plot the number of trips as a function of the minimum temperature, where each point represents a day
# you'll need to summarize the trips and join to the weather data to do this
+str(weather)
+head(trips)
+trips_with_weather %>%
+ group_by(ymd,tmin)%>%
+ summarise(num_trips = n(), .groups = "drop")%>%
+ ggplot(aes(tmin, num_trips))+geom_point()
+
+#works only for this data frame since tmin is not different
+# trips_by_day <- trips %>%
+# mutate(date = as.Date(starttime)) %>%
+# group_by(date)%>%
+# summarise(num_trips = n())
+# weather %>%
+# mutate(date = as.Date(date))%>%
+# inner_join(trips_by_day,weather_by_day, by ='date') %>%
+# ggplot(aes(x= mean(tmin),y= num_trips))+
+# geom_point() +
+# labs(x = "Minimum Temperature", y = "Number of Trips",
+# title = "Trips vs. Min Temperature")
+
# repeat this, splitting results by whether there was substantial precipitation or not
# you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this
+trips_with_weather %>%
+ mutate(precptf = ifelse(prcp > mean(prcp), 'T', 'F'))%>%
+ group_by(ymd,tmin, precptf)%>%
+ summarise(num_trips = n(), .groups = "drop")%>%
+ ungroup()%>%
+ ggplot(aes(tmin, num_trips))+geom_point() +
+ labs( x = "number of trip",
+ y = "minimum temp",
+ title = "Substantial precipitation on num of trips") +
+ facet_wrap(~precptf)
+
+
# add a smoothed fit on top of the previous plot, using geom_smooth
+trips_with_weather %>%
+ mutate(precptf = ifelse(prcp > mean(prcp), 'T', 'F'))%>%
+ group_by(ymd,tmin, precptf)%>%
+ summarise(num_trips = n(), .groups = "drop")%>%
+ ungroup()%>%
+ ggplot(aes(tmin, num_trips))+geom_point() + geom_smooth()+
+ labs( x = "number of trip",
+ y = "minimum temp",
+ title = "Substantial precipitation on num of trips") +
+ facet_wrap(~precptf)
# compute the average number of trips and standard deviation in number of trips by hour of the day
# hint: use the hour() function from the lubridate package
+library(lubridate)
+
+trips_with_weather %>%
+ mutate(hour = hour(starttime),
+ date = as.Date(starttime))%>%
+ group_by(hour,date)%>%
+ summarise(num_trips = n(), .groups = "drop")%>%
+ group_by(hour)%>%
+ summarise(
+ average_trips = mean(num_trips),
+ sd_trips = sd(num_trips))
# plot the above
+trips_with_weather %>%
+ mutate(hour = hour(starttime),
+ date = as.Date(starttime))%>%
+ group_by(hour,date)%>%
+ summarise(num_trips = n(), .groups = "drop")%>%
+ group_by(hour)%>%
+ summarise(
+ average_trips = mean(num_trips),
+ sd_trips = sd(num_trips)) %>%
+ ggplot( aes(hour, average_trips))+
+ geom_line(color = "red") + geom_ribbon(aes(ymin = average_trips - sd_trips, ymax = average_trips + sd_trips), alpha = 0.25)+
+ labs(
+ x = "Hour of Day",
+ y = "Average Number of Trips",
+ title = "Average Number of Trips by Hour with ±1 SD Ribbon",
+ subtitle = "Red line: Mean trips per hour; Blue ribbon: ±1 standard deviation"
+ ) +
+ theme_minimal()
+
# repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days
# hint: use the wday() function from the lubridate package
+trips %>%
+ mutate(
+ hour = hour(starttime),
+ day = as.Date(starttime),
+ weekday = wday(starttime, label =TRUE) )%>%
+ group_by(hour, weekday, day)%>%
+ summarise(trip_count = n(), .groups = "drop")%>%
+ group_by(hour,weekday)%>%
+ summarise(average = mean(trip_count),
+ standarddev = sd(trip_count),
+ .groups = "drop") %>%
+ ggplot(aes(hour, average)) + geom_line(color = "red")+ geom_ribbon(aes(ymin = average - standarddev, ymax = average + standarddev), alpha = 0.25)+
+ facet_wrap(~weekday)
diff --git a/week2/#DAY 5 Exercises.r b/week2/#DAY 5 Exercises.r
new file mode 100644
index 000000000..f92679f99
--- /dev/null
+++ b/week2/#DAY 5 Exercises.r
@@ -0,0 +1,53 @@
+#DAY 5 Exercises
+
+## Combining and reshaping exercises
+
+#12.2.1 Exercises
+# Compute the rate for table2, and table4a + table4b. You will need to perform four operations:
+
+# Extract the number of TB cases per country per year.
+# Extract the matching population per country per year.
+# Divide cases by population, and multiply by 10000.
+# Store back in the appropriate place.
+# Which representation is easiest to work with? Which is hardest? Why?
+
+
+
+
+
+
+# 12.3.3 Exercises
+
+# Why are pivot_longer() and pivot_wider() not perfectly symmetrical?
+# Carefully consider the following example:
+
+# stocks <- tibble(
+# year = c(2015, 2015, 2016, 2016),
+# half = c( 1, 2, 1, 2),
+# return = c(1.88, 0.59, 0.92, 0.17)
+# )
+# stocks %>%
+# pivot_wider(names_from = year, values_from = return) %>%
+# pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return")
+# (Hint: look at the variable types and think about column names.)
+
+# pivot_longer() has a names_ptypes argument, e.g. names_ptypes = list(year = double()). What does it do?
+
+
+
+
+
+
+
+
+#What would happen if you widen this table? Why? How could you add a new column to uniquely identify each value?
+
+people <- tribble(
+ ~name, ~names, ~values,
+ #-----------------|--------|------
+ "Phillip Woods", "age", 45,
+ "Phillip Woods", "height", 186,
+ "Phillip Woods", "age", 50,
+ "Jessica Cordero", "age", 37,
+ "Jessica Cordero", "height", 156
+)
\ No newline at end of file
diff --git a/week2/Week 2 exercises.R b/week2/Week 2 exercises.R
new file mode 100644
index 000000000..cf445ae0c
--- /dev/null
+++ b/week2/Week 2 exercises.R
@@ -0,0 +1,52 @@
+library(tidyverse)
+library(lubridate)
+
+magnets <- read.csv('magnets.csv')
+
+
+
+
+
+
+
+#Machine learning Lab: Linear Regression
+
+library(MASS)
+install.packages("ISLR2")
+library(ISLR2)
+head(Boston)
+str(Boston)
+
+lm.fit <- lm(medv ~ lstat, data = Boston)
+attach(Boston)
+lm.fit <- lm(medv~lstat)
+summary(lm.fit)
+names(lm.fit)
+coef(lm.fit)
+confint(lm.fit)
+predict(lm.fit, data.frame(lstat = (c(5,10,15))),
+ interval = "confidence")
+
+plot(lstat, medv)
+abline(lm.fit)
+abline(lm.fit, lwd = 3, col = "red")
+plot(lstat, medv, col = "red")
+plot(lstat, medv, pch = 20)
+plot(lstat, medv, pch = "+")
+plot(1:20, 1:20, pch = 1:20)
+
+par(mfrow = c(1,1))
+plot(lm.fit)
+plot(predict(lm.fit), residuals(lm.fit))
+plot(predict(lm.fit), rstudent(lm.fit))
+plot(hatvalues(lm.fit))
+which.max(hatvalues(lm.fit))
+
+
+#Multiple Linear regression
+lm.fit <- lm(medv~lstat + age, data = Boston)
+summary(lm.fit)
+
+#USE all variables to perform regression line
+lm.fit <- lm(medv ~ ., data = Boston)
+summary(lm.fit)
diff --git a/week2/diamond-sizes.html b/week2/diamond-sizes.html
new file mode 100644
index 000000000..da3dab5fa
--- /dev/null
+++ b/week2/diamond-sizes.html
@@ -0,0 +1,408 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Diamond sizes
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
We have data about 53940 diamonds. Only 126 are larger than 2.5
+carats. The distribution of the remainder is shown below:
+

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/week2/week2exs.RMD b/week2/week2exs.RMD
new file mode 100644
index 000000000..e48ba80c8
--- /dev/null
+++ b/week2/week2exs.RMD
@@ -0,0 +1,189 @@
+---
+title: "Week 2 exercises "
+date: 2025-05-06
+output: html_document
+---
+```{r }
+library(tidyverse)
+library(lubridate)
+
+magnets <- read.csv('magnets.csv')
+str(magnets)
+summary(magnets)
+```
+
+Day 3
+
+
+# Question 9.1.
+
+# 1. What is the sample average of the change in score between the patients
+# rating before the application of the device and the rating after the application?
+```{r }
+head(magnets)
+
+mean(magnets$change)
+```
+
+# 2. Is the variable active a factor or a numeric variable?
+# 11Vallbona, Carlos, Carlton F. Hazlewood, and Gabor Jurida. (1997). Response of pain to
+# static magnetic elds in postpolio patients: A double-blind pilot study. Archives of Physical
+# and Rehabilitation Medicine 78(11): 1200-1203.
+
+it is a factor
+
+# 9.5. SOLVED EXERCISES
+# 153
+# 3. Compute the average value of the variable change for the patients that
+# received and active magnet and average value for those that received an
+# inactive placebo. (Hint: Notice that the rst 29 patients received an active
+# magnet and the last 21 patients received an inactive placebo. The sub
+# sequence of the rst 29 values of the given variables can be obtained via
+# the expression change[1:29] and the last 21 vales are obtained via the
+# expression change[30:50] .)
+
+
+
+# 4. Compute the sample standard deviation of the variable change for the
+# patients that received and active magnet and the sample standard devia
+# tion for those that received an inactive placebo.
+# 5. Produce a boxplot of the variable change for the patients that received
+# and active magnet and for patients that received an inactive placebo.
+# What is the number of outliers in each subsequence?
+
+
+
+
+# Question 10.1.
+ In Subsection 10.3.2 we compare the average against the mid
+range as estimators of the expectation of the measurement. The goal of this
+ exercise is to repeat the analysis, but this time compare the average to the
+ median as estimators of the expectation in symmetric distributions.
+
+ 1. Simulate the sampling distribution of average and the median of a sample
+ of size n = 100 from the Normal(3,2) distribution. Compute the expec
+tation and the variance of the sample average and of the sample median.
+ Which of the two estimators has a smaller mean square error?
+```{r }
+
+pnorm(100,3,2)
+
+
+
+
+```
+
+ 2. Simulate the sampling distribution of average and the median of a sample
+ of size n = 100 from the Uniform(0555) distribution. Compute the
+ expectation and the variance of the sample average and of the sample
+ median. Which of the two estimators has a smaller mean square error?
+
+
+
+
+
+
+# Question 10.2.
+The goal in this exercise is to assess estimation of a proportion
+ in a population on the basis of the proportion in the sample.
+ The le pop2.csv was introduced in Exercise 7.1 of Chapter 7. This le
+ contains information associated to the blood pressure of an imaginary popu
+lation of size 100,000. The le can be found on the internet (http://pluto.
+ huji.ac.il/~msby/StatThink/Datasets/pop2.csv). One of the variables in
+ the le is a factor by the name group that identi es levels of blood pressure.
+ The levels of this variable are HIGH , LOW , and NORMAL .
+ The le ex2.csv contains a sample of size n = 150 taken from the given
+ population. This le can also be found on the internet (http://pluto.huji.
+ ac.il/~msby/StatThink/Datasets/ex2.csv). It contains the same variables
+ as in the le pop2.csv . The le ex2.csv corresponds in this exercise to
+ the observed sample and the le pop2.csv corresponds to the unobserved
+ population.
+ Download both les to your computer and answer the following questions:
+ 1. Compute the proportion in the sample of those with a high level of blood
+ pressure16.
+ 2. Compute the proportion in the population of those with a high level of
+ blood pressure
+ 3. Simulate the sampling distribution of the sample proportion and compute
+ its expectation.
+ 4. Compute the variance of the sample proportion.
+ 5. It is proposed in Section 10.5 that the variance of the sample proportion
+ is Var( P) = p(1 p) n, where p is the probability of the event (having a
+ high blood pressure in our case) and n is the sample size (n = 150 in our
+ case). Examine this proposal in the current setting.
+
+
+```{r exercise10.2}
+
+pop2 <- read.csv("pop2.csv")
+str(pop2)
+
+ex2 <- read.csv("ex2.csv")
+summary(ex2)
+str(ex2)
+#ex2.csv contains a sample of size n = 150 taken from the given population of pop2 100,000
+
+
+#1
+sum(ex2$group == "HIGH") / nrow(ex2)
+#2
+mean(pop2$group == "HIGH")
+#3
+
+
+```
+
+
+
+## Day 4
+
+# Questions 12.1.
+
+Consider a medical condition that does not have a standard
+treatment. The recommended design of a clinical trial for a new treatment
+ to such condition involves using a placebo treatment as a control. A placebo
+ treatment is a treatment that externally looks identical to the actual treatment
+ but, in reality, it does not have the active ingredients. The reason for using
+ placebo for control is the placebo effect . Patients tent to react to the fact that
+ they are being treated regardless of the actual bene cial e ect of the treatment.
+ As an example, consider the trial for testing magnets as a treatment for pain
+ that was described in Question 9.1. The patients that where randomly assigned
+ to the control (the last 21 observations in the le magnets.csv ) were treated
+ with devises that looked like magnets but actually were not. The goal in this
+ exercise is to test for the presence of a placebo e ect in the case study Magnets
+ and Pain Relief of Question 9.1 using the data in the le magnets.csv .
+
+
+ 1. Let X be the measurement of change, the difference between the score of
+ pain before the treatment and the score after the treatment, for patients
+ that were treated with the inactive placebo. Express, in terms of the
+ expected value of X, the null hypothesis and the alternative hypothesis
+ for a statistical test to determine the presence of a placebo e ect. The null
+ hypothesis should re ect the situation that the placebo e ect is absent.
+
+
+ 2. Identify the observations that can be used in order to test the hypotheses.
+
+
+ 3. Carry out the test and report your conclusion. (Use a signi cance level of
+ 5%.)
+
+
+
+
+
+
+
+
+
+ ```{r day4 }
+
+magnets <- read.csv('magnets.csv')
+str(magnets)
+summary(magnets)
+
+
+
+
+
+
+ ```
\ No newline at end of file
diff --git a/week3/README.html b/week3/README.html
new file mode 100644
index 000000000..7fa99884d
--- /dev/null
+++ b/week3/README.html
@@ -0,0 +1,334 @@
+
+
+
+
+
+
+ MovieLens 10M/100k Data Set README
+
+
+
+ Summary
+
+
+ This data set contains 10000054 ratings and 95580 tags
+ applied to 10681 movies by 71567 users of the
+ online movie recommender service MovieLens.
+
+
+ Users were selected at random for inclusion. All users selected had rated
+ at least 20 movies. Unlike previous MovieLens data sets, no demographic
+ information is included. Each user is represented by an id, and no other
+ information is provided.
+
+
+
+ The data are contained in three files, movies.dat,
+ ratings.dat and tags.dat.
+ Also included are scripts for generating subsets of the data to support five-fold
+ cross-validation of rating predictions. More details about the contents and use
+ of all these files follows.
+
+
+
+ This and other GroupLens data sets are publicly available for download at
+ GroupLens Data Sets.
+
+
+ Usage License
+
+
+ Neither the University of Minnesota nor any of the researchers
+ involved can guarantee the correctness of the data, its suitability
+ for any particular purpose, or the validity of results based on the
+ use of the data set. The data set may be used for any research
+ purposes under the following conditions:
+
+
+ - The user may not state or imply any endorsement from the
+ University of Minnesota or the GroupLens Research Group.
+
+ - The user must acknowledge the use of the data set in
+ publications resulting from the use of the data set (see below
+ for citation information).
+
+ - The user may not redistribute the data without separate
+ permission.
+
+ - The user may not use this information for any commercial or
+ revenue-bearing purposes without first obtaining permission
+ from a faculty member of the GroupLens Research Project at the
+ University of Minnesota.
+
+
+ The executable software scripts are provided "as is" without warranty
+ of any kind, either expressed or implied, including, but not limited to,
+ the implied warranties of merchantability and fitness for a particular purpose.
+ The entire risk as to the quality and performance of them is with you.
+ Should the program prove defective, you assume the cost of all
+ necessary servicing, repair or correction.
+
+
+ In no event shall the University of Minnesota, its affiliates or employees
+ be liable to you for any damages arising out of the use or inability to use
+ these programs (including but not limited to loss of data or data being
+ rendered inaccurate).
+
+
+
+ If you have any further questions or comments, please email grouplens-info
+
+
+
+ Citation
+
+
+ To acknowledge use of the dataset in publications, please cite the
+ following paper:
+
+
+ F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets:
+ History and Context. ACM Transactions on Interactive Intelligent
+ Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages.
+ DOI=http://dx.doi.org/10.1145/2827872
+
+
+
+ Acknowledgements
+
+
+ Thanks to Rich Davies for generating the data set.
+
+
+
+ Further Information About GroupLens
+
+
+ GroupLens is a research group in the
+ Department of Computer Science and Engineering
+ at the University of Minnesota. Since its
+ inception in 1992, GroupLens' research projects have explored a variety of fields
+ including:
+
+
+ - Information Filtering
+ - Recommender Systems
+ - Online Communities
+ - Mobile and Ubiquitious Technologies
+ - Digital Libraries
+ - Local Geographic Information Systems.
+
+
+ GroupLens Research operates a movie recommender based on
+ collaborative filtering, MovieLens,
+ which is the source of these data.
+
+
+
+ Content and Use of Files
+
+
+
+ Character Encoding
+
+
+ The three data files are encoded as
+ UTF-8. This is a departure
+ from previous MovieLens data sets, which used different character encodings.
+ If accented characters in movie titles or tag values (e.g. Misérables, Les (1995))
+ display incorrectly, make sure that any program reading the data, such as a
+ text editor, terminal, or script, is configured for UTF-8.
+
+
+
+ User Ids
+
+
+ Movielens users were selected at random for inclusion. Their ids have been
+ anonymized.
+
+
+ Users were selected separately for inclusion
+ in the ratings and tags data sets, which implies that user ids may appear in
+ one set but not the other.
+
+
+ The anonymized values are consistent between the ratings and tags data files.
+ That is, user id n, if it appears in both files, refers to the same
+ real MovieLens user.
+
+
+
+ Ratings Data File Structure
+
+
+ All ratings are contained in the file ratings.dat. Each line of this
+ file represents one rating of one movie by one user, and has the following format:
+
+
+ UserID::MovieID::Rating::Timestamp
+
+
+ The lines within this file are ordered first by UserID, then, within user,
+ by MovieID.
+
+
+ Ratings are made on a 5-star scale, with half-star increments.
+
+
+ Timestamps represent
+ seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
+
+
+
+ Tags Data File Structure
+
+
+ All tags are contained in the file tags.dat. Each line of this
+ file represents one tag applied to one movie by one user, and has
+ the following format:
+
+
+ UserID::MovieID::Tag::Timestamp
+
+
+ The lines within this file are ordered first by UserID, then, within user,
+ by MovieID.
+
+
+ Tags are user
+ generated metadata about movies. Each tag is typically a single word, or
+ short phrase. The meaning, value and purpose of a particular tag is
+ determined by each user.
+
+
+ Timestamps represent
+ seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
+
+
+
+ Movies Data File Structure
+
+
+ Movie information is contained in the file movies.dat.
+ Each line of this file represents one movie, and has the following format:
+
+
+ MovieID::Title::Genres
+
+
+ MovieID is the real MovieLens id.
+
+
+ Movie titles, by policy, should be entered identically to those
+ found in IMDB, including year of release.
+ However, they are entered manually, so errors and inconsistencies may exist.
+
+
+ Genres are a pipe-separated list, and are selected from the following:
+
+
+ - Action
+ - Adventure
+ - Animation
+ - Children's
+ - Comedy
+ - Crime
+ - Documentary
+ - Drama
+ - Fantasy
+ - Film-Noir
+ - Horror
+ - Musical
+ - Mystery
+ - Romance
+ - Sci-Fi
+ - Thriller
+ - War
+ - Western
+
+
+
+ Cross-Validation Subset Generation Scripts
+
+
+ A Unix shell script, split_ratings.sh, is provided that, if desired,
+ can be used to split the ratings data for five-fold cross-validation
+ of rating predictions. It depends on a second script, allbut.pl, which
+ is also included and is written in Perl. They should run without modification
+ under Linux, Mac OS X, Cygwin or other Unix like systems.
+
+
+ Running split_ratings.sh will use ratings.dat
+ as input, and produce the fourteen output files described below. Multiple
+ runs of the script will produce identical results.
+
+
+
+ | File Names |
+ Description |
+
+
+
+ r1.train, r2.train, r3.train, r4.train, r5.train
+ r1.test, r2.test, r3.test, r4.test, r5.test
+ |
+
+ The data sets r1.train and r1.test through r5.train and r5.test
+ are 80%/20% splits of the ratings data into training and test data.
+ Each of r1, ..., r5 have disjoint test sets; this if for
+ 5 fold cross validation (where you repeat your experiment
+ with each training and test set and average the results).
+ |
+
+
+
+ ra.train, rb.train
+ ra.test, rb.test
+ |
+
+ The data sets ra.train, ra.test, rb.train, and rb.test
+ split the ratings data into a training set and a test set with
+ exactly 10 ratings per user in the test set. The sets
+ ra.test and rb.test are disjoint.
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/week3/allbut.pl b/week3/allbut.pl
new file mode 100644
index 000000000..a22afad08
--- /dev/null
+++ b/week3/allbut.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/env perl
+
+# get args
+if (@ARGV < 3) {
+ print STDERR "Usage: $0 base_name start stop max_test [ratings ...]\n";
+ exit 1;
+}
+$basename = shift;
+$start = shift;
+$stop = shift;
+$maxtest = shift;
+
+# open files
+open( TESTFILE, ">$basename.test" ) or die "Cannot open $basename.test for writing\n";
+open( BASEFILE, ">$basename.train" ) or die "Cannot open $basename.train for writing\n";
+
+# init variables
+$testcnt = 0;
+
+while (<>) {
+ ($user) = split /::/, $_, 2;
+ if (! defined $ratingcnt{$user}) {
+ $ratingcnt{$user} = 1;
+ } else {
+ ++$ratingcnt{$user};
+ }
+ if (($testcnt < $maxtest || $maxtest <= 0)
+ && $ratingcnt{$user} >= $start && $ratingcnt{$user} <= $stop) {
+ ++$testcnt;
+ print TESTFILE;
+ }
+ else {
+ print BASEFILE;
+ }
+}
diff --git a/week3/ngrams/01_download_1grams.sh b/week3/ngrams/01_download_1grams.sh
index 1d6d5bf10..1ed69bf1c 100644
--- a/week3/ngrams/01_download_1grams.sh
+++ b/week3/ngrams/01_download_1grams.sh
@@ -2,6 +2,8 @@
# use curl or wget to download the version 2 1gram file with all terms starting with "1", googlebooks-eng-all-1gram-20120701-1.gz
+curl -o googlebooks-eng-all-1gram-20120701-1.gz http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-1.gz
+
# update the timestamp on the resulting file using touch
# do not remove, this will keep make happy and avoid re-downloading of the data once you have it
touch googlebooks-eng-all-1gram-20120701-1.gz
diff --git a/week3/ngrams/02_filter_1grams.sh b/week3/ngrams/02_filter_1grams.sh
index 3b8e9ec29..973564385 100644
--- a/week3/ngrams/02_filter_1grams.sh
+++ b/week3/ngrams/02_filter_1grams.sh
@@ -4,3 +4,6 @@
# decompress the first using gunzip, zless, zcat or similar
# then filter out rows that match using grep -E, egrep, awk, or similar
# write results to year_counts.tsv
+
+
+zcat googlebooks-eng-all-1gram-20120701-1.gz | grep -P '\t(18|19|20)[0-9]{2}\b' > year_counts.tsv
\ No newline at end of file
diff --git a/week3/ngrams/03_download_totals.sh b/week3/ngrams/03_download_totals.sh
index f53381e8e..902ced55f 100644
--- a/week3/ngrams/03_download_totals.sh
+++ b/week3/ngrams/03_download_totals.sh
@@ -2,6 +2,7 @@
# use curl or wget to download the version 2 of the total counts file, googlebooks-eng-all-totalcounts-20120701.txt
+curl -o googlebooks-eng-all-totalcounts-20120701.txt http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-totalcounts-20120701.txt
# update the timestamp on the resulting file using touch
# do not remove, this will keep make happy and avoid re-downloading of the data once you have it
touch googlebooks-eng-all-totalcounts-20120701.txt
diff --git a/week3/ngrams/04_reformat_totals.sh b/week3/ngrams/04_reformat_totals.sh
index 0445c1ff6..1af2b4a10 100644
--- a/week3/ngrams/04_reformat_totals.sh
+++ b/week3/ngrams/04_reformat_totals.sh
@@ -3,3 +3,5 @@
# reformat total counts in googlebooks-eng-all-totalcounts-20120701.txt to a valid csv
# use tr, awk, or sed to convert tabs to newlines
# write results to total_counts.csv
+
+tr '\t' '\n' < googlebooks-eng-all-totalcounts-20120701.txt > total_counts.csv
\ No newline at end of file
diff --git a/week3/ngrams/05_final_report.Rmd b/week3/ngrams/05_final_report.Rmd
index a7c90c1fb..0ab1495c5 100644
--- a/week3/ngrams/05_final_report.Rmd
+++ b/week3/ngrams/05_final_report.Rmd
@@ -12,6 +12,7 @@ output:
---
```{r setup, include=FALSE}
+setwd("C:/Users/ds3/Documents/ds3_final/coursework/week3/ngrams")
library(here)
library(scales)
library(tidyverse)
@@ -19,6 +20,7 @@ library(tidyverse)
theme_set(theme_bw())
knitr::opts_chunk$set(echo = TRUE)
+getwd()
```
# Description
@@ -49,11 +51,16 @@ Then edit the `03_download_totals.sh` file to down the `googlebooks-eng-all-tota
## Load the cleaned data
-Load in the `year_counts.tsv` and `total_counts.csv` files. Use the `here()` function around the filename to keep things portable.Give the columns of `year_counts.tsv` the names `term`, `year`, `volume`, and `book_count`. Give the columns of `total_counts.csv` the names `year`, `total_volume`, `page_count`, and `book_count`. Note that column order in these files may not match the examples in the documentation.
+Load in the `year_counts.tsv` and `total_counts.csv` files. Use the `here()` function around the filename to keep things portable.Give the columns of `year_counts.tsv` the names `term`, `year`, `volume`, and `book_count`.
+ Give the columns of `total_counts.csv` the names `year`, `total_volume`, `page_count`, and `book_count`. Note that column order in these files may not match the examples in the documentation.
```{r load-counts}
+years_counts <- read_tsv("year_counts.tsv", col_names = c('term', 'year', 'volume', 'book_count'))
+total_counts <- read_csv("total_counts.csv", col_names = c('year', 'total_volume', 'page_count', 'book_count'))
+summary(years_counts)
+summary(total_counts)
```
## Your written answer
@@ -69,8 +76,13 @@ Add a line below using Rmarkdown's inline syntax to print the total number of li
Join the raw year term counts with the total counts and divide to get a proportion of mentions for each term normalized by the total counts for each year.
```{r join-years-and-totals}
+joined_years_total <- years_counts %>% left_join(total_counts, by= "year")
+filter(joined_years_total, term == "1883" | term == "1910" | term == "1950")
+head(joined_years_total)
+view(head(years_counts,500))
+head(total_counts)
```
## Plot the main figure 3a
@@ -79,6 +91,17 @@ Plot the proportion of mentions for the terms "1883", "1910", and "1950" over ti
```{r plot-proportion-over-time}
+
+joined_w_proportion <- joined_years_total %>% mutate(proportion = (volume / total_volume)*10000)
+
+str(joined_w_proportion)
+
+joined_w_proportion %>% filter(term == "1883" | term == "1910" | term == "1950") %>% filter(year >= 1850 & year <= 2012) %>%
+ ggplot(aes(year, proportion, color = term)) + geom_line(size =1) + scale_y_continuous(labels = percent) +
+ labs(
+ x= "Year",
+ y= "Frequency"
+ )
```
## Your written answer
@@ -96,6 +119,8 @@ Go to the ngram viewer, enter the terms "1883", "1910", and "1950" and take a sc
## Your written answer
Add your screenshot for Part C below this line using the `` syntax and comment on similarities / differences.
+``
+
# Part D
@@ -107,7 +132,12 @@ Add your screenshot for Part C below this line using the ` %>% filter(year >= 1850 & year <= 2012) %>%
+ ggplot(aes(year, volume, color = term)) + geom_line(size =1) + scale_y_continuous(labels = comma) +
+ labs(
+ x= "Year",
+ y= "Frequency"
+ )
```
# Part E
@@ -116,11 +146,19 @@ Plot the raw counts for the terms "1883", "1910", and "1950" over time from 1850
As part of answering this question, make an additional plot.
+Yes, there is a major difference between two because the raw count doesnot go down.
+
## Plot the totals
Plot the total counts for each year over time, from 1850 to 2012. Use the `comma` function from the `scales` package for a readable y axis. There should be only one line on this plot (not three).
```{r plot-totals}
+joined_w_proportion %>% filter(term == "1883" | term == "1910" | term == "1950") %>% filter(year >= 1850 & year <= 2012) %>%
+ ggplot(aes(year, total_volume)) + geom_line(size =1) + scale_y_continuous(labels = comma) +
+ labs(
+ x= "Year",
+ y= "Frequency"
+ )
```
@@ -134,9 +172,14 @@ Write up your answer to Part E here.
## Compute peak mentions
+
For each year term, find the year where its proportion of mentions peaks (hits its highest value). Store this in an intermediate dataframe.
```{r compute-peaks}
+peak_dataframe <- joined_w_proportion %>%
+ filter(term == "1883" | term == "1910" | term == "1950") %>%
+ filter(year >= 1850 & year <= 2012) %>%
+
```
diff --git a/week3/ngrams/plot.png b/week3/ngrams/plot.png
new file mode 100644
index 000000000..222565159
Binary files /dev/null and b/week3/ngrams/plot.png differ
diff --git a/week3/split_ratings.sh b/week3/split_ratings.sh
new file mode 100644
index 000000000..34af4a9b4
--- /dev/null
+++ b/week3/split_ratings.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+RATINGS_COUNT=`wc -l ratings.dat | xargs | cut -d ' ' -f 1`
+echo "ratings count: $RATINGS_COUNT"
+SET_SIZE=`expr $RATINGS_COUNT / 5`
+echo "set size: $SET_SIZE"
+REMAINDER=`expr $RATINGS_COUNT % 5`
+echo "remainder: $REMAINDER"
+
+for i in 1 2 3 4 5
+ do
+ head -`expr $i \* $SET_SIZE` ratings.dat | tail -$SET_SIZE > r$i.test
+
+ # XXX: OSX users will see the message "head: illegal line count -- 0" here,
+ # but this is just a warning; the script still works as intended.
+ head -`expr \( $i - 1 \) \* $SET_SIZE` ratings.dat > r$i.train
+ tail -`expr \( 5 - $i \) \* $SET_SIZE` ratings.dat >> r$i.train
+
+ if [ $i -eq 5 ]; then
+ tail -$REMAINDER ratings.dat >> r5.test
+ else
+ tail -$REMAINDER ratings.dat >> r$i.train
+ fi
+
+ echo "r$i.test created. `wc -l r$i.test | xargs | cut -d " " -f 1` lines."
+ echo "r$i.train created. `wc -l r$i.train | xargs | cut -d " " -f 1` lines."
+done
+
+./allbut.pl ra 1 10 0 ratings.dat
+echo "ra.test created. `wc -l ra.test | xargs | cut -d " " -f 1` lines."
+echo "ra.train created. `wc -l ra.train | xargs | cut -d " " -f 1` lines."
+
+./allbut.pl rb 11 20 0 ratings.dat
+echo "rb.test created. `wc -l rb.test | xargs | cut -d " " -f 1` lines."
+echo "rb.train created. `wc -l rb.train | xargs | cut -d " " -f 1` lines."
+
diff --git a/week3/trips_per_day.tsv b/week3/trips_per_day.tsv
deleted file mode 100644
index 41a5887df..000000000
--- a/week3/trips_per_day.tsv
+++ /dev/null
@@ -1,366 +0,0 @@
-ymd num_trips date prcp snwd snow tmax tmin
-2014-01-01 6059 20140101 0 0 0 3.3 2.4
-2014-01-02 8600 20140102 0.33 0 3.1 3.3 1.8
-2014-01-03 1144 20140103 0.29 5.9 3.3 1.8 0.9
-2014-01-04 2292 20140104 0 5.9 0 2.9 0.8
-2014-01-05 2678 20140105 0.14 3.9 0 4 2.7
-2014-01-06 9510 20140106 0.36 1.2 0 5.5 1.9
-2014-01-07 6267 20140107 0 0 0 1.9 0.4
-2014-01-08 9246 20140108 0 0 0 2.2 0.9
-2014-01-09 13354 20140109 0 0 0 3.2 2.2
-2014-01-10 9847 20140110 0.11 0 0 3.7 3
-2014-01-11 7695 20140111 0.5 0 0 5.8 3.7
-2014-01-12 12515 20140112 0.05 0 0 5.4 3.8
-2014-01-13 20633 20140113 0 0 0 5.1 3.7
-2014-01-14 9999 20140114 0.38 0 0 5.2 4.4
-2014-01-15 21630 20140115 0 0 0 4.7 3.9
-2014-01-16 19953 20140116 0 0 0 4.2 3.6
-2014-01-17 20301 20140117 0 0 0 4.4 3.3
-2014-01-18 9438 20140118 0.07 0 0 4.1 2.9
-2014-01-19 9028 20140119 0 0 0 3.8 2.4
-2014-01-20 13407 20140120 0 0 0 4.6 3.1
-2014-01-21 3804 20140121 0.46 0 11 3.1 1.1
-2014-01-22 2451 20140122 0.02 11 0.5 1.7 0.5
-2014-01-23 5071 20140123 0 7.1 0 2 0.7
-2014-01-24 6176 20140124 0 5.9 0 2 1
-2014-01-25 4338 20140125 0.04 3.9 1 2.8 1.9
-2014-01-26 4980 20140126 0 3.9 0 3.4 1.7
-2014-01-27 13119 20140127 0 3.1 0 4.4 2.1
-2014-01-28 10033 20140128 0 1.2 0 2.1 1.2
-2014-01-29 10021 20140129 0.04 2 0.8 2.3 1.4
-2014-01-30 12158 20140130 0 1 0 3 1.6
-2014-01-31 14653 20140131 0 1.2 0 3.9 2.5
-2014-02-01 12771 20140201 0 1.2 0 4.5 3.6
-2014-02-02 13816 20140202 0 1.2 0 5.6 3.9
-2014-02-03 2600 20140203 1.17 1.2 8 4.3 2.7
-2014-02-04 8709 20140204 0 7.9 0 3.5 2.2
-2014-02-05 2746 20140205 1.43 9.1 4 3.4 2.9
-2014-02-06 7196 20140206 0 9.1 0 3.2 2.1
-2014-02-07 8495 20140207 0 9.1 0 3.2 2.4
-2014-02-08 5986 20140208 0 9.1 0 2.9 2.1
-2014-02-09 4996 20140209 0.1 7.9 1.2 3.1 2.1
-2014-02-10 6846 20140210 0 9.8 0 2.9 2.1
-2014-02-11 8343 20140211 0 9.8 0 2.6 1.6
-2014-02-12 8580 20140212 0 9.8 0 2.5 1.3
-2014-02-13 876 20140213 1.78 11.8 9.5 3.6 2.4
-2014-02-14 3609 20140214 0.3 18.1 3 4 3.1
-2014-02-15 2261 20140215 0.13 18.1 1.6 3.7 2.7
-2014-02-16 3003 20140216 0 18.1 0 3 2.1
-2014-02-17 4854 20140217 0 16.9 0 3.2 1.8
-2014-02-18 5140 20140218 0.16 18.1 1.5 3.9 2.6
-2014-02-19 8506 20140219 0.26 16.9 0 4.5 3.4
-2014-02-20 11792 20140220 0.03 15 0 5.1 3.7
-2014-02-21 8680 20140221 0.09 13 0 4.9 3.6
-2014-02-22 13044 20140222 0 9.8 0 5.4 4
-2014-02-23 13324 20140223 0 9.1 0 5.4 4.3
-2014-02-24 12922 20140224 0 5.9 0 4.4 2.7
-2014-02-25 12830 20140225 0 5.9 0 3.3 2.4
-2014-02-26 11188 20140226 0.03 5.9 0.2 3.1 2
-2014-02-27 12036 20140227 0 5.9 0 3.4 1.4
-2014-02-28 9587 20140228 0 5.9 0 2.4 0.9
-2014-03-01 9202 20140301 0 5.9 0 3.7 2
-2014-03-02 8195 20140302 0 5.1 0 4 3.2
-2014-03-03 7708 20140303 0.04 5.1 0.1 3.2 1.7
-2014-03-04 10398 20140304 0 5.1 0 2.9 1.3
-2014-03-05 13801 20140305 0 3.9 0 3.9 2.6
-2014-03-06 12773 20140306 0 3.9 0 3.1 1.6
-2014-03-07 13304 20140307 0 3.9 0 3.7 2.6
-2014-03-08 16350 20140308 0 3.1 0 5.7 3.5
-2014-03-09 12262 20140309 0 2 0 4.4 3.6
-2014-03-10 17231 20140310 0 1.2 0 5.1 3.6
-2014-03-11 24356 20140311 0 0 0 6.6 4.5
-2014-03-12 15101 20140312 0.35 0 0 5.6 3.2
-2014-03-13 10979 20140313 0 0 0 3.2 1.8
-2014-03-14 16299 20140314 0 0 0 4.6 2.2
-2014-03-15 18053 20140315 0 0 0 5.8 4.2
-2014-03-16 9851 20140316 0 0 0 4.2 3
-2014-03-17 12503 20140317 0 0 0 3.5 2.3
-2014-03-18 16679 20140318 0 0 0 4.3 2.8
-2014-03-19 13341 20140319 0.92 0 0 4.6 3.2
-2014-03-20 19695 20140320 0 0 0 5.4 4
-2014-03-21 20718 20140321 0 0 0 5.1 3.9
-2014-03-22 19880 20140322 0 0 0 6.3 4.1
-2014-03-23 11740 20140323 0 0 0 4.2 2.7
-2014-03-24 13598 20140324 0 0 0 3.5 2.1
-2014-03-25 16037 20140325 0 0 0 3.9 2.6
-2014-03-26 14500 20140326 0 0 0 3.6 2.4
-2014-03-27 16666 20140327 0 0 0 4.4 2.2
-2014-03-28 18425 20140328 0.04 0 0 6.2 3.7
-2014-03-29 4480 20140329 1.81 0 0 5.9 4.3
-2014-03-30 7907 20140330 0.35 0 0 4.9 3.9
-2014-03-31 17085 20140331 0.16 0 0 5.6 3.6
-2014-04-01 23908 20140401 0 0 0 6 3.9
-2014-04-02 22515 20140402 0 0 0 5.4 4.2
-2014-04-03 26321 20140403 0.07 0 0 6.7 4.6
-2014-04-04 12566 20140404 0.21 0 0 4.7 4
-2014-04-05 18655 20140405 0 0 0 5.4 4
-2014-04-06 20750 20140406 0 0 0 6.1 3.6
-2014-04-07 14677 20140407 0.52 0 0 5.3 4.3
-2014-04-08 21884 20140408 0.34 0 0 6.4 4.6
-2014-04-09 26931 20140409 0 0 0 6.1 4.5
-2014-04-10 27463 20140410 0 0 0 5.8 4.2
-2014-04-11 26798 20140411 0 0 0 7.5 5.6
-2014-04-12 27108 20140412 0 0 0 7.3 5
-2014-04-13 26871 20140413 0 0 0 7.7 5.4
-2014-04-14 28832 20140414 0 0 0 7.5 5.9
-2014-04-15 10523 20140415 0.71 0 0 6.3 3.3
-2014-04-16 20082 20140416 0.05 0 0 4.9 3.1
-2014-04-17 22282 20140417 0 0 0 4.8 3.6
-2014-04-18 20770 20140418 0 0 0 4.9 3.5
-2014-04-19 24369 20140419 0 0 0 6.8 4.1
-2014-04-20 18464 20140420 0 0 0 6 4.5
-2014-04-21 27124 20140421 0 0 0 6.7 4.1
-2014-04-22 27259 20140422 0.01 0 0 7.1 5.1
-2014-04-23 26949 20140423 0 0 0 6.1 4.4
-2014-04-24 28107 20140424 0 0 0 6.2 4.2
-2014-04-25 27919 20140425 0.02 0 0 6.3 4.4
-2014-04-26 19899 20140426 0.92 0 0 6.7 4.6
-2014-04-27 21926 20140427 0 0 0 5.9 4.7
-2014-04-28 28557 20140428 0 0 0 6.7 4.5
-2014-04-29 18404 20140429 0.03 0 0 5.2 4.3
-2014-04-30 2867 20140430 4.97 0 0 5.2 4.1
-2014-05-01 26762 20140501 0.12 0 0 7.8 5.1
-2014-05-02 32377 20140502 0 0 0 7 5.6
-2014-05-03 25501 20140503 0.08 0 0 7.1 5.4
-2014-05-04 21277 20140504 0.02 0 0 6.5 5.4
-2014-05-05 30288 20140505 0 0 0 7 5
-2014-05-06 32233 20140506 0 0 0 7.1 5.2
-2014-05-07 32882 20140507 0 0 0 6.9 5
-2014-05-08 17619 20140508 0.41 0 0 5.9 5.3
-2014-05-09 22641 20140509 0.04 0 0 6.3 5.5
-2014-05-10 21832 20140510 0.37 0 0 8.3 5.7
-2014-05-11 28326 20140511 0.03 0 0 8.2 6
-2014-05-12 31644 20140512 0 0 0 8.5 6.4
-2014-05-13 32847 20140513 0 0 0 7.3 5.3
-2014-05-14 30044 20140514 0.01 0 0 7.1 5.2
-2014-05-15 26057 20140515 0.15 0 0 7.1 5.8
-2014-05-16 17784 20140516 1.54 0 0 6.8 5.8
-2014-05-17 29912 20140517 0 0 0 7 5.3
-2014-05-18 27436 20140518 0 0 0 6.7 5.1
-2014-05-19 31692 20140519 0 0 0 7.2 4.9
-2014-05-20 35567 20140520 0 0 0 7.8 5.3
-2014-05-21 34420 20140521 0 0 0 7.4 6.3
-2014-05-22 24535 20140522 0.24 0 0 6.7 5.9
-2014-05-23 24484 20140523 0.91 0 0 7.1 5.6
-2014-05-24 16798 20140524 0.4 0 0 7 5.7
-2014-05-25 25637 20140525 0 0 0 8 5.5
-2014-05-26 26930 20140526 0 0 0 8.6 6.6
-2014-05-27 32936 20140527 0 0 0 8.6 6.4
-2014-05-28 29690 20140528 0 0 0 6.4 5.4
-2014-05-29 33889 20140529 0 0 0 6.6 5.1
-2014-05-30 34343 20140530 0 0 0 7.5 5.5
-2014-05-31 27734 20140531 0.05 0 0 7.3 5.7
-2014-06-01 31333 20140601 0 0 0 7.7 5.5
-2014-06-02 34519 20140602 0 0 0 8 5.9
-2014-06-03 29501 20140603 0.12 0 0 8.7 6.5
-2014-06-04 36068 20140604 0 0 0 8.2 6.3
-2014-06-05 24619 20140605 0.87 0 0 7.6 6.1
-2014-06-06 37719 20140606 0 0 0 7.6 6.1
-2014-06-07 31204 20140607 0 0 0 8.2 6
-2014-06-08 30162 20140608 0 0 0 8.6 6.6
-2014-06-09 18611 20140609 1.6 0 0 7.3 6.3
-2014-06-10 33435 20140610 0 0 0 7.7 6.5
-2014-06-11 30548 20140611 0.02 0 0 7 6.1
-2014-06-12 29084 20140612 0.07 0 0 7.3 6
-2014-06-13 18308 20140613 1.28 0 0 7.9 6.5
-2014-06-14 28417 20140614 0 0 0 7.4 6
-2014-06-15 28382 20140615 0 0 0 8 5.9
-2014-06-16 34932 20140616 0 0 0 8.1 6.3
-2014-06-17 35097 20140617 0 0 0 8.9 7.1
-2014-06-18 35531 20140618 0 0 0 8.9 7.6
-2014-06-19 31724 20140619 0.15 0 0 7.7 6.8
-2014-06-20 34405 20140620 0 0 0 7.9 6.4
-2014-06-21 30288 20140621 0 0 0 7.8 6.2
-2014-06-22 29417 20140622 0 0 0 7.9 6.4
-2014-06-23 32580 20140623 0 0 0 8.1 6.5
-2014-06-24 35897 20140624 0 0 0 8.1 6.8
-2014-06-25 34340 20140625 0.08 0 0 8.5 7
-2014-06-26 34846 20140626 0.07 0 0 8.5 7
-2014-06-27 35823 20140627 0 0 0 8.3 6.7
-2014-06-28 28507 20140628 0 0 0 8.7 6.6
-2014-06-29 27986 20140629 0 0 0 8.3 6.8
-2014-06-30 33597 20140630 0 0 0 8.4 6.9
-2014-07-01 34854 20140701 0 0 0 8.9 7.2
-2014-07-02 26582 20140702 0.96 0 0 9.1 7.2
-2014-07-03 27587 20140703 1.78 0 0 8.7 6.9
-2014-07-04 13612 20140704 0.14 0 0 7.4 6.5
-2014-07-05 22913 20140705 0 0 0 8.1 6.3
-2014-07-06 23822 20140706 0 0 0 8.4 6.6
-2014-07-07 31863 20140707 0.04 0 0 9 7.2
-2014-07-08 32713 20140708 0.39 0 0 9.1 7.1
-2014-07-09 34426 20140709 0.09 0 0 8.8 7.1
-2014-07-10 36288 20140710 0 0 0 8.3 7.2
-2014-07-11 35000 20140711 0 0 0 8.6 7.1
-2014-07-12 28718 20140712 0 0 0 8.5 7.1
-2014-07-13 24869 20140713 0.03 0 0 8.3 7.2
-2014-07-14 25825 20140714 0.46 0 0 8.4 7.2
-2014-07-15 22963 20140715 1.3 0 0 8.6 7.2
-2014-07-16 35391 20140716 0 0 0 8.1 6.8
-2014-07-17 38619 20140717 0 0 0 8.1 6.7
-2014-07-18 36105 20140718 0 0 0 8.1 6.4
-2014-07-19 29291 20140719 0 0 0 7.6 6.8
-2014-07-20 30296 20140720 0 0 0 8 6.6
-2014-07-21 35670 20140721 0 0 0 8.5 6.7
-2014-07-22 36886 20140722 0 0 0 8.6 7.1
-2014-07-23 34173 20140723 0.19 0 0 8.8 7.2
-2014-07-24 36693 20140724 0 0 0 8 7
-2014-07-25 36384 20140725 0 0 0 8.2 6.6
-2014-07-26 27179 20140726 0 0 0 8.1 6.9
-2014-07-27 27004 20140727 0.02 0 0 8.5 7.1
-2014-07-28 33189 20140728 0.19 0 0 8.2 6.8
-2014-07-29 37092 20140729 0 0 0 7.6 6.4
-2014-07-30 37377 20140730 0 0 0 8 6.3
-2014-07-31 35458 20140731 0 0 0 8.2 6.8
-2014-08-01 32654 20140801 0 0 0 8.4 7.1
-2014-08-02 26784 20140802 0.41 0 0 7.4 6.3
-2014-08-03 25276 20140803 0.07 0 0 7.6 6.6
-2014-08-04 33822 20140804 0 0 0 8.4 7
-2014-08-05 34392 20140805 0 0 0 9 7.1
-2014-08-06 36336 20140806 0 0 0 8.3 7
-2014-08-07 36362 20140807 0 0 0 8.3 6.6
-2014-08-08 34073 20140808 0 0 0 8.3 6.5
-2014-08-09 31636 20140809 0 0 0 8.7 6.6
-2014-08-10 26749 20140810 0 0 0 8.8 6.8
-2014-08-11 33664 20140811 0 0 0 8.7 7.1
-2014-08-12 26261 20140812 0.19 0 0 7.9 7
-2014-08-13 27977 20140813 0.53 0 0 8.2 6.8
-2014-08-14 35154 20140814 0 0 0 7.7 6.3
-2014-08-15 32480 20140815 0 0 0 7.3 6.1
-2014-08-16 32017 20140816 0 0 0 7.8 6.3
-2014-08-17 27417 20140817 0.01 0 0 8.2 6.6
-2014-08-18 34339 20140818 0 0 0 8.1 6.3
-2014-08-19 35367 20140819 0 0 0 8.3 6.3
-2014-08-20 35532 20140820 0 0 0 8.4 7
-2014-08-21 33804 20140821 0.35 0 0 8.3 6.5
-2014-08-22 30406 20140822 0.06 0 0 7.9 6.5
-2014-08-23 26429 20140823 0.01 0 0 7.7 6.7
-2014-08-24 27493 20140824 0 0 0 8 6.4
-2014-08-25 32892 20140825 0 0 0 8.8 6.4
-2014-08-26 34519 20140826 0 0 0 8.9 7
-2014-08-27 33721 20140827 0 0 0 9 7
-2014-08-28 33927 20140828 0 0 0 8.2 6.6
-2014-08-29 31264 20140829 0 0 0 8 6.1
-2014-08-30 22689 20140830 0 0 0 8 6.5
-2014-08-31 18053 20140831 0.62 0 0 9 7.3
-2014-09-01 20725 20140901 0 0 0 8.8 7.5
-2014-09-02 29657 20140902 0 0 0 9.2 7.7
-2014-09-03 34843 20140903 0 0 0 8.6 7.2
-2014-09-04 36392 20140904 0 0 0 8.7 6.9
-2014-09-05 35579 20140905 0 0 0 8.7 7.2
-2014-09-06 26808 20140906 0.11 0 0 9.1 6.7
-2014-09-07 28901 20140907 0 0 0 8.1 6.5
-2014-09-08 33979 20140908 0 0 0 7.5 6.5
-2014-09-09 34166 20140909 0 0 0 7.3 6.3
-2014-09-10 37418 20140910 0 0 0 8 6.3
-2014-09-11 36668 20140911 0 0 0 8.3 6.9
-2014-09-12 38481 20140912 0 0 0 7.8 6.2
-2014-09-13 19499 20140913 0.26 0 0 6.9 5.8
-2014-09-14 27187 20140914 0 0 0 7.1 5.3
-2014-09-15 34258 20140915 0 0 0 7.1 5.5
-2014-09-16 25579 20140916 0.37 0 0 7 5.8
-2014-09-17 36791 20140917 0 0 0 7.3 5.5
-2014-09-18 37300 20140918 0 0 0 7.6 5.7
-2014-09-19 35674 20140919 0 0 0 6.6 5.4
-2014-09-20 29999 20140920 0 0 0 7.5 5.7
-2014-09-21 26650 20140921 0.15 0 0 7.5 6.7
-2014-09-22 32937 20140922 0 0 0 7.1 5.5
-2014-09-23 35599 20140923 0 0 0 7.1 5.2
-2014-09-24 35838 20140924 0 0 0 7.1 5.8
-2014-09-25 17165 20140925 0.32 0 0 6.4 5.7
-2014-09-26 34500 20140926 0 0 0 7.7 5.8
-2014-09-27 30463 20140927 0 0 0 8.3 6
-2014-09-28 29491 20140928 0 0 0 8.4 6.4
-2014-09-29 32385 20140929 0 0 0 7.9 6.7
-2014-09-30 34901 20140930 0 0 0 7.1 6.2
-2014-10-01 28053 20141001 0.02 0 0 6.5 6.1
-2014-10-02 34154 20141002 0 0 0 7 6.1
-2014-10-03 35966 20141003 0 0 0 7.1 5.6
-2014-10-04 14173 20141004 1.18 0 0 6.9 5.2
-2014-10-05 23578 20141005 0 0 0 6.1 4.6
-2014-10-06 30628 20141006 0 0 0 6.9 5
-2014-10-07 32756 20141007 0.06 0 0 7.1 6.3
-2014-10-08 33437 20141008 0.04 0 0 7.3 6.2
-2014-10-09 32322 20141009 0 0 0 6.8 5.5
-2014-10-10 31616 20141010 0 0 0 6.4 5.2
-2014-10-11 13807 20141011 0.33 0 0 6 5
-2014-10-12 23079 20141012 0 0 0 6.3 4.8
-2014-10-13 23668 20141013 0.05 0 0 6.5 5.2
-2014-10-14 30252 20141014 0 0 0 7.6 6.3
-2014-10-15 30477 20141015 0.69 0 0 7.7 6.9
-2014-10-16 24829 20141016 1.11 0 0 7.1 6.1
-2014-10-17 34755 20141017 0 0 0 7.1 5.9
-2014-10-18 25854 20141018 0 0 0 7 5.6
-2014-10-19 19646 20141019 0 0 0 5.6 4.4
-2014-10-20 26440 20141020 0 0 0 6 4.2
-2014-10-21 31087 20141021 0.11 0 0 6.7 5.5
-2014-10-22 12023 20141022 1.51 0 0 5.8 5
-2014-10-23 11194 20141023 0.61 0 0 5.3 5
-2014-10-24 29606 20141024 0 0 0 6.3 5.1
-2014-10-25 25232 20141025 0.01 0 0 6.7 5
-2014-10-26 21488 20141026 0 0 0 6.3 5.3
-2014-10-27 27532 20141027 0 0 0 6.3 4.8
-2014-10-28 32313 20141028 0 0 0 7.2 5.3
-2014-10-29 30651 20141029 0 0 0 7.2 5.1
-2014-10-30 31079 20141030 0 0 0 5.9 4.7
-2014-10-31 28843 20141031 0.05 0 0 5.5 4.5
-2014-11-01 7484 20141101 0.35 0 0 4.7 4.2
-2014-11-02 12990 20141102 0 0 0 4.8 4.1
-2014-11-03 24019 20141103 0 0 0 6.1 3.9
-2014-11-04 30181 20141104 0 0 0 6.8 5.3
-2014-11-05 30766 20141105 0 0 0 6.4 5.6
-2014-11-06 13949 20141106 0.37 0 0 5.7 4.8
-2014-11-07 25648 20141107 0 0 0 5.3 4
-2014-11-08 18211 20141108 0 0 0 4.8 3.6
-2014-11-09 18128 20141109 0 0 0 5.7 4.6
-2014-11-10 25573 20141110 0 0 0 6.1 4.4
-2014-11-11 28787 20141111 0 0 0 6.4 4.9
-2014-11-12 28164 20141112 0 0 0 6.5 4.7
-2014-11-13 23972 20141113 0.2 0 0 4.8 3.6
-2014-11-14 19709 20141114 0.06 0 0 4.2 3.5
-2014-11-15 14856 20141115 0 0 0 4.2 3.3
-2014-11-16 13445 20141116 0.03 0 0 4.5 3.5
-2014-11-17 7346 20141117 1.54 0 0 5.2 4
-2014-11-18 17010 20141118 0 0 0 4.5 2.4
-2014-11-19 16270 20141119 0 0 0 3.6 2.2
-2014-11-20 19987 20141120 0 0 0 4.5 3.1
-2014-11-21 18837 20141121 0 0 0 3.7 2.8
-2014-11-22 13154 20141122 0 0 0 4.4 2.8
-2014-11-23 15218 20141123 0 0 0 5.7 4.3
-2014-11-24 20794 20141124 0.7 0 0 6.9 5.3
-2014-11-25 26064 20141125 0 0 0 6.8 5.1
-2014-11-26 7479 20141126 1.24 0 0.2 5.1 3.4
-2014-11-27 3757 20141127 0.02 0 0 3.8 3.4
-2014-11-28 7839 20141128 0 0 0 3.7 2.9
-2014-11-29 7869 20141129 0 0 0 4.5 2.7
-2014-11-30 11772 20141130 0 0 0 5.5 4.5
-2014-12-01 18569 20141201 0.09 0 0 6.5 4.2
-2014-12-02 15175 20141202 0.08 0 0 4.3 3.5
-2014-12-03 12177 20141203 0.06 0 0 4.6 4.1
-2014-12-04 21055 20141204 0 0 0 4.5 3.7
-2014-12-05 18920 20141205 0.51 0 0 4.4 3.4
-2014-12-06 4441 20141206 1.22 0 0 5 3.9
-2014-12-07 9319 20141207 0.04 0 0 4.2 3
-2014-12-08 14283 20141208 0 0 0 3.7 2.4
-2014-12-09 6912 20141209 2.54 0 0 4.2 3.6
-2014-12-10 11098 20141210 0.08 0 1 4 3.2
-2014-12-11 16413 20141211 0.01 1.2 0 3.8 3.1
-2014-12-12 18850 20141212 0 0 0 3.8 3.2
-2014-12-13 13173 20141213 0 0 0 4.4 3.4
-2014-12-14 12096 20141214 0 0 0 4.6 3.8
-2014-12-15 17761 20141215 0 0 0 4.8 3.7
-2014-12-16 18941 20141216 0.2 0 0 4.9 3.8
-2014-12-17 18196 20141217 0.02 0 0 5.4 4.2
-2014-12-18 19206 20141218 0 0 0 4.2 3.7
-2014-12-19 18256 20141219 0 0 0 3.8 3.1
-2014-12-20 10421 20141220 0 0 0 3.3 3
-2014-12-21 8854 20141221 0 0 0 3.6 3.1
-2014-12-22 13120 20141222 0.04 0 0 4.4 3.5
-2014-12-23 9849 20141223 0.16 0 0 4.6 4.3
-2014-12-24 5049 20141224 0.8 0 0 5.8 4.4
-2014-12-25 4620 20141225 0.09 0 0 6.2 4.4
-2014-12-26 9360 20141226 0 0 0 5 4
-2014-12-27 10070 20141227 0 0 0 5.5 4.4
-2014-12-28 8055 20141228 0.1 0 0 5.4 4.3
-2014-12-29 13055 20141229 0 0 0 4.4 3.4
-2014-12-30 12483 20141230 0 0 0 3.4 2.8
-2014-12-31 10493 20141231 0 0 0 3.2 2.7
diff --git a/week3/week3_exercises.R b/week3/week3_exercises.R
new file mode 100644
index 000000000..a9878cdfa
--- /dev/null
+++ b/week3/week3_exercises.R
@@ -0,0 +1,73 @@
+library(ISLR2)
+
+df <- read.table("babyweights.txt")
+summary(df)
+str(df)
+head(df,100)
+
+lm.fit <- lm(bwt~smoke, data = df)
+summary(lm.fit)
+
+#5.29
+#Heights VS Weights Scatterplot
+
+# (a) Positive, linear relationship: taller people tend to weigh more.
+# (b) Equation: weight = -105.01 + 1.02 × height. Slope: +1.02 kg per cm. Intercept: not meaningful physically.
+# (c) H0: no association. H1 association. p-value ≈ 0. Strong evidence for association of height and weight .
+# (d) R² ≈ 0.52. About 52% of weight variation is explained by height.
+
+#6.1
+#eqn is weight = 123.05 - 8.4 smoke
+# slope of (-8.94) shows how mother who smoke has their baby's weight by 8.4 ounces
+# less than botn to non smoker mothers.
+#Predict birth weights of non smoker so y = 123.05 -8.5*0 = 123.05 ounces
+# for smoker y = 123.05 - 8.4 *1 = 114.11.
+
+# looking at p-value which is really small we reject the null so there is statistically significant
+#relationship.
+
+#6.2
+
+lm.fit <- lm(bwt~parity, data = df)
+summary(lm.fit)
+
+# baby's weight = 120.06 + -1.92 parity
+#this means the weight of baby is changed by 1.92 ounces less if it is first born
+#weight of baby if its not first born (parity =1) = 120.06 - 1.92 = 118.14 ounces
+#weight of baby if its first born (parity =0) = 120.06 ounces
+# it's not statistically significant since its p-value is 0.1 which is more than 0.05
+
+
+#6.3
+lm.fit <- lm(bwt~ . , data = df)
+summary(lm.fit)
+
+#a. baby weight (y) = -80.41 + 0.44gestation -3.32 parity - 0.0089 age + 1.15 height +
+# 0.05 weight - 8.40 smoke
+
+#b. here we can say since slope of gestation is 0.44 meaning the weight is increases by 0.44 ounces
+#for each additional day of pregnancy
+
+#for age we have slope of -0.01 meaning the weight decreases by 0.01 ounces foe each additional year
+#of mother's age which is very small and not significant and its p value is 0.9
+
+
+#c. since for the previous model parity was only the predictor but for this we are taking in account
+# with many variables the coefficient of parity changes. this is called controlling for confounding
+# the effect of parity is adjusted for the effects of the other predictors.
+
+#d. residuals is
+head(df)
+pred_bwt = -80.41 + 0.44*284 - 3.33*0 - 0.01*27 + 1.15*62 + 0.05*100 - 8.40*0
+print(120 - pred_bwt)
+
+#e variance of residuals = 249.28 , variance of birth weights of all babies = 332.57
+summary(lm.fit)
+summary(lm.fit)$r.squared
+summary(lm.fit)$adj.r.squared
+
+
+
+
+
+
diff --git a/week4/model.RData b/week4/model.RData
new file mode 100644
index 000000000..d62e16830
Binary files /dev/null and b/week4/model.RData differ
diff --git a/week4/predict_citibikes.rmd b/week4/predict_citibikes.rmd
new file mode 100644
index 000000000..88134f111
--- /dev/null
+++ b/week4/predict_citibikes.rmd
@@ -0,0 +1,219 @@
+---
+title: "Citibike Prediction"
+author: "Drishya Shrestha"
+date: "`r Sys.Date()`"
+output: html_document
+---
+
+
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Setup (loading packages and data)
+
+
+```{r setup}
+library(tidyverse)
+library(scales)
+library(modelr)
+library(readr)
+theme_set(theme_bw())
+
+
+```
+
+## Loading data
+
+
+```{r pressure, echo=FALSE}
+trips <- read_tsv("trips_per_day.tsv")
+str(trips)
+summary(trips)
+print(head(trips))
+```
+
+Now, we will divide the dataset into training, validation, and test sets. The training set will be used to fit the model, the validation set will be used to tune the model parameters, and the test set will be used to evaluate the model's performance.
+80% of the data will be used for training, 10% for validation, and 10% for testing.
+
+### Splitting into training, validation and test sets
+
+```{r, message = FALSE}
+set.seed(42)
+
+num_days <- nrow(trips_per_day)
+
+
+# 10% for test set
+num_test <- floor(num_days * 0.1)
+test_ndx <- sample(1:num_days, num_test, replace = FALSE)
+trips_test <- trips_per_day[test_ndx, ]
+# 10% for validation set
+trips_per_day <- trips_per_day[-test_ndx, ]
+```
+
+Now, we will fit a linear regression model to the training data on different polynomial degrees and choose the one which lowest validation error.
+
+### K-fold cross-validation to choose polynomial degree for tmin
+```{r}
+library(ggplot2)
+set.seed(42)
+num_folds <- 5
+n <- nrow(trips_per_day)
+trips_per_day <- trips_per_day %>%
+ mutate(fold = sample(rep(1:num_folds, length.out = n)))
+
+head(trips_per_day)
+
+# fit a model for each polynomial degree
+K <- 1:8
+avg_validate_err <- c()
+se_validate_err <- c()
+for (k in K) {
+
+ # do 5-fold cross-validation within each value of k
+ validate_err <- c()
+ for (f in 1:num_folds) {
+ # fit on the training data
+ trips_per_day_train <- filter(trips_per_day, fold != f)
+ model <- lm(num_trips ~ poly(tmin, k, raw = T), data=trips_per_day_train)
+
+ # evaluate on the validation data
+ trips_per_day_validate <- filter(trips_per_day, fold == f)
+ validate_err[f] <- sqrt(mean((predict(model, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+ }
+
+ # compute the average validation error across folds
+ # and the standard error on this estimate
+ avg_validate_err[k] <- mean(validate_err)
+ se_validate_err[k] <- sd(validate_err) / sqrt(num_folds)
+}
+
+# plot the validate error, highlighting the value of k with the lowest average error
+plot_data <- data.frame(K, avg_validate_err, se_validate_err)
+ggplot(plot_data, aes(x=K, y=avg_validate_err)) +
+ geom_pointrange(aes(ymin=avg_validate_err - se_validate_err,
+ ymax=avg_validate_err + se_validate_err,
+ color=avg_validate_err == min(avg_validate_err))) +
+ geom_line(color = "red") +
+ scale_x_continuous(breaks=1:12) +
+ theme(legend.position="none") +
+ xlab('Polynomial Degree') +
+ ylab('RMSE on validation data')
+```
+
+The polynomial degree with the lowest validation error is 4.
+
+### Adding new features to the dataset
+
+```{r}
+trips_per_day<-mutate(trips_per_day, day_name = weekdays(ymd), snow_high = ifelse(snwd>mean(snwd), 1,0),avg_temp = (tmin + tmax) / 2, prcp_high = ifelse(prcp > 0.15, 1, 0),
+ month = month(ymd),
+ season = case_when(
+ month %in% c(12, 1, 2) ~ "Winter",
+ month %in% c(3, 4, 5) ~ "Spring",
+ month %in% c(6, 7, 8) ~ "Summer",
+ month %in% c(9, 10, 11) ~ "Fall"
+ )
+)
+trips_per_day<-mutate(trips_per_day, day_type = ifelse(day_name %in% c("Saturday", "Sunday"), 1, 0)) %>% select(- c(day_name))
+head(trips_per_day)
+```
+
+We added new columns to the dataset:
+- `day_type`: If the day is a weekend (0) and weekday (1).
+- `snow_high`: Binary variable indicating if the snow depth is above average.
+- `avg_temp`: Average temperature for the day.
+- `prcp_high`: Binary variable indicating if the precipitation is above 0.15 inches.
+- `season`: Season of the year based on the month.
+
+### Fitting the final model with selected features on Validation data and Training data to look at the RMSE
+```{r}
+set.seed(42)
+# Number of folds
+num_folds <- 8
+n <- nrow(trips_per_day)
+
+# Assign fold labels
+trips_per_day <- trips_per_day %>%
+ mutate(fold = sample(rep(1:num_folds, length.out = n)))
+
+# Store RMSE for each fold
+validate_err <- numeric(num_folds)
+train_err <- numeric(num_folds)
+
+for (f in 1:num_folds) {
+ # Split data
+ train_data <- filter(trips_per_day, fold != f)
+ validate_data <- filter(trips_per_day, fold == f)
+ # Fit model with selected features
+ model <- lm(num_trips ~ prcp_high + avg_temp + day_type + season*prcp, data = train_data)
+ # RMSE on training data
+ train_pred <- predict(model, train_data)
+ train_err[f] <- sqrt(mean((train_pred - train_data$num_trips)^2))
+ # RMSE on validation data
+ val_pred <- predict(model, validate_data)
+ validate_err[f] <- sqrt(mean((val_pred - validate_data$num_trips)^2))
+}
+
+# Summary results
+mean_train_rmse <- mean(train_err)
+mean_validate_rmse <- mean(validate_err)
+
+print(paste("Avg Training RMSE:", round(mean_train_rmse, 2)))
+print(paste("Avg Validation RMSE:", round(mean_validate_rmse, 2)))
+
+validate_data_w_prediction <- mutate(validate_data, predicted_num_trips = predict(model, validate_data))
+summary(validate_data_w_prediction)
+
+```
+
+In order to evaluate the model's performance, we will look at the RMSE on training and validation datasets. The average RMSE was around 3600 trips for both the datasets.
+
+### Visualizing the results
+### Graph 1 for date vs predicted values and actual values of validation data
+
+
+
+```{r, echo=FALSE, message=FALSE}
+#Graph 1 for date vs predicted values and actual values of validation data
+ggplot(validate_data_w_prediction, aes(x = ymd, y = num_trips)) +
+ geom_line(color = "blue", size = 1) +
+ geom_line(aes(y = predicted_num_trips), color = "red", size = 1, linetype = "dashed") +
+ labs(title = "Actual vs Predicted Number of Trips",
+ x = "Date",
+ y = "Number of Trips") +
+ theme_minimal()
+
+```
+
+This shows the predicted number of trips (in red) and the actual number of trips (in blue) over time. The model seems to capture the trend in the data well, although there are some discrepancies on certain days.
+
+### Graph 2 for actual vs predicted values of validation data
+```{r, echo=FALSE, message=FALSE}
+# Graph 2 for actual vs predicted values of validation data
+ggplot(validate_data_w_prediction, aes(x = num_trips, y = predicted_num_trips)) +
+ geom_point(color = "blue", alpha = 0.5) +
+ geom_smooth(method = "lm", color = "red") +
+ labs(title = "Actual vs Predicted Number of Trips",
+ x = "Actual Number of Trips",
+ y = "Predicted Number of Trips") +
+ theme_minimal()
+```
+This scatter plot shows the relationship between the actual number of trips and the predicted number of trips. The red line represents the linear regression line fitted to the data.
+
+
+
+
+``` {r }
+
+save(model, file = "model.RData")
+
+tmp_env <- new.env()
+load("model.RData", envir = tmp_env)
+ls(tmp_env) # Lists all objects saved in the file
+
+print(tmp_env$model)
+
+```
\ No newline at end of file
diff --git a/week4/test_citbike_predictions.Rmd b/week4/test_citbike_predictions.Rmd
new file mode 100644
index 000000000..efde59cf7
--- /dev/null
+++ b/week4/test_citbike_predictions.Rmd
@@ -0,0 +1,77 @@
+---
+title: "Test Citibike Prediction"
+author: "Drishya Shrestha"
+date: "`r Sys.Date()`"
+output: html_document
+---
+
+
+
+## Setup (loading packages and data)
+
+
+```{r setup}
+library(tidyverse)
+library(scales)
+library(modelr)
+library(readr)
+theme_set(theme_bw())
+```
+
+
+
+## Loading data
+
+
+```{r pressure, echo=FALSE}
+trips_per_day_2015 <- read_tsv("trips_per_day_2015.tsv")
+weather_2015 <- read_csv("weather_2015.csv")
+summary(weather_2015)
+str(weather_2015)
+summary(trips_per_day_2015)
+str(trips_per_day_2015)
+
+tmp_env <- new.env()
+load("model.RData", envir = tmp_env)
+model <- tmp_env$model
+summary(model)
+formula(model)
+
+
+
+str(trips_per_day_2015)
+head(trips_per_day_2015)
+head(weather_2015)
+joined_df <- inner_join(weather_2015, trips_per_day_2015, by = c("DATE" = "ymd"))
+head(joined_df)
+names(joined_df) <- tolower(names(joined_df))
+
+joined_df<-mutate(joined_df, day_name = weekdays(date), snow_high = ifelse(snwd>5, 1,0),avg_temp = (tmin/10 + tmax/10) / 2, prcp_high = ifelse(prcp > 0.15, 1, 0),
+ month = month(date),
+ season = case_when(
+ month %in% c(12, 1, 2) ~ "Winter",
+ month %in% c(3, 4, 5) ~ "Spring",
+ month %in% c(6, 7, 8) ~ "Summer",
+ month %in% c(9, 10, 11) ~ "Fall"
+ )
+)
+joined_df<-mutate(joined_df, day_type = ifelse(day_name %in% c("Saturday", "Sunday"), 1, 0)) %>% select(- c(day_name,tavg))
+str(joined_df)
+view(joined_df)
+##Testing it
+actual <- joined_df$num_trips
+predicted <- predict(model, newdata = joined_df)
+
+rmse <- sqrt(mean((actual - predicted)^2))
+rmse
+
+#Checking and debugging the new data
+
+
+colSums(is.na(joined_df))
+
+#Features used check
+attr(model$terms, "term.labels")
+# Check types
+str(joined_df)
+```