diff --git a/week1/Extra_Day4_Book_Excercies_Chapter3_Ahmed_sols.R b/week1/Extra_Day4_Book_Excercies_Chapter3_Ahmed_sols.R new file mode 100644 index 000000000..96da0e8e9 --- /dev/null +++ b/week1/Extra_Day4_Book_Excercies_Chapter3_Ahmed_sols.R @@ -0,0 +1,158 @@ +#Excercie 3.3.1 + +library(tidyverse) + + +dta_mpg <- mpg + +#---------------------------------------------------------------------------------------- +# Excercies from chapter 3 first edition + +# 3.3.1 Excercies + +# Question 1: +# A fix for the code: +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point(color = "blue") + +# Another sol + +ggplot(mpg) + +geom_point(aes(x = displ, y= hwy), color = "blue") + +# The issue was that the color was included in aes(). So, it was treated as aesthetic. +#The color = "blue" was interpreted as a categorical variable which only takes a single value "blue" + +# Question 2: +# I used ?mpg to check this: + +# categorial variables in mpg are: manufacturer, model, year, trans, drv, fl, class +# Continuous variable in mpg are: displ, year, cyl, cty, hwy + +#Question 3: + +#Makes teh color as a scale not distinct colors (same for size) +ggplot(mpg, aes(x = displ, y = hwy, color = cty)) + +geom_point() + +ggplot(mpg, aes(x = displ, y = hwy, size = cty)) + +geom_point() + +# Gives error for shapes because we cannot make a scale of shapes +ggplot(mpg, aes(x = displ, y = hwy, shape = cty)) + +geom_point() + +#----------------------------------------------------------------------------------------- + +# 3.5.1 Excercies + +# question 1 +#Answer: The continuous variable is convereted to a categorical variable, then the grahp contains a facet for each distinct value +ggplot( + mpg, aes(x = displ, y = hwy)) + + geom_point() + facet_grid(.~cty) + +# Question 4 + +# class is faceted +ggplot(data = mpg) + + geom_point(mapping = aes(x = displ, y = hwy)) + + facet_wrap(~ class, nrow = 2) + +# class mapped to color +ggplot(data = mpg) + + geom_point(mapping = aes(x = displ, y = hwy, color = class)) + + +#Advantages: +# 1- It's easier to distinguish and see the different classes +# 2- There is no overlapping between the point, so it is easier to see the trends. On the other hand, the with color, there is huge overlapping. + +#Disadvantages: +# 1- It is diffcult to compare as the points are on different and separate graphs + +#----------------------------------------------------------------------------------------- + +# 3.6.1 Excercies: + +# Question 5: +#graph 1 +ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + + geom_point() + + geom_smooth() + +#graph 2 +ggplot() + + geom_point(data = mpg, mapping = aes(x = displ, y = hwy)) + + geom_smooth(data = mpg, mapping = aes(x = displ, y = hwy)) + + +#Ans: No, because both geom_point() and geom_smooth will take the same data as input in graph 2. +# In graph 1, they will take from data from mapping. So, the two graphs will look the same. + + +# Question 6: + +?mpg +?geom_smooth +#graph 1 +ggplot( + mpg, aes(x = displ, y = hwy) +) + geom_point() + geom_smooth(se = FALSE) + +#graph 2 +ggplot( + mpg, aes(x = displ, y = hwy) +) + geom_point() + geom_smooth(aes(group = drv), se = FALSE) + +#graph 3 +ggplot( + mpg, aes(x = displ, y = hwy, color = drv) +) + geom_point() + geom_smooth(se = FALSE) + + +#graph 4 +ggplot( + mpg, aes(x = displ, y = hwy) +) + geom_point(aes(color = drv)) + geom_smooth(se = FALSE) + + +#graph 5 +ggplot( + mpg, aes(x = displ, y = hwy) +) + geom_point(aes(color = drv)) + geom_smooth(aes(linetype = drv), se = FALSE) + +#graph 6 +ggplot( + mpg, aes(x = displ, y = hwy, color = drv) +) + geom_point(size = 4, color = "white") + geom_point() + + +#----------------------------------------------------------------------------------------- + +#3.8.1 Excercies + +#Question 1: + +ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + + geom_point() + +# Problem is that there is overlapping because there are multiple observations for each combination cty and hwy + +# Quick fix would be: + +ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + + geom_point(position = "jitter") + +#jitter shows the positions where there are more observations + + +#Question 2: + +?geom_jitter() + +#The two arguments would be width and height examples: + +ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_jitter(width = 0) + +ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_jitter(width = 40) diff --git a/week1/citibike.R b/week1/citibike.R index ad01de1d3..714e59263 100644 --- a/week1/citibike.R +++ b/week1/citibike.R @@ -3,16 +3,16 @@ library(lubridate) ######################################## # READ AND TRANSFORM THE DATA -######################################## +##########@############################## # read one month of data -trips <- read_csv('201402-citibike-tripdata.csv') +trips <- read_csv('./week1/201402-citibike-tripdata.csv') # replace spaces in column names with underscores names(trips) <- gsub(' ', '_', names(trips)) # convert dates strings to dates -# trips <- mutate(trips, starttime = mdy_hms(starttime), stoptime = mdy_hms(stoptime)) +#trips <- mutate(trips, starttime = mdy_hms(starttime), stoptime = mdy_hms(stoptime)) # recode gender as a factor 0->"Unknown", 1->"Male", 2->"Female" trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unknown","Male","Female"))) @@ -24,25 +24,81 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn # count the number of trips (= rows in the data frame) +#print(nrow(trips)) ==> 224736 + # find the earliest and latest birth years (see help for max and min to deal with NAs) +#birth_year_col <- trips$birth_year +#birth_year_col_new <- as.numeric(birth_year_col) +#min(birth_year_col_new) +# Ans: [1] 1899 +# max(birth_year_col_new) +# Ans: [1] 1997 + + + # use filter and grepl to find all trips that either start or end on broadway +#filter(trips ,grepl("Broadway", start_station_name) | grepl("Broadway", end_station_name)) + # do the same, but find all trips that both start and end on broadway +#filter(trips ,grepl("Broadway", start_station_name) & grepl("Broadway", end_station_name)) + + # find all unique station names +#uniq_start_station_name <- unique(trips$start_station_name) +#uniq_end_station_name <- unique(trips$end_station_name) +#combine_stations <- paste(uniq_end_station_name, uniq_start_station_name) +#unique(combine_stations) + # count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender # do this all at once, by using summarize() with multiple arguments + +# trips %>% +# group_by(gender) %>% +# summarize (count = n(), avg = mean(tripduration), std = sd(tripduration)) + + + # find the 10 most frequent station-to-station trips +trips %>% + group_by(start_station_name, end_station_name) %>% + summarize(count = n()) %>% + arrange(desc(count)) %>% + slice_head(n = 10) + # find the top 3 end stations for trips starting from each start station +trips %>% group_by(start_station_name, end_station_name) %>% summarize (count = n()) %>% arrange(desc(count), start_station_name) %>% slice_head(n=3) + + +trips %>% group_by(start_station_name, end_station_name) %>% summarize (count = n()) %>% group_by(start_station_name) %>% arrange(desc(count)) %>% slice(1:3) + # find the top 3 most common station-to-station trips by gender +trips %>% group_by(start_station_name, end_station_name, gender) %>% summarize (count = n()) %>% group_by(gender) %>% arrange(desc(count)) %>% slice(1:3) + + # find the day with the most trips # tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package) + +trips %>% mutate( date = as.Date(starttime)) %>% group_by(date) %>% summarise(count = n()) %>% arrange(desc(count)) %>% slice(1) + + + + + # compute the average number of trips taken during each of the 24 hours of the day across the entire month # what time(s) of day tend to be peak hour(s)? + +trips %>% mutate( hours = hour(starttime)) %>% + group_by(hours) %>% summarise(count = n(), day_in_month_count = days_in_month(starttime), avg = count / day_in_month_count) + +trips %>% mutate( hours = hour(starttime)) %>% + group_by(hours) %>% summarise(count = n()) %>% arrange(desc(count)) %>% slice(1) + diff --git a/week1/citibike.sh b/week1/citibike.sh index 25604f545..b164e9be0 100755 --- a/week1/citibike.sh +++ b/week1/citibike.sh @@ -5,19 +5,75 @@ # count the number of unique stations +# cut -d, -f5 201402-citibike-tripdata.csv | sort | uniq -c | wc -l +# Ans: 330 + # count the number of unique bikes +# cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | wc -l +# Ans: 5700 + # count the number of trips per day +# cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c + +# 12771 2014-02-01 +# 13816 2014-02-02 +# 2600 2014-02-03 +# 8709 2014-02-04 +# 2746 2014-02-05 +# 7196 2014-02-06 + 8495 2014-02-07 + 5986 2014-02-08 + 4996 2014-02-09 + 6846 2014-02-10 + 8343 2014-02-11 + 8580 2014-02-12 + 876 2014-02-13 + 3609 2014-02-14 + 2261 2014-02-15 + 3003 2014-02-16 + 4854 2014-02-17 + 5140 2014-02-18 + 8506 2014-02-19 + 11792 2014-02-20 + 8680 2014-02-21 + 13044 2014-02-22 + 13324 2014-02-23 + 12922 2014-02-24 + 12830 2014-02-25 + 11188 2014-02-26 + 12036 2014-02-27 + 9587 2014-02-28 + # find the day with the most rides +#cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort -r | head -n1 +# 13816 2014-02-02 + # find the day with the fewest rides +# cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | head -n2| tail -n1 + # find the id of the bike with the most rides +# cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | sort -r | head -n2 | tail -n1 +# 128 16151 + + # count the number of rides by gender and birth year +# cut -d, -f14,15 201402-citibike-tripdata.csv | sort | uniq -c + + + # count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...) +#cut -d, -f5 201402-citibike-tripdata.csv | grep '.*[0-9].* &.*[0-9].*' | wc -l +#90549 + + # compute the average trip duration +#awk '{sum += $1} END {avg = sum / NR; print "Average:" avg}' 201402-citibike-tripdata.csv +# 874.516 \ No newline at end of file diff --git a/week1/musical_pairs.sh b/week1/musical_pairs.sh new file mode 100644 index 000000000..da9b18791 --- /dev/null +++ b/week1/musical_pairs.sh @@ -0,0 +1,10 @@ +members=("ahmed" "aisha" "alou" "naomi" "sara" "sofia" "srijana" "vaishnavi" "vanessa" "dereck" "drishya" "yehtut") + +if command -v md5sum &> /dev/null; then + seed=$(date +%F | md5sum | awk '{print $1}') +else + seed=$(date +%F | md5sum | awk '{print $NF}') + +fi + +printf "%s\n" "${members[@]}" | shuf --random-source=<(echo $seed) | xargs -n2 echo "Pair: " \ No newline at end of file diff --git a/week1/plot_trips.R b/week1/plot_trips.R index 4f25437ba..0276fb3f1 100644 --- a/week1/plot_trips.R +++ b/week1/plot_trips.R @@ -19,25 +19,95 @@ load('trips.RData') # plot the distribution of trip times across all rides (compare a histogram vs. a density plot) +ggplot(trips, aes(x = tripduration)) + + geom_histogram(bins = 30, fill = "blue", color = "black") + scale_x_log10() + + + ggplot(trips, aes(x = tripduration)) + + geom_density(fill = "blue", color = "black") + scale_x_log10() + + + + # plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot) + ggplot(trips, aes(x = tripduration, color = usertype, fill = usertype)) + + geom_histogram() + scale_x_log10() + + facet_grid(~ usertype) + + ylab("Number of trips") + + + # Density : + ggplot(trips, aes(x = tripduration, color = usertype, fill = usertype)) + + geom_density()+ scale_x_log10() + + facet_grid(~ usertype) + + + # plot the total number of trips on each day in the dataset +trips %>% mutate(day = as.Date(starttime)) %>% + ggplot(aes(x = day)) + + geom_histogram(bins = 30, fill = "blue") + + + + # plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color) + +trips %>% mutate(age = as.numeric(format(ymd, "%Y")) - as.numeric(birth_year)) %>% + ggplot(aes(x = age, , fill = gender)) + geom_histogram(bins = 30, alpha = 0.9) + + ylab("Total number of trips") + + + + + # plot the ratio of male to female trips (on the y axis) by age (on the x axis) # hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio # (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet) + + +trips_age <- trips %>% mutate(age = as.numeric(format(ymd, "%Y")) - as.numeric(birth_year)) + +trips_age_grouped <- group_by(trips_age, age, gender) %>% summarise(count = n()) + +trips_age_grouped_pivoted <- pivot_wider(trips_age_grouped, names_from = gender, values_from = count) + +trips_age_grouped_pivoted_with_ratio <- mutate(trips_age_grouped_pivoted, ratio = Male / Female) + +ggplot(trips_age_grouped_pivoted_with_ratio, aes(x = age, y = ratio)) + geom_point() + scale_x_log10(label = comma) + +geom_smooth(se = FALSE) + + + + ######################################## # plot weather data ######################################## + # plot the minimum temperature (on the y axis) over each day (on the x axis) +ggplot(weather, aes(x = date, y = tmin)) + geom_point() + + + # plot the minimum temperature and maximum temperature (on the y axis, with different colors) over each day (on the x axis) # hint: try using the pivot_longer() function for this to reshape things before plotting # (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet) +new_weather_pivoted <- pivot_longer(weather, cols = c(tmin, tmax), names_to = "temp_type", values_to = "Temp") +ggplot(new_weather_pivoted, aes(x = date, y = Temp, color =temp_type )) + geom_point() + labs( + x = "Date", + y = "Temp" +) + + + + + ######################################## # plot trip and weather data ######################################## @@ -48,15 +118,63 @@ trips_with_weather <- inner_join(trips, weather, by="ymd") # plot the number of trips as a function of the minimum temperature, where each point represents a day # you'll need to summarize the trips and join to the weather data to do this +daily_trips <- trips_with_weather %>% mutate(date = as.Date(starttime)) %>% + group_by(date) %>% summarize(trip_count = n(), min_temp = min(tmin)) + + ggplot(daily_trips, aes(x = min_temp, y = trip_count)) + geom_point() + xlab("Min Temp") + ylab("Number of trips") + # repeat this, splitting results by whether there was substantial precipitation or not # you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this +daily_summary <- trips_with_weather %>% mutate(date = as.Date(starttime)) %>% + group_by(date) %>% summarize(trip_count_after = n(), min_temp_after = min(tmin), substantial_prcp = min (prcp) >= 0.3) + + ggplot(daily_summary, aes(x = min_temp_after, y = trip_count_after, color =substantial_prcp )) + + geom_point() + xlab("Min Temp") + ylab("Number of trips") + + + # add a smoothed fit on top of the previous plot, using geom_smooth +daily_summary <- trips_with_weather %>% mutate(date = as.Date(starttime)) %>% + group_by(date) %>% summarize(trip_count_after = n(), min_temp_after = min(tmin), substantial_prcp = min (prcp) >= 0.3) + + ggplot(daily_summary, aes(x = min_temp_after, y = trip_count_after, color =substantial_prcp )) + + geom_point() + geom_smooth(method = "lm") + xlab("Min Temp") + ylab("Number of trips") + + # compute the average number of trips and standard deviation in number of trips by hour of the day # hint: use the hour() function from the lubridate package +hour_summary <- trips_with_weather %>% mutate(date = as.Date(starttime), hour = hour(starttime)) %>% + group_by(date, hour) %>% summarize(trips_per_hr = n(), .groups = "drop") %>% group_by(hour) %>% + summarize(avg = mean(trips_per_hr) , sd_trips = sd(trips_per_hr)) + + + + # plot the above +hour_summary <- trips_with_weather %>% mutate(date = as.Date(starttime), hour = hour(starttime)) %>% + group_by(date, hour) %>% summarize(trips_per_hr = n(), .groups = "drop") %>% group_by(hour) %>% + summarize(avg = mean(trips_per_hr) , sd_trips = sd(trips_per_hr)) + + +ggplot(hour_summary, aes(x = hour, y = avg)) + geom_point(color = 'blue') + + geom_errorbar(aes(ymin = avg - sd_trips, ymax = avg + sd_trips)) + + # repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days # hint: use the wday() function from the lubridate package + +hour_by_day <- trips_with_weather %>% mutate(date = as.Date(starttime), hour = hour(starttime), day_of_week = wday(date)) %>% + group_by(date, day_of_week, hour) %>% summarize(trips_per_hr = n(), .groups = "drop") %>% + group_by(day_of_week, hour) %>% summarize(avg_trip = mean(trips_per_hr), sd_trips = sd(trips_per_hr), .groups = "drop") + + + ggplot(hour_by_day, aes(x = hour, y = avg_trip, color = day_of_week)) + geom_point(color = 'blue') + + geom_errorbar(aes(ymin = avg_trip - sd_trips, ymax = avg_trip + sd_trips)) + scale_x_continuous(breaks = 0:23) + + + + diff --git a/week2/Day5_book_excercies.R b/week2/Day5_book_excercies.R new file mode 100644 index 000000000..d30f5817c --- /dev/null +++ b/week2/Day5_book_excercies.R @@ -0,0 +1,79 @@ +library(tidyverse) +view(table4a) +view(table4b) + + +#Section 12.2.1, exercise 2 + +#Question 2 for table2 +view(table2) +table2_pivoted <- pivot_wider(table2, names_from = type, values_from = count) + +view(table2_pivoted) + +table2_pivoted_with_rate <- mutate(table2_pivoted, rate = (cases / population) * 10000) + + +view(table2_pivoted_with_rate) + + +#Question 2 for table4a and table4b + +table4a_pivoted <- pivot_longer(table4a, names_to = "Year", values_to = "Cases" , -country) +table4b_pivoted <- pivot_longer(table4b, names_to = "Year", values_to = "pop" , -country) + +view(table4a_pivoted) +view(table4b_pivoted) + +tables_4_combined <- inner_join(table4a_pivoted, table4b_pivoted) + +view(tables_4_combined) + +tables_4_combined_mutated <- mutate(tables_4_combined, rate = (Cases / pop) * 10000) + +view(tables_4_combined_mutated) + +#-------------------------------------------------------------------------------------------- + +# Section 12.3.3 exercises 1 + +stocks <- tibble( + year = c(2015, 2015, 2016, 2016), + half = c( 1, 2, 1, 2), + return = c(1.88, 0.59, 0.92, 0.17) +) + +head(stocks) + +view(stocks) + +stocks %>% + pivot_wider(names_from = year, values_from = return) %>% + pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return") + +#pivot_wider makes the observations from year to columns names + +#pivot_longer changed the years from being columns names to observations, which make it characters. + + +# Section 12.3.3 exercises 3 + +people <- tribble( + ~name, ~names, ~values, + #-----------------|--------|------ + "Phillip Woods", "age", 45, + "Phillip Woods", "height", 186, + "Phillip Woods", "age", 50, + "Jessica Cordero", "age", 37, + "Jessica Cordero", "height", 156 +) +people_piovted <- pivot_wider(people, names_from = names, values_from = values) + +view(people_piovted) +# Phillip Woods has two observations for two values in the age column because there are two "Phillip Woods" exist twice in the data, which would mean there are two "Phillip Woods" +# The wider used the height for both "Phillip Woods" +# A solution for this, we can use a unique key or ID for each person, so that they would be treated as two different people in the data + + + + diff --git a/week2/Day7_Day8_Excercies.R b/week2/Day7_Day8_Excercies.R new file mode 100644 index 000000000..1e15b4599 --- /dev/null +++ b/week2/Day7_Day8_Excercies.R @@ -0,0 +1,235 @@ +library(tidyverse) +library(dplyr) +magents <- read.csv("D:/coursework/week2/magnets.csv") +View(magents) + + +# Chapter 9 Section 1 +#Question 1 +avg_change <- magents |> mutate(change_avg = mean(change)) +view(avg_change) + +#Question 2 +str(magents) + +# The type of data in active is characters. The answers to the question is that active is factor not numeric. +# It's two different levels either 1 or 2. + +avg_change_for_active = mean(magents$change[1:29]) +avg_change_for_nonactive = mean(magents$change[30:50]) + +view(avg_change_for_active) +view(avg_change_for_nonactive) + + +# Question 4 +std_for_active = sd(magents$change[1:29]) +std_for_non_active = sd(magents$change[30:50]) + +print(std_for_active) +print(std_for_non_active) + +# Question 5 +#box_plot for active: + +boxplot(magents$change[1:29]) + +boxplot(magents$change[30:50]) #I see there are three outliers + + +# Chapter 10 Section 1 + +#Question 1: + +#Normal (3,2) ==> 3 is the mean & 2 is the variance + +mu <- 3 +variance <- 2 +std <- sqrt(2) + +x_bar <- rep(0, 10^3) +x_mid <- rep(0, 10^3) + +for(i in 1:10^3){ + x <- rnorm(100, mu, std) + x_bar[i] <- mean(x) + x_mid[i] <- median(x) +} + +x_bar +x_mid + +mean(x_bar) # mean of the means +mean(x_mid) # mean of the medians + +var(x_bar) #variance for x_bar +var(x_mid) # variance for x_mid + +# The variance of x_bar is going to be around 0.2, but the variance of x_mid is going to be around 0.3 + +#Question 2: + +min <- 0.5 +max <- 5 + +x_bar <- rep(0, 10^3) +x_mid <- rep(0, 10^3) + +for(i in 1:10^3){ + x <- runif(100, min, max) + x_bar[i] <- mean(x) + x_mid[i] <- median(x) +} + +x_bar +x_mid + +mean(x_bar) # mean of the means +mean(x_mid) # mean of the medians + +var(x_bar) #variance for x_bar +var(x_mid) # variance for x_mid + + +# Chapter 10 Section 2 +library(tidyverse) +population_data<- read.csv("D:/coursework/week2/pop2.csv") +sample_population_data<- read.csv("D:/coursework/week2/ex2.csv") +view(population_data) +view(sample_population_data) + + +#Question 1 +people_from_sample_with_high_bp = count(sample_population_data, group == "HIGH") +ans <- people_from_sample_with_high_bp / 150 +print(ans) + +# Another sol: +mean (sample_population_data$group == "HIGH") + + +#Question 2: + +mean (population_data$group == "HIGH") + + +#Question 3: + +p_hat <- rep(0, 10^3) + +for(i in 1: 10^3){ + y <- sample(population_data$group, 150) # we are getting 150 as a sample size to simulate the sample that we have + p_hat[i] = (y == "HIGH") + + view(y) +} +mean(p_hat) + +# Question 4 + +var(p_hat) + +# Question 5 + +p <- mean (population_data$group == "HIGH") + +variance <- p * (1-p) / 150 +print(variance) + +# Chapter 2 Section 2 from Intro to Stat with Randomization and Simulation + +# Question a) + +Dead_from_control_group = 30/34 + +Dead_from_treatment_group = 25/69 + + +# Question b) part 1) + +#H0 ==> the null hypothesis is that the treatment is not effective +#HA ==> the alternative hypothesis is that the treament is effective + +# Question b) part 2) + +# 28 alive, 75 dead, 69 recieved the treatment, 34 control group, the null hypothesis, the percentage of the alive in the grou + + +# Question C) + +41/99 #==> this suggests that the treatment is effective + + + + + +# Chapter 2 Section 6 from Intro to Stat with Randomization and Simulation: + +# Question 1 + +#The hypotheses are: +# H0: Yawing next to someone doesn't affect the person p_treament = p_control +# HA: (There is an association between being near a yawner and yawning) p_treament does not equal to p_control + +# Question 2 + +p_hat_treatment = 10/34 +p_hat_control = 4/16 + +my_diff = p_hat_treatment - p_hat_control +print(my_diff) +# So our difference is 0.04411765 + +# Question 3: + +# some notes from graph are: +#The null distribution is approximately normally distributed around the zero (0.0) + +#Our observation is 0.04411765, so I located this on the x-axis + +# The p-value = p(p_hat_treatment - p_hat_control >= 0.044) + p(p_hat_treatment - p_hat_control <= -0.044) +# This is because my hypothesis is double-sided + +#This p-value will be around the zero, and since the largest bars (large proportion) of the distribution lies between -0.1 and +0.1 + +#let's guess that 4,000 to 6,000 lies in this distribution + +#p-value = (4000 to 6000) / 10,000 = 0.4 to 0.6 + +#So, I would say that the p-value would be large than 5% anyway, which would be enough to reject the null hypothesis. + + +# Chapter 3 Section 1 from Intro to Stat with Randomization and Simulation + +#Question 1: + +#The answer is false because of the central limit theorem conditions: n >= 30, np >= 10 & n(1-p) >= 10 + + + +#Chapter 9 Section 2 from Introduction to Statistical Thinking + +# Question 1 +# the goal we are trying to do replication to test the statistic mentioned in the question +mu <- 3.5 +sig1 <- 3 +sig2 <- 1.5 +stat_test <- rep(0, 10^3) + +for(i in 1:10^3){ + x1 <- rnorm(29,mu, sig1) + x2 <- rnorm(21, mu, sig2) + x1_bar <- mean(x1) + x1_variance <- var(x1) + x2_bar <- mean(x2) + x2_variance <- var(x2) + stat_test[i] <- (x1_bar - x2_bar) / sqrt((x1_variance/29) + (x2_variance/21)) +} +quantile(stat_test, c(0.025, 0.975), na.rm = TRUE) + +#The answer would be an interval between the two numbers [-2.018366, 2.011073] This is just an example because everytime the code would +# run, the number would be different, depending on the samples that would be generated + + +#Question 2 + diff --git a/week2/Excercises-Day6/Question 4.1.jpeg b/week2/Excercises-Day6/Question 4.1.jpeg new file mode 100644 index 000000000..721b0a3ed Binary files /dev/null and b/week2/Excercises-Day6/Question 4.1.jpeg differ diff --git a/week2/Excercises-Day6/Question 4.2.jpeg b/week2/Excercises-Day6/Question 4.2.jpeg new file mode 100644 index 000000000..18f496512 Binary files /dev/null and b/week2/Excercises-Day6/Question 4.2.jpeg differ diff --git a/week2/Excercises-Day6/Question6.R b/week2/Excercises-Day6/Question6.R new file mode 100644 index 000000000..746b2a5d9 --- /dev/null +++ b/week2/Excercises-Day6/Question6.R @@ -0,0 +1,15 @@ +# Question 6.1.1 +1 - pnorm(650,560,57) + +# Question 6.1.2 +1 - pnorm(650,630,61) + +# Question 6.1.3 +tenth_percentile <- qnorm(0.1,560,57) #==> gives the 10%-percentile +ninty_percentile <- qnorm(0.9,560,57) #==> gives the 90%-percentile +# the final answer would be an interval of these two from 10 to 90 + +# Question 6.1.4 +tenth_percentile_new <- qnorm(0.1,630,61) #==> gives the 10%-percentile +ninty_percentile_new <- qnorm(0.9,630,61) #==> gives the 90%-percentile +# the final answer would be an interval of these two from 10 to 90 diff --git a/week2/Week2_Excercises.R b/week2/Week2_Excercises.R new file mode 100644 index 000000000..425c84f6f --- /dev/null +++ b/week2/Week2_Excercises.R @@ -0,0 +1,409 @@ +library(tidyverse) +library(dplyr) +magents <- read.csv("D:/coursework/week2/magnets.csv") +View(magents) + + +# Chapter 9 Section 1 +#Question 1 +avg_change <- magents |> mutate(change_avg = mean(change)) +view(avg_change) + +#Question 2 +str(magents) + +# The type of data in active is characters. The answers to the question is that active is factor not numeric. +# It's two different levels either 1 or 2. + +avg_change_for_active = mean(magents$change[1:29]) +avg_change_for_nonactive = mean(magents$change[30:50]) + +view(avg_change_for_active) +view(avg_change_for_nonactive) + + +# Question 4 +std_for_active = sd(magents$change[1:29]) +std_for_non_active = sd(magents$change[30:50]) + +print(std_for_active) +print(std_for_non_active) + +# Question 5 +#box_plot for active: + +boxplot(magents$change[1:29]) + +boxplot(magents$change[30:50]) #I see there are three outliers + + +# Chapter 10 Section 1 + +#Question 1: + +#Normal (3,2) ==> 3 is the mean & 2 is the variance + +mu <- 3 +variance <- 2 +std <- sqrt(2) + +x_bar <- rep(0, 10^3) +x_mid <- rep(0, 10^3) + +for(i in 1:10^3){ + x <- rnorm(100, mu, std) + x_bar[i] <- mean(x) + x_mid[i] <- median(x) +} + +x_bar +x_mid + +mean(x_bar) # mean of the means +mean(x_mid) # mean of the medians + +var(x_bar) #variance for x_bar +var(x_mid) # variance for x_mid + +# The variance of x_bar is going to be around 0.2, but the variance of x_mid is going to be around 0.3 + +#Question 2: + +min <- 0.5 +max <- 5 + +x_bar <- rep(0, 10^3) +x_mid <- rep(0, 10^3) + +for(i in 1:10^3){ + x <- runif(100, min, max) + x_bar[i] <- mean(x) + x_mid[i] <- median(x) +} + +x_bar +x_mid + +mean(x_bar) # mean of the means +mean(x_mid) # mean of the medians + +var(x_bar) #variance for x_bar +var(x_mid) # variance for x_mid + + +# Chapter 10 Section 2 +library(tidyverse) +population_data<- read.csv("D:/coursework/week2/pop2.csv") +sample_population_data<- read.csv("D:/coursework/week2/ex2.csv") +view(population_data) +view(sample_population_data) + + +#Question 1 +people_from_sample_with_high_bp = count(sample_population_data, group == "HIGH") +ans <- people_from_sample_with_high_bp / 150 +print(ans) + +# Another sol: +mean (sample_population_data$group == "HIGH") + + +#Question 2: + +mean (population_data$group == "HIGH") + + +#Question 3: + +p_hat <- rep(0, 10^3) + +for(i in 1: 10^3){ + y <- sample(population_data$group, 150) # we are getting 150 as a sample size to simulate the sample that we have + p_hat[i] = (y == "HIGH") + + view(y) +} +mean(p_hat) + +# Question 4 + +var(p_hat) + +# Question 5 + +p <- mean (population_data$group == "HIGH") + +variance <- p * (1-p) / 150 +print(variance) + +# Chapter 2 Section 2 from Intro to Stat with Randomization and Simulation + +# Question a) + +Dead_from_control_group = 30/34 + +Dead_from_treatment_group = 25/69 + + +# Question b) part 1) + +#H0 ==> the null hypothesis is that the treatment is not effective +#HA ==> the alternative hypothesis is that the treament is effective + +# Question b) part 2) + +# 28 alive, 75 dead, 69 recieved the treatment, 34 control group, the null hypothesis, the percentage of the alive in the grou + + +# Question C) + +41/99 #==> this suggests that the treatment is effective + + + + + +# Chapter 2 Section 6 from Intro to Stat with Randomization and Simulation: + +# Question 1 + +#The hypotheses are: +# H0: Yawing next to someone doesn't affect the person p_treament = p_control +# HA: (There is an association between being near a yawner and yawning) p_treament does not equal to p_control + +# Question 2 + +p_hat_treatment = 10/34 +p_hat_control = 4/16 + +my_diff = p_hat_treatment - p_hat_control +print(my_diff) +# So our difference is 0.04411765 + +# Question 3: + +# some notes from graph are: +#The null distribution is approximately normally distributed around the zero (0.0) + +#Our observation is 0.04411765, so I located this on the x-axis + +# The p-value = p(p_hat_treatment - p_hat_control >= 0.044) + p(p_hat_treatment - p_hat_control <= -0.044) +# This is because my hypothesis is double-sided + +#This p-value will be around the zero, and since the largest bars (large proportion) of the distribution lies between -0.1 and +0.1 + +#let's guess that 4,000 to 6,000 lies in this distribution + +#p-value = (4000 to 6000) / 10,000 = 0.4 to 0.6 + +#So, I would say that the p-value would be large than 5% anyway, which would be enough to reject the null hypothesis. + + +# Chapter 3 Section 1 from Intro to Stat with Randomization and Simulation + +#Question 1: + +#The answer is false because of the central limit theorem conditions: n >= 30, np >= 10 & n(1-p) >= 10 + + +#------------------------------------------------------------------------ + +#Chapter 9 Section 2 from Introduction to Statistical Thinking + +# Question 1 +# the goal we are trying to do replication to test the statistic mentioned in the question +mu <- 3.5 +sig1 <- 3 +sig2 <- 1.5 +stat_test <- rep(0, 10^3) + +for(i in 1:10^3){ + x1 <- rnorm(29,mu, sig1) + x2 <- rnorm(21, mu, sig2) + x1_bar <- mean(x1) + x1_variance <- var(x1) + x2_bar <- mean(x2) + x2_variance <- var(x2) + stat_test[i] <- (x1_bar - x2_bar) / sqrt((x1_variance/29) + (x2_variance/21)) +} +quantile(stat_test, c(0.025, 0.975), na.rm = TRUE) + +#The answer would be an interval between the two numbers [-2.018366, 2.011073] This is just an example because everytime the code would +# run, the number would be different, depending on the samples that would be generated + + +#Question 2 +x1_bar <- mean(magents$change[1:29]) +x2_bar <- mean(magents$change[30:50]) + +x1_variance <- var(magents$change[1:29]) +x2_variance <- var(magents$change[30:50]) + +ans <- (x1_bar - x2_bar) / sqrt((x1_variance/29) + (x2_variance/21)) +print(ans) + +#The value of 5.985601 doesn't fall in this interval + + +#------------------------------------------------------------------------ + +#Chapter 12 Section 1 + +#Question 1 + +view(magents) + +#The null Hypothesis: H0 is that the placebo doesn't have effect (E(x) = 0) +#The alternative Hypothesis: HA is that the placebo's effect is not = zero (the placebo has an effect) (E(X) != 0) + +#Note: this is going to be a two-sided test + +#Question 2 + +# The observations that would be used to test this null hypothesis are the people who recieved placebo as a treatment. +# In other words, the observations that has active ==> '2'. So, the observations from 30 to 50 ==> magents$change[30:50] + + +#Question 3 + +t.test(magents$change[30:50]) + +# the default of t-test it compares the mean to be equal to zero. + +#The result from running t-test is that: + +#One Sample t-test + +#data: magents$change[30:50] + +#t = 3.1804, df = 20, p-value = 0.004702 +#alternative hypothesis: true mean is not equal to 0 +#95 percent confidence interval: +# 0.3768845 1.8135916 +#sample estimates: +#mean of x +# 1.095238 + + +# since p-value is less than 0.05, then we reject the null hypothesis, which would mean that the placebo might have an effect. + + +#------------------------------------------------------------------------ + +#Chapter 13 Section 1 + +#Question 1 + +# To compare the expectation of the reported score of pain between two groups, we have to use Welch Two Sample t-test. +# We compare the score1 grouping by active to differenite between group 1 (actual treatment) and group 2 (placebo) + +#This test gets the difference between the means of the two groups, the standard error of the diff, +# the t-statistic to see how extreme the diff is give the null hypothesis of they have the same mean is true, and the p-value to decide. +t.test(magents$score1 ~ magents$active) + +#Since the p-value = 0.6806, which is greater than 0.05. Then, we fail to reject the null hypothesis of the diff between the mean = 0. + +#Question 2 + +#Same idea of question 1 but for the variance this time + +var.test(magents$score1 ~ magents$active) + +#Since the p-value = 0.3687, which is greater than 0.05. Then, we fail to reject the null hypothesis of the diff between the variances = 0. + + +#Question 3 + +#Same idea of the last two question but using change this time because we are after treatment + +t.test(magents$change ~ magents$active) + +#Since p-value is very small, which is less than 0.05. Then, we have to reject null hypothesis of the two groups (control and treament) +# have the same mean of change after the treatment. + + +#Question 4 + +var.test(magents$change ~ magents$active) + +#Since p-value is very small, which is less than 0.05. Then, we have to reject null hypothesis of the two groups (control and treament) +# have the same variance after the treatment. + + + +#------------------------------------------------------------------------ + +#Chapter 5 Excercises 5.20 from Introduction to Statistical Thinking:- + +# Question A: + +#Equation for any line: y = mx + b +# m represents the slope = r (correlation factor) * (standard deviation of height (y-axis) / standard deviation of shoulder grith (x-axis)) + +0.67 * (9.41 / 10.37) #==> slope ==> m = 0.6079749 + +# b is the intercept = (mean of height) - slope * (mean of shoulder girth) + +171.14 - 0.6079749 * 108.2 # the incercept ==> b = 105.3571 + +# Final Equation of the regression line is that y = 0.6079749x + 105.3571 + +# Question B: + +# 0.6079749 ==> slope ==> represents that for each additional cm in shoulder girth, there will be 0.6079749 cm added to the height + +# 105.3571 ==> incercept ==> when the shoulder girth equals to zero, the height is supposed to be equal to 105.3571. +# The incercept is very hard to describe in reality + +# Question C: + +0.67^2 +# R^2 = r^2 = (0.67)^2 = 0.4489 ==> about 45% of the change in height can be explained by the linear realtionship with shoulder girth + + +# Question D: +0.6079749 * 100 + 105.3571 + +#His height would be about 166.1546 cm + + +# Question E: + +# Residual = Actual - Predicted ==> 160 - 166.16 = -6.16 ==> means that the model overestimated the student's height by 6.16 + + +# Question F: + +# I don't think so because shoulder girth of 56 cm is way far from the range of the points in the graph. So, I don't believe his height would be explained using this model + + +#------------------------------------------------------------------------ + +#Chapter 5 Excercises 29 from Introduction to Statistical Thinking:- + +#Question A: + +#There is a positive linear relationship between weight and height. There are some outliers + +#Question B: + +# Regression line equation: y = 1.0716x - 105.0113 + +#Slope ==> 1.0716 ==> With additional cm in height, there is a kg increased in the weight +# Incercept ==> when the height was equal to zero, the weight is - 105.0113. It is very unrealistic thing to describe because no one has height of zero or weight with negetive number + +#Question C: + +#Null hypothesis: H0: the height has no effect on the weight +# Alternative Hypothesis: HA: the height has effect on the weight +# p-value is zero, which is clearly less than 0.05 (sig level). This means that we have to reject the null hypothesis. +# So, indeed, there is a realtionship between height and weight + +#Question D: + +0.72^2 + +# R^2 = (0) = (0.72)^2 = 0.5184 ==> this means about 51% of the change (variability) in weight is explained by the height + + + diff --git a/week2/diamond-sizes.Rmd b/week2/diamond-sizes.Rmd index 3003fdbd2..f893a1507 100644 --- a/week2/diamond-sizes.Rmd +++ b/week2/diamond-sizes.Rmd @@ -3,6 +3,11 @@ title: "Diamond sizes" date: 2016-08-25 output: html_document --- +```{r, echo=FALSE} +knitr::opts_chunk$set( + echo = FALSE +) +``` ```{r setup, include = FALSE} library(ggplot2) @@ -21,4 +26,33 @@ below: smaller %>% ggplot(aes(carat)) + geom_freqpoly(binwidth = 0.01) +``` + + + +Stricking features are that there are only `r nrow(diamonds) - nrow(smaller)`, which are larger than 2.5 carats. +Also, the majority of the diamonds are less than 1 carat + + + + + + +```{r} +diamonds %>% ggplot(aes(x = carat, color = cut, fill = cut)) + +geom_histogram(bins = 30) +``` + + + +```{r} +diamonds %>% ggplot(aes(x = carat, color = color, fill = color)) + +geom_histogram(bins = 30) +``` + + + +```{r} +diamonds %>% ggplot(aes(x = carat, color = clarity, fill = clarity)) + +geom_histogram(bins = 30) ``` \ No newline at end of file diff --git a/week2/diamond-sizes.html b/week2/diamond-sizes.html new file mode 100644 index 000000000..420d8c82f --- /dev/null +++ b/week2/diamond-sizes.html @@ -0,0 +1,418 @@ + + + + + + + + + + + + + + +Diamond sizes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

We have data about 53940 diamonds. Only 126 are larger than 2.5 +carats. The distribution of the remainder is shown below:

+

+ +

Stricking features are that there are only 126, which are larger than +2.5 carats. Also, the majority of the diamonds are less than 1 carat

+ + +

+ +

+ +

+ + + + +
+ + + + + + + + + + + + + + + diff --git a/week3/Week3_Excercises.R b/week3/Week3_Excercises.R new file mode 100644 index 000000000..e3f42034a --- /dev/null +++ b/week3/Week3_Excercises.R @@ -0,0 +1,298 @@ +library(tidyverse) +library(MASS) +library(ISLR) + +#Reproducing the table in ISRS 5.29 from the bodydata + +bodydata <- read.table('D:/coursework/week3/body.dat.txt', header = TRUE, sep = ' ', stringsAsFactors = FALSE) +view(bodydata) + +bodydata <- bodydata |> rename("Height" = X174.0, "Weight" = X65.6) + +ggplot(bodydata, aes(x = Height, y = Weight)) + geom_point(color = "lightblue") + +lm.fit <- lm(Weight ~ Height, data = bodydata) + +summary(lm.fit) + + +#------------------------------------------------------------------------- + +# Lab 3.6.3 + +library(MASS) +library(ISLR) + +view(Boston) + +lm.fit <- lm(medv ~ lstat + age, data = Boston) #lm(y ∼ x1 + x2 + x3) +summary(lm.fit) + +lm.fit <- lm(medv ~ ., data = Boston) +summary(lm.fit) +str(Boston) +install.packages('car') +library(car) + +vif(lm.fit) # vif for all the variables + +lm.fit1 <- lm(medv ~ . - age, data = Boston) # for all the variables except the age + +vif(lm.fit1) +summary(lm.fit1) + + +lm.fit1 <- update(lm.fit, ~ . - age) # Another way to do the model using every variable except the age using a model you did before with all the variables + +#------------------------------------------------------------------------- + + +# Lab 3.6.4 + +#The syntax lstat:age tells R to include an interaction term between lstat and age. + +# lstat * age simultaneously includes lstat, age, and the interaction term lstat×age as predictors + +summary(lm(medv ~ lstat * age, data = Boston)) + +#------------------------------------------------------------------- + +# Lab 3.6.5 + +lm.fit2 <- lm(medv ~ lstat + I(lstat^2)) +summary(lm.fit2) + +#anova() function to further quantify the extent to which the quadratic fit is superior to the linear fit + +lm.fit <- lm(medv ~ lstat) +anova(lm.fit, lm.fit2) # compares the two models: Linear and quadertic ==> prints the Analysis of Variance Table + +#The null hypothesis is that the two models fit the data equally well, and the alternative hypothesis is that the full model is superior + +lm.fit3 <- lm(medv ~ lstat + I(lstat^2) + I(lstat^3)) # I created this out of curiosity to see how the ^3 would do comapring to them + +par(mfrow = c(2, 2)) +plot(lm.fit2) +plot(lm.fit) +plot(lm.fit3) + + +lm.fit5 <- lm(medv ~ poly(lstat , 5)) + +plot(lm.fit5) +summary(lm.fit5) +summary(lm(medv ~ log(rm), data = Boston)) + +#------------------------------------------------------------------- + +# Lab 3.6.6 + +head(Carseats) +view(Carseats) + +lm.fit <- lm(Sales ~ . + Income:Advertising + Price:Age, data = Carseats) +summary(lm.fit) +attach(Carseats) # makes the columns names avaliable to use without needing to mention the dataset everytime + +contrasts(ShelveLoc) # Shows the coding in R for qualitative variables + + +#------------------------------------------------------------------- + +# Exercises 6.1 + +babyweights <- read.table('D:/coursework/week3/babyweights.txt') +view(babyweights) + +# Question a) + +#Avg weight of the baby = -8.94 * smoke + 123.05 + +# Question b) + +# Slope (-8.94) ==> means that if the mother is a smoker, there will be a decrease of 8.94 in the baby's weight + +123.05 - 8.94 +# the baby weight for a smoker = 114.11 +# the baby weight for a non-smoker = 123.05 + + +# Question c) +# Yes, because the p-value for smoke is less than 5% ==> then the variable is statistically sig +# This will lead to the fact that there is a realtionship between smoking and average weight for the baby + + +#Replication part from actual data + +lm.fit1 <- lm(bwt ~ smoke, data = babyweights) +summary(lm.fit1) + +#------------------------------------------------------------------- + +# Exercises 6.2 + +# Question a) + +# Avg weight of the baby = -1.93 * parity + 120.07 + +# Question b) + +# Slope (-1.93) ==> means that if the baby is the first born, there will be a decrease of 1.93 in the average baby's weight + + +120.07 - 1.93 +# the baby weight if the baby is first-born = 118.14 +# the baby weight if the baby isn't first-born = 120.07 + +# Question c) +# Yes, because the p-value for smoke is greater than 5% ==> then the variable isn't statistically sig +# This will lead to the fact that there isn't a clear realtionship between parity and average weight for the baby + + +#Replication part from actual data + +lm.fit2 <- lm(bwt ~ parity, data = babyweights) +summary(lm.fit2) +#------------------------------------------------------------------- + + +# Exercises 6.3 + +# Question a) + +# Avg weight of the baby = 0.44 * gestation - 3.33 * parity - 0.01 * age + 1.15 * height + 0.05 weight - 8.4 * smoke - 80.41 + + +# Question b) + +# Slope for gestation (0.44) ==> if the length of the pregnancy increased by a day, there will be 0.44 ounces added to the average weight +# Slope for age (-0.01) ==> if the mother's age increased by 1 year, there will be 0.01 ounces decreament in the avg weight + +#Question c) + +# This might be explained by the collinearity among the variables. Also, each new variable added to the model would have an effect on the other variables + + +#Question d) + +0.44 * 284 - 3.33 * 0 - 0.01 * 27 + 1.15 * 62 + 0.05 * 100 - 8.4 * 0 - 80.41 +# predicted value : 120.58 +# Acutal avg weight : 120 + +# The residual = 120 - 120.58 = -0.58 + + +#Question E) + +# R^2 = 1- variance of residuals / variance of the birth weights + +1- 249.28/332.57 + +# R^2 = 0.2504435 + +# Adj R^2 = 1- variance of residuals / variance of the birth weights * (n-1) / n-k-1 +# n ==> number of obs ==> 1236 +# K ==> number of independent variables (predictors) ==> 6 + +1- (249.28/332.57 * (1236-1)/ (1236 - 6 -1) ) + +# Adj R^2 = 0.2467842 + + +#Replication part from actual data + +lm.fit3 <- lm(bwt ~ ., data = babyweights) +summary(lm.fit3) + + +#------------------------------------------------------------------- +# Lab 5.3.1 The Validation Set Approach + +#set.seed() function in order to set a seed for seed R’s random number generator +install.packages('ISLR2') +library(ISLR2) +set.seed(1) +train <- sample(392, 196) #sample is for splitting the data into havles +lm.fit <- lm(mpg ~ horsepower , data = Auto, subset = train) # model between mpg and horsepower +summary(lm.fit) + +#The mean() function to calculate the MSE of the 196 observations in the validation set + +attach(Auto) +mean((mpg - predict(lm.fit, Auto))[-train]^2) + +lm.fit2 <- lm(mpg ~ poly(horsepower , 2), data = Auto, subset = train) + +mean((mpg - predict(lm.fit2, Auto))[-train]^2) # get the MSE for this model + +lm.fit3 <- lm(mpg ~ poly(horsepower , 3), data = Auto, subset = train) + +summary(lm.fit3) + +mean((mpg - predict(lm.fit3, Auto))[-train]^2) + +set.seed(2) +train <- sample(392, 196) + +lm.fit <- lm(mpg ~ horsepower , subset = train) + +mean((mpg - predict(lm.fit, Auto))[-train]^2) + +lm.fit2 <- lm(mpg ~ poly(horsepower , 2), data = Auto, subset = train) +summary(lm.fit2) +mean((mpg - predict(lm.fit2 , Auto))[-train]^2) + +lm.fit3 <- lm(mpg ~ poly(horsepower , 3), data = Auto, subset = train) + +mean((mpg - predict(lm.fit3, Auto))[-train]^2) + +#------------------------------------------------------------------- +# Lab 5.3.2 The Validation Set Approach + +# LOOCV (Leave-One-Out Cross-Validation) estimate can be automatically computed for any generalized linear model using the glm() and cv.glm() functions + +glm.fit <- glm(mpg ~ horsepower , data = Auto) # this just work as lm +summary(glm.fit) +coef(glm.fit) # give coffient and slope for the variables in the model + +# Just to prove the idea that glm in this case works as lm + +lm.fit <- lm(mpg ~ horsepower , data = Auto) +summary(lm.fit) +coef(lm.fit) + +# Final Notes: + +# glm is more for generalized linear model when we specify a family argument. Ex: family = binomial +# lm is for linear models +# cv.glm() is used to perform cross-validation for generalized linear models. + + +library(boot) +glm.fit <- glm(mpg ~ horsepower , data = Auto) +summary(glm.fit) +coef(glm.fit) +cv.err <- cv.glm(Auto, glm.fit) +cv.err$delta + + +cv.error <- rep(0, 10) +for (i in 1:10) { +glm.fit <- glm(mpg ~ poly(horsepower , i), data = Auto) +cv.error[i] <- cv.glm(Auto, glm.fit)$delta[1] +} + +cv.error + +#------------------------------------------------------------------- +# Lab 5.3.3 The Validation Set Approach + +#The cv.glm() function can also be used to implement k-fold CV + +set.seed(17) +cv.error.10 <- rep(0, 10) +for (i in 1:10) { + glm.fit <- glm(mpg ~ poly(horsepower , i), data = Auto) + cv.error.10[i] <- cv.glm(Auto, glm.fit, K = 10)$delta[1] + } +cv.error.10 # to get the error for our models diff --git a/week3/allbut.pl b/week3/allbut.pl new file mode 100644 index 000000000..a22afad08 --- /dev/null +++ b/week3/allbut.pl @@ -0,0 +1,35 @@ +#!/usr/bin/env perl + +# get args +if (@ARGV < 3) { + print STDERR "Usage: $0 base_name start stop max_test [ratings ...]\n"; + exit 1; +} +$basename = shift; +$start = shift; +$stop = shift; +$maxtest = shift; + +# open files +open( TESTFILE, ">$basename.test" ) or die "Cannot open $basename.test for writing\n"; +open( BASEFILE, ">$basename.train" ) or die "Cannot open $basename.train for writing\n"; + +# init variables +$testcnt = 0; + +while (<>) { + ($user) = split /::/, $_, 2; + if (! defined $ratingcnt{$user}) { + $ratingcnt{$user} = 1; + } else { + ++$ratingcnt{$user}; + } + if (($testcnt < $maxtest || $maxtest <= 0) + && $ratingcnt{$user} >= $start && $ratingcnt{$user} <= $stop) { + ++$testcnt; + print TESTFILE; + } + else { + print BASEFILE; + } +} diff --git a/week3/movielens.Rmd b/week3/movielens.Rmd index 78a442d9c..fb0b9d473 100644 --- a/week3/movielens.Rmd +++ b/week3/movielens.Rmd @@ -13,6 +13,8 @@ output: library(scales) library(tidyverse) library(knitr) +library(ggplot2) +library(patchwork) # set plot theme theme_set(theme_bw()) @@ -35,6 +37,10 @@ head(ratings) %>% kable() ```{r dist-ratings} # plot the distribution of rating values https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=26 + +ratings |> ggplot(aes(x = rating)) + geom_histogram(binwidth = 0.5, fill = "skyblue", color = "white") + +labs(x = "Rating" , y = "Number of Ratings", title = "How many ratings are there at each level?") + +scale_y_continuous(labels = scales::comma) ``` ## Per-movie stats @@ -42,16 +48,53 @@ head(ratings) %>% kable() ```{r aggregate-by-movie} # aggregate ratings by movie, computing mean rating and number of ratings # hint: use the n() function for easy counting within a group + +movie_summary <- ratings |> group_by(movie_id) |> summarise(mean_rating = mean(rating), num_ratings = n()) |> +arrange(desc(num_ratings)) + +view(movie_summary) ``` ```{r dist-movie-popularity} # plot distribution of movie popularity (= number of ratings the movie received) # hint: try scale_x_log10() for a logarithmic x axis + +movie_summary |> ggplot(aes(x = num_ratings)) + +geom_histogram() + +scale_x_log10() + +labs( + title = "Distribution of Movie popularity", + x = "Number of Ratings", + y = "Number of Movies" +) ``` ```{r dist-mean-ratings-by-movie} # plot distribution of mean ratings by movie https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=28 # hint: try geom_histogram and geom_density + + +hist_plot <- movie_summary |> ggplot(aes(x = mean_rating)) + +geom_histogram(alpha = 0.7) + +labs( + title = "Distribution of mean ratings by movie", + x = "Mean Rating by Movie", + y = "Number of Movies" +) + +density_plot <- movie_summary |> ggplot(aes(x = mean_rating)) + +geom_density(fill = "#232222", size = 1) +labs( + title = "Distribution of mean ratings by movie", + x = "Mean Rating by Movie", + y = "Number of Movies" +) + + +hist_plot + density_plot + + + ``` ```{r cdf-movie-pop} @@ -59,7 +102,30 @@ head(ratings) %>% kable() # hint: use dplyr's rank and arrange functions, and the base R sum and cumsum functions # store the result in a new data frame so you can use it in creating figure 2 from the paper below + +#rank movies by popularity & CDF part + +movie_cdf <- movie_summary |> arrange(desc(num_ratings)) |> +mutate( + rank = row_number(), + cum_ratings = cumsum(num_ratings), # cumulative sum of number of ratings + total_ratings = sum(num_ratings), # This is a constant anyway because we know how many ratings we got overall + cdf = cum_ratings / total_ratings +) + # plot the CDF of movie popularity + +movie_cdf |> ggplot(aes(x = rank, y = cdf)) + +geom_line(size = 1) + +scale_y_continuous(labels = label_percent())+ +labs( + title = "What fraction of ratings are given to the most popular movies?", + x = "Rank", + y = "CDF" +) + + + ``` @@ -67,11 +133,22 @@ head(ratings) %>% kable() ```{r aggregate-by-user} # aggregate ratings by user, computing mean and number of ratings +user_summary <- ratings |> group_by(user_id) |> summarise(mean_rating = mean(rating), num_ratings = n()) |> +arrange(desc(num_ratings)) +view(user_summary) ``` ```{r dist-user-activity} # plot distribution of user activity (= number of ratings the user made) # hint: try a log scale here + +user_summary |> ggplot(aes(x = num_ratings)) + +geom_histogram() + +scale_x_log10()+ +labs( + title = "Distribution of user activity", + x = "Number of ratings by users", +) ``` # Anatomy of the long tail @@ -91,4 +168,94 @@ head(ratings) %>% kable() # paper, produce one curve for the 100% user satisfaction level and # another for 90%---do not, however, bother implementing the null # model (shown in the dashed lines). + +view(movie_cdf) +head(ratings, 50) + +total_number_of_users = n_distinct(ratings$user_id) +print(total_number_of_users) + + +movie_summary_with_rank <- ratings |> group_by(movie_id) |> summarise(mean_rating = mean(rating), num_ratings = n()) |> +arrange(desc(num_ratings)) + + + +movie_summary_with_rank <- movie_summary_with_rank |> mutate(rank = row_number()) + +view(movie_summary_with_rank) + + +user_movies <- ratings |> group_by(user_id) + + +total_users_movies_rank <- left_join(user_movies, movie_summary_with_rank, by ="movie_id") + +user_weirdest_movie_rank <- total_users_movies_rank |> group_by(user_id) |> summarise(rank_of_weirdes_movie = max(rank)) |> arrange(rank_of_weirdes_movie) + +view(user_weirdest_movie_rank) + +number_of_users_watch_this_weird_movie <- user_weirdest_movie_rank |> group_by(rank_of_weirdes_movie) |> summarise(number_users_for_100 = n()) + +view(number_of_users_watch_this_weird_movie) + + +satisfied_customers <- number_of_users_watch_this_weird_movie |> +mutate( + satisfied_customers_num_for_100 = cumsum(number_users_for_100), + satisfied_customers_num_percent_for_100 = satisfied_customers_num_for_100 /total_number_of_users +) + +view(satisfied_customers) + +satisfied_customers |> ggplot(aes(x = rank_of_weirdes_movie, y = satisfied_customers_num_percent_for_100)) + geom_line()+ +labs( + x = "Inventory Size", + y = "Percent of the user" +) + + + +#Start the 90% satisfaction rate + +total_users_movies_rank <- left_join(user_movies, movie_summary_with_rank, by ="movie_id") + +ninety_qunatile_try <- total_users_movies_rank |> group_by(user_id) |> summarise(ninety_qunatile = quantile(rank, 0.9)) |> arrange(ninety_qunatile) + + +number_of_user_watch_this_ninety <- ninety_qunatile_try |> group_by(ninety_qunatile) |> summarise(number_users_ninety = n()) + +view(number_of_user_watch_this_ninety) + +satisfied_customers_ninety_percent <- number_of_user_watch_this_ninety |> +mutate( + satisfied_customers_num_for_90 = cumsum(number_users_ninety), + satisfied_customers_num_percent_for_90 = satisfied_customers_num_for_90 /total_number_of_users +) + +view(satisfied_customers_ninety_percent) + +satisfied_customers_ninety_percent |> ggplot(aes(x = ninety_qunatile, y = satisfied_customers_num_percent_for_90)) + geom_line() + +labs( + x = "Inventory Size", + y = "Percent of the user" +) + +#final_graph + +ggplot() + +geom_line(data = satisfied_customers_ninety_percent, aes(x = ninety_qunatile, y = satisfied_customers_num_percent_for_90, color = "90% Satisfaction"), size = 1) + +geom_line(data = satisfied_customers, aes(x = rank_of_weirdes_movie, y = satisfied_customers_num_percent_for_100, color = "100% Satisfaction"), size = 1) + +scale_color_manual(values = c("100% Satisfaction" = "Blue", "90% Satisfaction" = " red")) + +labs(color = "Level of Satisfaction") + +labs( + x = "Inventory Size", + y = "Percent of the user" +) + + + + + + ``` diff --git a/week3/ngrams/01_download_1grams.sh b/week3/ngrams/01_download_1grams.sh index 1d6d5bf10..30a07f6ce 100644 --- a/week3/ngrams/01_download_1grams.sh +++ b/week3/ngrams/01_download_1grams.sh @@ -2,6 +2,9 @@ # use curl or wget to download the version 2 1gram file with all terms starting with "1", googlebooks-eng-all-1gram-20120701-1.gz + +curl -O "http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-1.gz" + # update the timestamp on the resulting file using touch # do not remove, this will keep make happy and avoid re-downloading of the data once you have it touch googlebooks-eng-all-1gram-20120701-1.gz diff --git a/week3/ngrams/02_filter_1grams.sh b/week3/ngrams/02_filter_1grams.sh index 3b8e9ec29..4f501173d 100644 --- a/week3/ngrams/02_filter_1grams.sh +++ b/week3/ngrams/02_filter_1grams.sh @@ -1,6 +1,13 @@ #!/bin/bash # filter original 1gram file googlebooks-eng-all-1gram-20120701-1.gz to only lines where the ngram exactly matches a year (18xx, 19xx, or 20xx, where x is a digit) + + # decompress the first using gunzip, zless, zcat or similar +gunzip -k googlebooks-eng-all-1gram-20120701-1.gz + # then filter out rows that match using grep -E, egrep, awk, or similar # write results to year_counts.tsv + +grep -E '18[0-9][0-9]|19[0-9][0-9]|20[0-9][0-9]' googlebooks-eng-all-1gram-20120701-1 > year_counts.tsv + diff --git a/week3/ngrams/03_download_totals.sh b/week3/ngrams/03_download_totals.sh index f53381e8e..934a0cdd5 100644 --- a/week3/ngrams/03_download_totals.sh +++ b/week3/ngrams/03_download_totals.sh @@ -2,6 +2,8 @@ # use curl or wget to download the version 2 of the total counts file, googlebooks-eng-all-totalcounts-20120701.txt +curl -O "http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-totalcounts-20120701.txt" + # update the timestamp on the resulting file using touch # do not remove, this will keep make happy and avoid re-downloading of the data once you have it touch googlebooks-eng-all-totalcounts-20120701.txt diff --git a/week3/ngrams/04_reformat_totals.sh b/week3/ngrams/04_reformat_totals.sh index 0445c1ff6..89572e18d 100644 --- a/week3/ngrams/04_reformat_totals.sh +++ b/week3/ngrams/04_reformat_totals.sh @@ -3,3 +3,8 @@ # reformat total counts in googlebooks-eng-all-totalcounts-20120701.txt to a valid csv # use tr, awk, or sed to convert tabs to newlines # write results to total_counts.csv + + +awk 'BEGIN { OFS = "\n" } { $1=$1; print }' googlebooks-eng-all-totalcounts-20120701.txt > total_counts.csv + + diff --git a/week3/ngrams/05_final_report.Rmd b/week3/ngrams/05_final_report.Rmd index a7c90c1fb..de601c9f1 100644 --- a/week3/ngrams/05_final_report.Rmd +++ b/week3/ngrams/05_final_report.Rmd @@ -15,6 +15,7 @@ output: library(here) library(scales) library(tidyverse) +library(ggplot2) theme_set(theme_bw()) @@ -53,13 +54,32 @@ Load in the `year_counts.tsv` and `total_counts.csv` files. Use the `here()` fun ```{r load-counts} +total_counts_r <- read_csv('total_counts.csv', + col_names = c('year','total_volume','page_count','book_count')) + +year_counts_r <- read_tsv('year_counts.tsv', + col_names = c('term', 'year', 'volume', 'book_count')) ``` ## Your written answer Add a line below using Rmarkdown's inline syntax to print the total number of lines in each dataframe you've created. +```{r number_lines} + +number_lines_in_total_file = nrow(total_counts_r) + +number_lines_in_years_file = nrow(year_counts_r) + +print(paste("Total number of lines in total counts: " , number_lines_in_total_file)) + +print(paste("Total number of lines in years file: " , number_lines_in_years_file)) + + +``` + + # Part B > Recreate the main part of figure 3a of Michel et al. (2011). To recreate this figure, you will need two files: the one you downloaded in part (a) and the “total counts” file, which you can use to convert the raw counts into proportions. Note that the total counts file has a structure that may make it a bit hard to read in. Does version 2 of the NGram data produce similar results to those presented in Michel et al. (2011), which are based on version 1 data? @@ -70,6 +90,18 @@ Join the raw year term counts with the total counts and divide to get a proporti ```{r join-years-and-totals} +head(total_counts_r) + +years_and_total <- left_join(year_counts_r, total_counts_r, by = "year") + + + +years_and_total <- years_and_total |> mutate( + proportions = volume / total_volume +) + +head(years_and_total) + ``` @@ -79,12 +111,26 @@ Plot the proportion of mentions for the terms "1883", "1910", and "1950" over ti ```{r plot-proportion-over-time} + +years_and_total <- years_and_total |> filter(term == "1883" | term == "1910" | term == "1950") |> filter(year >= "1850" & year <= "2012") + +head(years_and_total) + +years_and_total |> ggplot(aes(x = year, y = proportions * 10000, color = term)) + geom_line() + +labs( + x = "Year", + y = "Frequency" +) + ``` ## Your written answer Write up your answer to Part B here. +In our graph which is based on version 2, the peaks are a little bit higher than the lines in graph on the Ngram viewer, especially 1950. Also, from what I see is that the graph in our version starts right before the year itself not 10 years compared to the graph in the Ngram version. + + # Part C > Now check your graph against the graph created by the [NGram Viewer](https://books.google.com/ngrams/). @@ -93,6 +139,10 @@ Write up your answer to Part B here. Go to the ngram viewer, enter the terms "1883", "1910", and "1950" and take a screenshot. +`![](ngrams 1883.png)` + +Added to what I am menioned in part b, the scales are different on y-axis + ## Your written answer Add your screenshot for Part C below this line using the `![](figure_filename.png)` syntax and comment on similarities / differences. @@ -108,6 +158,15 @@ Plot the raw counts for the terms "1883", "1910", and "1950" over time from 1850 ```{r plot-raw-mentions-over-time} +head(years_and_total) + +years_and_total |> ggplot(aes(x = year, y = volume , color = term)) + geom_line() + +scale_y_continuous(labels = comma) + +labs( + x = "Year", + y = "Frequency" +) + ``` # Part E @@ -122,12 +181,29 @@ Plot the total counts for each year over time, from 1850 to 2012. Use the `comma ```{r plot-totals} +view(total_counts_r) + +total_counts_r_filtered <- total_counts_r |> filter(year >= "1850" & year <= "2012") + + + +view(total_counts_r_filtered) + +total_counts_r_filtered |> ggplot(aes(x = year, y = total_volume)) + geom_line() + +scale_y_continuous(labels = comma) + +labs( + x = "Year", + y = "Total count" +) + ``` ## Your written answer Write up your answer to Part E here. +Yes, this leads to reevaluate any of the results of Michel et al. (2011). I believe now that the people are not forgetting about the year based on this graph. There is a upward trend across the different the three lines for the three years after they reach the peak and go down a little bit. + # Part F > Now, using the proportion of mentions, replicate the inset of figure 3a. That is, for each year between 1875 and 1975, calculate the half-life of that year. The half-life is defined to be the number of years that pass before the proportion of mentions reaches half its peak value. Note that Michel et al. (2011) do something more complicated to estimate the half-life—see section III.6 of the Supporting Online Information—but they claim that both approaches produce similar results. Does version 2 of the NGram data produce similar results to those presented in Michel et al. (2011), which are based on version 1 data? (Hint: Don’t be surprised if it doesn’t.) @@ -138,6 +214,43 @@ For each year term, find the year where its proportion of mentions peaks (hits i ```{r compute-peaks} +all_terms_with_total <- left_join(year_counts_r, total_counts_r, by = "year") + +all_terms_with_total <- all_terms_with_total |> mutate( + proportions = volume / total_volume +) + +str(all_terms_with_total) + + +all_terms_with_total <- all_terms_with_total |> filter(grepl("^(18|19|20|21)[0-9]{2}$", term)) + +view(all_terms_with_total) + + +#All the terms with peak year and peak value +all_terms_with_peak_year_and_peak_value <- all_terms_with_total |> filter(year >= "1850" & year <= "2012") |> group_by(term) |> +slice_max(order_by = proportions, n = 1, with_ties = FALSE) |> rename(peak_year = year, peak_proportion = proportions)|> +select(term, peak_year, peak_proportion) + + +view(all_terms_with_peak_year_and_peak_value) + + + +# The three terms + +all_terms_with_peak_year_and_peak_value |> filter(term %in% c(1883, 1910, 1950)) + + +# It's the numbers but the values are rounded when they are saved in the dataframes + +view(years_and_total) +years_and_total_highest_value_for_each_term <- years_and_total |> group_by(term) |> filter(proportions == max(proportions)) |> select(term, year, proportions) +view(years_and_total_highest_value_for_each_term) + + + ``` ## Compute half-lifes @@ -146,6 +259,28 @@ Now, for each year term, find the minimum number of years it takes for the propo ```{r compute-half-lifes} + +joined_all_terms_with_peak_year_and_peak_value <- all_terms_with_total |> +inner_join(all_terms_with_peak_year_and_peak_value, by = "term") |> +filter(year > peak_year) + +view(joined_all_terms_with_peak_year_and_peak_value) + + +half_life_all_terms <- joined_all_terms_with_peak_year_and_peak_value |> +filter(proportions <= 0.5 * peak_proportion) |> +group_by(term) |> +slice_min(order_by = year, n = 1, with_ties = FALSE) |> +mutate(years_to_half = year - peak_year, half_year = year) |> +select(term, half_year, peak_year, years_to_half, peak_proportion, year) + + +view(half_life_all_terms) + +# To view the three main years "terms" + +half_life_all_terms |> filter(term %in% c(1883, 1910, 1950)) + ``` ## Plot the inset of figure 3a @@ -155,16 +290,85 @@ Plot the half-life of each term over time from 1850 to 2012. Each point should r ```{r plot-half-lifes} +half_life_all_terms <- half_life_all_terms |> mutate(highlight = ifelse(term %in% c("1883", "1910", "1950"), term, "Other")) + +view(half_life_all_terms) + + +half_life_all_terms |> filter(term %in% c(1883, 1910, 1950)) + + +half_life_all_terms |> ggplot(aes(x = year, y = years_to_half, color = highlight)) + +geom_point(size = 4) + +geom_smooth(method = "loess", se = FALSE, color = "black") + +scale_x_continuous(limits = c(1850, 2012)) + +scale_color_manual(values = c("1883" = "blue", "1910" = "#034503", "1950" = "red")) + +labs( + x = "Year", + y = "Half-life (yrs)", + color = "Terms" +) + ``` ## Your written answer Write up your answer to Part F here. +No, they don't look the same as the graph in version 1. In version 1, there is a downward trend in the half-life years over the years. +The slope in version 1 is clearly negative. + # Part G > Were there any years that were outliers such as years that were forgotten particularly quickly or particularly slowly? Briefly speculate about possible reasons for that pattern and explain how you identified the outliers. +Yes, there were some terms that have high half-life, which means that they were hard to be forgotten. +Also, there were some terms have very short half-life time closer to zero. Below is the code that shows I have identified the outliers + +```{r outliers} + +view(half_life_all_terms) + +str(half_life_all_terms) +#Note about qunatile, it needs a vector to be passed not a dataframe +Q1_half_life <- quantile(half_life_all_terms$years_to_half, 0.25) + +Q3_half_life <- quantile(half_life_all_terms$years_to_half, 0.75) + +IQR_half_life <- Q3_half_life - Q1_half_life + +lower_bound <- Q1_half_life - 1.5* IQR_half_life +upper_bound <- Q3_half_life + 1.5 * IQR_half_life + +outliers <- half_life_all_terms |> filter(years_to_half < lower_bound | years_to_half > upper_bound) + +print(min(half_life_all_terms$years_to_half)) + +min_half_life <- min(half_life_all_terms$years_to_half) + +terms_min_half_life <- half_life_all_terms |> filter(years_to_half == min_half_life) + +max_half_life <- max(half_life_all_terms$years_to_half) + +terms_max_half_life <- half_life_all_terms |> filter(years_to_half == max_half_life) + + +#Outliers + +print("Outliers: ") +print(outliers) +view(outliers) + +print("Term with the min years to half life: ") +print(terms_min_half_life) + +print("Term with the max years to half life: ") +print(terms_max_half_life) + +``` + + + ## Your written answer Write up your answer to Part G here. Include code that shows the years with the smallest and largest half-lifes. @@ -172,3 +376,4 @@ Write up your answer to Part G here. Include code that shows the years with the # Makefile Edit the `Makefile` in this directory to execute the full set of scripts that download the data, clean it, and produce this report. This must be turned in with your assignment such that running `make` on the command line produces the final report as a pdf file. + diff --git a/week3/ngrams/Makefile b/week3/ngrams/Makefile index 1b22923b1..46b3b148b 100644 --- a/week3/ngrams/Makefile +++ b/week3/ngrams/Makefile @@ -1,17 +1,26 @@ # target to make the file report all: 05_final_report.html -googlebooks-eng-all-1gram-20120701-1.gz: # add filenames for data and/or code this output depends on here - # add code to run script to download the 1grams data here +googlebooks-eng-all-1gram-20120701-1.gz: + bash 01_download_1grams.sh -year_counts.tsv: # add filenames for data and/or code this output depends on here - # add code to run script to filter the 1grams data here +year_counts.tsv: googlebooks-eng-all-1gram-20120701-1.gz + bash 02_filter_1grams.sh -googlebooks-eng-all-totalcounts-20120701.txt: # add filenames for data and/or code this output depends on here - # add code to run script to download the total counts data here +googlebooks-eng-all-totalcounts-20120701.txt: + bash 03_download_totals.sh -total_counts.csv: # add filenames for data and/or code this output depends on here - # add code to run script to reformat the total counts data here +total_counts.csv: googlebooks-eng-all-totalcounts-20120701.txt + bash 04_reformat_totals.sh -05_final_report.html: # add filenames for data and/or code this output depends on here +05_final_report.html: year_counts.tsv total_counts.csv 05_final_report.Rmd Rscript -e "rmarkdown::render('05_final_report.Rmd')" + + +clean: + rm -f googlebooks-eng-all-1gram-20120701-1.gz year_counts.tsv \ + googlebooks-eng-all-totalcounts-20120701.txt total_counts.csv \ + 05_final_report + +.PHONY: all clean + diff --git a/week3/ngrams/ngrams 1883.png b/week3/ngrams/ngrams 1883.png new file mode 100644 index 000000000..31f94d3af Binary files /dev/null and b/week3/ngrams/ngrams 1883.png differ diff --git a/week4/Bestmodel.RData b/week4/Bestmodel.RData new file mode 100644 index 000000000..2038ce0cb Binary files /dev/null and b/week4/Bestmodel.RData differ diff --git a/week4/Citibike.Rmd b/week4/Citibike.Rmd new file mode 100644 index 000000000..a1c4c1309 --- /dev/null +++ b/week4/Citibike.Rmd @@ -0,0 +1,440 @@ +```{r setup, include=FALSE} +library(here) +library(scales) +library(tidyverse) +library(ggplot2) +library(modelr) + +theme_set(theme_bw()) + +knitr::opts_chunk$set(echo = TRUE) +``` + +```{r Load_data} +trips_per_day <- read_tsv('trips_per_day.tsv') + +#Manipulate the dataframe to include: weekends +trips_per_day <- trips_per_day |> mutate( + day_name = weekdays(ymd), +) +trips_per_day$is_weekend <- ifelse(trips_per_day$day_name %in% c("Saturday", "Sunday"), 1, 0) + +trips_per_day <- trips_per_day |> mutate( + avg_temp = tmax - tmin, + avg_prcp = mean(prcp) +) + +trips_per_day <- trips_per_day |> mutate( + month = month(ymd), + season = case_when( + month %in% c(12, 1, 2) ~ "Winter", + month %in% c(3, 4, 5) ~ "Spring", + month %in% c(6, 7, 8) ~ "Summer", + month %in% c(9, 10, 11) ~ "Fall" + ) +) + + +view(trips_per_day) + + + +``` + +```{r setup the data} + +set.seed(42) + +# Train and validation set +num_days <- nrow(trips_per_day) +frac_train_validation <- 0.9 +num_train_validation <- floor(num_days * frac_train_validation) + +# randomly sample rows for the training & validation sets +ndx <- sample(1:num_days, num_train_validation, replace=F) +trips_per_day_train_validation <- trips_per_day[ndx, ] + +# Trips for test +trips_per_day_test <- trips_per_day[-ndx, ] + + +# split the train and validation data +set.seed(42) +num_days_overall_train_validation <- nrow(trips_per_day_train_validation) +frac_train <- 0.6 +num_train <- floor(num_days_overall_train_validation * frac_train) +ndx_trian <- sample(1:num_days_overall_train_validation, num_train, replace = F) +trips_per_day_train <- trips_per_day_train_validation[ndx_trian, ] + +# Trips for validation: +trips_per_day_validate <- trips_per_day_train_validation [-ndx_trian, ] + +``` + +```{r model} + +# fit a model for each polynomial degree +K <- 1:8 +train_err <- c() +validate_err <- c() +for (k in K) { + + # fit on the training data + model <- lm(num_trips ~ poly(tmin, k, raw = T), data=trips_per_day_train) + + # evaluate on the training data + train_err[k] <- sqrt(mean((predict(model, trips_per_day_train) - trips_per_day_train$num_trips)^2)) + + # evaluate on the validate data + validate_err[k] <- sqrt(mean((predict(model, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) +} + +``` + + +```{r model plot} + +plot_data <- data.frame(K, train_err, validate_err) %>% + gather("split", "error", -K) + +ggplot(plot_data, aes(x=K, y=error, color=split)) + + geom_line() + + scale_x_continuous(breaks=K) + + xlab('Polynomial Degree') + + ylab('RMSE') + +``` + + + +```{r model prediction vs actual} + + +model <- lm(num_trips ~ poly(tmin, 5, raw = T), data = trips_per_day_train) + +trips_per_day_train <- trips_per_day_train %>% + add_predictions(model) %>% + mutate(split = "train") +trips_per_day_validate <- trips_per_day_validate %>% + add_predictions(model) %>% + mutate(split = "validate") +plot_data <- bind_rows(trips_per_day_train, trips_per_day_validate) + +ggplot(plot_data, aes(x = tmin, y = num_trips)) + + geom_point(aes(color = split)) + + geom_line(aes(y = pred)) + + xlab('Minimum temperature') + + ylab('Daily trips') + + scale_y_continuous() +``` + + +# New Models + + +# Model with everything + +```{r model with avg temp and snow and is_weekend} + +train_err <- c() +validate_err <- c() + +view(trips_per_day_train) + model_with_everything <- lm(num_trips ~ avg_temp +is_weekend + prcp + snow + snwd, data=trips_per_day_train) + + # evaluate on the training data + train_err <- sqrt(mean((predict(model_with_everything, trips_per_day_train) - trips_per_day_train$num_trips)^2)) + + # evaluate on the validate data + validate_err <- sqrt(mean((predict(model_with_everything, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) + + train_err # Gives the RMSE for the train # about 7997.772 + validate_err # Gives the RMSE for the train # about 7723.385 + + summary(model_with_everything) +``` + + +# Model with rain +```{r model with rain} + +train_err <- c() +validate_err <- c() + + model_with_rain <- lm(num_trips ~ (tmin + prcp), data=trips_per_day_train) + + # evaluate on the training data + train_err <- sqrt(mean((predict(model_with_rain, trips_per_day_train) - trips_per_day_train$num_trips)^2)) + + # evaluate on the validate data + validate_err <- sqrt(mean((predict(model_with_rain, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) + + train_err # Gives the RMSE for the train # about 5141 + validate_err # Gives the RMSE for the train # about 4920 + + +``` + +# Model with rain and snow + + +```{r model with rain and snow} + +train_err <- c() +validate_err <- c() + + model_with_rain_snow <- lm(num_trips ~ (tmin + prcp + snow), data=trips_per_day_train) + + # evaluate on the training data + train_err <- sqrt(mean((predict(model_with_rain_snow, trips_per_day_train) - trips_per_day_train$num_trips)^2)) + + # evaluate on the validate data + validate_err <- sqrt(mean((predict(model_with_rain_snow, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) + + train_err # Gives the RMSE for the train # about 5141 + validate_err # Gives the RMSE for the train # about 4920 + + # Approximately no difference between it and just with rain + + +``` + + +# model with rain and snow and their interaction +```{r model with rain and snow and their interaction} + +train_err <- c() +validate_err <- c() + + model_with_rain_snow_theirinteraction <- lm(num_trips ~ (tmin + prcp * snow), data=trips_per_day_train) + + # evaluate on the training data + train_err <- sqrt(mean((predict(model_with_rain_snow_theirinteraction, trips_per_day_train) - trips_per_day_train$num_trips)^2)) + + # evaluate on the validate data + validate_err <- sqrt(mean((predict(model_with_rain_snow_theirinteraction, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) + + train_err # Gives the RMSE for the train # about 5093 + validate_err # Gives the RMSE for the train # about 4898 + + # Approximately no difference between it and just with rain + + +``` + + + +# model with rain and snow and their interaction and their polynomial + + +```{r model} + +# fit a model for each polynomial degree +K <- 1:8 +train_err <- c() +validate_err <- c() +for (k in K) { + + # fit on the training data + model_with_rain_snow_theirinteraction_with_poly <- lm(num_trips ~ poly((tmin + prcp * snow), k, raw = T), data=trips_per_day_train) + + # evaluate on the training data + train_err[k] <- sqrt(mean((predict(model_with_rain_snow_theirinteraction_with_poly, trips_per_day_train) - trips_per_day_train$num_trips)^2)) + + # evaluate on the validate data + validate_err[k] <- sqrt(mean((predict(model_with_rain_snow_theirinteraction_with_poly, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) +} + +train_err # there are higher than the rain and snow and their interaction Not helpul + +validate_err + +``` + +# model with rain and snow and their interaction & weekend effect & tmax + +```{r model with rain and snow and their interaction & weekend effect & tmax} + +train_err <- c() +validate_err <- c() + +view(trips_per_day_train) + model_with_rain_snow_theirinter_avg_temp_weekend <- lm(num_trips ~ (tmax + is_weekend + prcp * snow), data=trips_per_day_train) + + # evaluate on the training data + train_err <- sqrt(mean((predict(model_with_rain_snow_theirinter_avg_temp_weekend, trips_per_day_train) - trips_per_day_train$num_trips)^2)) + + # evaluate on the validate data + validate_err <- sqrt(mean((predict(model_with_rain_snow_theirinter_avg_temp_weekend, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) + + train_err # 4143.062 + validate_err # 3938.039 + + summary(model_with_rain_snow_theirinter_avg_temp_weekend) + +``` + + +# model with tmax interacting with snow depth and prcp interacting with snow & weekend effect + +```{r model with tmax interacting with snow depth and prcp} + +train_err <- c() +validate_err <- c() + +view(trips_per_day_train) + model_with_rain_snow_theirinter_avg_temp_weekend <- lm(num_trips ~ (tmax * snwd + is_weekend + prcp * snow), data=trips_per_day_train) + + # evaluate on the training data + train_err <- sqrt(mean((predict(model_with_rain_snow_theirinter_avg_temp_weekend, trips_per_day_train) - trips_per_day_train$num_trips)^2)) + + # evaluate on the validate data + validate_err <- sqrt(mean((predict(model_with_rain_snow_theirinter_avg_temp_weekend, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) + + train_err # 4051.437 + validate_err # 3810.801 +``` + + + + +# model with tmax interacting with snow depth and prcp interacting with snow & weekend effect + +```{r model with tmax interacting with snow depth and prcp} + +train_err <- c() +validate_err <- c() + +view(trips_per_day_train) + model_with_rain_snow_theirinter_temp_snwd_weekend_season <- lm(num_trips ~ (tmax * snwd + is_weekend * season + prcp * snow), data=trips_per_day_train) + + # evaluate on the training data + train_err <- sqrt(mean((predict(model_with_rain_snow_theirinter_temp_snwd_weekend_season, trips_per_day_train) - trips_per_day_train$num_trips)^2)) + + # evaluate on the validate data + validate_err <- sqrt(mean((predict(model_with_rain_snow_theirinter_temp_snwd_weekend_season, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) + + train_err # 3620 + validate_err # 4108.801 + + #summary(model_with_rain_snow_theirinter_temp_snwd_weekend_season) +``` + + + + + + +# Let's see which degree is the best for every variable: + +``` {r model check variables} + +K <- 1:8 +train_err <- c() +validate_err <- c() +for (k in K) { + + # fit on the training data + model <- lm(num_trips ~ poly(snwd, k, raw = T), data=trips_per_day_train) + + # evaluate on the training data + train_err[k] <- sqrt(mean((predict(model, trips_per_day_train) - trips_per_day_train$num_trips)^2)) + + # evaluate on the validate data + validate_err[k] <- sqrt(mean((predict(model, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) +} + + +``` + + +``` {r model check the best degree for the variables} +plot_data <- data.frame(K, train_err, validate_err) %>% + gather("split", "error", -K) + +ggplot(plot_data, aes(x=K, y=error, color=split)) + + geom_line() + + scale_x_continuous(breaks=K) + + xlab('Polynomial Degree') + + ylab('RMSE') +``` + + + +# My best model so far + +```{r best model} + +train_err <- c() +validate_err <- c() + +view(trips_per_day_train) + model_with_rain_snow_theirinter_temp_snwd_weekend_season <- lm(num_trips ~ (tmax * snwd + is_weekend * season + prcp * snow), data=trips_per_day_train) + + # evaluate on the training data + train_err <- sqrt(mean((predict(model_with_rain_snow_theirinter_temp_snwd_weekend_season, trips_per_day_train) - trips_per_day_train$num_trips)^2)) + + # evaluate on the validate data + validate_err <- sqrt(mean((predict(model_with_rain_snow_theirinter_temp_snwd_weekend_season, trips_per_day_validate) - trips_per_day_validate$num_trips)^2)) + + train_err # 3620 + validate_err # 4108.801 + + #summary(model_with_rain_snow_theirinter_temp_snwd_weekend_season) +``` + + +# My Plot for the best model graph 1 + +```{r plot best model graph 1} + +trips_per_day_train <- trips_per_day_train %>% + add_predictions(model_with_rain_snow_theirinter_temp_snwd_weekend_season) %>% + mutate(split = "train") + +trips_per_day_validate <- trips_per_day_validate %>% + add_predictions(model_with_rain_snow_theirinter_temp_snwd_weekend_season) %>% + mutate(split = "validate") + +plot_data <- bind_rows(trips_per_day_train, trips_per_day_validate) + +view(plot_data) + +ggplot(plot_data, aes(x = date)) + + geom_point(aes(y = num_trips, color = split)) + + geom_line(aes(y = pred)) + + xlab('Date ') + + ylab('Daily trips') + + scale_y_continuous() + +``` + + +# My Plot for the best model graph 2 + +```{r plot best model graph 2} + +view(plot_data) + +ggplot(plot_data) + + geom_point(aes(x = pred, y = num_trips)) + + geom_abline(color ="blue") + + xlab('Prediction') + + ylab('Daily trips') + + scale_y_continuous() + +``` + +# save my best model + +```{r save my best model} +save(model_with_rain_snow_theirinter_temp_snwd_weekend_season, file = "Bestmodel.RData") +``` + + + + + + + + diff --git a/week4/test_citibike_predictions.Rmd b/week4/test_citibike_predictions.Rmd new file mode 100644 index 000000000..2644a6d0f --- /dev/null +++ b/week4/test_citibike_predictions.Rmd @@ -0,0 +1,87 @@ +```{r setup, include=FALSE} +library(here) +library(scales) +library(tidyverse) +library(ggplot2) +library(modelr) +library(lubridate) + +theme_set(theme_bw()) + +knitr::opts_chunk$set(echo = TRUE) + + +``` + + +```{r Load_data} +trips_per_2015_day <- read_tsv('trips_per_day_2015.tsv') +weather_2015 <- read.csv('weather_2015.csv') +view(weather_2015) +view(trips_per_2015_day) + + +str(weather_2015) +str(trips_per_2015_day) + + +weather_2015$DATE <- ymd(weather_2015$DATE) + + +trips_per_day <- inner_join(trips_per_2015_day, weather_2015, by = c("ymd" = "DATE")) + +view(trips_per_day) + +names(trips_per_day) <- tolower(names(trips_per_day)) + +view(trips_per_day) + +#Manipulate the dataframe to include: weekends +trips_per_day <- trips_per_day |> mutate( + day_name = weekdays(ymd), +) + +trips_per_day$is_weekend <- ifelse(trips_per_day$day_name %in% c("Saturday", "Sunday"), 1, 0) + +trips_per_day <- trips_per_day |> mutate( + avg_temp = tmax - tmin, + avg_prcp = mean(prcp) +) + +trips_per_day <- trips_per_day |> mutate( + month = month(ymd), + season = case_when( + month %in% c(12, 1, 2) ~ "Winter", + month %in% c(3, 4, 5) ~ "Spring", + month %in% c(6, 7, 8) ~ "Summer", + month %in% c(9, 10, 11) ~ "Fall" + ) +) +view(trips_per_day) + + +trips_per_day <- trips_per_day |> select(ymd, num_trips, prcp, snow, snwd, tmax, tmin, day_name, is_weekend, avg_temp, avg_prcp, month, season) + +trips_per_day <- trips_per_day |> mutate( + tmax = tmax /10 +) +view(trips_per_day) +``` + +```{r test 2015} + +test_err <- c() + +view(trips_per_day) + +model <- load("Bestmodel.RData") + +model <- get(model) + +summary(model) +test_err <- c() +test_err <- sqrt(mean((predict(model, trips_per_day) - trips_per_day$num_trips)^2)) +test_err + +#test_err RMSE 7785.088 +```