msr-ds3 · ahmedlotfi2510 · May 28, 2025 · May 28, 2025 · May 29, 2025 · May 29, 2025
diff --git a/week1/Extra_Day4_Book_Excercies_Chapter3_Ahmed_sols.R b/week1/Extra_Day4_Book_Excercies_Chapter3_Ahmed_sols.R
@@ -0,0 +1,158 @@
+#Excercie 3.3.1
+
+library(tidyverse)
+
+
+dta_mpg <- mpg
+
+#----------------------------------------------------------------------------------------
+# Excercies from chapter 3 first edition 
+
+# 3.3.1 Excercies 
+
+# Question 1: 
+# A fix for the code: 
+ggplot(mpg, aes(x = displ, y = hwy)) + 
+  geom_point(color = "blue")
+
+# Another sol 
+
+ggplot(mpg) + 
+geom_point(aes(x = displ, y= hwy), color = "blue")
+
+# The issue was that the color was included in aes(). So, it was treated as aesthetic. 
+#The color = "blue" was interpreted as a categorical variable which only takes a single value "blue"
+
+# Question 2:
+# I used ?mpg to check this: 
+
+# categorial variables in mpg are: manufacturer, model, year, trans, drv, fl, class
+# Continuous variable in mpg are: displ, year, cyl, cty, hwy
+
+#Question 3:
+
+#Makes teh color as a scale not distinct colors (same for size)
+ggplot(mpg, aes(x = displ, y = hwy, color = cty)) +
+geom_point()
+
+ggplot(mpg, aes(x = displ, y = hwy, size = cty)) +
+geom_point()
+
+# Gives error for shapes because we cannot make a scale of shapes
+ggplot(mpg, aes(x = displ, y = hwy, shape = cty)) +
+geom_point()
+
+#-----------------------------------------------------------------------------------------
+
+# 3.5.1 Excercies 
+
+# question 1 
+#Answer: The continuous variable is convereted to a categorical variable, then the grahp contains a facet for each distinct value
+ggplot(
+  mpg, aes(x = displ, y = hwy)) +  
+  geom_point() +  facet_grid(.~cty)
+
+# Question 4
+
+# class is faceted
+ggplot(data = mpg) + 
+  geom_point(mapping = aes(x = displ, y = hwy)) + 
+  facet_wrap(~ class, nrow = 2)
+
+# class mapped to color
+ggplot(data = mpg) + 
+  geom_point(mapping = aes(x = displ, y = hwy, color =  class))
+
+
+#Advantages: 
+# 1- It's easier to distinguish and see the different classes
+# 2- There is no overlapping between the point, so it is easier to see the trends. On the other hand, the with color, there is huge overlapping. 
+
+#Disadvantages: 
+# 1- It is diffcult to compare as the points are on different and separate graphs
+
+#-----------------------------------------------------------------------------------------
+
+# 3.6.1 Excercies: 
+
+# Question 5: 
+#graph 1 
+ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
+  geom_point() + 
+  geom_smooth()
+
+#graph 2
+ggplot() + 
+  geom_point(data = mpg, mapping = aes(x = displ, y = hwy)) + 
+  geom_smooth(data = mpg, mapping = aes(x = displ, y = hwy))
+
+
+#Ans: No, because both geom_point() and geom_smooth will take the same data as input in graph 2. 
+# In graph 1, they will take from data from mapping. So, the two graphs will look the same. 
+
+
+# Question 6: 
+
+?mpg
+?geom_smooth
+#graph 1 
+ggplot(
+  mpg, aes(x = displ, y = hwy)
+) + geom_point() + geom_smooth(se = FALSE)
+
+#graph 2 
+ggplot(
+  mpg, aes(x = displ, y = hwy)
+)  +  geom_point() + geom_smooth(aes(group  = drv), se = FALSE) 
+
+#graph 3 
+ggplot(
+  mpg, aes(x = displ, y = hwy, color = drv)
+) + geom_point() + geom_smooth(se = FALSE) 
+
+
+#graph 4 
+ggplot(
+  mpg, aes(x = displ, y = hwy)
+) + geom_point(aes(color = drv)) + geom_smooth(se = FALSE) 
+
+
+#graph 5
+ggplot(
+  mpg, aes(x = displ, y = hwy)
+) + geom_point(aes(color = drv)) + geom_smooth(aes(linetype = drv), se = FALSE) 
+
+#graph 6
+ggplot(
+  mpg, aes(x = displ, y = hwy, color = drv)
+) + geom_point(size = 4, color = "white") +  geom_point() 
+
+
+#-----------------------------------------------------------------------------------------
+
+#3.8.1 Excercies 
+
+#Question 1: 
+
+ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + 
+  geom_point()
+
+# Problem is that there is overlapping because there are multiple observations for each combination cty and hwy 
+
+# Quick fix would be: 
+
+ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + 
+  geom_point(position = "jitter")
+
+#jitter shows the positions where there are more observations
+
+
+#Question 2: 
+
+?geom_jitter()
+
+#The two arguments would be width and height examples: 
+
+ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_jitter(width = 0)
+
+ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_jitter(width = 40)
diff --git a/week1/citibike.R b/week1/citibike.R
@@ -3,16 +3,16 @@ library(lubridate)
 
 ########################################
 # READ AND TRANSFORM THE DATA
-########################################
+##########@##############################
 
 # read one month of data
-trips <- read_csv('201402-citibike-tripdata.csv')
+trips <- read_csv('./week1/201402-citibike-tripdata.csv')
 
 # replace spaces in column names with underscores
 names(trips) <- gsub(' ', '_', names(trips))
 
 # convert dates strings to dates
-# trips <- mutate(trips, starttime = mdy_hms(starttime), stoptime = mdy_hms(stoptime))
+#trips <- mutate(trips, starttime = mdy_hms(starttime), stoptime = mdy_hms(stoptime))
 
 # recode gender as a factor 0->"Unknown", 1->"Male", 2->"Female"
 trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unknown","Male","Female")))
@@ -24,25 +24,81 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn
 
 # count the number of trips (= rows in the data frame)
 
+#print(nrow(trips)) ==> 224736
+
 # find the earliest and latest birth years (see help for max and min to deal with NAs)
 
+#birth_year_col <- trips$birth_year
+#birth_year_col_new <- as.numeric(birth_year_col)
+#min(birth_year_col_new)                          
+# Ans: [1] 1899
+# max(birth_year_col_new)
+# Ans: [1] 1997
+
+
+
 # use filter and grepl to find all trips that either start or end on broadway
 
+#filter(trips ,grepl("Broadway", start_station_name) |  grepl("Broadway", end_station_name))
+
 # do the same, but find all trips that both start and end on broadway
 
+#filter(trips ,grepl("Broadway", start_station_name) &  grepl("Broadway", end_station_name))
+
+
 # find all unique station names
 
+#uniq_start_station_name <- unique(trips$start_station_name)
+#uniq_end_station_name <- unique(trips$end_station_name) 
+#combine_stations <- paste(uniq_end_station_name, uniq_start_station_name)
+#unique(combine_stations)
+
 # count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender
 # do this all at once, by using summarize() with multiple arguments
 
+
+# trips %>% 
+#   group_by(gender) %>% 
+#   summarize (count = n(), avg = mean(tripduration), std = sd(tripduration))
+
+
+
 # find the 10 most frequent station-to-station trips
 
+trips %>%
+    group_by(start_station_name, end_station_name) %>%
+    summarize(count = n()) %>%
+    arrange(desc(count)) %>%
+    slice_head(n = 10)
+
 # find the top 3 end stations for trips starting from each start station
 
+trips %>% group_by(start_station_name, end_station_name) %>% summarize (count = n()) %>% arrange(desc(count), start_station_name) %>% slice_head(n=3)
+
+
+trips %>% group_by(start_station_name, end_station_name) %>%  summarize (count = n()) %>% group_by(start_station_name) %>% arrange(desc(count)) %>% slice(1:3)
+
 # find the top 3 most common station-to-station trips by gender
+trips %>% group_by(start_station_name, end_station_name, gender) %>%  summarize (count = n()) %>%  group_by(gender) %>% arrange(desc(count)) %>% slice(1:3)
+
+
 
 # find the day with the most trips
 # tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package)
 
+
+trips %>% mutate( date = as.Date(starttime)) %>% group_by(date) %>% summarise(count = n()) %>% arrange(desc(count)) %>% slice(1)
+
+
+
+
+
 # compute the average number of trips taken during each of the 24 hours of the day across the entire month
 # what time(s) of day tend to be peak hour(s)?
+
+trips %>% mutate( hours = hour(starttime)) %>% 
+    group_by(hours) %>% summarise(count = n(), day_in_month_count = days_in_month(starttime), avg = count /  day_in_month_count)
+
+trips %>% mutate( hours = hour(starttime)) %>% 
+    group_by(hours) %>% summarise(count = n()) %>%  arrange(desc(count)) %>% slice(1)
+
diff --git a/week1/citibike.sh b/week1/citibike.sh
@@ -5,19 +5,75 @@
 
 # count the number of unique stations
 
+#  cut -d, -f5 201402-citibike-tripdata.csv | sort | uniq -c  | wc -l
+#  Ans: 330 
+
 # count the number of unique bikes
 
+# cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c  | wc -l
+# Ans: 5700
+
 # count the number of trips per day
 
+# cut -d, -f2 201402-citibike-tripdata.csv | cut -d' '  -f1 | sort | uniq -c
+
+# 12771 2014-02-01
+# 13816 2014-02-02
+# 2600 2014-02-03
+#   8709 2014-02-04
+#  2746 2014-02-05
+#  7196 2014-02-06
+  8495 2014-02-07
+   5986 2014-02-08
+   4996 2014-02-09
+   6846 2014-02-10
+   8343 2014-02-11
+   8580 2014-02-12
+    876 2014-02-13
+   3609 2014-02-14
+   2261 2014-02-15
+   3003 2014-02-16
+   4854 2014-02-17
+   5140 2014-02-18
+   8506 2014-02-19
+  11792 2014-02-20
+   8680 2014-02-21
+  13044 2014-02-22
+  13324 2014-02-23
+  12922 2014-02-24
+  12830 2014-02-25
+  11188 2014-02-26
+  12036 2014-02-27
+   9587 2014-02-28
+
 # find the day with the most rides
 
+#cut -d, -f2 201402-citibike-tripdata.csv | cut -d' '  -f1 | sort | uniq -c | sort -r | head -n1
+# 13816 2014-02-02
+
 # find the day with the fewest rides
+# cut -d, -f2 201402-citibike-tripdata.csv | cut -d' '  -f1 | sort | uniq -c | sort | head -n2| tail -n1 
+
 
 # find the id of the bike with the most rides
 
+# cut -d, -f12  201402-citibike-tripdata.csv | sort | uniq -c | sort -r | head -n2 | tail -n1
+# 128 16151 
+
+
 # count the number of rides by gender and birth year
 
+# cut -d, -f14,15  201402-citibike-tripdata.csv | sort | uniq -c
+
+
+
 # count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...)
 
 
+#cut -d, -f5  201402-citibike-tripdata.csv | grep '.*[0-9].* &.*[0-9].*' | wc -l
+#90549 
+
+
 # compute the average trip duration
+#awk '{sum += $1} END {avg = sum / NR; print "Average:" avg}' 201402-citibike-tripdata.csv
+# 874.516
diff --git a/week1/musical_pairs.sh b/week1/musical_pairs.sh
@@ -0,0 +1,10 @@
+members=("ahmed" "aisha" "alou" "naomi" "sara" "sofia" "srijana" "vaishnavi" "vanessa" "dereck" "drishya" "yehtut")
+
+if command -v md5sum &> /dev/null; then 
+    seed=$(date +%F | md5sum | awk '{print $1}')
+else
+     seed=$(date +%F | md5sum | awk '{print $NF}')
+
+fi 
+
+printf "%s\n" "${members[@]}" | shuf --random-source=<(echo $seed) | xargs -n2 echo "Pair: "