Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
65c1af7
This is my assignmen for day2
ahmedlotfi2510 May 28, 2025
f783c19
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 May 28, 2025
a8e4d5f
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 May 29, 2025
0d234aa
Day3 assignment -Ahmed
ahmedlotfi2510 May 29, 2025
180d85a
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 May 30, 2025
ecce991
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 May 30, 2025
eb12f3d
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 May 30, 2025
d65e61b
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 May 30, 2025
2a4f3a6
Day4 assignment --Ahmed
ahmedlotfi2510 May 31, 2025
3cc78fa
Extra Excercies from Book Day4 -Ahmed
ahmedlotfi2510 Jun 2, 2025
26d513c
Extra Excercies from the Book Day 4 -Ahmed
ahmedlotfi2510 Jun 2, 2025
d165855
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 Jun 2, 2025
d965b0e
This is my day5 file with everything
ahmedlotfi2510 Jun 2, 2025
e02fe94
Day5 Book Exercies -Ahmed
ahmedlotfi2510 Jun 2, 2025
48cbdff
Diamond Excercies -Ahmed
ahmedlotfi2510 Jun 2, 2025
b9e7774
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 Jun 2, 2025
d18a057
musical_pairs try -Ahmed
ahmedlotfi2510 Jun 3, 2025
cf43331
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 Jun 3, 2025
77d54b2
Day 6 Excercies -Ahmed
ahmedlotfi2510 Jun 3, 2025
e05e572
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 Jun 4, 2025
46f7f24
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 Jun 5, 2025
00dd575
Day7 & Day 8 assigments
ahmedlotfi2510 Jun 6, 2025
ed08181
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 Jun 6, 2025
ab61b20
Week2 Excercies -Ahmed
ahmedlotfi2510 Jun 9, 2025
24094f5
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 Jun 9, 2025
240d144
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 Jun 10, 2025
4407170
Week3_Excercises -Ahmed
ahmedlotfi2510 Jun 11, 2025
af03fa8
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 Jun 11, 2025
f7d2ada
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 Jun 12, 2025
80ff02f
allbut.pl file
ahmedlotfi2510 Jun 12, 2025
66639e5
Movielens project -Ahmed
ahmedlotfi2510 Jun 13, 2025
5cc842c
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 Jun 13, 2025
8076b0c
ngrams project -Ahmed
ahmedlotfi2510 Jun 16, 2025
5ad6efc
Movielens Task -Ahmed
ahmedlotfi2510 Jun 16, 2025
4cefbbc
Week3 Excercies -Ahmed
ahmedlotfi2510 Jun 16, 2025
6721e9a
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 Jun 16, 2025
f5fa4ff
Citibike model -Ahmed
ahmedlotfi2510 Jun 16, 2025
4c856a1
Merge branch 'master' of https://github.com/msr-ds3/coursework
ahmedlotfi2510 Jun 17, 2025
df36898
My 2015 test RMSE ==> 214126.6 -Ahmed
ahmedlotfi2510 Jun 17, 2025
08fe1a2
New test 2015 -Ahmed
ahmedlotfi2510 Jun 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 158 additions & 0 deletions week1/Extra_Day4_Book_Excercies_Chapter3_Ahmed_sols.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#Excercie 3.3.1

library(tidyverse)


dta_mpg <- mpg

#----------------------------------------------------------------------------------------
# Excercies from chapter 3 first edition

# 3.3.1 Excercies

# Question 1:
# A fix for the code:
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(color = "blue")

# Another sol

ggplot(mpg) +
geom_point(aes(x = displ, y= hwy), color = "blue")

# The issue was that the color was included in aes(). So, it was treated as aesthetic.
#The color = "blue" was interpreted as a categorical variable which only takes a single value "blue"

# Question 2:
# I used ?mpg to check this:

# categorial variables in mpg are: manufacturer, model, year, trans, drv, fl, class
# Continuous variable in mpg are: displ, year, cyl, cty, hwy

#Question 3:

#Makes teh color as a scale not distinct colors (same for size)
ggplot(mpg, aes(x = displ, y = hwy, color = cty)) +
geom_point()

ggplot(mpg, aes(x = displ, y = hwy, size = cty)) +
geom_point()

# Gives error for shapes because we cannot make a scale of shapes
ggplot(mpg, aes(x = displ, y = hwy, shape = cty)) +
geom_point()

#-----------------------------------------------------------------------------------------

# 3.5.1 Excercies

# question 1
#Answer: The continuous variable is convereted to a categorical variable, then the grahp contains a facet for each distinct value
ggplot(
mpg, aes(x = displ, y = hwy)) +
geom_point() + facet_grid(.~cty)

# Question 4

# class is faceted
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_wrap(~ class, nrow = 2)

# class mapped to color
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color = class))


#Advantages:
# 1- It's easier to distinguish and see the different classes
# 2- There is no overlapping between the point, so it is easier to see the trends. On the other hand, the with color, there is huge overlapping.

#Disadvantages:
# 1- It is diffcult to compare as the points are on different and separate graphs

#-----------------------------------------------------------------------------------------

# 3.6.1 Excercies:

# Question 5:
#graph 1
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point() +
geom_smooth()

#graph 2
ggplot() +
geom_point(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_smooth(data = mpg, mapping = aes(x = displ, y = hwy))


#Ans: No, because both geom_point() and geom_smooth will take the same data as input in graph 2.
# In graph 1, they will take from data from mapping. So, the two graphs will look the same.


# Question 6:

?mpg
?geom_smooth
#graph 1
ggplot(
mpg, aes(x = displ, y = hwy)
) + geom_point() + geom_smooth(se = FALSE)

#graph 2
ggplot(
mpg, aes(x = displ, y = hwy)
) + geom_point() + geom_smooth(aes(group = drv), se = FALSE)

#graph 3
ggplot(
mpg, aes(x = displ, y = hwy, color = drv)
) + geom_point() + geom_smooth(se = FALSE)


#graph 4
ggplot(
mpg, aes(x = displ, y = hwy)
) + geom_point(aes(color = drv)) + geom_smooth(se = FALSE)


#graph 5
ggplot(
mpg, aes(x = displ, y = hwy)
) + geom_point(aes(color = drv)) + geom_smooth(aes(linetype = drv), se = FALSE)

#graph 6
ggplot(
mpg, aes(x = displ, y = hwy, color = drv)
) + geom_point(size = 4, color = "white") + geom_point()


#-----------------------------------------------------------------------------------------

#3.8.1 Excercies

#Question 1:

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
geom_point()

# Problem is that there is overlapping because there are multiple observations for each combination cty and hwy

# Quick fix would be:

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
geom_point(position = "jitter")

#jitter shows the positions where there are more observations


#Question 2:

?geom_jitter()

#The two arguments would be width and height examples:

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_jitter(width = 0)

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_jitter(width = 40)
62 changes: 59 additions & 3 deletions week1/citibike.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@ library(lubridate)

########################################
# READ AND TRANSFORM THE DATA
########################################
##########@##############################

# read one month of data
trips <- read_csv('201402-citibike-tripdata.csv')
trips <- read_csv('./week1/201402-citibike-tripdata.csv')

# replace spaces in column names with underscores
names(trips) <- gsub(' ', '_', names(trips))

# convert dates strings to dates
# trips <- mutate(trips, starttime = mdy_hms(starttime), stoptime = mdy_hms(stoptime))
#trips <- mutate(trips, starttime = mdy_hms(starttime), stoptime = mdy_hms(stoptime))

# recode gender as a factor 0->"Unknown", 1->"Male", 2->"Female"
trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unknown","Male","Female")))
Expand All @@ -24,25 +24,81 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn

# count the number of trips (= rows in the data frame)

#print(nrow(trips)) ==> 224736

# find the earliest and latest birth years (see help for max and min to deal with NAs)

#birth_year_col <- trips$birth_year
#birth_year_col_new <- as.numeric(birth_year_col)
#min(birth_year_col_new)
# Ans: [1] 1899
# max(birth_year_col_new)
# Ans: [1] 1997



# use filter and grepl to find all trips that either start or end on broadway

#filter(trips ,grepl("Broadway", start_station_name) | grepl("Broadway", end_station_name))

# do the same, but find all trips that both start and end on broadway

#filter(trips ,grepl("Broadway", start_station_name) & grepl("Broadway", end_station_name))


# find all unique station names

#uniq_start_station_name <- unique(trips$start_station_name)
#uniq_end_station_name <- unique(trips$end_station_name)
#combine_stations <- paste(uniq_end_station_name, uniq_start_station_name)
#unique(combine_stations)

# count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender
# do this all at once, by using summarize() with multiple arguments


# trips %>%
# group_by(gender) %>%
# summarize (count = n(), avg = mean(tripduration), std = sd(tripduration))



# find the 10 most frequent station-to-station trips

trips %>%
group_by(start_station_name, end_station_name) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
slice_head(n = 10)

# find the top 3 end stations for trips starting from each start station

trips %>% group_by(start_station_name, end_station_name) %>% summarize (count = n()) %>% arrange(desc(count), start_station_name) %>% slice_head(n=3)


trips %>% group_by(start_station_name, end_station_name) %>% summarize (count = n()) %>% group_by(start_station_name) %>% arrange(desc(count)) %>% slice(1:3)

# find the top 3 most common station-to-station trips by gender
trips %>% group_by(start_station_name, end_station_name, gender) %>% summarize (count = n()) %>% group_by(gender) %>% arrange(desc(count)) %>% slice(1:3)



# find the day with the most trips
# tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package)


trips %>% mutate( date = as.Date(starttime)) %>% group_by(date) %>% summarise(count = n()) %>% arrange(desc(count)) %>% slice(1)





# compute the average number of trips taken during each of the 24 hours of the day across the entire month
# what time(s) of day tend to be peak hour(s)?

trips %>% mutate( hours = hour(starttime)) %>%
group_by(hours) %>% summarise(count = n(), day_in_month_count = days_in_month(starttime), avg = count / day_in_month_count)

trips %>% mutate( hours = hour(starttime)) %>%
group_by(hours) %>% summarise(count = n()) %>% arrange(desc(count)) %>% slice(1)

56 changes: 56 additions & 0 deletions week1/citibike.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,75 @@

# count the number of unique stations

# cut -d, -f5 201402-citibike-tripdata.csv | sort | uniq -c | wc -l
# Ans: 330

# count the number of unique bikes

# cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | wc -l
# Ans: 5700

# count the number of trips per day

# cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c

# 12771 2014-02-01
# 13816 2014-02-02
# 2600 2014-02-03
# 8709 2014-02-04
# 2746 2014-02-05
# 7196 2014-02-06
8495 2014-02-07
5986 2014-02-08
4996 2014-02-09
6846 2014-02-10
8343 2014-02-11
8580 2014-02-12
876 2014-02-13
3609 2014-02-14
2261 2014-02-15
3003 2014-02-16
4854 2014-02-17
5140 2014-02-18
8506 2014-02-19
11792 2014-02-20
8680 2014-02-21
13044 2014-02-22
13324 2014-02-23
12922 2014-02-24
12830 2014-02-25
11188 2014-02-26
12036 2014-02-27
9587 2014-02-28

# find the day with the most rides

#cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort -r | head -n1
# 13816 2014-02-02

# find the day with the fewest rides
# cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | head -n2| tail -n1


# find the id of the bike with the most rides

# cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | sort -r | head -n2 | tail -n1
# 128 16151


# count the number of rides by gender and birth year

# cut -d, -f14,15 201402-citibike-tripdata.csv | sort | uniq -c



# count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...)


#cut -d, -f5 201402-citibike-tripdata.csv | grep '.*[0-9].* &.*[0-9].*' | wc -l
#90549


# compute the average trip duration
#awk '{sum += $1} END {avg = sum / NR; print "Average:" avg}' 201402-citibike-tripdata.csv
# 874.516
10 changes: 10 additions & 0 deletions week1/musical_pairs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
members=("ahmed" "aisha" "alou" "naomi" "sara" "sofia" "srijana" "vaishnavi" "vanessa" "dereck" "drishya" "yehtut")

if command -v md5sum &> /dev/null; then
seed=$(date +%F | md5sum | awk '{print $1}')
else
seed=$(date +%F | md5sum | awk '{print $NF}')

fi

printf "%s\n" "${members[@]}" | shuf --random-source=<(echo $seed) | xargs -n2 echo "Pair: "
Loading