msr-ds3 · drishyashrestha · May 29, 2025 · May 30, 2025 · May 30, 2025 · May 30, 2025
diff --git a/week1/citibike.R b/week1/citibike.R
@@ -5,8 +5,13 @@ library(lubridate)
 # READ AND TRANSFORM THE DATA
 ########################################
 
+
+
+
 # read one month of data
-trips <- read_csv('201402-citibike-tripdata.csv')
+trips <- read_csv('./coursework/week1/201402-citibike-tripdata.csv')
+##viewing the data
+view(trips)
 
 # replace spaces in column names with underscores
 names(trips) <- gsub(' ', '_', names(trips))
@@ -23,26 +28,88 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn
 ########################################
 
 # count the number of trips (= rows in the data frame)
+nrow(trips)
+
 
 # find the earliest and latest birth years (see help for max and min to deal with NAs)
 
-# use filter and grepl to find all trips that either start or end on broadway
+#converting birth_year into numbers
+trips$birth_year <- as.numeric(trips$birth_year)
+max(trips$birth_year, na.rm = TRUE)
+min(trips$birth_year, na.rm = TRUE)
 
+# use filter and grepl to find all trips that either start or end on broadway
+filtered_df <- filter(trips, grepl("Broadway", start_station_name) | grepl("Broadway",end_station_name))
+view(filtered_df)
 # do the same, but find all trips that both start and end on broadway
-
+filtered_both_df <- filter(trips, grepl("Broadway", start_station_name) & grepl("Broadway",end_station_name))
+nrow(filtered_both_df)
 # find all unique station names
+#stupid idea nrow(distinct(trips[ ,"start_station_name"])) + nrow(distinct(trips[ ,"end_station_name"]))
+
+union(trips$start_station_name, trips$end_station_name)
 
 # count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender
 # do this all at once, by using summarize() with multiple arguments
+grouped_df <- trips %>%
+    group_by(gender)%>%
+    summarize(num_trips = n(),
+     average_trip = mean(tripduration)/60,
+     standard_deviation = sd(tripduration)/60)
+grouped_df
+
+
+
 
 # find the 10 most frequent station-to-station trips
 
+frequent_df <- trips %>%
+  group_by(start_station_name, end_station_name) %>%
+  summarize(num_trips = n()) %>%
+  arrange(desc(num_trips))
+
+head(frequent_df,10)
+
 # find the top 3 end stations for trips starting from each start station
 
+trips %>%
+  group_by(start_station_name, end_station_name) %>%
+  summarize(num_trips = n()) %>%
+    group_by(start_station_name) %>%
+    arrange(desc(num_trips))%>%
+    slice(1:3)
+
+
+
+top3_end_stations
 # find the top 3 most common station-to-station trips by gender
+trips %>%
+    group_by(start_station_name, end_station_name, gender) %>%
+    summarize(num_trips= n())%>%
+    group_by(gender)%>%
+    arrange(desc(num_trips))%>%
+    slice(1:3)
+
+
+
 
 # find the day with the most trips
 # tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package)
+trips %>%
+    mutate(date_only = as.Date(starttime)) %>%
+    count(date_only)%>%
+    arrange(desc(n)) %>%
+    head(1)
 
 # compute the average number of trips taken during each of the 24 hours of the day across the entire month
 # what time(s) of day tend to be peak hour(s)?
+view(trips)
+trips %>% mutate( hours = hour(starttime)) %>%
+    group_by(hours) %>% summarise(count = n(), day_in_month_count = days_in_month(starttime), avg = count /  day_in_month_count)
+
+trips %>% mutate( hours = hour(starttime)) %>%
+    group_by(hours) %>% summarise(count = n()) %>%  arrange(desc(count)) %>% slice(1)
+
+
+
+
diff --git a/week1/citibike.sh b/week1/citibike.sh
@@ -4,20 +4,58 @@
 #
 
 # count the number of unique stations
+#$ cut -d , -f4 201402-citibike-tripdata.csv | sort | uniq | wc -l
+#330
+
 
 # count the number of unique bikes
+#$ cut -d , -f12 201402-citibike-tripdata.csv | sort | uniq | wc -l
+#5700
 
 # count the number of trips per day
+# $ cut -d , -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | head -n2
+#       1 starttime
+#     876 2014-02-13
 
 # find the day with the most rides
+ #$cut -d , -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort -r | head -n1
+ # 13816 2014-02-02
+
 
 # find the day with the fewest rides
+#$cut -d , -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | head -n2
 
 # find the id of the bike with the most rides
+#$cut -d , -f12 201402-citibike-tripdata.csv | sort | uniq -c | sort -r | head -n1
 
 # count the number of rides by gender and birth year
+#$ cut -d, -f15,14 201402-citibike-tripdata.csv | sort | uniq -c | sort -r
 
 # count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...)
-
+# $ cut -d, -f5 201402-citibike-tripdata.csv | tail -n +2 | grep -E '.*[0-9].*&.*[0-9].*' | wc -l
+# 90549
 
 # compute the average trip duration
+#  awk -F, {sum += $1; count++} END {print sum/count}' 201402-citibike-tripdata.csv 
+# 874.516
+
+
+#running average script for first 1000 lines
+# $ head -n 1000 201402-citibike-tripdata.csv | awk '{ 
+#   window[NR % 3] = $1
+#   if (NR >= 3) {
+#     print (window[(NR-1)%3] + window[(NR-2)%3] + window[(NR-3)%3]) / 3
+#   }
+# }'
+
+#MUSICAL CHAIR  in python
+# import random
+
+# names = ["Alou", "Srijana", "Sara", "Drishya", "Dereck", "Ahmed",
+#          "Aisha", "Vaishnavi", "Naomi", "Sofia", "Ye", "Vanessa"]
+
+# random.shuffle(names)
+
+# for i in range(0, 12, 2):
+#     print(f"{names[i]} & {names[i+1]}")
+
diff --git a/week1/plot_trips.R b/week1/plot_trips.R
@@ -11,32 +11,98 @@ theme_set(theme_bw())
 
 # load RData file output by load_trips.R
 load('trips.RData')
-
+head(trips)
 
 ########################################
 # plot trip data
-########################################
+######################################
 
 # plot the distribution of trip times across all rides (compare a histogram vs. a density plot)
+ggplot(trips,aes(tripduration/60))+ 
+    geom_histogram(fill= 'blue', bins = 50)+
+    scale_x_log10(labels=comma)+
+    scale_y_continuous(labels = comma)+
+    labs(x='Trip duration',
+    y= 'Frequency',
+    title = 'Histogram of trip duration in minutes')
+
+ggplot(trips,aes(tripduration/60))+ 
+    geom_density(fill= 'red')+
+    scale_x_log10(labels=comma)+
+    labs(x='Trip duration ',
+    y= 'Frequency',
+    title = 'Density plot in minutes')
+
+head(trips)
 
 # plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot)
+ggplot(trips, aes(x = tripduration, fill=usertype))+
+    geom_histogram(bins = 40)+
+    labs(x='Trip duration')+
+    scale_x_log10(labels=comma)+
+    facet_grid(~usertype)
+
+ggplot(trips, aes(x = tripduration, fill=usertype))+
+    geom_density()+
+    labs(x='Trip duration')+
+    scale_x_log10(labels=comma)+
+    facet_grid(~usertype)
 
 # plot the total number of trips on each day in the dataset
+trips%>%
+    mutate(date = as.Date(starttime))%>%
+    ggplot(aes(x =date))+
+        geom_histogram(bins=30)+
+        scale_y_continuous(labels=comma)+
+        scale_x_date(labels = date_format("%Y-%m-%d"))
 
 # plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color)
+trips%>%
+    mutate(age = as.numeric(format(ymd, "%Y")) - as.numeric(birth_year)) %>%
+    ggplot(aes(age, fill = gender))+
+    geom_histogram(bins=40,alpha = 0.8)+
+    scale_y_continuous(labels = comma)
 
 # plot the ratio of male to female trips (on the y axis) by age (on the x axis)
 # hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio
 # (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet)
+trips %>% mutate(age = as.numeric(format(ymd, "%Y")) - as.numeric(birth_year)) %>% 
+    group_by(age, gender) %>% summarise(num_trips = n(),  .groups = "drop")%>%
+    pivot_wider(names_from = gender, values_from = num_trips) %>% mutate(male_to_female = Male/ Female) %>% 
+    ggplot( aes(age, male_to_female))+  geom_line(color = "steelblue", size = 1) +
+    geom_smooth( color = "red", linetype = "dashed") +
+    scale_x_log10()+ 
+    labs(x = "AGE", y = "Male to Female Ratio", title = "Male/Female trip ratio by Age") +
+    theme_minimal()
 
 ########################################
 # plot weather data
 ########################################
 # plot the minimum temperature (on the y axis) over each day (on the x axis)
+view(weather)
+weather%>%
+    ggplot(aes(x= ymd, y= tmin, color=tmin))+
+    geom_point()+
+    scale_color_gradient(low = "blue", high = "red") +
+    labs(
+        title="Scatterplot of min temp over each day",
+        x=  "Day",
+        y= "Minimum temperature")
+
 
 # plot the minimum temperature and maximum temperature (on the y axis, with different colors) over each day (on the x axis)
 # hint: try using the pivot_longer() function for this to reshape things before plotting
 # (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet)
+view(weather)
+
+weather %>% pivot_longer(names_to = "temp_type", values_to = "temperature", cols = c(tmin,tmax)) %>%
+    ggplot(aes(ymd, temperature, color= temp_type))+ geom_line() + scale_x_date() + labs(
+    x = "Date",
+    y = "Temperature",
+    color = "Temperature Type",
+    title = "Daily Min and Max Temperatures") 
+
+
 
 ########################################
 # plot trip and weather data
@@ -45,18 +111,105 @@ load('trips.RData')
 # join trips and weather
 trips_with_weather <- inner_join(trips, weather, by="ymd")
 
+head(trips_with_weather)
+
 # plot the number of trips as a function of the minimum temperature, where each point represents a day
 # you'll need to summarize the trips and join to the weather data to do this
 
+str(weather)
+head(trips)
+trips_with_weather %>%
+    group_by(ymd,tmin)%>%
+    summarise(num_trips = n(), .groups = "drop")%>%
+    ggplot(aes(tmin, num_trips))+geom_point()
+
+#works only for this data frame since tmin is not different    
+# trips_by_day <- trips %>%
+#     mutate(date = as.Date(starttime)) %>%
+#     group_by(date)%>%
+#     summarise(num_trips = n())
+# weather %>%
+#     mutate(date = as.Date(date))%>%
+#     inner_join(trips_by_day,weather_by_day, by ='date') %>%
+#     ggplot(aes(x= mean(tmin),y= num_trips))+
+#         geom_point() +
+#         labs(x = "Minimum Temperature", y = "Number of Trips",
+#         title = "Trips vs. Min Temperature")
+
 # repeat this, splitting results by whether there was substantial precipitation or not
 # you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this
+trips_with_weather %>%
+    mutate(precptf = ifelse(prcp > mean(prcp), 'T', 'F'))%>%
+    group_by(ymd,tmin, precptf)%>%
+    summarise(num_trips = n(), .groups = "drop")%>%
+    ungroup()%>%
+    ggplot(aes(tmin, num_trips))+geom_point() + 
+    labs( x = "number of trip",
+        y = "minimum temp",
+        title = "Substantial precipitation on num of trips") +
+    facet_wrap(~precptf)
+
+
 
 # add a smoothed fit on top of the previous plot, using geom_smooth
+trips_with_weather %>%
+    mutate(precptf = ifelse(prcp > mean(prcp), 'T', 'F'))%>%
+    group_by(ymd,tmin, precptf)%>%
+    summarise(num_trips = n(), .groups = "drop")%>%
+    ungroup()%>%
+    ggplot(aes(tmin, num_trips))+geom_point() + geom_smooth()+
+    labs( x = "number of trip",
+        y = "minimum temp",
+        title = "Substantial precipitation on num of trips") +
+    facet_wrap(~precptf)
 
 # compute the average number of trips and standard deviation in number of trips by hour of the day
 # hint: use the hour() function from the lubridate package
+library(lubridate)
+
+trips_with_weather %>%
+    mutate(hour = hour(starttime),
+    date = as.Date(starttime))%>%
+    group_by(hour,date)%>%
+    summarise(num_trips = n(), .groups = "drop")%>%
+    group_by(hour)%>%
+    summarise(
+        average_trips = mean(num_trips),
+        sd_trips = sd(num_trips))
 
 # plot the above
+trips_with_weather %>%
+    mutate(hour = hour(starttime),
+    date = as.Date(starttime))%>%
+    group_by(hour,date)%>%
+    summarise(num_trips = n(), .groups = "drop")%>%
+    group_by(hour)%>%
+    summarise(
+        average_trips = mean(num_trips),
+        sd_trips = sd(num_trips)) %>%
+    ggplot( aes(hour, average_trips))+
+    geom_line(color = "red") + geom_ribbon(aes(ymin = average_trips - sd_trips, ymax = average_trips + sd_trips), alpha = 0.25)+
+     labs(
+        x = "Hour of Day",
+        y = "Average Number of Trips",
+        title = "Average Number of Trips by Hour with ±1 SD Ribbon",
+        subtitle = "Red line: Mean trips per hour; Blue ribbon: ±1 standard deviation"
+    ) +
+    theme_minimal()
+
 
 # repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days
 # hint: use the wday() function from the lubridate package
+trips %>%
+    mutate(
+        hour = hour(starttime),
+        day = as.Date(starttime),
+        weekday = wday(starttime, label =TRUE) )%>%
+    group_by(hour, weekday, day)%>%
+    summarise(trip_count = n(), .groups = "drop")%>%
+    group_by(hour,weekday)%>%
+    summarise(average = mean(trip_count),
+        standarddev = sd(trip_count),
+        .groups = "drop") %>%
+        ggplot(aes(hour, average)) + geom_line(color = "red")+ geom_ribbon(aes(ymin = average - standarddev, ymax = average + standarddev), alpha = 0.25)+
+        facet_wrap(~weekday)