diff --git a/week1/citibike.R b/week1/citibike.R
index ad01de1d..110b7adf 100644
--- a/week1/citibike.R
+++ b/week1/citibike.R
@@ -6,7 +6,7 @@ library(lubridate)
 ########################################
 
 # read one month of data
-trips <- read_csv('201402-citibike-tripdata.csv')
+trips <- read_csv('./week1/201402-citibike-tripdata.csv')
 
 # replace spaces in column names with underscores
 names(trips) <- gsub(' ', '_', names(trips))
@@ -23,26 +23,54 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn
 ########################################
 
 # count the number of trips (= rows in the data frame)
+print(nrow(trips))
+
 
 # find the earliest and latest birth years (see help for max and min to deal with NAs)
+ birth_year_col <- as.numeric(birth_year_col)
+ birth_year_col <- na.omit(birth_year_col)
+ min(birth_year_col)
+ max(birth_year_col)
 
 # use filter and grepl to find all trips that either start or end on broadway
+filter(trips, grepl("Broadway", start_station_name) | grepl("Broadway", end_station_name))
 
 # do the same, but find all trips that both start and end on broadway
+filter(trips, grepl("Broadway", start_station_name) & grepl("Broadway", end_station_name))
 
 # find all unique station names
 
+ stations <- paste(trips$start_station_name, trips$end_station_name)
+ unique_stations <- unique(stations)
+
 # count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender
 # do this all at once, by using summarize() with multiple arguments
 
+trips %>% group_by(gender) %>% summarize(count =n(), mean = mean(tripduration), std = sd(tripduration))
+
+
 # find the 10 most frequent station-to-station trips
+trips %>% 
+group_by(start_station_name, end_station_name) 
+%>% summarize(frequency = n()) %>% arrange(desc(frequency)) %>% slice_head(n = 10)
 
 # find the top 3 end stations for trips starting from each start station
-
+ trips %>% group_by(start_station_name, end_station_name) %>% summarize(count = n()) %>% group_by(start_station_name) %>%  arrange(desc(count)) %>% slice_head(n=3)
 # find the top 3 most common station-to-station trips by gender
-
+trips %>% group_by(start_station_name, end_station_name) %>% 
+group_by(gender) %>% summarize(frequency = n()) %>% arrange(desc(frequency)) %>%
+ slice_head(n =3)
 # find the day with the most trips
 # tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package)
 
+trips %>% group_by(as.Date(starttime)) %>% summarize(frequency = n()) %>% ar$
+slice_head(n =1)
+
 # compute the average number of trips taken during each of the 24 hours of the day across the entire month
 # what time(s) of day tend to be peak hour(s)?
+
+trips %>% mutate( hours = hour(starttime)) %>% 
+    group_by(hours) %>% summarise(count = n(), day_in_month_count = days_in_month(starttime), avg = count /  day_in_month_count)
+
+trips %>% mutate( hours = hour(starttime)) %>% 
+    group_by(hours) %>% summarise(count = n()) %>%  arrange(desc(count)) %>% slice(1)
diff --git a/week1/citibike.sh b/week1/citibike.sh
index 25604f54..d9ea9609 100755
--- a/week1/citibike.sh
+++ b/week1/citibike.sh
@@ -4,20 +4,207 @@
 #
 
 # count the number of unique stations
-
+# $ cut -d, -f4 201402-citibike-tripdata.csv | sort -u | wc -l
+  # 330
 # count the number of unique bikes
-
+# $ cut -d, -f12  201402-citibike-tripdata.csv | sort -u | wc -l
+ #5700
 # count the number of trips per day
 
+# $ cut -d, -f2 201402-citibike-tripdata.csv | cut -d" " -f1 | sort | uniq -c
+#   12771 2014-02-01
+#   13816 2014-02-02
+#    2600 2014-02-03
+#    8709 2014-02-04
+#    2746 2014-02-05
+#    7196 2014-02-06
+#    8495 2014-02-07
+#    5986 2014-02-08
+#    4996 2014-02-09
+#    6846 2014-02-10
+#    8343 2014-02-11
+#    8580 2014-02-12
+#     876 2014-02-13
+#    3609 2014-02-14
+#    2261 2014-02-15
+#    3003 2014-02-16
+#    4854 2014-02-17
+#    5140 2014-02-18
+#    8506 2014-02-19
+#   11792 2014-02-20
+#    8680 2014-02-21
+#   13044 2014-02-22
+#   13324 2014-02-23
+#   12922 2014-02-24
+#   12830 2014-02-25
+#   11188 2014-02-26
+#   12036 2014-02-27
+#    9587 2014-02-28
+
 # find the day with the most rides
 
-# find the day with the fewest rides
+# $ cut -d, -f2 201402-citibike-tripdata.csv | cut -d" " -f1 | sort | uniq -c| sort -r | head -n1
+#   13816 2014-02-02
 
+# find the day with the fewest rides
+#  $ cut -d, -f2 201402-citibike-tripdata.csv | cut -d" " -f1| sort | uniq -c | sort | head -n2
+#       1 starttime
+#     876 2014-02-13
 # find the id of the bike with the most rides
+# $ cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | head -n1
+#      10 14529
 
 # count the number of rides by gender and birth year
+# $ cut -d, -f15,14  201402-citibike-tripdata.csv | sort | uniq -c
+#    6717 \N,0
+#       9 1899,1
+#      68 1900,1
+#      11 1901,1
+#       5 1907,1
+#       4 1910,1
+#       1 1913,1
+#       3 1917,1
+#       1 1921,1
+#      32 1922,1
+#       5 1926,2
+#       2 1927,1
+#       1 1932,1
+#       7 1932,2
+#      10 1933,1
+#      21 1934,1
+#      14 1935,1
+#      31 1936,1
+#      24 1937,1
+#      70 1938,1
+#       5 1938,2
+#      24 1939,1
+#      19 1939,2
+#      83 1940,1
+#       1 1940,2
+#     148 1941,1
+#      16 1941,2
+#     173 1942,1
+#       9 1942,2
+#     108 1943,1
+#      22 1943,2
+#     277 1944,1
+#      34 1944,2
+#     171 1945,1
+#      43 1945,2
+#     424 1946,1
+#      30 1946,2
+#     391 1947,1
+#      60 1947,2
+#     664 1948,1
+#     143 1948,2
+#     624 1949,1
+#     101 1949,2
+#     738 1950,1
+#     152 1950,2
+#       6 1951,0
+#    1006 1951,1
+#     146 1951,2
+#    1040 1952,1
+#     143 1952,2
+#    1474 1953,1
+#     301 1953,2
+#    1636 1954,1
+#     306 1954,2
+#    1568 1955,1
+#     349 1955,2
+#    1777 1956,1
+#     542 1956,2
+#    1676 1957,1
+#     562 1957,2
+#    2333 1958,1
+#     643 1958,2
+#    2281 1959,1
+#     539 1959,2
+#    2679 1960,1
+#     776 1960,2
+#    2315 1961,1
+#     432 1961,2
+#    2808 1962,1
+#     833 1962,2
+#    3514 1963,1
+#     715 1963,2
+#    3679 1964,1
+#     570 1964,2
+#    2957 1965,1
+#     687 1965,2
+#    3440 1966,1
+#     565 1966,2
+#    4016 1967,1
+#     634 1967,2
+#    3931 1968,1
+#     545 1968,2
+#    4557 1969,1
+#     898 1969,2
+#    4657 1970,1
+#    1079 1970,2
+#    4132 1971,1
+#     791 1971,2
+#    4066 1972,1
+#     962 1972,2
+#    4097 1973,1
+#     877 1973,2
+#    4957 1974,1
+#     891 1974,2
+#    4185 1975,1
+#     699 1975,2
+#    4557 1976,1
+#    1022 1976,2
+#    4817 1977,1
+#    1140 1977,2
+#    5645 1978,1
+#    1231 1978,2
+#    6433 1979,1
+#    1338 1979,2
+#    6173 1980,1
+#    1488 1980,2
+#    6620 1981,1
+#    1588 1981,2
+#    6244 1982,1
+#    1724 1982,2
+#    6890 1983,1
+#    1889 1983,2
+#    7348 1984,1
+#    1791 1984,2
+#    7043 1985,1
+#    2262 1985,2
+#    6147 1986,1
+#    1962 1986,2
+#    5776 1987,1
+#    1696 1987,2
+#    6449 1988,1
+#    1599 1988,2
+#    5408 1989,1
+#    1435 1989,2
+#    4541 1990,1
+#    1156 1990,2
+#       8 1991,0
+#    2377 1991,1
+#     689 1991,2
+#    1758 1992,1
+#     410 1992,2
+#    1398 1993,1
+#     289 1993,2
+#     927 1994,1
+#     288 1994,2
+#     664 1995,1
+#     163 1995,2
+#     234 1996,1
+#     100 1996,2
+#     164 1997,1
+#      87 1997,2
+#       1 birth year,gender
 
 # count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...)
 
+# $ cut -d, -f5 201402-citibike-tripdata.csv | grep '[1-9].*[1-9]'| wc -l
+# 122406
 
 # compute the average trip duration
+
+# $ awk -F, '{sum+=$1; count++} END {print sum/count}' 201402-citibike-tripdata.csv
+# 874.516
\ No newline at end of file
diff --git a/week1/intro_command_line.ipynb b/week1/intro_command_line.ipynb
index 816197c9..f3750b64 100644
--- a/week1/intro_command_line.ipynb
+++ b/week1/intro_command_line.ipynb
@@ -919,7 +919,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
@@ -937,7 +937,7 @@
    ],
    "source": [
     "# count trips by gender\n",
-    "cut -d, -f15 201402-citibike-tripdata.csv | sort | uniq -c"
+    "cut -d, -f15 201402-citibike-tripdata.ccsv | sort | uniq -c"
    ]
   },
   {
@@ -1012,7 +1012,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
@@ -1030,7 +1030,7 @@
    ],
    "source": [
     "# use awk to count trips by gender without having to sort\n",
-    "awk -F, '{counts[$15]++} END {for (k in counts) print counts[k]\"\\t\" k }' 201402-citibike-tripdata.csv"
+    "awk -F, '{counts[$15]++} END git 201402-citibike-tripdata.csv"
    ]
   },
   {
diff --git a/week1/plot_trips.R b/week1/plot_trips.R
index 4f25437b..cb8fce9d 100644
--- a/week1/plot_trips.R
+++ b/week1/plot_trips.R
@@ -11,52 +11,149 @@ theme_set(theme_bw())
 
 # load RData file output by load_trips.R
 load('trips.RData')
-
-
 ########################################
 # plot trip data
 ########################################
 
 # plot the distribution of trip times across all rides (compare a histogram vs. a density plot)
-
+ggplot(trips, aes(x = tripduration))+
+    geom_histogram(fill = 'blue', bins = 30) + 
+    scale_x_log10(labels=comma) +
+    ylab('Number of trip per Time')
+
+    ggplot(trips, aes(x = tripduration))+
+    geom_density(fill = 'grey') + 
+    scale_x_log10(labels=comma)
 # plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot)
+ggplot(trips, aes(x = tripduration, , color = usertype, fill = usertype))+
+    geom_histogram(bins = 30) + 
+    scale_x_log10(labels=comma) +
+    ylab('Number of trip per Time')+
+    facet_grid( ~usertype)
+
+ggplot(trips, aes(x = tripduration, , color = usertype, fill = usertype))+
+    geom_density(fill = 'grey') + 
+    scale_x_log10(labels=comma) +
+    ylab('Number of trip per Time')+
+    facet_grid( ~usertype)
+
 
 # plot the total number of trips on each day in the dataset
 
+trips %>% mutate(date = as.Date(starttime)) %>%
+ggplot(aes(x = date))+
+    geom_histogram(fill = 'blue', bins = 30)+
+    scale_y_continuous(label = comma)+
+    scale_x_date(date_labels = "%Y-%m-%d")
+    
 # plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color)
+trips %>% mutate(age = as.numeric(format(ymd, '%Y')) - as.numeric(birth_year)) %>%
+    ggplot(aes(x = age), fill = 'gender')+
+    geom_histogram(bins = 30)
+
 
 # plot the ratio of male to female trips (on the y axis) by age (on the x axis)
 # hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio
 # (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet)
 
+view(head(trips,100))
+
+view(head(trips,100))
+
+trips %>% mutate(age = year(ymd) - as.numeric(birth_year)) %>%
+    group_by(age, gender) %>% 
+        summarise(count = n()) %>%
+            pivot_wider(names_from = gender, values_from = count) %>%
+            ggplot(aes(x = age, y = Male/Female)) +
+            geom_point() +
+            scale_y_log10()
 ########################################
 # plot weather data
 ########################################
 # plot the minimum temperature (on the y axis) over each day (on the x axis)
+ggplot(weather, aes(x =date, y = tmin)) +
+        geom_point()
 
 # plot the minimum temperature and maximum temperature (on the y axis, with different colors) over each day (on the x axis)
 # hint: try using the pivot_longer() function for this to reshape things before plotting
 # (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet)
+weather %>% pivot_longer(names_to = "min_max", values_to = "temp", -c(date, prcp, snwd, snow, ymd)) %>%
+        ggplot(aes(x = date, y = temp, color= min_max)) +
+            geom_point()
+
 
 ########################################
 # plot trip and weather data
 ########################################
 
 # join trips and weather
-trips_with_weather <- inner_join(trips, weather, by="ymd")
+trips_with_weather <- inner_join(trips, weather, by = "ymd")
 
 # plot the number of trips as a function of the minimum temperature, where each point represents a day
 # you'll need to summarize the trips and join to the weather data to do this
+weather %>% head()
+trips_with_weathe %>% head()
+
+trips_with_weather %>% 
+    group_by(ymd,tmin) %>% 
+         summarise(count = n(), .groups = "drop") %>% 
+            ggplot(aes(x = tmin, y = count)) +
+                geom_point() +
+                xlab("minimum temperature") +
+                ylab("number of trip")
 
 # repeat this, splitting results by whether there was substantial precipitation or not
 # you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this
 
+trips_with_weather %>% 
+    mutate(prcp_T_F = ifelse(prcp > mean(prcp), 'T', 'F')) %>% 
+    group_by(ymd,tmin,prcp_T_F) %>% 
+         summarise(count = n(),.groups = "drop") %>% 
+         ungroup() %>% 
+            ggplot(aes(x = tmin, y = count)) +
+                geom_point() +
+                xlab("minimum temperature") +
+                ylab("number of trip") +
+                facet_wrap(~prcp_T_F)
+
 # add a smoothed fit on top of the previous plot, using geom_smooth
 
+
+
+trips_with_weather %>% 
+    mutate(prcp_T_F = ifelse(prcp > mean(prcp), 'T', 'F')) %>% 
+    group_by(ymd,tmin,prcp_T_F) %>% 
+         summarise(count = n(),.groups = "drop") %>% 
+         ungroup() %>% 
+            ggplot(aes(x = tmin, y = count)) +
+                geom_point() +
+                geom_smooth() +
+                ylab("minimum temperature") +
+                xlab("number of trip") +
+                facet_wrap(~prcp_T_F)
+
 # compute the average number of trips and standard deviation in number of trips by hour of the day
 # hint: use the hour() function from the lubridate package
-
+trips %>% head()
+    trips %>% group_by(hour = hour(starttime),ymd) %>% 
+        summarise(n_trips = n(), .groups = "drop")%>%
+            group_by(hour) %>% summarise(avg = mean(n_trips),std = sd(n_trips))
 # plot the above
+trips %>% head()
+    trips %>% group_by( hour = hour(starttime),ymd) %>% 
+        summarise(n_trips = n(), .groups = "drop")%>%
+            group_by(hour) %>% summarise(avg = mean(n_trips),std = sd(n_trips)) %>%
+            ggplot(aes(x= hour, y= avg))+
+            geom_line(color = "red")+
+            geom_ribbon(aes(ymin = (avg - std), ymax = (avg + std), alpha = 0.25))
+
 
 # repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days
 # hint: use the wday() function from the lubridate package
+    trips %>% 
+    mutate(day_week = wday(ymd, label = TRUE)) %>% 
+    group_by( hour = hour(starttime), ymd,day_week) %>% 
+        summarise(n_trips = n() , .groups = "drop")%>%
+            group_by(day_week,hour) %>% summarise(avg = mean(n_trips),std = sd(n_trips), .groups = "drop") %>%
+            ggplot(aes(hour, avg)) + geom_line() + geom_ribbon(aes(ymin= avg - std, ymax = avg + std, alpha = 0.5)) + facet_wrap(~day_week)
+ 
\ No newline at end of file
diff --git a/week4/best_model.RData b/week4/best_model.RData
new file mode 100644
index 00000000..69fb97f5
Binary files /dev/null and b/week4/best_model.RData differ
diff --git a/week4/trips_modeling.Rmd b/week4/trips_modeling.Rmd
new file mode 100644
index 00000000..b9f7a685
--- /dev/null
+++ b/week4/trips_modeling.Rmd
@@ -0,0 +1,180 @@
+---
+title: "trips_model"
+date: '`r Sys.time()`'
+output:
+  html_document:
+    #code_folding: hide
+    number_sections: yes
+    toc: yes
+    toc_depth: 3
+---
+
+```{r setup, include=FALSE}
+library(scales)
+library(tidyverse)
+library(knitr)
+
+# set plot theme
+theme_set(theme_bw())
+```
+
+```{r setup, include=FALSE}
+set.seed(42)
+
+
+data <- read_tsv("C:/Users/ds3/Documents/coursework/week4/trips_per_day.tsv")
+n <- nrow(data)
+shuffled_indices <- sample(n)
+
+
+n_test <- floor(0.1 * n)
+test_idx <- shuffled_indices[1:n_test]
+test_set <- data[test_idx, ]
+
+
+remaining_idx <- shuffled_indices[(n_test + 1):n]
+remaining_data <- data[remaining_idx, ]
+
+
+num_folds <- 5
+trips_per_day <- remaining_data %>%
+  mutate(fold = ((row_number() - 1) %% num_folds) + 1)
+
+head(trips_per_day)
+
+```
+
+```{r setup, include=FALSE}
+
+# fit a model for each polynomial degree
+K <- 1:8
+avg_validate_err <- c()
+se_validate_err <- c()
+for (k in K) {
+
+  # do 5-fold cross-validation within each value of k
+  validate_err <- c()
+  for (f in 1:num_folds) {
+    # fit on the training data
+    trips_per_day_train <- filter(trips_per_day, fold != f)
+    model <- lm(num_trips ~ poly(tmin, k, raw = T), data=trips_per_day_train)
+
+    # evaluate on the validation data
+    trips_per_day_validate <- filter(trips_per_day, fold == f)
+    validate_err[f] <- sqrt(mean((predict(model, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+  }
+
+  # compute the average validation error across folds
+  # and the standard error on this estimate
+  avg_validate_err[k] <- mean(validate_err)
+  se_validate_err[k] <- sd(validate_err) / sqrt(num_folds)
+}
+
+```
+
+```{r setup, include=FALSE}
+# plot the validate error, highlighting the value of k with the lowest average error
+plot_data <- data.frame(K, avg_validate_err, se_validate_err)
+ggplot(plot_data, aes(x=K, y=avg_validate_err)) +
+  geom_pointrange(aes(ymin=avg_validate_err - se_validate_err,
+                      ymax=avg_validate_err + se_validate_err,
+                      color=avg_validate_err == min(avg_validate_err))) +
+  geom_line(color = "red") +
+  scale_x_continuous(breaks=1:12) +
+  theme(legend.position="none") +
+  xlab('Polynomial Degree') +
+  ylab('RMSE on validation data')
+```
+
+
+```{r setup, include=FALSE}
+n <- nrow(trips_per_day)
+train_size <- floor(0.8 * n)
+
+# Randomly shuffle row indices
+shuffled_indices <- sample(n)
+
+# Index split
+train_indices <- shuffled_indices[1:train_size]
+validation_indices <- shuffled_indices[(train_size + 1):n]
+
+# Create the two datasets
+trips_per_day_train <- trips_per_day[train_indices, ]
+trips_per_day_validate <- trips_per_day[validation_indices, ]
+```
+
+
+
+
+```{r setup, include=FALSE}
+
+# model with features:  num_trips  prcp  snwd  snow  tmax  tmin
+model_with_all_features <- trips_per_day_train |> select(-ymd, -date, -fold)
+
+model_with_all <- lm(num_trips ~ ., data =model_with_all_features)
+model_with_all
+
+validate_err_with_all <- sqrt(mean((predict(model_with_all, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+
+# High validation error low predictive power
+validate_err_with_all
+```
+
+
+```{r setup, include=FALSE}
+
+# model with features and interaction:  num_trips  prcp  snwd  snow  tmax  tmin
+
+
+model_with_all_interactions <- lm(num_trips ~ .^2, data = model_with_all_features)
+model_with_all_interactions
+
+
+validate_err_with_interactions_all <- sqrt(mean((predict(model_with_all_interactions, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+
+# High validation error low predictive power
+validate_err_with_interactions_all
+```
+
+
+```{r setup, include=FALSE}
+
+# model with features and interaction with tmin:  num_trips  prcp  snwd  snow  tmax  tmin
+
+
+model_with_all_interactions_with_tmin <- model <- lm(
+  num_trips ~ prcp + snow + tmax + tmin + 
+              prcp:tmin + snwd:tmin + snow:tmin + tmax:tmin,
+  data = model_with_all_features
+)
+
+model_with_all_interactions_with_tmin
+
+
+validate_err_with_interactions_with_min_all <- sqrt(mean((predict(model_with_all_interactions_with_tmin, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
+
+# High validation error low predictive power
+validate_err_with_interactions_with_min_all
+
+```
+
+
+```{r setup, include=FALSE}
+# Fit the model on the training data with poly(., 4) and interactions with tmin
+model_with_poly4_and_linear_interactions <- lm(
+  num_trips ~ 
+   
+  data = model_with_all_features
+)
+
+# Predict on the validation set
+predictions <- predict(model_with_poly4_and_linear_interactions, trips_per_day_validate)
+
+# Compute validation RMSE
+validate_err_poly4_with_interactions <- sqrt(
+  mean((predictions - trips_per_day_validate$num_trips)^2)
+)
+
+# Print validation error
+validate_err_poly4_with_interactions
+```
\ No newline at end of file