Donsven · lkvehling · Nov 15, 2025 · Jan 5, 2026
diff --git a/TODO_PIPELINES/lillie_pipeline.md b/TODO_PIPELINES/lillie_pipeline.md
@@ -0,0 +1,27 @@
+## APIs/Data Resources:
+1. https://collegefootballdata.com/
+- advertises free data on NCAA football
+- not sure if it is just FBS or FCS also
+- not sure on its speed for updates or accuracy but seems like a starting place
+2. https://www.basketball-reference.com/
+- another supposedly free site on NBA, ABA, G League, and WNBA
+- team, player, league information
+- not sure how the data is downloaded
+3. https://www.sports-reference.com/
+- more general sports from basketball-reference
+- includes more sports include baseball, football pro and college, basketball pro and college
+- not sure on update time but has good historical data
+- not betting specific
+4. https://www.kaggle.com/datasets/ehallmar/nba-historical-stats-and-betting-data
+- money lines betting information for NBA games
+5. https://www.kaggle.com/datasets/scottfree/sports-lines
+- betting information for line, over/under, and game results for select seasons of select sports
+- offer variety and also an AlphaPy python model to analyze the trend data in the game results
+
+## High Level WorkFlow
+1. Collect data from APIs or data resources
+2. Filter and Clean Data into desired values and parameters
+3. Split the data into train and test
+4. Fit a linear regression model
+5. Evaluate accuracy --> RSME, MAE, R^2
+6. Adjust and improve
diff --git a/data_scraping/Lillie_QB_passing.py b/data_scraping/Lillie_QB_passing.py
@@ -0,0 +1,83 @@
+"""
+NFL QB Passing Yards + Interceptions data collection.
+
+Outputs RAW CSV to:
+  sports_data/raw/qb_passing_2023_2024_raw.csv
+
+Notes:
+- Uses weekly player stats from nflreadpy
+- "Starting QB" is approximated as the QB with the most pass attempts
+  for each (season, week, team)
+"""
+
+from pathlib import Path
+
+import nflreadpy as nfl
+import polars as pl
+
+SEASONS = [2023, 2024]
+
+
+def pick_starting_qb_per_team_week(qb_stats: pl.DataFrame) -> pl.DataFrame:
+    """
+    Select one QB per (season, week, team) based on highest passing attempts.
+    """
+    required = {
+        "season",
+        "week",
+        "team",
+        "attempts",
+        "passing_yards",
+        "passing_interceptions",
+    }
+    missing = required - set(qb_stats.columns)
+    if missing:
+        raise ValueError(f"Missing required columns: {sorted(missing)}")
+
+    qb_stats = qb_stats.sort(
+        ["season", "week", "team", "attempts"],
+        descending=[False, False, False, True],
+    )
+
+    return qb_stats.unique(subset=["season", "week", "team"], keep="first")
+
+
+def main() -> None:
+    stats = nfl.load_player_stats(SEASONS)
+
+    qbs = stats.filter(
+        (pl.col("position") == "QB")
+        & (pl.col("attempts").is_not_null())
+        & (pl.col("attempts") > 0)
+        & (pl.col("season_type") == "REG")
+    )
+
+    starters = pick_starting_qb_per_team_week(qbs)
+
+    raw = starters.select(
+        [
+            "season",
+            "week",
+            "team",
+            "opponent_team",
+            "player_id",
+            "player_display_name",
+            "attempts",
+            "passing_yards",
+            "passing_interceptions",
+        ]
+    )
+
+    repo_root = Path(__file__).resolve().parents[1]
+    out_dir = repo_root / "sports_data" / "raw"
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    out_path = out_dir / "qb_passing_2023_2024_raw.csv"
+    raw.write_csv(out_path)
+
+    print(f"✅ Wrote raw data -> {out_path}")
+    print(f"Rows: {raw.height}, Columns: {len(raw.columns)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/data_scraping/Lillie_QB_passing_preprocessing.py b/data_scraping/Lillie_QB_passing_preprocessing.py
@@ -0,0 +1,117 @@
+# data_scraping/Lillie_QB_passing_preprocessing.py
+"""
+Preprocessing script for NFL QB passing data.
+
+Input:
+  sports_data/raw/qb_passing_2023_2024_raw.csv
+
+Output:
+  sports_data/processed/qb_passing_2023_2024_processed.csv
+
+Final dataset columns:
+  - passing_yards
+  - passing_interceptions
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import polars as pl
+
+
+def cap_outliers_3sigma(df: pl.DataFrame, column: str) -> pl.DataFrame:
+    """
+    Caps values outside mean ± 3 * std (winsorization).
+    """
+    stats = df.select(
+        pl.col(column).mean().alias("mean"),
+        pl.col(column).std().alias("std"),
+    ).row(0)
+
+    mean, std = stats
+    if mean is None or std is None or std == 0:
+        return df
+
+    lower = mean - 3 * std
+    upper = mean + 3 * std
+
+    return df.with_columns(
+        pl.when(pl.col(column) < lower).then(lower)
+        .when(pl.col(column) > upper).then(upper)
+        .otherwise(pl.col(column))
+        .alias(column)
+    )
+
+
+def main() -> None:
+    repo_root = Path(__file__).resolve().parents[1]
+
+    raw_path = repo_root / "sports_data" / "raw" / "qb_passing_2023_2024_raw.csv"
+    if not raw_path.exists():
+        raise FileNotFoundError(f"Raw data file not found: {raw_path}")
+
+    df = pl.read_csv(raw_path)
+
+    # -------------------------
+    # a) Handle Missing Values
+    # -------------------------
+    df = df.drop_nulls(subset=["passing_yards", "passing_interceptions"])
+
+    # -------------------------
+    # b) Remove Duplicates
+    # -------------------------
+    # If duplicates exist for a team/week, keep the one with most attempts
+    df = df.sort(["season", "week", "team", "attempts"], descending=[False, False, False, True])
+    df = df.unique(subset=["season", "week", "team"], keep="first")
+
+    # -------------------------
+    # c) Data Type Validation
+    # -------------------------
+    df = df.with_columns(
+        [
+            pl.col("season").cast(pl.Int32, strict=False),
+            pl.col("week").cast(pl.Int32, strict=False),
+            pl.col("attempts").cast(pl.Int32, strict=False),
+            pl.col("passing_yards").cast(pl.Int32, strict=False),
+            pl.col("passing_interceptions").cast(pl.Int32, strict=False),
+        ]
+    )
+
+    # -------------------------
+    # d) Outlier Detection
+    # -------------------------
+    df = cap_outliers_3sigma(df, "passing_yards")
+    df = cap_outliers_3sigma(df, "passing_interceptions")
+
+    # -------------------------
+    # e) Consistency Checks
+    # -------------------------
+    df = df.with_columns(
+        [
+            pl.col("team").cast(pl.Utf8).str.strip_chars().str.to_uppercase(),
+            pl.col("opponent_team").cast(pl.Utf8).str.strip_chars().str.to_uppercase(),
+            # sanity: no negative values
+            pl.when(pl.col("passing_yards") < 0).then(0).otherwise(pl.col("passing_yards")).alias("passing_yards"),
+            pl.when(pl.col("passing_interceptions") < 0)
+            .then(0)
+            .otherwise(pl.col("passing_interceptions"))
+            .alias("passing_interceptions"),
+        ]
+    )
+
+    # Final dataset: ONLY 2 columns (per assignment)
+    processed = df.select(["passing_yards", "passing_interceptions"])
+
+    out_dir = repo_root / "sports_data" / "processed"
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    out_path = out_dir / "qb_passing_2023_2024_processed.csv"
+    processed.write_csv(out_path)
+
+    print(f"✅ Processed data written to {out_path}")
+    print(f"Rows: {processed.height}, Columns: {len(processed.columns)}")
+
+
+if __name__ == "__main__":
+    main()