diff --git a/TODO_PIPELINES/lillie_pipeline.md b/TODO_PIPELINES/lillie_pipeline.md new file mode 100644 index 0000000..2a111c2 --- /dev/null +++ b/TODO_PIPELINES/lillie_pipeline.md @@ -0,0 +1,27 @@ +## APIs/Data Resources: +1. https://collegefootballdata.com/ +- advertises free data on NCAA football +- not sure if it is just FBS or FCS also +- not sure on its speed for updates or accuracy but seems like a starting place +2. https://www.basketball-reference.com/ +- another supposedly free site on NBA, ABA, G League, and WNBA +- team, player, league information +- not sure how the data is downloaded +3. https://www.sports-reference.com/ +- more general sports from basketball-reference +- includes more sports include baseball, football pro and college, basketball pro and college +- not sure on update time but has good historical data +- not betting specific +4. https://www.kaggle.com/datasets/ehallmar/nba-historical-stats-and-betting-data +- money lines betting information for NBA games +5. https://www.kaggle.com/datasets/scottfree/sports-lines +- betting information for line, over/under, and game results for select seasons of select sports +- offer variety and also an AlphaPy python model to analyze the trend data in the game results + +## High Level WorkFlow +1. Collect data from APIs or data resources +2. Filter and Clean Data into desired values and parameters +3. Split the data into train and test +4. Fit a linear regression model +5. Evaluate accuracy --> RSME, MAE, R^2 +6. Adjust and improve diff --git a/data_scraping/Lillie_QB_passing.py b/data_scraping/Lillie_QB_passing.py new file mode 100644 index 0000000..30ca618 --- /dev/null +++ b/data_scraping/Lillie_QB_passing.py @@ -0,0 +1,83 @@ +""" +NFL QB Passing Yards + Interceptions data collection. + +Outputs RAW CSV to: + sports_data/raw/qb_passing_2023_2024_raw.csv + +Notes: +- Uses weekly player stats from nflreadpy +- "Starting QB" is approximated as the QB with the most pass attempts + for each (season, week, team) +""" + +from pathlib import Path + +import nflreadpy as nfl +import polars as pl + +SEASONS = [2023, 2024] + + +def pick_starting_qb_per_team_week(qb_stats: pl.DataFrame) -> pl.DataFrame: + """ + Select one QB per (season, week, team) based on highest passing attempts. + """ + required = { + "season", + "week", + "team", + "attempts", + "passing_yards", + "passing_interceptions", + } + missing = required - set(qb_stats.columns) + if missing: + raise ValueError(f"Missing required columns: {sorted(missing)}") + + qb_stats = qb_stats.sort( + ["season", "week", "team", "attempts"], + descending=[False, False, False, True], + ) + + return qb_stats.unique(subset=["season", "week", "team"], keep="first") + + +def main() -> None: + stats = nfl.load_player_stats(SEASONS) + + qbs = stats.filter( + (pl.col("position") == "QB") + & (pl.col("attempts").is_not_null()) + & (pl.col("attempts") > 0) + & (pl.col("season_type") == "REG") + ) + + starters = pick_starting_qb_per_team_week(qbs) + + raw = starters.select( + [ + "season", + "week", + "team", + "opponent_team", + "player_id", + "player_display_name", + "attempts", + "passing_yards", + "passing_interceptions", + ] + ) + + repo_root = Path(__file__).resolve().parents[1] + out_dir = repo_root / "sports_data" / "raw" + out_dir.mkdir(parents=True, exist_ok=True) + + out_path = out_dir / "qb_passing_2023_2024_raw.csv" + raw.write_csv(out_path) + + print(f"✅ Wrote raw data -> {out_path}") + print(f"Rows: {raw.height}, Columns: {len(raw.columns)}") + + +if __name__ == "__main__": + main() diff --git a/data_scraping/Lillie_QB_passing_preprocessing.py b/data_scraping/Lillie_QB_passing_preprocessing.py new file mode 100644 index 0000000..1a12298 --- /dev/null +++ b/data_scraping/Lillie_QB_passing_preprocessing.py @@ -0,0 +1,117 @@ +# data_scraping/Lillie_QB_passing_preprocessing.py +""" +Preprocessing script for NFL QB passing data. + +Input: + sports_data/raw/qb_passing_2023_2024_raw.csv + +Output: + sports_data/processed/qb_passing_2023_2024_processed.csv + +Final dataset columns: + - passing_yards + - passing_interceptions +""" + +from __future__ import annotations + +from pathlib import Path + +import polars as pl + + +def cap_outliers_3sigma(df: pl.DataFrame, column: str) -> pl.DataFrame: + """ + Caps values outside mean ± 3 * std (winsorization). + """ + stats = df.select( + pl.col(column).mean().alias("mean"), + pl.col(column).std().alias("std"), + ).row(0) + + mean, std = stats + if mean is None or std is None or std == 0: + return df + + lower = mean - 3 * std + upper = mean + 3 * std + + return df.with_columns( + pl.when(pl.col(column) < lower).then(lower) + .when(pl.col(column) > upper).then(upper) + .otherwise(pl.col(column)) + .alias(column) + ) + + +def main() -> None: + repo_root = Path(__file__).resolve().parents[1] + + raw_path = repo_root / "sports_data" / "raw" / "qb_passing_2023_2024_raw.csv" + if not raw_path.exists(): + raise FileNotFoundError(f"Raw data file not found: {raw_path}") + + df = pl.read_csv(raw_path) + + # ------------------------- + # a) Handle Missing Values + # ------------------------- + df = df.drop_nulls(subset=["passing_yards", "passing_interceptions"]) + + # ------------------------- + # b) Remove Duplicates + # ------------------------- + # If duplicates exist for a team/week, keep the one with most attempts + df = df.sort(["season", "week", "team", "attempts"], descending=[False, False, False, True]) + df = df.unique(subset=["season", "week", "team"], keep="first") + + # ------------------------- + # c) Data Type Validation + # ------------------------- + df = df.with_columns( + [ + pl.col("season").cast(pl.Int32, strict=False), + pl.col("week").cast(pl.Int32, strict=False), + pl.col("attempts").cast(pl.Int32, strict=False), + pl.col("passing_yards").cast(pl.Int32, strict=False), + pl.col("passing_interceptions").cast(pl.Int32, strict=False), + ] + ) + + # ------------------------- + # d) Outlier Detection + # ------------------------- + df = cap_outliers_3sigma(df, "passing_yards") + df = cap_outliers_3sigma(df, "passing_interceptions") + + # ------------------------- + # e) Consistency Checks + # ------------------------- + df = df.with_columns( + [ + pl.col("team").cast(pl.Utf8).str.strip_chars().str.to_uppercase(), + pl.col("opponent_team").cast(pl.Utf8).str.strip_chars().str.to_uppercase(), + # sanity: no negative values + pl.when(pl.col("passing_yards") < 0).then(0).otherwise(pl.col("passing_yards")).alias("passing_yards"), + pl.when(pl.col("passing_interceptions") < 0) + .then(0) + .otherwise(pl.col("passing_interceptions")) + .alias("passing_interceptions"), + ] + ) + + # Final dataset: ONLY 2 columns (per assignment) + processed = df.select(["passing_yards", "passing_interceptions"]) + + out_dir = repo_root / "sports_data" / "processed" + out_dir.mkdir(parents=True, exist_ok=True) + + out_path = out_dir / "qb_passing_2023_2024_processed.csv" + processed.write_csv(out_path) + + print(f"✅ Processed data written to {out_path}") + print(f"Rows: {processed.height}, Columns: {len(processed.columns)}") + + +if __name__ == "__main__": + main()