Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions TODO_PIPELINES/lillie_pipeline.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
## APIs/Data Resources:
1. https://collegefootballdata.com/
- advertises free data on NCAA football
- not sure if it is just FBS or FCS also
- not sure on its speed for updates or accuracy but seems like a starting place
2. https://www.basketball-reference.com/
- another supposedly free site on NBA, ABA, G League, and WNBA
- team, player, league information
- not sure how the data is downloaded
3. https://www.sports-reference.com/
- more general sports from basketball-reference
- includes more sports include baseball, football pro and college, basketball pro and college
- not sure on update time but has good historical data
- not betting specific
4. https://www.kaggle.com/datasets/ehallmar/nba-historical-stats-and-betting-data
- money lines betting information for NBA games
5. https://www.kaggle.com/datasets/scottfree/sports-lines
- betting information for line, over/under, and game results for select seasons of select sports
- offer variety and also an AlphaPy python model to analyze the trend data in the game results

## High Level WorkFlow
1. Collect data from APIs or data resources
2. Filter and Clean Data into desired values and parameters
3. Split the data into train and test
4. Fit a linear regression model
5. Evaluate accuracy --> RSME, MAE, R^2
6. Adjust and improve
83 changes: 83 additions & 0 deletions data_scraping/Lillie_QB_passing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
NFL QB Passing Yards + Interceptions data collection.

Outputs RAW CSV to:
sports_data/raw/qb_passing_2023_2024_raw.csv

Notes:
- Uses weekly player stats from nflreadpy
- "Starting QB" is approximated as the QB with the most pass attempts
for each (season, week, team)
"""

from pathlib import Path

import nflreadpy as nfl
import polars as pl

SEASONS = [2023, 2024]


def pick_starting_qb_per_team_week(qb_stats: pl.DataFrame) -> pl.DataFrame:
"""
Select one QB per (season, week, team) based on highest passing attempts.
"""
required = {
"season",
"week",
"team",
"attempts",
"passing_yards",
"passing_interceptions",
}
missing = required - set(qb_stats.columns)
if missing:
raise ValueError(f"Missing required columns: {sorted(missing)}")

qb_stats = qb_stats.sort(
["season", "week", "team", "attempts"],
descending=[False, False, False, True],
)

return qb_stats.unique(subset=["season", "week", "team"], keep="first")


def main() -> None:
stats = nfl.load_player_stats(SEASONS)

qbs = stats.filter(
(pl.col("position") == "QB")
& (pl.col("attempts").is_not_null())
& (pl.col("attempts") > 0)
& (pl.col("season_type") == "REG")
)

starters = pick_starting_qb_per_team_week(qbs)

raw = starters.select(
[
"season",
"week",
"team",
"opponent_team",
"player_id",
"player_display_name",
"attempts",
"passing_yards",
"passing_interceptions",
]
)

repo_root = Path(__file__).resolve().parents[1]
out_dir = repo_root / "sports_data" / "raw"
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "qb_passing_2023_2024_raw.csv"
raw.write_csv(out_path)

print(f"✅ Wrote raw data -> {out_path}")
print(f"Rows: {raw.height}, Columns: {len(raw.columns)}")


if __name__ == "__main__":
main()
117 changes: 117 additions & 0 deletions data_scraping/Lillie_QB_passing_preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# data_scraping/Lillie_QB_passing_preprocessing.py
"""
Preprocessing script for NFL QB passing data.

Input:
sports_data/raw/qb_passing_2023_2024_raw.csv

Output:
sports_data/processed/qb_passing_2023_2024_processed.csv

Final dataset columns:
- passing_yards
- passing_interceptions
"""

from __future__ import annotations

from pathlib import Path

import polars as pl


def cap_outliers_3sigma(df: pl.DataFrame, column: str) -> pl.DataFrame:
"""
Caps values outside mean ± 3 * std (winsorization).
"""
stats = df.select(
pl.col(column).mean().alias("mean"),
pl.col(column).std().alias("std"),
).row(0)

mean, std = stats
if mean is None or std is None or std == 0:
return df

lower = mean - 3 * std
upper = mean + 3 * std

return df.with_columns(
pl.when(pl.col(column) < lower).then(lower)
.when(pl.col(column) > upper).then(upper)
.otherwise(pl.col(column))
.alias(column)
)


def main() -> None:
repo_root = Path(__file__).resolve().parents[1]

raw_path = repo_root / "sports_data" / "raw" / "qb_passing_2023_2024_raw.csv"
if not raw_path.exists():
raise FileNotFoundError(f"Raw data file not found: {raw_path}")

df = pl.read_csv(raw_path)

# -------------------------
# a) Handle Missing Values
# -------------------------
df = df.drop_nulls(subset=["passing_yards", "passing_interceptions"])

# -------------------------
# b) Remove Duplicates
# -------------------------
# If duplicates exist for a team/week, keep the one with most attempts
df = df.sort(["season", "week", "team", "attempts"], descending=[False, False, False, True])
df = df.unique(subset=["season", "week", "team"], keep="first")

# -------------------------
# c) Data Type Validation
# -------------------------
df = df.with_columns(
[
pl.col("season").cast(pl.Int32, strict=False),
pl.col("week").cast(pl.Int32, strict=False),
pl.col("attempts").cast(pl.Int32, strict=False),
pl.col("passing_yards").cast(pl.Int32, strict=False),
pl.col("passing_interceptions").cast(pl.Int32, strict=False),
]
)

# -------------------------
# d) Outlier Detection
# -------------------------
df = cap_outliers_3sigma(df, "passing_yards")
df = cap_outliers_3sigma(df, "passing_interceptions")

# -------------------------
# e) Consistency Checks
# -------------------------
df = df.with_columns(
[
pl.col("team").cast(pl.Utf8).str.strip_chars().str.to_uppercase(),
pl.col("opponent_team").cast(pl.Utf8).str.strip_chars().str.to_uppercase(),
# sanity: no negative values
pl.when(pl.col("passing_yards") < 0).then(0).otherwise(pl.col("passing_yards")).alias("passing_yards"),
pl.when(pl.col("passing_interceptions") < 0)
.then(0)
.otherwise(pl.col("passing_interceptions"))
.alias("passing_interceptions"),
]
)

# Final dataset: ONLY 2 columns (per assignment)
processed = df.select(["passing_yards", "passing_interceptions"])

out_dir = repo_root / "sports_data" / "processed"
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "qb_passing_2023_2024_processed.csv"
processed.write_csv(out_path)

print(f"✅ Processed data written to {out_path}")
print(f"Rows: {processed.height}, Columns: {len(processed.columns)}")


if __name__ == "__main__":
main()
Loading