Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions data/data_scraping/Logan_OilFutures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import datetime as dt
import os
from pathlib import Path

import databento as db

# -------------------
# 1. Config
# -------------------

DATASET = "GLBX.MDP3" # CME Globex futures dataset
SYMBOL = "CL.n.0" # Front-month continuous WTI Crude futures
STYPE_IN = "continuous"
SCHEMA = "ohlcv-1d"

today = dt.date.today()
start_date = (today - dt.timedelta(days=365 * 2)).isoformat()
end_date = today.isoformat()


api_key = os.getenv("DATABENTO_API_KEY")
if not api_key:
raise RuntimeError(
"DATABENTO_API_KEY is not set. Please set it in your environment."
)
client = db.Historical(api_key) # uses DATABENTO_API_KEY env var

# -------------------
# Daily OHLCV data
# -------------------
bars = client.timeseries.get_range(
dataset=DATASET,
symbols=[SYMBOL],
stype_in=STYPE_IN,
schema=SCHEMA,
start=start_date,
end=end_date,
)

bars_df = bars.to_df()
bars_df["date"] = bars_df.index.date


# ---------------
# Get Expiration Dates
# -------------------
defs = client.timeseries.get_range(
dataset=DATASET,
symbols="ALL_SYMBOLS",
schema="definition",
start=start_date,
)
defs_df = defs.to_df()


# Filter to CL futures only
fut_df = defs_df[defs_df["instrument_class"] == db.InstrumentClass.FUTURE]
cl_defs = fut_df[fut_df["asset"] == "CL"].copy()

# Keep only what we need + drop duplicate instrument_ids
cl_defs = cl_defs[["instrument_id", "raw_symbol", "expiration"]].drop_duplicates(
"instrument_id"
)

# -------------------
# Join expirations onto OHLCV
# -------------------
merged = bars_df.merge(
cl_defs,
on="instrument_id",
how="left",
)


# -------------------
# Keep only your requested fields
# -------------------
final_df = merged[
[
"date", # day of the bar
"open", # daily open price
"high", # daily high price
"low", # daily low price
"close", # daily closing price
"volume", # daily trading volume
"expiration", # contract expiration date
]
]


# Choose where you want to save it
BASE_DIR = Path(r"C:\Users\mammo\Documents\AISC\Predictive_Modeling")
output_path = BASE_DIR / "data" / "finance_data" / "raw" / "Logan_OilFuturesRaw.csv"
# Make sure the folder exists (optional but nice)
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Save to CSV (no index column in the file)
final_df.to_csv(output_path, index=False)

print(f"Saved CSV to: {output_path}")
35 changes: 35 additions & 0 deletions data/data_scraping/Logan_OilFutures_preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from pathlib import Path

import pandas as pd

csv_path = r"C:\Users\mammo\Documents\AISC\OilFutures-API\Logan_OilFuturesRaw.csv"

# Load and convert Date strings to date-time objects
df = pd.read_csv(csv_path, parse_dates=["date", "expiration"])

# Convert columns from string → datetime64[ns]
df["date"] = pd.to_datetime(df["date"])
df["expiration"] = pd.to_datetime(df["expiration"])


# Outlier detection beyond 3 standard deviations from the mean
num_df = df.select_dtypes(include=["float64", "int64"])
means = num_df.mean()
sstds = num_df.std(ddof=0)

# Z-score calculation
z_scores = (num_df - means) / sstds

# Filter rows with any z-score > 3 or < -3
outlier_mask = (z_scores.abs() > 3).any(axis=1)
outliers = df[outlier_mask]

# Drop the outliers from the original dataframe
df_no_outliers = df[~outlier_mask].copy()
df = df_no_outliers

BASE_DIR = Path(r"C:\Users\mammo\Documents\AISC\Predictive_Modeling")
out_path = (
BASE_DIR / "data" / "finance_data" / "processed" / "Logan_OilFuturesProcessed.csv"
)
df.to_csv(out_path, index=False)
Loading