Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
950 changes: 950 additions & 0 deletions notebooks/eda_credit_history_demo.ipynb

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions src/Example Usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#Example Usage
cat_cols_subset2 = ["pymnt_plan", "purpose", "title", "zip_code", "addr_state"]

transformer = Subset2CategoricalPerformanceTransformer(cat_cols=cat_cols_subset2)

transformer.fit(X_train[cat_cols_subset2])

X_train_cat = transformer.transform(X_train[cat_cols_subset2])
Binary file added src/__pycache__/__init__.cpython-39.pyc
Binary file not shown.
Binary file added src/__pycache__/eda_credit_history.cpython-39.pyc
Binary file not shown.
Binary file added src/__pycache__/transformers.cpython-39.pyc
Binary file not shown.
31 changes: 31 additions & 0 deletions src/cleaning_categorical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pandas as pd
def normalize_cat_strings(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
# Lowercase + strip whitespace in the given categorical columns.
df = df.copy()
for c in cols:
df[c] = df[c].astype(str).str.lower().str.strip()
return df


def fill_cat_missing_with_sentinel(
df: pd.DataFrame,
cols: list[str],
label: str = "Missing",) -> pd.DataFrame:
# Replace NaNs in categorical columns with a sentinel category.
df = df.copy()
for c in cols:
df[c] = df[c].fillna(label)
df[c] = df[c].replace("", label)
return df

def group_rare_categories(
df: pd.DataFrame,
col: str,
min_count: int = 100,
) -> pd.DataFrame:
# Replace rare categories in a single column by "Other".
df = df.copy()
counts = df[col].value_counts()
rare = counts[counts < min_count].index
df[col] = df[col].where(~df[col].isin(rare), "Other")
return df
28 changes: 28 additions & 0 deletions src/cleaning_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pandas as pd
def add_desc_length(df: pd.DataFrame) -> pd.DataFrame:
# Create "desc_len" as character length of "desc" (or 0 if NaN).
df = df.copy()
df["desc_len"] = df["desc"].fillna("").astype(str).str.len()
return df


def add_title_word_count(df: pd.DataFrame) -> pd.DataFrame:
# Create "title_word_count" as word count of "title".
df = df.copy()
df["title_word_count"] = (
df["title"].fillna("").astype(str).str.split().apply(len)
)
return df

###### Implement a functional composition helper:

def apply_cat_steps(
df: pd.DataFrame,
cols: list[str],
steps: list,
) -> pd.DataFrame:
# Apply a sequence of functions(df, cols) -> df to categorical columns.
for step in steps:
df = step(df, cols)
return df

116 changes: 116 additions & 0 deletions src/eda_credit_history.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@

import numpy as np
import pandas as pd
from typing import Dict, Any, Callable

CREDIT_NUMERIC_COLS = ["dti", "dti_joint", "delinq_2yrs", "mths_since_last_delinq", "mths_since_last_record", "mths_since_last_major_derog",
"open_acc", "total_acc", "pub_rec", "acc_now_delinq", "revol_bal", "revol_util", "total_rev_hi_lim",
"tot_coll_amt", "tot_cur_bal", "total_bal_il", "open_acc_6m", "open_il_6m", "open_il_12m", "open_il_24m",
"mths_since_rcnt_il", "open_rv_12m", "open_rv_24m", "max_bal_bc", "all_util",
"inq_last_6mths", "inq_last_12m", "inq_fi", "collections_12_mths_ex_med"]

"""Goal:
Explore how credit history, balances, utilization, and inquiries
relate to loan_status"""

class CreditHistoryEDA:
def __init__(self, df: pd.DataFrame, target_col: str = "loan_status"):
"""Store the full DataFrame and the name of the target column."""
self.df = df
self.target_col = target_col

def credit_structure_summary(self) -> pd.DataFrame:
"""
One row per CREDIT_NUMERIC_COLS column with:
- column
- dtype
- n_missing
- missing_pct
- mean (if numeric)
- std (if numeric)
"""
df_Numeric = self.df[CREDIT_NUMERIC_COLS].copy(deep=True)
row = []
for col in CREDIT_NUMERIC_COLS:
series = df_Numeric[col]

n_missing = series.isna().sum()
missing_pct = (n_missing / len(series)) * 100
#if pd.api.types.is_numeric_dtype(series): # as meansioned above in docs "mean (if numeric)""
mean_val = series.mean()
std_val = series.std()
#else:
# mean_val = None
# std_val = None
row.append({"column": col, "dtype": str(series.dtypes), "n_missing": n_missing, "missing_pct": missing_pct, "mean": mean_val, "std": std_val})
return pd.DataFrame(row, columns=["column", "dtype", "n_missing", "missing_pct", "mean", "std" ])

def default_rate_by_bucket(self, col: str, bins: int = 4):
"""
For a numeric credit column (e.g., dti, revol_util),
create `bins` buckets and compute default rate per bucket.

Return a DataFrame with columns:
- bucket (interval)
- n_loans
- default_rate
"""
#df_drop = self.df[[col]].dropna(subset=[col]).copy(deep=True)
#print("df_drop OK")
Buckets = pd.qcut(self.df[col], q=bins) # create interval bucket i.e 4
#print("self.df[bucket] is OK")
result = self.df.groupby(Buckets)[self.target_col].agg(n_loans="count", default_rate="mean").reset_index()
# print("result is OK")
return result
#print("function is OK")

def correlation_with_default(self) -> pd.Series:
"""
Compute correlation of each numeric credit column with the target
(assuming loan_status is encoded as 0/1).
Return a Series indexed by column name.
"""
correlation = {}
for col in CREDIT_NUMERIC_COLS:
#if pd.api.types.is_numeric_dtype(self.df[col]):
correlation[col] = self.df[col].corr(self.df[self.target_col])
#else:
# correlation[col] = None
return pd.Series(correlation, name="correlation_with_default")
###################### part 2
def credit_history_report(eda:CreditHistoryEDA):
steps: Dict[str, Callable[[], Any]] = {"structure_summary": eda.credit_structure_summary, "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5),
"revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5), "correlation_with_default": eda.correlation_with_default}
report: Dict[str, Any] = {}
for name, func in steps.items():
report[name] = func()
return report




# # 2. Functional credit-history report
# # Add a functional report generator that coordinates several EDA steps:
# # """def credit_history_report(eda: CreditHistoryEDA) -> Dict[str, Any]:"""
# # Build a dict of step_name -> callable and run them to produce
# # a combined report.
# # Example steps:
# # - "structure_summary": eda.credit_structure_summary
# # - "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5)
# # - "revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5)
# # - "correlation_with_default": eda.correlation_with_default
# # Iterate over this dict, call each function, and return
# # a result dict: step_name -> output.
# # Example idea:
# # """ def credit_history_report(eda: CreditHistoryEDA) -> Dict[str, Any]:
# # steps: Dict[str, Callable[[], Any]] = {
# # "structure_summary": eda.credit_structure_summary,
# # "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5),
# # "revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5),
# # "correlation_with_default": eda.correlation_with_default,
# # }
# # report: Dict[str, Any] = {}
# # for name, func in steps.items():
# # report[name] = func()
# # return report"""
# This should clearly show higher-order functions (functions stored and called later).
58 changes: 58 additions & 0 deletions src/test_subset2_transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#tests/test_subset2_transformer.py
import pandas as pd
from transformers import Subset2CategoricalPerformanceTransformer

def test_no_nans_after_transform():
df = pd.DataFrame({
"pymnt_plan": ["y", None, "n"],
"purpose": ["debt", "wedding", None],
"title": ["abc def", None, "xxx"],
"zip_code": ["123xx", "456xx", "123xx"],
"addr_state": ["CA", "ZZ", "CA"],
})

tr = Subset2CategoricalPerformanceTransformer(
cat_cols=list(df.columns), min_count=2
)
tr.fit(df)
out = tr.transform(df)

assert not pd.isna(out).any()


def test_rare_category_grouping():
df = pd.DataFrame({
"purpose": ["a", "b", "c", "d", "e"], # all rare
"pymnt_plan": ["y", "y", "y", "y", "y"],
"title": ["t"]*5,
"zip_code": ["111"]*5,
"addr_state": ["CA","CA","CA","CA","CA"],
})

tr = Subset2CategoricalPerformanceTransformer(
cat_cols=df.columns.tolist(),
min_count=3
)
tr.fit(df)

transformed = tr.transform(df)

# purpose has 5 categories but each count=1 < 3 → all become Other
assert "Other" in tr.rare_maps_["purpose"] or len(tr.rare_maps_["purpose"]) == 5


def test_text_features_numeric():
df = pd.DataFrame({
"pymnt_plan": ["y","n"],
"purpose": ["debt","car"],
"title": ["abc def","hello"],
"zip_code": ["123","456"],
"addr_state": ["CA","NY"],
})

tr = Subset2CategoricalPerformanceTransformer(cat_cols=df.columns.tolist())
tr.fit(df)
out = tr.transform(df)

# last 2 columns must be numeric
assert out[:, -2:].dtype.kind in ("i", "f")
50 changes: 50 additions & 0 deletions src/transformers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# In src/transformers.py, implement Subset2CategoricalPerformanceTransformer:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

class Subset2CategoricalPerformanceTransformer(BaseEstimator, TransformerMixin):
def __init__(self, cat_cols: list[str], min_count: int = 100):
self.cat_cols = cat_cols
self.min_count = min_count
self.encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
self.rare_maps_ = {} # mapping per column for rare categories

def fit(self, X, y=None):
df = pd.DataFrame(X, columns[self.cat_cols]).copy()
# 1) normalize strings
df = normalize_cat_strings(df, self.cat_cols)
# 2) fill missing with sentinel
df = fill_cat_missing_with_sentinel(df, self.cat_cols)
# 3) detect rare categories per column and store mapping
for col in self.cat_cols:
counts = df[col].value_counts()
rare = counts[counts < self.min_count].index
self.rare_maps_[col] = set(rare)
df[col] = df[col].where(~df[col].isin(rare), "Other")
# 4) fit encoder
self.encoder.fit(df[self.cat_cols])
return self

def transform(self, X):
df = pd.DataFrame(X, columns=self.cat_cols).copy()
# apply normalization, missing fill, and rare grouping using self.rare_maps_
df = normalize_cat_strings(df, self.cat_cols)
df = fill_cat_missing_with_sentinel(df, self.cat_cols)
for col in self.cat_cols:
rare = self.rare_maps_.get(col, set())
df[col] = df[col].where(~df[col].isin(rare), "Other")
encoded = self.encoder.transform(df[self.cat_cols])
# text features
df_text = df.copy()
df_text["desc"] = df_text.get("desc", "")
df_text["title"] = df_text.get("title", "")

df_text = add_desc_length(df_text)
df_text = add_title_word_count(df_text)

text_features = df_text[["desc_len", "title_word_count"]].values.astype(float)

# final output: encoded OHE + text numeric
return np.hstack([encoded, text_features]) # return encoded