RigelAlgebar · aaghafari-dev · Nov 28, 2025 · Nov 30, 2025
diff --git a/notebooks/eda_credit_history_demo.ipynb b/notebooks/eda_credit_history_demo.ipynb
diff --git a/src/Example Usage.py b/src/Example Usage.py
@@ -0,0 +1,8 @@
+#Example Usage
+cat_cols_subset2 = ["pymnt_plan", "purpose", "title", "zip_code", "addr_state"]
+
+transformer = Subset2CategoricalPerformanceTransformer(cat_cols=cat_cols_subset2)
+
+transformer.fit(X_train[cat_cols_subset2])
+
+X_train_cat = transformer.transform(X_train[cat_cols_subset2])
diff --git a/src/__pycache__/__init__.cpython-39.pyc b/src/__pycache__/__init__.cpython-39.pyc
diff --git a/src/__pycache__/eda_credit_history.cpython-39.pyc b/src/__pycache__/eda_credit_history.cpython-39.pyc
diff --git a/src/__pycache__/transformers.cpython-39.pyc b/src/__pycache__/transformers.cpython-39.pyc
diff --git a/src/cleaning_categorical.py b/src/cleaning_categorical.py
@@ -0,0 +1,31 @@
+import pandas as pd
+def normalize_cat_strings(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
+    # Lowercase + strip whitespace in the given categorical columns.
+    df = df.copy()
+    for c in cols:
+        df[c] = df[c].astype(str).str.lower().str.strip()
+    return df
+
+
+def fill_cat_missing_with_sentinel(
+    df: pd.DataFrame,
+    cols: list[str],
+    label: str = "Missing",) -> pd.DataFrame:
+    # Replace NaNs in categorical columns with a sentinel category.
+    df = df.copy()
+    for c in cols:
+        df[c] = df[c].fillna(label)
+        df[c] = df[c].replace("", label)
+    return df
+
+def group_rare_categories(
+    df: pd.DataFrame,
+    col: str,
+    min_count: int = 100,
+) -> pd.DataFrame:
+    # Replace rare categories in a single column by "Other".
+    df = df.copy()
+    counts = df[col].value_counts()
+    rare = counts[counts < min_count].index
+    df[col] = df[col].where(~df[col].isin(rare), "Other")
+    return df
diff --git a/src/cleaning_text.py b/src/cleaning_text.py
@@ -0,0 +1,28 @@
+import pandas as pd
+def add_desc_length(df: pd.DataFrame) -> pd.DataFrame:
+    # Create "desc_len" as character length of "desc" (or 0 if NaN).
+    df = df.copy()
+    df["desc_len"] = df["desc"].fillna("").astype(str).str.len()
+    return df
+
+
+def add_title_word_count(df: pd.DataFrame) -> pd.DataFrame:
+    # Create "title_word_count" as word count of "title".
+    df = df.copy()
+    df["title_word_count"] = (
+        df["title"].fillna("").astype(str).str.split().apply(len)
+    )
+    return df
+
+###### Implement a functional composition helper:
+
+def apply_cat_steps(
+    df: pd.DataFrame,
+    cols: list[str],
+    steps: list,
+) -> pd.DataFrame:
+    # Apply a sequence of functions(df, cols) -> df to categorical columns.
+    for step in steps:
+        df = step(df, cols)
+    return df
+
diff --git a/src/eda_credit_history.py b/src/eda_credit_history.py
@@ -0,0 +1,116 @@
+
+import numpy as np
+import pandas as pd
+from typing import Dict, Any, Callable
+
+CREDIT_NUMERIC_COLS = ["dti", "dti_joint", "delinq_2yrs", "mths_since_last_delinq", "mths_since_last_record", "mths_since_last_major_derog",
+"open_acc", "total_acc", "pub_rec", "acc_now_delinq", "revol_bal", "revol_util", "total_rev_hi_lim",
+"tot_coll_amt", "tot_cur_bal", "total_bal_il", "open_acc_6m", "open_il_6m", "open_il_12m", "open_il_24m",
+"mths_since_rcnt_il", "open_rv_12m", "open_rv_24m", "max_bal_bc", "all_util",
+"inq_last_6mths", "inq_last_12m", "inq_fi", "collections_12_mths_ex_med"]
+
+"""Goal:
+Explore how credit history, balances, utilization, and inquiries 
+relate to loan_status"""
+
+class CreditHistoryEDA:
+    def __init__(self, df: pd.DataFrame, target_col: str = "loan_status"):
+        """Store the full DataFrame and the name of the target column."""
+        self.df = df
+        self.target_col = target_col
+
+    def credit_structure_summary(self) -> pd.DataFrame:
+        """
+        One row per CREDIT_NUMERIC_COLS column with:
+        - column
+        - dtype
+        - n_missing
+        - missing_pct
+        - mean (if numeric)
+        - std (if numeric)
+        """
+        df_Numeric = self.df[CREDIT_NUMERIC_COLS].copy(deep=True)
+        row = []
+        for col in CREDIT_NUMERIC_COLS:
+            series = df_Numeric[col]
+
+            n_missing = series.isna().sum()
+            missing_pct = (n_missing / len(series)) * 100
+            #if pd.api.types.is_numeric_dtype(series):  # as meansioned above in docs "mean (if numeric)""
+            mean_val = series.mean()
+            std_val = series.std()
+            #else:
+            #    mean_val = None
+            #    std_val = None
+            row.append({"column": col, "dtype": str(series.dtypes), "n_missing": n_missing, "missing_pct": missing_pct, "mean": mean_val, "std": std_val})        
+        return pd.DataFrame(row, columns=["column", "dtype", "n_missing", "missing_pct", "mean", "std" ]) 
+
+    def default_rate_by_bucket(self, col: str, bins: int = 4): 
+        """
+        For a numeric credit column (e.g., dti, revol_util),
+        create `bins` buckets and compute default rate per bucket.
+
+        Return a DataFrame with columns:
+        - bucket (interval)
+        - n_loans
+        - default_rate
+        """
+        #df_drop = self.df[[col]].dropna(subset=[col]).copy(deep=True)
+        #print("df_drop OK")
+        Buckets = pd.qcut(self.df[col], q=bins) # create interval bucket i.e 4
+        #print("self.df[bucket] is OK")
+        result = self.df.groupby(Buckets)[self.target_col].agg(n_loans="count", default_rate="mean").reset_index()
+       # print("result is OK")
+        return result
+    #print("function is OK")
+
+    def correlation_with_default(self) -> pd.Series:
+        """
+        Compute correlation of each numeric credit column with the target
+        (assuming loan_status is encoded as 0/1).
+        Return a Series indexed by column name.
+        """
+        correlation = {}
+        for col in CREDIT_NUMERIC_COLS:
+            #if pd.api.types.is_numeric_dtype(self.df[col]):
+            correlation[col] = self.df[col].corr(self.df[self.target_col])
+            #else:
+            #    correlation[col] = None
+        return pd.Series(correlation, name="correlation_with_default")
+###################### part 2
+def credit_history_report(eda:CreditHistoryEDA):
+    steps: Dict[str, Callable[[], Any]] = {"structure_summary": eda.credit_structure_summary, "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5),
+    "revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5), "correlation_with_default": eda.correlation_with_default}
+    report: Dict[str, Any] = {}
+    for name, func in steps.items():
+        report[name] = func()
+    return report    
+
+
+
+
+# # 2. Functional credit-history report
+# # Add a functional report generator that coordinates several EDA steps:
+# # """def credit_history_report(eda: CreditHistoryEDA) -> Dict[str, Any]:"""
+# # Build a dict of step_name -> callable and run them to produce
+# # a combined report.
+# # Example steps:
+# #   - "structure_summary": eda.credit_structure_summary
+# #   - "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5)
+# #   - "revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5)
+# #   - "correlation_with_default": eda.correlation_with_default
+# # Iterate over this dict, call each function, and return
+# # a result dict: step_name -> output.
+# # Example idea:
+# # """ def credit_history_report(eda: CreditHistoryEDA) -> Dict[str, Any]:
+# # steps: Dict[str, Callable[[], Any]] = {
+# # "structure_summary": eda.credit_structure_summary,
+# # "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5),
+# # "revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5),
+# # "correlation_with_default": eda.correlation_with_default,
+# # }
+# # report: Dict[str, Any] = {}
+# # for name, func in steps.items():
+# #     report[name] = func()
+# # return report"""
+# This should clearly show higher-order functions (functions stored and called later).
diff --git a/src/test_subset2_transformer.py b/src/test_subset2_transformer.py
@@ -0,0 +1,58 @@
+#tests/test_subset2_transformer.py
+import pandas as pd
+from transformers import Subset2CategoricalPerformanceTransformer
+
+def test_no_nans_after_transform():
+    df = pd.DataFrame({
+        "pymnt_plan": ["y", None, "n"],
+        "purpose": ["debt", "wedding", None],
+        "title": ["abc def", None, "xxx"],
+        "zip_code": ["123xx", "456xx", "123xx"],
+        "addr_state": ["CA", "ZZ", "CA"],
+    })
+
+    tr = Subset2CategoricalPerformanceTransformer(
+        cat_cols=list(df.columns), min_count=2
+    )
+    tr.fit(df)
+    out = tr.transform(df)
+
+    assert not pd.isna(out).any()
+
+
+def test_rare_category_grouping():
+    df = pd.DataFrame({
+        "purpose": ["a", "b", "c", "d", "e"],   # all rare
+        "pymnt_plan": ["y", "y", "y", "y", "y"],
+        "title": ["t"]*5,
+        "zip_code": ["111"]*5,
+        "addr_state": ["CA","CA","CA","CA","CA"],
+    })
+
+    tr = Subset2CategoricalPerformanceTransformer(
+        cat_cols=df.columns.tolist(),
+        min_count=3
+    )
+    tr.fit(df)
+
+    transformed = tr.transform(df)
+
+    # purpose has 5 categories but each count=1 < 3 → all become Other
+    assert "Other" in tr.rare_maps_["purpose"] or len(tr.rare_maps_["purpose"]) == 5
+
+
+def test_text_features_numeric():
+    df = pd.DataFrame({
+        "pymnt_plan": ["y","n"],
+        "purpose": ["debt","car"],
+        "title": ["abc def","hello"],
+        "zip_code": ["123","456"],
+        "addr_state": ["CA","NY"],
+    })
+
+    tr = Subset2CategoricalPerformanceTransformer(cat_cols=df.columns.tolist())
+    tr.fit(df)
+    out = tr.transform(df)
+
+    # last 2 columns must be numeric
+    assert out[:, -2:].dtype.kind in ("i", "f")
diff --git a/src/transformers.py b/src/transformers.py
@@ -0,0 +1,50 @@
+# In src/transformers.py, implement Subset2CategoricalPerformanceTransformer:
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing import OneHotEncoder
+import pandas as pd
+import numpy as np
+
+class Subset2CategoricalPerformanceTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self, cat_cols: list[str], min_count: int = 100):
+        self.cat_cols = cat_cols
+        self.min_count = min_count
+        self.encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
+        self.rare_maps_ = {}  # mapping per column for rare categories
+
+    def fit(self, X, y=None):
+        df = pd.DataFrame(X, columns[self.cat_cols]).copy()
+        # 1) normalize strings
+        df = normalize_cat_strings(df, self.cat_cols)
+        # 2) fill missing with sentinel
+        df = fill_cat_missing_with_sentinel(df, self.cat_cols)
+        # 3) detect rare categories per column and store mapping
+        for col in self.cat_cols:
+            counts = df[col].value_counts()
+            rare = counts[counts < self.min_count].index
+            self.rare_maps_[col] = set(rare)
+            df[col] = df[col].where(~df[col].isin(rare), "Other")
+        # 4) fit encoder
+        self.encoder.fit(df[self.cat_cols])
+        return self
+
+    def transform(self, X):
+        df = pd.DataFrame(X, columns=self.cat_cols).copy()
+        # apply normalization, missing fill, and rare grouping using self.rare_maps_
+        df = normalize_cat_strings(df, self.cat_cols)
+        df = fill_cat_missing_with_sentinel(df, self.cat_cols)
+        for col in self.cat_cols:
+            rare = self.rare_maps_.get(col, set())
+            df[col] = df[col].where(~df[col].isin(rare), "Other")
+        encoded = self.encoder.transform(df[self.cat_cols])
+        # text features
+        df_text = df.copy()
+        df_text["desc"] = df_text.get("desc", "")
+        df_text["title"] = df_text.get("title", "")
+
+        df_text = add_desc_length(df_text)
+        df_text = add_title_word_count(df_text)
+
+        text_features = df_text[["desc_len", "title_word_count"]].values.astype(float)
+
+        # final output: encoded OHE + text numeric
+        return np.hstack([encoded, text_features])  #  return encoded