From acc7ef8664a0b79bcedac7eaa6c60f2e82a73ad3 Mon Sep 17 00:00:00 2001
From: aaghafari-dev <aa.ghafari@gmail.com>
Date: Fri, 28 Nov 2025 15:29:38 +0100
Subject: [PATCH 1/2] My Homework

---
 notebooks/eda_credit_history_demo.ipynb       | 950 ++++++++++++++++++
 src/__pycache__/__init__.cpython-39.pyc       | Bin 0 -> 186 bytes
 .../eda_credit_history.cpython-39.pyc         | Bin 0 -> 3574 bytes
 src/eda_credit_history.py                     | 116 +++
 4 files changed, 1066 insertions(+)
 create mode 100644 notebooks/eda_credit_history_demo.ipynb
 create mode 100644 src/__pycache__/__init__.cpython-39.pyc
 create mode 100644 src/__pycache__/eda_credit_history.cpython-39.pyc
 create mode 100644 src/eda_credit_history.py
diff --git a/notebooks/eda_credit_history_demo.ipynb b/notebooks/eda_credit_history_demo.ipynb
new file mode 100644
index 0000000..5cc3090
--- /dev/null
+++ b/notebooks/eda_credit_history_demo.ipynb
@@ -0,0 +1,950 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "df20702c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "repo_root = r\"D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Tusday\\ml-model-git-lab\"\n",
+    "sys.path.insert(0, repo_root)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "849444e8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<module 'src.eda_credit_history' from 'D:\\\\edu_laptop\\\\Ironhack_AI_&_DataScience\\\\Month_1_2\\\\Week5\\\\Tusday\\\\ml-model-git-lab\\\\src\\\\eda_credit_history.py'>"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import importlib\n",
+    "import src.eda_credit_history\n",
+    "importlib.reload(src.eda_credit_history)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "f71547c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from src.eda_credit_history import CreditHistoryEDA, credit_history_report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "bbc716e1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Tusday\\ml-model-git-lab\\src\\eda_credit_history.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(src.eda_credit_history.__file__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "8cc26390",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\ASUS\\AppData\\Local\\Temp\\ipykernel_4312\\1681293082.py:1: DtypeWarning: Columns (19,55) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  df = pd.read_csv(\"D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Monday\\loan.csv\")\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>member_id</th>\n",
+       "      <th>loan_amnt</th>\n",
+       "      <th>funded_amnt</th>\n",
+       "      <th>funded_amnt_inv</th>\n",
+       "      <th>term</th>\n",
+       "      <th>int_rate</th>\n",
+       "      <th>installment</th>\n",
+       "      <th>grade</th>\n",
+       "      <th>sub_grade</th>\n",
+       "      <th>...</th>\n",
+       "      <th>total_bal_il</th>\n",
+       "      <th>il_util</th>\n",
+       "      <th>open_rv_12m</th>\n",
+       "      <th>open_rv_24m</th>\n",
+       "      <th>max_bal_bc</th>\n",
+       "      <th>all_util</th>\n",
+       "      <th>total_rev_hi_lim</th>\n",
+       "      <th>inq_fi</th>\n",
+       "      <th>total_cu_tl</th>\n",
+       "      <th>inq_last_12m</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1077501</td>\n",
+       "      <td>1296599</td>\n",
+       "      <td>5000.0</td>\n",
+       "      <td>5000.0</td>\n",
+       "      <td>4975.0</td>\n",
+       "      <td>36 months</td>\n",
+       "      <td>10.65</td>\n",
+       "      <td>162.87</td>\n",
+       "      <td>B</td>\n",
+       "      <td>B2</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1077430</td>\n",
+       "      <td>1314167</td>\n",
+       "      <td>2500.0</td>\n",
+       "      <td>2500.0</td>\n",
+       "      <td>2500.0</td>\n",
+       "      <td>60 months</td>\n",
+       "      <td>15.27</td>\n",
+       "      <td>59.83</td>\n",
+       "      <td>C</td>\n",
+       "      <td>C4</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1077175</td>\n",
+       "      <td>1313524</td>\n",
+       "      <td>2400.0</td>\n",
+       "      <td>2400.0</td>\n",
+       "      <td>2400.0</td>\n",
+       "      <td>36 months</td>\n",
+       "      <td>15.96</td>\n",
+       "      <td>84.33</td>\n",
+       "      <td>C</td>\n",
+       "      <td>C5</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3 rows × 74 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \\\n",
+       "0  1077501    1296599     5000.0       5000.0           4975.0   36 months   \n",
+       "1  1077430    1314167     2500.0       2500.0           2500.0   60 months   \n",
+       "2  1077175    1313524     2400.0       2400.0           2400.0   36 months   \n",
+       "\n",
+       "   int_rate  installment grade sub_grade  ... total_bal_il il_util  \\\n",
+       "0     10.65       162.87     B        B2  ...          NaN     NaN   \n",
+       "1     15.27        59.83     C        C4  ...          NaN     NaN   \n",
+       "2     15.96        84.33     C        C5  ...          NaN     NaN   \n",
+       "\n",
+       "  open_rv_12m  open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi  \\\n",
+       "0         NaN          NaN        NaN      NaN              NaN    NaN   \n",
+       "1         NaN          NaN        NaN      NaN              NaN    NaN   \n",
+       "2         NaN          NaN        NaN      NaN              NaN    NaN   \n",
+       "\n",
+       "  total_cu_tl inq_last_12m  \n",
+       "0         NaN          NaN  \n",
+       "1         NaN          NaN  \n",
+       "2         NaN          NaN  \n",
+       "\n",
+       "[3 rows x 74 columns]"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(\"D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Monday\\loan.csv\") \n",
+    "df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "bbef40c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "default_map = {\n",
+    "    \"Fully Paid\": 0,\n",
+    "    \"Current\": 0,\n",
+    "    \"In Grace Period\": 0,\n",
+    "    \"Issued\": 0,\n",
+    "    \"Does not meet the credit policy. Status:Fully Paid\": 0,\n",
+    "\n",
+    "    \"Charged Off\": 1,\n",
+    "    \"Default\": 1,\n",
+    "    \"Late (31-120 days)\": 1,\n",
+    "    \"Late (16-30 days)\": 1,\n",
+    "    \"Does not meet the credit policy. Status:Charged Off\": 1\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "75edcf62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"loan_status_binary\"] = df[\"loan_status\"].map(default_map)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "fe151cf7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Instantiate the EDA class:\n",
+    "eda = CreditHistoryEDA(df, target_col=\"loan_status_binary\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "220be1c5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['Fully Paid', 'Charged Off', 'Current', 'Default',\n",
+       "       'Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)',\n",
+       "       'Does not meet the credit policy. Status:Fully Paid',\n",
+       "       'Does not meet the credit policy. Status:Charged Off', 'Issued'],\n",
+       "      dtype=object)"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.loan_status.unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "da3e1e57",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Tusday\\ml-model-git-lab\\src\\eda_credit_history.py:62: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n",
+      "  result = self.df.groupby(Buckets)[self.target_col].agg(n_loans=\"count\", default_rate=\"mean\").reset_index()\n",
+      "D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Tusday\\ml-model-git-lab\\src\\eda_credit_history.py:62: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n",
+      "  result = self.df.groupby(Buckets)[self.target_col].agg(n_loans=\"count\", default_rate=\"mean\").reset_index()\n",
+      "C:\\Users\\ASUS\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python39\\site-packages\\numpy\\lib\\_function_base_impl.py:2922: RuntimeWarning: invalid value encountered in divide\n",
+      "  c /= stddev[:, None]\n",
+      "C:\\Users\\ASUS\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python39\\site-packages\\numpy\\lib\\_function_base_impl.py:2923: RuntimeWarning: invalid value encountered in divide\n",
+      "  c /= stddev[None, :]\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Run the report:\n",
+    "report = credit_history_report(eda)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "308d0c04",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>column</th>\n",
+       "      <th>dtype</th>\n",
+       "      <th>n_missing</th>\n",
+       "      <th>missing_pct</th>\n",
+       "      <th>mean</th>\n",
+       "      <th>std</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>dti</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>18.157039</td>\n",
+       "      <td>17.190626</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>dti_joint</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>886870</td>\n",
+       "      <td>99.942640</td>\n",
+       "      <td>18.310118</td>\n",
+       "      <td>7.169233</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>delinq_2yrs</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>29</td>\n",
+       "      <td>0.003268</td>\n",
+       "      <td>0.314442</td>\n",
+       "      <td>0.862244</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>mths_since_last_delinq</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>454312</td>\n",
+       "      <td>51.197065</td>\n",
+       "      <td>34.063798</td>\n",
+       "      <td>21.884940</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>mths_since_last_record</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>750326</td>\n",
+       "      <td>84.555303</td>\n",
+       "      <td>70.117903</td>\n",
+       "      <td>28.127914</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>mths_since_last_major_derog</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>665676</td>\n",
+       "      <td>75.015974</td>\n",
+       "      <td>44.104838</td>\n",
+       "      <td>22.179841</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>open_acc</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>29</td>\n",
+       "      <td>0.003268</td>\n",
+       "      <td>11.548469</td>\n",
+       "      <td>5.317313</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>total_acc</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>29</td>\n",
+       "      <td>0.003268</td>\n",
+       "      <td>25.268026</td>\n",
+       "      <td>11.840561</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>pub_rec</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>29</td>\n",
+       "      <td>0.003268</td>\n",
+       "      <td>0.195307</td>\n",
+       "      <td>0.582091</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>acc_now_delinq</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>29</td>\n",
+       "      <td>0.003268</td>\n",
+       "      <td>0.004991</td>\n",
+       "      <td>0.077625</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>revol_bal</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>16920.787533</td>\n",
+       "      <td>22426.791896</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>revol_util</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>502</td>\n",
+       "      <td>0.056571</td>\n",
+       "      <td>55.067693</td>\n",
+       "      <td>23.834344</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>total_rev_hi_lim</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>70276</td>\n",
+       "      <td>7.919502</td>\n",
+       "      <td>32068.620045</td>\n",
+       "      <td>37498.258326</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>tot_coll_amt</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>70276</td>\n",
+       "      <td>7.919502</td>\n",
+       "      <td>225.702610</td>\n",
+       "      <td>10311.367195</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>tot_cur_bal</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>70276</td>\n",
+       "      <td>7.919502</td>\n",
+       "      <td>139458.189336</td>\n",
+       "      <td>153749.966885</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>total_bal_il</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>866007</td>\n",
+       "      <td>97.591559</td>\n",
+       "      <td>36552.811389</td>\n",
+       "      <td>43103.833619</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>open_acc_6m</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>866007</td>\n",
+       "      <td>97.591559</td>\n",
+       "      <td>1.109021</td>\n",
+       "      <td>1.242675</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>open_il_6m</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>866007</td>\n",
+       "      <td>97.591559</td>\n",
+       "      <td>2.928832</td>\n",
+       "      <td>3.089987</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>open_il_12m</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>866007</td>\n",
+       "      <td>97.591559</td>\n",
+       "      <td>0.761651</td>\n",
+       "      <td>0.996035</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>open_il_24m</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>866007</td>\n",
+       "      <td>97.591559</td>\n",
+       "      <td>1.674574</td>\n",
+       "      <td>1.688725</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>mths_since_rcnt_il</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>866569</td>\n",
+       "      <td>97.654892</td>\n",
+       "      <td>20.912686</td>\n",
+       "      <td>27.209081</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>open_rv_12m</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>866007</td>\n",
+       "      <td>97.591559</td>\n",
+       "      <td>1.389060</td>\n",
+       "      <td>1.520129</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>open_rv_24m</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>866007</td>\n",
+       "      <td>97.591559</td>\n",
+       "      <td>2.975482</td>\n",
+       "      <td>2.631886</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>max_bal_bc</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>866007</td>\n",
+       "      <td>97.591559</td>\n",
+       "      <td>5887.979740</td>\n",
+       "      <td>5284.701239</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>all_util</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>866007</td>\n",
+       "      <td>97.591559</td>\n",
+       "      <td>60.831939</td>\n",
+       "      <td>20.013254</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>inq_last_6mths</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>29</td>\n",
+       "      <td>0.003268</td>\n",
+       "      <td>0.694623</td>\n",
+       "      <td>0.998448</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>inq_last_12m</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>866007</td>\n",
+       "      <td>97.591559</td>\n",
+       "      <td>1.977307</td>\n",
+       "      <td>2.874067</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>inq_fi</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>866007</td>\n",
+       "      <td>97.591559</td>\n",
+       "      <td>0.943945</td>\n",
+       "      <td>1.446872</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>collections_12_mths_ex_med</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>145</td>\n",
+       "      <td>0.016340</td>\n",
+       "      <td>0.014380</td>\n",
+       "      <td>0.134191</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                         column    dtype  n_missing  missing_pct  \\\n",
+       "0                           dti  float64          0     0.000000   \n",
+       "1                     dti_joint  float64     886870    99.942640   \n",
+       "2                   delinq_2yrs  float64         29     0.003268   \n",
+       "3        mths_since_last_delinq  float64     454312    51.197065   \n",
+       "4        mths_since_last_record  float64     750326    84.555303   \n",
+       "5   mths_since_last_major_derog  float64     665676    75.015974   \n",
+       "6                      open_acc  float64         29     0.003268   \n",
+       "7                     total_acc  float64         29     0.003268   \n",
+       "8                       pub_rec  float64         29     0.003268   \n",
+       "9                acc_now_delinq  float64         29     0.003268   \n",
+       "10                    revol_bal  float64          0     0.000000   \n",
+       "11                   revol_util  float64        502     0.056571   \n",
+       "12             total_rev_hi_lim  float64      70276     7.919502   \n",
+       "13                 tot_coll_amt  float64      70276     7.919502   \n",
+       "14                  tot_cur_bal  float64      70276     7.919502   \n",
+       "15                 total_bal_il  float64     866007    97.591559   \n",
+       "16                  open_acc_6m  float64     866007    97.591559   \n",
+       "17                   open_il_6m  float64     866007    97.591559   \n",
+       "18                  open_il_12m  float64     866007    97.591559   \n",
+       "19                  open_il_24m  float64     866007    97.591559   \n",
+       "20           mths_since_rcnt_il  float64     866569    97.654892   \n",
+       "21                  open_rv_12m  float64     866007    97.591559   \n",
+       "22                  open_rv_24m  float64     866007    97.591559   \n",
+       "23                   max_bal_bc  float64     866007    97.591559   \n",
+       "24                     all_util  float64     866007    97.591559   \n",
+       "25               inq_last_6mths  float64         29     0.003268   \n",
+       "26                 inq_last_12m  float64     866007    97.591559   \n",
+       "27                       inq_fi  float64     866007    97.591559   \n",
+       "28   collections_12_mths_ex_med  float64        145     0.016340   \n",
+       "\n",
+       "             mean            std  \n",
+       "0       18.157039      17.190626  \n",
+       "1       18.310118       7.169233  \n",
+       "2        0.314442       0.862244  \n",
+       "3       34.063798      21.884940  \n",
+       "4       70.117903      28.127914  \n",
+       "5       44.104838      22.179841  \n",
+       "6       11.548469       5.317313  \n",
+       "7       25.268026      11.840561  \n",
+       "8        0.195307       0.582091  \n",
+       "9        0.004991       0.077625  \n",
+       "10   16920.787533   22426.791896  \n",
+       "11      55.067693      23.834344  \n",
+       "12   32068.620045   37498.258326  \n",
+       "13     225.702610   10311.367195  \n",
+       "14  139458.189336  153749.966885  \n",
+       "15   36552.811389   43103.833619  \n",
+       "16       1.109021       1.242675  \n",
+       "17       2.928832       3.089987  \n",
+       "18       0.761651       0.996035  \n",
+       "19       1.674574       1.688725  \n",
+       "20      20.912686      27.209081  \n",
+       "21       1.389060       1.520129  \n",
+       "22       2.975482       2.631886  \n",
+       "23    5887.979740    5284.701239  \n",
+       "24      60.831939      20.013254  \n",
+       "25       0.694623       0.998448  \n",
+       "26       1.977307       2.874067  \n",
+       "27       0.943945       1.446872  \n",
+       "28       0.014380       0.134191  "
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Display at least:\n",
+    "report[\"structure_summary\"] # structure of all credit-history columns\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "a8d40f26",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>dti</th>\n",
+       "      <th>n_loans</th>\n",
+       "      <th>default_rate</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>(-0.001, 10.62]</td>\n",
+       "      <td>177693</td>\n",
+       "      <td>0.060019</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>(10.62, 15.4]</td>\n",
+       "      <td>177520</td>\n",
+       "      <td>0.063672</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>(15.4, 19.97]</td>\n",
+       "      <td>177323</td>\n",
+       "      <td>0.070487</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>(19.97, 25.53]</td>\n",
+       "      <td>177519</td>\n",
+       "      <td>0.077327</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>(25.53, 9999.0]</td>\n",
+       "      <td>177324</td>\n",
+       "      <td>0.073211</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               dti  n_loans  default_rate\n",
+       "0  (-0.001, 10.62]   177693      0.060019\n",
+       "1    (10.62, 15.4]   177520      0.063672\n",
+       "2    (15.4, 19.97]   177323      0.070487\n",
+       "3   (19.97, 25.53]   177519      0.077327\n",
+       "4  (25.53, 9999.0]   177324      0.073211"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "report[\"dti_buckets\"] # default rate by DTI bucket\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "6e8fc3d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>revol_util</th>\n",
+       "      <th>n_loans</th>\n",
+       "      <th>default_rate</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>(-0.001, 33.3]</td>\n",
+       "      <td>177564</td>\n",
+       "      <td>0.052978</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>(33.3, 49.1]</td>\n",
+       "      <td>178278</td>\n",
+       "      <td>0.060271</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>(49.1, 62.8]</td>\n",
+       "      <td>177072</td>\n",
+       "      <td>0.068243</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>(62.8, 77.5]</td>\n",
+       "      <td>176707</td>\n",
+       "      <td>0.076115</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>(77.5, 892.3]</td>\n",
+       "      <td>177256</td>\n",
+       "      <td>0.087021</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       revol_util  n_loans  default_rate\n",
+       "0  (-0.001, 33.3]   177564      0.052978\n",
+       "1    (33.3, 49.1]   178278      0.060271\n",
+       "2    (49.1, 62.8]   177072      0.068243\n",
+       "3    (62.8, 77.5]   176707      0.076115\n",
+       "4   (77.5, 892.3]   177256      0.087021"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "report[\"revol_util_buckets\"] # default rate by revol_util bucket\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "02549940",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dti                            0.008322\n",
+       "dti_joint                     -0.040340\n",
+       "delinq_2yrs                   -0.002174\n",
+       "mths_since_last_delinq        -0.006165\n",
+       "mths_since_last_record         0.043935\n",
+       "mths_since_last_major_derog   -0.014210\n",
+       "open_acc                      -0.017776\n",
+       "total_acc                     -0.019214\n",
+       "pub_rec                       -0.015158\n",
+       "acc_now_delinq                -0.000019\n",
+       "revol_bal                     -0.020264\n",
+       "revol_util                     0.046479\n",
+       "total_rev_hi_lim              -0.037400\n",
+       "tot_coll_amt                  -0.001642\n",
+       "tot_cur_bal                   -0.038387\n",
+       "total_bal_il                        NaN\n",
+       "open_acc_6m                         NaN\n",
+       "open_il_6m                          NaN\n",
+       "open_il_12m                         NaN\n",
+       "open_il_24m                         NaN\n",
+       "mths_since_rcnt_il                  NaN\n",
+       "open_rv_12m                         NaN\n",
+       "open_rv_24m                         NaN\n",
+       "max_bal_bc                          NaN\n",
+       "all_util                            NaN\n",
+       "inq_last_6mths                 0.082200\n",
+       "inq_last_12m                        NaN\n",
+       "inq_fi                              NaN\n",
+       "collections_12_mths_ex_med    -0.007635\n",
+       "Name: correlation_with_default, dtype: float64"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "report[\"correlation_with_default\"] # correlation of each credit feature with default"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/__pycache__/__init__.cpython-39.pyc b/src/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8835e08fb5e179011b26073661e7da6d0deed938
GIT binary patch
literal 186
zcmYe~<>g`kf|e7inIQTxh(HF6K#l_t7qb9~6oz01O-8?!3`HPe1o120#VRH>r8GV#
zv7jWsAjY#OKQALOIXm9bGhQv;C9xziI5{&lFF7^FH$Sf=Bi=CHC?-5LHQO{Mq_j9C
zu`(t%M>jV=B{fGkJ+nkNCow6exF|U$K0Y%qvm`!Vub}c4hYe7^G$+*#WbtPpW&i+e
C1u*LX

literal 0
HcmV?d00001

diff --git a/src/__pycache__/eda_credit_history.cpython-39.pyc b/src/__pycache__/eda_credit_history.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84c396dc5f021036bb39158c3632bdef94b4ef75
GIT binary patch
literal 3574
zcmb7HOK%m)74G+a*f{196c{CH<uT9^I0MOyq8WmQ*f2qw@hF%~B+HYcZ&%swOFyow
zssUTqD=Z}&{s9Ync4VJ@en)MyVm+HEo8&vy*Vh*?i?r1BICWlq>Z^0A(=%uK4xWGg
zJ;~Jt^fZ}$d`v#atG-0T9qty+x@&uH-AC^gK^d-xWwaiZo%N3EY&qQj&Ry?*;qZWm
z2M!OPd+R;X<Izi(cX;<8SnqRp%}IK%q0w1Qd=sqXnKu5NYS(m^)1pWRg-CvEe6DlT
zL&Ju3UTHJOMUhuK?B=di=3=S06;pYY309;^vv%=ax)fP0x%uF2{xTib5>m3>GTnM4
zDwbxM>FHXhg&lTA&45ALEMS3E_0wY$Jt>~l1skM=>9@m1=Y?5pML}WPIV<wg%!9(R
zx`4hC?%P=-tspJI4s<Yx&K=ulpOmI=CwW2Rb^+rXH)kd{e^Z)sGv6b#N}s5aPiD9=
zA?|*eKC{LKndu_J%zC^)(X_$)gtVFY>72|(G~3M0W%>e<>AbEKG&5^hJY%Kc8SUWg
z6JYW=UiAQtut-_=_z#3X3`EGimmc?da1gFXMByP25%CTX(cz0C6kXl}N1xAuqX&+@
z<(TJZz%j=cfSI#qaajtU>p$d5*K+sn${jOT)M>?(PIaSRdB)?Lrms`EC2Z_70`T!e
zV-N2#UbTp3?2aAgj6Jlj1*Nz5*%}m!SZ|AXvnh&rCDrLYnU*3>D{jdZP3lcMZVkl<
zQBzh+tMcbiN{CjX*p$D(M1F|ol_RgB`%k{|>4xA9Vm{LKXyd-DtL-#<%<kN0A2Kqu
zmgNG7-uSAn^ftS}Zf^Wph{qpqJZuzCcQ?x7dRYVL*SB(g9mixt$qWipmf4^HcQz_Z
zBOD~g@+#MiorPgV`{H<!yLajB@GTwlC9a92(2cAzm$)ZS4?SfF8?plo#?H{u?j>jF
zahH29+<kY;+4r=&;(YtfzK?+Vmz@1z931*b&S7xm?1$q}hvVS6`vEe;1!q6PS_FCr
zeFuGa>>YN;zV3~KVgI>14#&~BGw$->Wpw1a&dy~$H=G}P$Sc8jE)QRLz(KV4&wf04
zzOF<p>!<NZ$awj|-Ie<f+3KIZy8Gb%GF$%o%eBeg#7}d*{pnQb`h+|~;Cgo?X66+u
zbA<%6HA9<F*eKJpRFPKk)qFFqno`I-OJ>%T=BF<`M8E?s#OO6gRFlX=Er?8L9WlMB
zeKR*>)&wQ286SF~pwr5z<ctNIIe$t3Opw(8M3Ad0g<Mk_zYvwY39ItgXiSujkp4L7
zlqRy)mGMU0^rqNPdh!af_W34TZ9mfZ2)gm%fQb~oS}6Gusk}jaU2>8=L5A$WQ5x6-
z&^8Hi{AdDM*laSOPB3^@rgHZpR;dyJcIVu{?YdoW-d*(22eG@Cyo2BO>3L0<Pds@C
zoe2hcrQS#dA;`pGas68iv@=997<yx1dkk#9@PKB_5B;%!L@NO>j~qrB?1g6$43q<l
z;oiTe`+cvLae54cxOMFqfLF!R*3z{Y_3T=FQZ|xnQ(>^DTEvgY*GKW7$sP->Y}U!@
za@1gni_NqtbS#OF@ZK^l^`|->*b@vNrdwx$Kk-4$9)MOB;H<Qe*r8d}R*aIWnyS5i
zY?_hjDH#ZKcTAYoO{FcM63=udvSxnbtrbZ+@^^M4cCtpdSnX`dx)}|2ZH6>{y0vBI
zq)<rPdBw%EB(#vT`zOC3+0O6VkWijd2*cZux;Sf)4R%@kg@j5Kp>gIfc(J#4<sAr}
zF84+n^dU}=nZhM^_Fe5D>Bak~9)ON_$vJFC&JZaOkOA$Mv(weVFdU=0AX!rC^7sBR
zjo<QQUvWc`qDZO4QehEkw*4LuQ5KsiJX1|qQ-$+`WEIa;!#Gzl$_Xk4k5d(Y^s5`m
z(o|xKlr?*JV;giZ4t9?LOLgvqA1C(2OGbX+K4jrYiHvxYNPdhEwY!U?Bc<KicH^Vg
zOyakRWNt=TyTw!>Px6CYI}Uai11CZv@P6W-cKqMt)N10&pFtAcJ?iGEB9fPknLcAE
zqfJ5MdB%2{v}jkXtMWGZ<!99VlA60{jE_?y=~5%Vp@zJhfuHrA(wh7!8oF30a=YtZ
z2p2D(J8!A4T;rouzkw=36upElMQ;8L#r3bwrbWv^*N%Md9dto=Q4oWD7q~-;I`obR
zd*?Qgq^Q_Fr#p-fv@MMZW8rEBWp6x`>(1j<3uwp;0h0R`lEJDZFJ4pg%C(0Lo&8$-
zWgDQMY>nyOD$;Vm)7!-HJzi(;$|MU<E+Duzk7}uxZWVQw7V0(zt;GN97KKv&g&Ic9
zBWg}@Nv2q^dosWv3C!7d$_oA-@FUS?1ZB>hB;aG3-BZBxF4-VmCd{=c74p!xcBZr#
zDat+cY4Rb_g3YGNW*oAW?T4qv;*a3Bv3J}Bcj3R07r2Y=yeo&GCvzsm)d5A?t41ao
z;i8$U$+5wu1gBcktyL0CByD7S&IoH(zV*Py#WvQB?yVN%w@Fmd6%HlTz2J5G3wQnp
D@(JEh

literal 0
HcmV?d00001

diff --git a/src/eda_credit_history.py b/src/eda_credit_history.py
new file mode 100644
index 0000000..907607f
--- /dev/null
+++ b/src/eda_credit_history.py
@@ -0,0 +1,116 @@
+
+import numpy as np
+import pandas as pd
+from typing import Dict, Any, Callable
+
+CREDIT_NUMERIC_COLS = ["dti", "dti_joint", "delinq_2yrs", "mths_since_last_delinq", "mths_since_last_record", "mths_since_last_major_derog",
+"open_acc", "total_acc", "pub_rec", "acc_now_delinq", "revol_bal", "revol_util", "total_rev_hi_lim",
+"tot_coll_amt", "tot_cur_bal", "total_bal_il", "open_acc_6m", "open_il_6m", "open_il_12m", "open_il_24m",
+"mths_since_rcnt_il", "open_rv_12m", "open_rv_24m", "max_bal_bc", "all_util",
+"inq_last_6mths", "inq_last_12m", "inq_fi", "collections_12_mths_ex_med"]
+
+"""Goal:
+Explore how credit history, balances, utilization, and inquiries 
+relate to loan_status"""
+
+class CreditHistoryEDA:
+    def __init__(self, df: pd.DataFrame, target_col: str = "loan_status"):
+        """Store the full DataFrame and the name of the target column."""
+        self.df = df
+        self.target_col = target_col
+
+    def credit_structure_summary(self) -> pd.DataFrame:
+        """
+        One row per CREDIT_NUMERIC_COLS column with:
+        - column
+        - dtype
+        - n_missing
+        - missing_pct
+        - mean (if numeric)
+        - std (if numeric)
+        """
+        df_Numeric = self.df[CREDIT_NUMERIC_COLS].copy(deep=True)
+        row = []
+        for col in CREDIT_NUMERIC_COLS:
+            series = df_Numeric[col]
+            
+            n_missing = series.isna().sum()
+            missing_pct = (n_missing / len(series)) * 100
+            #if pd.api.types.is_numeric_dtype(series):  # as meansioned above in docs "mean (if numeric)""
+            mean_val = series.mean()
+            std_val = series.std()
+            #else:
+            #    mean_val = None
+            #    std_val = None
+            row.append({"column": col, "dtype": str(series.dtypes), "n_missing": n_missing, "missing_pct": missing_pct, "mean": mean_val, "std": std_val})        
+        return pd.DataFrame(row, columns=["column", "dtype", "n_missing", "missing_pct", "mean", "std" ]) 
+
+    def default_rate_by_bucket(self, col: str, bins: int = 4): 
+        """
+        For a numeric credit column (e.g., dti, revol_util),
+        create `bins` buckets and compute default rate per bucket.
+
+        Return a DataFrame with columns:
+        - bucket (interval)
+        - n_loans
+        - default_rate
+        """
+        #df_drop = self.df[[col]].dropna(subset=[col]).copy(deep=True)
+        #print("df_drop OK")
+        Buckets = pd.qcut(self.df[col], q=bins) # create interval bucket i.e 4
+        #print("self.df[bucket] is OK")
+        result = self.df.groupby(Buckets)[self.target_col].agg(n_loans="count", default_rate="mean").reset_index()
+       # print("result is OK")
+        return result
+    #print("function is OK")
+
+    def correlation_with_default(self) -> pd.Series:
+        """
+        Compute correlation of each numeric credit column with the target
+        (assuming loan_status is encoded as 0/1).
+        Return a Series indexed by column name.
+        """
+        correlation = {}
+        for col in CREDIT_NUMERIC_COLS:
+            #if pd.api.types.is_numeric_dtype(self.df[col]):
+            correlation[col] = self.df[col].corr(self.df[self.target_col])
+            #else:
+            #    correlation[col] = None
+        return pd.Series(correlation, name="correlation_with_default")
+###################### part 2
+def credit_history_report(eda:CreditHistoryEDA):
+    steps: Dict[str, Callable[[], Any]] = {"structure_summary": eda.credit_structure_summary, "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5),
+    "revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5), "correlation_with_default": eda.correlation_with_default}
+    report: Dict[str, Any] = {}
+    for name, func in steps.items():
+        report[name] = func()
+    return report    
+
+
+
+
+# # 2. Functional credit-history report
+# # Add a functional report generator that coordinates several EDA steps:
+# # """def credit_history_report(eda: CreditHistoryEDA) -> Dict[str, Any]:"""
+# # Build a dict of step_name -> callable and run them to produce
+# # a combined report.
+# # Example steps:
+# #   - "structure_summary": eda.credit_structure_summary
+# #   - "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5)
+# #   - "revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5)
+# #   - "correlation_with_default": eda.correlation_with_default
+# # Iterate over this dict, call each function, and return
+# # a result dict: step_name -> output.
+# # Example idea:
+# # """ def credit_history_report(eda: CreditHistoryEDA) -> Dict[str, Any]:
+# # steps: Dict[str, Callable[[], Any]] = {
+# # "structure_summary": eda.credit_structure_summary,
+# # "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5),
+# # "revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5),
+# # "correlation_with_default": eda.correlation_with_default,
+# # }
+# # report: Dict[str, Any] = {}
+# # for name, func in steps.items():
+# #     report[name] = func()
+# # return report"""
+# This should clearly show higher-order functions (functions stored and called later).
\ No newline at end of file

From d26f592430567626a384e9dde358b19b446f35e5 Mon Sep 17 00:00:00 2001
From: aaghafari-dev <aa.ghafari@gmail.com>
Date: Sun, 30 Nov 2025 19:24:55 +0100
Subject: [PATCH 2/2] My Homework_Issue#2

---
 src/Example Usage.py                        |   8 +++
 src/__pycache__/transformers.cpython-39.pyc | Bin 0 -> 2020 bytes
 src/cleaning_categorical.py                 |  31 +++++++++++
 src/cleaning_text.py                        |  28 ++++++++++
 src/test_subset2_transformer.py             |  58 ++++++++++++++++++++
 src/transformers.py                         |  50 +++++++++++++++++
 6 files changed, 175 insertions(+)
 create mode 100644 src/Example Usage.py
 create mode 100644 src/__pycache__/transformers.cpython-39.pyc
 create mode 100644 src/cleaning_categorical.py
 create mode 100644 src/cleaning_text.py
 create mode 100644 src/test_subset2_transformer.py
 create mode 100644 src/transformers.py

diff --git a/src/Example Usage.py b/src/Example Usage.py
new file mode 100644
index 0000000..c1d2d86
--- /dev/null
+++ b/src/Example Usage.py	
@@ -0,0 +1,8 @@
+#Example Usage
+cat_cols_subset2 = ["pymnt_plan", "purpose", "title", "zip_code", "addr_state"]
+
+transformer = Subset2CategoricalPerformanceTransformer(cat_cols=cat_cols_subset2)
+
+transformer.fit(X_train[cat_cols_subset2])
+
+X_train_cat = transformer.transform(X_train[cat_cols_subset2])
diff --git a/src/__pycache__/transformers.cpython-39.pyc b/src/__pycache__/transformers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..879be20c3c9ca340ac6b80e877dfb61390a5a270
GIT binary patch
literal 2020
zcma)7OK;pZ5GJX|YS+8=#&J@(kDyJ9A_9^CG15z66lq><4%rk)fEL0Q1TAvcvL$Lm
za_iVw_hMi32Z|on$Nrz5c<804{Dq!6qqQA3=phs|;*7}QXufYoqSxyX7{9LHEdELe
z`5TpsBLd|f-1a^QNhB>vMKfBlj8W8C8B}2wHuIp2vKVYZ*~*gdhzw<POk|{}Vn<=t
zmcet9#&5t6Nty567gjy8uBe2ojqkrOLfcVoDrKG)uL_+~-+QLj_qBVZ^I9r1$Z_Yz
z5rT3LZhIGmA{mt=W3r<H6|NH%N%lLFfeeo^swElA)-lamGLh|Lk|i>NXuAH!^XZmV
zZsUP)YFwKl7v&GiU>ri{>OxR&Bpd>9C%JGuuS@IORiR-v)h-X<-Pyo7M)m-1`vr&@
zoe@rE3?y|-QVCJLMlC1^$WRi9Bpmp-80*@o$0^H}w}qCa;#0k=>pksbI}ygJG%_u4
zXaFc;@C@Hli{$u@5k~P!Oe{|WA6iw8-md+Fm}OY~PcFY6Dmmq)n7Dc}d}3<7E%IId
z?Gyebe<+-Io)_Ry4WHK9ZSy;PWB8*|yI&1oOsy3A!>YVh0Tkt}ap7*2VrytkK6Gz^
zv$rSvKH<F3h2tEfv?vIT;aa8#ciuPIx6gbG{*zk#_!#acwZFl{kvW|~QfFi*SSKfJ
z9-ffHz(unFEk_LI@!L7t9+IC2hauR2BJ&okj%Ij~on)7+lb!Zq>^igf2q4>8nuVL>
zbY=JS*_*{DbOp-bgrQ}Vyu1r5m**>>En=V)Km|bAz(>#AwlZnovx)R=%<p3(D&<>1
zpJ}CS1CS5%8nSi;psz$(94L+iv(6NHZ2h%SQI?IRDy#()-zxwatF$Y$Dt-4CQBGkC
zvd}iX@KK?qdgTLPwdq2Re6$BqC?6KE-z=de7!@vUnI6vRPoD03CPzMiuzb95<wN9C
z!<vOyfwkrn5J)7u27=J5tP59%c4;5bz0OwY!OizZ7(&Pyx;Y(4lyhwV1jUgZz!tI!
zCG(oi1Hg*85X)$X0kRBGKA)ovkiC0|D8%#DEO5yY@BxqpoB-+e>9aEn8~hmH2X6qs
z4cM_X?=E}+hbS;&aaueBPuM_UZ7gAJqerKDv`HMW;fMlbmKRT01x;qmfJZCyzPkjz
zT5ffI8TbX(6AG+I1}Qa|J#z_!52doX2Z&tZN(D^`R9-5b^&11<tBqWMo35Ev+;$m7
z(*&;I>?(>6P&7~nV*vd_R6j!TF^UH9_IZ_ND?&<scF@>uXKQG9_dFjr%(6Zf*6mM}
zk49xJoM(FC<8AApZKO%VAae~*`4q)xC_V>q)|Vvo+Yxu_mCJf4`A7*jHOjurtQ)Wi
z{a6f{&^6WtR&{_^*Xb%t;MoV6u(jY|<GmSn{>hMXWL8QCX&c*<h9czLcQ}V`Hib6J
zeV6l>Q&FD23CqGds32#2P-tiRu)86@!8k9(j>TY_(!l>D!ea5HgkFt1eS@~NlXdNG
zsf5wDx8RRwkn;xr5!sb9`^2b;sdLp7_@b}J6ZioL+hFb^J*_7D<|gh$pZ^ztPc|1H
P^9>%*wB0_W5R>*VLOcy*

literal 0
HcmV?d00001

diff --git a/src/cleaning_categorical.py b/src/cleaning_categorical.py
new file mode 100644
index 0000000..bc251d9
--- /dev/null
+++ b/src/cleaning_categorical.py
@@ -0,0 +1,31 @@
+import pandas as pd
+def normalize_cat_strings(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
+    # Lowercase + strip whitespace in the given categorical columns.
+    df = df.copy()
+    for c in cols:
+        df[c] = df[c].astype(str).str.lower().str.strip()
+    return df
+    
+
+def fill_cat_missing_with_sentinel(
+    df: pd.DataFrame,
+    cols: list[str],
+    label: str = "Missing",) -> pd.DataFrame:
+    # Replace NaNs in categorical columns with a sentinel category.
+    df = df.copy()
+    for c in cols:
+        df[c] = df[c].fillna(label)
+        df[c] = df[c].replace("", label)
+    return df
+
+def group_rare_categories(
+    df: pd.DataFrame,
+    col: str,
+    min_count: int = 100,
+) -> pd.DataFrame:
+    # Replace rare categories in a single column by "Other".
+    df = df.copy()
+    counts = df[col].value_counts()
+    rare = counts[counts < min_count].index
+    df[col] = df[col].where(~df[col].isin(rare), "Other")
+    return df
\ No newline at end of file
diff --git a/src/cleaning_text.py b/src/cleaning_text.py
new file mode 100644
index 0000000..6a685dc
--- /dev/null
+++ b/src/cleaning_text.py
@@ -0,0 +1,28 @@
+import pandas as pd
+def add_desc_length(df: pd.DataFrame) -> pd.DataFrame:
+    # Create "desc_len" as character length of "desc" (or 0 if NaN).
+    df = df.copy()
+    df["desc_len"] = df["desc"].fillna("").astype(str).str.len()
+    return df
+
+
+def add_title_word_count(df: pd.DataFrame) -> pd.DataFrame:
+    # Create "title_word_count" as word count of "title".
+    df = df.copy()
+    df["title_word_count"] = (
+        df["title"].fillna("").astype(str).str.split().apply(len)
+    )
+    return df
+
+###### Implement a functional composition helper:
+
+def apply_cat_steps(
+    df: pd.DataFrame,
+    cols: list[str],
+    steps: list,
+) -> pd.DataFrame:
+    # Apply a sequence of functions(df, cols) -> df to categorical columns.
+    for step in steps:
+        df = step(df, cols)
+    return df
+    
\ No newline at end of file
diff --git a/src/test_subset2_transformer.py b/src/test_subset2_transformer.py
new file mode 100644
index 0000000..0a813ac
--- /dev/null
+++ b/src/test_subset2_transformer.py
@@ -0,0 +1,58 @@
+#tests/test_subset2_transformer.py
+import pandas as pd
+from transformers import Subset2CategoricalPerformanceTransformer
+
+def test_no_nans_after_transform():
+    df = pd.DataFrame({
+        "pymnt_plan": ["y", None, "n"],
+        "purpose": ["debt", "wedding", None],
+        "title": ["abc def", None, "xxx"],
+        "zip_code": ["123xx", "456xx", "123xx"],
+        "addr_state": ["CA", "ZZ", "CA"],
+    })
+
+    tr = Subset2CategoricalPerformanceTransformer(
+        cat_cols=list(df.columns), min_count=2
+    )
+    tr.fit(df)
+    out = tr.transform(df)
+
+    assert not pd.isna(out).any()
+
+
+def test_rare_category_grouping():
+    df = pd.DataFrame({
+        "purpose": ["a", "b", "c", "d", "e"],   # all rare
+        "pymnt_plan": ["y", "y", "y", "y", "y"],
+        "title": ["t"]*5,
+        "zip_code": ["111"]*5,
+        "addr_state": ["CA","CA","CA","CA","CA"],
+    })
+
+    tr = Subset2CategoricalPerformanceTransformer(
+        cat_cols=df.columns.tolist(),
+        min_count=3
+    )
+    tr.fit(df)
+
+    transformed = tr.transform(df)
+
+    # purpose has 5 categories but each count=1 < 3 → all become Other
+    assert "Other" in tr.rare_maps_["purpose"] or len(tr.rare_maps_["purpose"]) == 5
+
+
+def test_text_features_numeric():
+    df = pd.DataFrame({
+        "pymnt_plan": ["y","n"],
+        "purpose": ["debt","car"],
+        "title": ["abc def","hello"],
+        "zip_code": ["123","456"],
+        "addr_state": ["CA","NY"],
+    })
+
+    tr = Subset2CategoricalPerformanceTransformer(cat_cols=df.columns.tolist())
+    tr.fit(df)
+    out = tr.transform(df)
+
+    # last 2 columns must be numeric
+    assert out[:, -2:].dtype.kind in ("i", "f")
diff --git a/src/transformers.py b/src/transformers.py
new file mode 100644
index 0000000..4cd5225
--- /dev/null
+++ b/src/transformers.py
@@ -0,0 +1,50 @@
+# In src/transformers.py, implement Subset2CategoricalPerformanceTransformer:
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing import OneHotEncoder
+import pandas as pd
+import numpy as np
+
+class Subset2CategoricalPerformanceTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self, cat_cols: list[str], min_count: int = 100):
+        self.cat_cols = cat_cols
+        self.min_count = min_count
+        self.encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
+        self.rare_maps_ = {}  # mapping per column for rare categories
+
+    def fit(self, X, y=None):
+        df = pd.DataFrame(X, columns[self.cat_cols]).copy()
+        # 1) normalize strings
+        df = normalize_cat_strings(df, self.cat_cols)
+        # 2) fill missing with sentinel
+        df = fill_cat_missing_with_sentinel(df, self.cat_cols)
+        # 3) detect rare categories per column and store mapping
+        for col in self.cat_cols:
+            counts = df[col].value_counts()
+            rare = counts[counts < self.min_count].index
+            self.rare_maps_[col] = set(rare)
+            df[col] = df[col].where(~df[col].isin(rare), "Other")
+        # 4) fit encoder
+        self.encoder.fit(df[self.cat_cols])
+        return self
+
+    def transform(self, X):
+        df = pd.DataFrame(X, columns=self.cat_cols).copy()
+        # apply normalization, missing fill, and rare grouping using self.rare_maps_
+        df = normalize_cat_strings(df, self.cat_cols)
+        df = fill_cat_missing_with_sentinel(df, self.cat_cols)
+        for col in self.cat_cols:
+            rare = self.rare_maps_.get(col, set())
+            df[col] = df[col].where(~df[col].isin(rare), "Other")
+        encoded = self.encoder.transform(df[self.cat_cols])
+        # text features
+        df_text = df.copy()
+        df_text["desc"] = df_text.get("desc", "")
+        df_text["title"] = df_text.get("title", "")
+
+        df_text = add_desc_length(df_text)
+        df_text = add_title_word_count(df_text)
+
+        text_features = df_text[["desc_len", "title_word_count"]].values.astype(float)
+
+        # final output: encoded OHE + text numeric
+        return np.hstack([encoded, text_features])  #  return encoded
\ No newline at end of file

	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	sub_grade	...	total_bal_il	il_util	open_rv_12m	open_rv_24m	max_bal_bc	all_util	total_rev_hi_lim	inq_fi	total_cu_tl	inq_last_12m
0	1077501	1296599	5000.0	5000.0	4975.0	36 months	10.65	162.87	B	B2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	1077430	1314167	2500.0	2500.0	2500.0	60 months	15.27	59.83	C	C4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	1077175	1313524	2400.0	2400.0	2400.0	36 months	15.96	84.33	C	C5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	column	dtype	n_missing	missing_pct	mean	std
0	dti	float64	0	0.000000	18.157039	17.190626
1	dti_joint	float64	886870	99.942640	18.310118	7.169233
2	delinq_2yrs	float64	29	0.003268	0.314442	0.862244
3	mths_since_last_delinq	float64	454312	51.197065	34.063798	21.884940
4	mths_since_last_record	float64	750326	84.555303	70.117903	28.127914
5	mths_since_last_major_derog	float64	665676	75.015974	44.104838	22.179841
6	open_acc	float64	29	0.003268	11.548469	5.317313
7	total_acc	float64	29	0.003268	25.268026	11.840561
8	pub_rec	float64	29	0.003268	0.195307	0.582091
9	acc_now_delinq	float64	29	0.003268	0.004991	0.077625
10	revol_bal	float64	0	0.000000	16920.787533	22426.791896
11	revol_util	float64	502	0.056571	55.067693	23.834344
12	total_rev_hi_lim	float64	70276	7.919502	32068.620045	37498.258326
13	tot_coll_amt	float64	70276	7.919502	225.702610	10311.367195
14	tot_cur_bal	float64	70276	7.919502	139458.189336	153749.966885
15	total_bal_il	float64	866007	97.591559	36552.811389	43103.833619
16	open_acc_6m	float64	866007	97.591559	1.109021	1.242675
17	open_il_6m	float64	866007	97.591559	2.928832	3.089987
18	open_il_12m	float64	866007	97.591559	0.761651	0.996035
19	open_il_24m	float64	866007	97.591559	1.674574	1.688725
20	mths_since_rcnt_il	float64	866569	97.654892	20.912686	27.209081
21	open_rv_12m	float64	866007	97.591559	1.389060	1.520129
22	open_rv_24m	float64	866007	97.591559	2.975482	2.631886
23	max_bal_bc	float64	866007	97.591559	5887.979740	5284.701239
24	all_util	float64	866007	97.591559	60.831939	20.013254
25	inq_last_6mths	float64	29	0.003268	0.694623	0.998448
26	inq_last_12m	float64	866007	97.591559	1.977307	2.874067
27	inq_fi	float64	866007	97.591559	0.943945	1.446872
28	collections_12_mths_ex_med	float64	145	0.016340	0.014380	0.134191
	dti	n_loans	default_rate
0	(-0.001, 10.62]	177693	0.060019
1	(10.62, 15.4]	177520	0.063672
2	(15.4, 19.97]	177323	0.070487
3	(19.97, 25.53]	177519	0.077327
4	(25.53, 9999.0]	177324	0.073211
	revol_util	n_loans	default_rate
0	(-0.001, 33.3]	177564	0.052978
1	(33.3, 49.1]	178278	0.060271
2	(49.1, 62.8]	177072	0.068243
3	(62.8, 77.5]	176707	0.076115
4	(77.5, 892.3]	177256	0.087021