From 9b6d943ad07002612c0a7f1fc6ee1e1e27f047ec Mon Sep 17 00:00:00 2001
From: D V <dv@Mac.fritz.box>
Date: Tue, 25 Nov 2025 21:42:13 +0100
Subject: [PATCH 01/12] eda_borrower done

---
 src/eda_borrower.py | 114 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 src/eda_borrower.py

diff --git a/src/eda_borrower.py b/src/eda_borrower.py
new file mode 100644
index 0000000..3318a34
--- /dev/null
+++ b/src/eda_borrower.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+"""eda_borrower.py
+
+Automatically generated by Colab.
+
+Original file is located at
+    https://colab.research.google.com/drive/1Xvxr8HvPPx6PW8OzzI-Ki4Z4_UV3SCBZ
+"""
+
+import pandas as pd
+from typing import Dict, Any, List, Callable
+
+BORROWER_COLS = [
+    "id", "member_id",
+    "emp_title", "emp_length",
+    "home_ownership",
+    "annual_inc", "annual_inc_joint",
+    "verification_status", "verification_status_joint",
+    "zip_code", "addr_state",
+    "purpose", "title", "desc",
+    "issue_d", "pymnt_plan", "policy_code",
+    "url",
+]
+
+class BorrowerProfileEDA:
+    def __init__(self, df: pd.DataFrame, target_col: str = "loan_status"):
+        """
+        Store the full DataFrame and the name of the target column.
+        """
+        self.df = df
+        self.target_col = target_col
+
+    def structure_summary(self) -> pd.DataFrame:
+        """
+        Return a DataFrame with one row per column in BORROWER_COLS:
+        - column: column name
+        - dtype: pandas dtype
+        - n_missing: number of missing values
+        - missing_pct: percentage of missing values
+        - n_unique: number of unique values
+        - goal is to create a function that tells me all the information above about the columns in borrower_cols
+        - if I dont find any data return none
+        """
+
+        rows = []
+
+        for col in BORROWER_COLS:
+            if col not in  self.df.columns:
+                rows.append({
+                    "column" : col,
+                    "dtype" : None,
+                    "n_missing" : None,
+                    "missing_pct" : None,
+                    "n_unique" : None
+            })
+            else:
+                s = self.df[col]
+
+                rows.append({
+                    "column" : col,
+                    "dtype" : s.dtype,
+                    "n_missing" : s.isna().sum(),
+                    "missing_pct" : (s.isna().mean() *100),
+                    "n_unique" : s.nunique(dropna=True)
+            })
+        return pd.DataFrame(rows)
+
+    def income_summary(self) -> pd.DataFrame:
+        """
+        Return basic stats (count, mean, std, min, max, quartiles)
+        for:
+        - annual_inc
+        - annual_inc_joint
+
+        Use df[["annual_inc", "annual_inc_joint"]].describe().T
+        or equivalent.
+        """
+
+        return self.df[["annual_inc", "annual_inc_joint"]].describe().T
+
+    def categorical_freqs(self, max_levels: int = 10) -> Dict[str, pd.Series]:
+        """
+        For important categorical borrower columns (e.g. home_ownership,
+        addr_state, purpose), return a dict:
+
+            {
+              "home_ownership": Series of top levels,
+              "addr_state": Series of top levels,
+              ...
+            }
+
+        Each Series should be the result of value_counts().head(max_levels).
+        """
+        cat_cols = ['home_ownership', 'addr_state', 'purpose']
+
+        result = {}
+
+        for col in cat_cols:
+          if col in self.df.columns:
+            result[col] = self.df[col].value_counts.head(max_levels)
+          else:
+            result[col] = None
+
+        return result
+
+    def default_rate_by_category(self, col: str) -> pd.Series:
+        """
+        For a given categorical column (e.g. 'home_ownership' or 'purpose'),
+        compute the default rate per category.
+
+        Default rate = mean of self.target_col for each category.
+        Return a pandas Series indexed by category, with values in [0, 1].
+        """
+        return self.df.groupby(col)[self.target_col].mean()
\ No newline at end of file

From 77f202ba6891ac08f84865b24a92a76c671d90d8 Mon Sep 17 00:00:00 2001
From: D V <dv@Air-von-D.fritz.box>
Date: Wed, 26 Nov 2025 15:49:39 +0100
Subject: [PATCH 02/12] eda demo

---
 notebooks/eda_borrower_demo.ipynb | 79 +++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 notebooks/eda_borrower_demo.ipynb

diff --git a/notebooks/eda_borrower_demo.ipynb b/notebooks/eda_borrower_demo.ipynb
new file mode 100644
index 0000000..192e255
--- /dev/null
+++ b/notebooks/eda_borrower_demo.ipynb
@@ -0,0 +1,79 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "knS_VAzEkWuW"
+      },
+      "outputs": [],
+      "source": [
+        "\"\"\"\n",
+        "1.  Load the dataset:\n",
+        "\n",
+        "        import pandas as pd\n",
+        "        from src.eda_borrower import BorrowerProfileEDA, run_borrower_eda_pipeline\n",
+        "\n",
+        "        df = pd.read_csv(\"data/loan_sample.csv\")  # or the correct path in your repo\n",
+        "\n",
+        "\n",
+        "2.  Instantiate the EDA class:\n",
+        "\n",
+        "        eda = BorrowerProfileEDA(df, target_col=\"loan_status\")\n",
+        "\n",
+        "\n",
+        "3.  Run the pipeline and inspect the results:\n",
+        "\n",
+        "        report = run_borrower_eda_pipeline(eda)\n",
+        "\n",
+        "\n",
+        "4.  Display at least:\n",
+        "\n",
+        "        report[\"structure\"]          # table of borrower column structure\n",
+        "        report[\"income\"]             # income stats\n",
+        "        report[\"freqs\"]              # categorical frequencies\n",
+        "        report[\"default_by_home_ownership\"]  # default rate by home_ownership\n",
+        "        report[\"default_by_purpose\"]         # default rate by purpose\n",
+        "\n",
+        "\n",
+        "\n",
+        "You can add markdown cells explaining what each result means in plain language (e.g., class imbalance, missingness, etc.).\n",
+        "\n",
+        "Acceptance Criteria ✅\n",
+        "---------------------\n",
+        "\n",
+        "-   `BorrowerProfileEDA`:\n",
+        "\n",
+        "    -   Initializes correctly with a DataFrame.\n",
+        "    -   `structure_summary()` returns a DataFrame with the requested columns/metrics.\n",
+        "    -   `income_summary()` returns a DataFrame with stats for `annual_inc` and `annual_inc_joint`.\n",
+        "    -   `categorical_freqs()` returns a dict of Series with top categories.\n",
+        "    -   `default_rate_by_category(col)` returns a Series of default rates per category.\n",
+        "-   Functional pipeline:\n",
+        "\n",
+        "    -   `borrower_eda_steps(eda)` returns a dict of callables.\n",
+        "    -   `run_borrower_eda_pipeline(eda)` iterates over that dict, calls each function, and returns a dict of results.\n",
+        "-   Notebook:\n",
+        "\n",
+        "    -   Runs top-to-bottom without errors.\n",
+        "    -   Shows the structure summary, income summary, categorical frequencies, and default-rate-by-category analysis.\n",
+        "    -   Contains only EDA (no model training).\n",
+        "    \"\"\""
+      ]
+    }
+  ]
+}
\ No newline at end of file

From 6a53fd6b28a1f39e914bed08c72e4f5dd452856a Mon Sep 17 00:00:00 2001
From: D V <dv@Air-von-D.fritz.box>
Date: Wed, 26 Nov 2025 15:52:27 +0100
Subject: [PATCH 03/12] eda with pipeline completed

---
 .DS_Store     | Bin 0 -> 8196 bytes
 src/.DS_Store | Bin 0 -> 6148 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 .DS_Store
 create mode 100644 src/.DS_Store

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..a95fbfd24f88ddd81883ab3d66b7ef9b741dad95
GIT binary patch
literal 8196
zcmeHMO>fgc5S?vG<4^&Spi(&?SuMvZq+bYxkkT|cR00kSf&-us*9NS*-jFyAsH&7Z
z{09C4XZ{HPg%iBl-4xqRPeHVTU1@hVyYG3|Z)WXGLPVlI@K=cziO4}^Tb@TVq3}G{
zQ)R}CEWirziClV!x<t!pZLtbi1*`&A0jq#j;36u3cQzO2jQ75twXIdaD)3(_z|RL4
zm2J(~k+$m5fkveOu(KFe1%2cJisKqvGj^n{qT-r5dk}^yOo|~i9rG^3VQa>YwAFMH
znoh#3EKG(X)au~53QnS?t!=FWR)NzBaPEGMZc~>Al;F($&fjB_7zt=;Cz5+!Bt0Pc
zaBZPheyV~VAAO%XKz4GtuV>-BVZUpx_s#1@VLz!>zl)i|?4`M)Q*`E?SFLw)*h)Id
zpkMFwUht=<Qbyjm-|?P>!LVJvu`T1I6UISL5kfxz<mK})_T{iH2eIE*+?JkjN=~U=
zzIk-Cwp!g>aW^*Ck5}BI`)gJ7?`|9)mz;&=I}di-`_VxxKdD<IVV*X0MSnhNKcjbY
z<A<$Y9Le}C#u(4zlJ=;d6-3;^Tw1syJ)#}jrY1e17BzEd-a#7e&|}EF4~M*=kRl3l
z_#Zj^935a6KJZa~Co{a$^tDSgcD4tO#45SHE2n=+Ieh}-+}*lSy|?17udhw%)(N+k
zb8v}v;j}Gkz$HzpKxjYf4C2^N%XOK0%5j0RF@ht=ORT~p%_%Bu7w+icbmbf(I6Ttm
zvO>SdDGpWd04v1I!~PQbKuLCog2N@vcYq9l9N<Y%LH(G;tB^_K3&8hiFN@+Z5m!@T
zVo>KE8h_}-o7>5S!*$38ccN`8(GQgiB;+4EVS=C>A{nWQ;7d5F0(MMcR#*;>3>x0E
z#VRnJ0tHQ9<ov&R_V54GEn?YL0jt1;RzS=)T8%okIR3Vq-!13bF6wJkE{q##t0-tx
pIu5JSaoCeT4AFN1Wlqi5k+v8?`RhLfO!1B1zW<fWZ1WE&@C)G~wq5`L

literal 0
HcmV?d00001

diff --git a/src/.DS_Store b/src/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..7b8876ba1c870614d6fff60b6497919012fc4792
GIT binary patch
literal 6148
zcmeHK%}T>S5Z-O0-BN@c6!aGGS}?7xAYMYOFJMFuDm5WRgE3p$)Er77cYPsW#OHBl
zcOw?-Rm9G~?l-@?*$=Wmj4|%d;tpdrV@yCp<fyC=bgvB6Ofn+JF`{BRj#3$c{bpi+
z9q`*N7PFA$Ed2WY(Kt<}x$D03THV^%Y>0+viQC{o7C|27v#B3WuF<-XGL0%dj4qQ=
zF|@Z&WtN9YHkzn{BpyM^%~g`cvhd|Bi>InK&;ijD&7r+FpLaWMzwPw;hl{o|KkmAq
zAN3ZCrr6m(I5{6arcarC(Ts8+T*;Qf3SL3^T+nkc$x@l!gRjc3@(76mVt^PR2G*7V
zb0mn?+A=^3CkBXtpBTXX!GVV887wubtphr|K4aWML;)S&5{QmL&tRz$A|PCs0_sw3
zo)}!0gWoZEp21S1E@xcL4C9!YtH%phvxDDJ>5O|CsV4@Afn^5j+O+ZfKZ9Rp<s*N&
zge+o!82D!laBC0@d{~q@TfZ$2&sqoC9W)foD^URfed!Va2JR!fDyZWQ>X7FdEH&aN
S=vU={bP-U5P)7{>0s|jNhDu}r

literal 0
HcmV?d00001


From 3213f32aba3942885d5ea9c4f0edd1c75f14f40c Mon Sep 17 00:00:00 2001
From: D V <dv@Air-von-D.fritz.box>
Date: Wed, 26 Nov 2025 16:03:57 +0100
Subject: [PATCH 04/12] eda with pipeline completed

---
 .DS_Store | Bin 8196 -> 8196 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/.DS_Store b/.DS_Store
index a95fbfd24f88ddd81883ab3d66b7ef9b741dad95..2e2ab3a8d1aca188ed5c3414f62f303878a35b85 100644
GIT binary patch
delta 16
XcmZp1XmQx^QDpKpA@|J|qGz}OJ}w5<

delta 16
XcmZp1XmQx^QDpLe0r$-oqGz}OKWGO1


From a89d0f126ad247c3e3143579d4f961f495163297 Mon Sep 17 00:00:00 2001
From: D V <dv@Air-von-D.fritz.box>
Date: Wed, 26 Nov 2025 16:02:46 +0100
Subject: [PATCH 05/12] eda with pipeline completed

---
 src/eda_borrower.py | 76 +++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 44 deletions(-)

diff --git a/src/eda_borrower.py b/src/eda_borrower.py
index 3318a34..ed6d9b3 100644
--- a/src/eda_borrower.py
+++ b/src/eda_borrower.py
@@ -31,19 +31,9 @@ def __init__(self, df: pd.DataFrame, target_col: str = "loan_status"):
         self.target_col = target_col
 
     def structure_summary(self) -> pd.DataFrame:
-        """
-        Return a DataFrame with one row per column in BORROWER_COLS:
-        - column: column name
-        - dtype: pandas dtype
-        - n_missing: number of missing values
-        - missing_pct: percentage of missing values
-        - n_unique: number of unique values
-        - goal is to create a function that tells me all the information above about the columns in borrower_cols
-        - if I dont find any data return none
-        """
-
+        # Create empty list
         rows = []
-
+        # Loop over BORROWER_COLS and if empty fields fill with dict
         for col in BORROWER_COLS:
             if col not in  self.df.columns:
                 rows.append({
@@ -54,8 +44,10 @@ def structure_summary(self) -> pd.DataFrame:
                     "n_unique" : None
             })
             else:
+                # create series
                 s = self.df[col]
 
+                # fill series with asked information about data
                 rows.append({
                     "column" : col,
                     "dtype" : s.dtype,
@@ -63,52 +55,48 @@ def structure_summary(self) -> pd.DataFrame:
                     "missing_pct" : (s.isna().mean() *100),
                     "n_unique" : s.nunique(dropna=True)
             })
+        # return Dataframe with Information about the data
         return pd.DataFrame(rows)
 
     def income_summary(self) -> pd.DataFrame:
-        """
-        Return basic stats (count, mean, std, min, max, quartiles)
-        for:
-        - annual_inc
-        - annual_inc_joint
-
-        Use df[["annual_inc", "annual_inc_joint"]].describe().T
-        or equivalent.
-        """
-
+        #provide information about two explicit columns via describe().T
         return self.df[["annual_inc", "annual_inc_joint"]].describe().T
 
-    def categorical_freqs(self, max_levels: int = 10) -> Dict[str, pd.Series]:
-        """
-        For important categorical borrower columns (e.g. home_ownership,
-        addr_state, purpose), return a dict:
-
-            {
-              "home_ownership": Series of top levels,
-              "addr_state": Series of top levels,
-              ...
-            }
 
-        Each Series should be the result of value_counts().head(max_levels).
-        """
+    def categorical_freqs(self, max_levels: int = 10) -> Dict[str, pd.Series]:
+        # create variable for list of columns
         cat_cols = ['home_ownership', 'addr_state', 'purpose']
-
+        # create empty dict to return info
         result = {}
-
+        # loop over specified columns and write value_counts in dict result
         for col in cat_cols:
           if col in self.df.columns:
             result[col] = self.df[col].value_counts.head(max_levels)
           else:
             result[col] = None
-
+        # return dict with result
         return result
 
+
     def default_rate_by_category(self, col: str) -> pd.Series:
-        """
-        For a given categorical column (e.g. 'home_ownership' or 'purpose'),
-        compute the default rate per category.
+        return self.df.groupby(col)[self.target_col].mean()
 
-        Default rate = mean of self.target_col for each category.
-        Return a pandas Series indexed by category, with values in [0, 1].
-        """
-        return self.df.groupby(col)[self.target_col].mean()
\ No newline at end of file
+def borrower_eda_steps(eda: BorrowerProfileEDA) -> Dict[str, Callable[[], Any]]:
+
+        return {
+        "structure": eda.structure_summary,
+        "income": eda.income_summary,
+        "freqs": lambda: eda.categorical_freqs(max_levels=10),
+        "default_by_home_ownership": lambda: eda.default_rate_by_category("home_ownership"),
+        "default_by_purpose": lambda: eda.default_rate_by_category("purpose"),
+    }
+
+
+    def run_borrower_eda_pipeline(eda: BorrowerProfileEDA) -> Dict[str, Any]:
+        steps = borrower_eda_steps(eda)
+        results = {}
+
+        for name, func in steps.items():
+            results[name] = func()  # call the stored function
+
+        return results
\ No newline at end of file

From 44e54af4ac0694d008e23a30adc45ad84af43782 Mon Sep 17 00:00:00 2001
From: D V <dv@Air-von-D.fritz.box>
Date: Thu, 27 Nov 2025 15:22:16 +0100
Subject: [PATCH 06/12] cleaned out mistakes

---
 src/eda_borrower.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/eda_borrower.py b/src/eda_borrower.py
index ed6d9b3..729678e 100644
--- a/src/eda_borrower.py
+++ b/src/eda_borrower.py
@@ -83,7 +83,7 @@ def default_rate_by_category(self, col: str) -> pd.Series:
 
 def borrower_eda_steps(eda: BorrowerProfileEDA) -> Dict[str, Callable[[], Any]]:
 
-        return {
+    return {
         "structure": eda.structure_summary,
         "income": eda.income_summary,
         "freqs": lambda: eda.categorical_freqs(max_levels=10),
@@ -92,11 +92,11 @@ def borrower_eda_steps(eda: BorrowerProfileEDA) -> Dict[str, Callable[[], Any]]:
     }
 
 
-    def run_borrower_eda_pipeline(eda: BorrowerProfileEDA) -> Dict[str, Any]:
-        steps = borrower_eda_steps(eda)
-        results = {}
+def run_borrower_eda_pipeline(eda: BorrowerProfileEDA) -> Dict[str, Any]:
+    steps = borrower_eda_steps(eda)
+    results = {}
 
-        for name, func in steps.items():
-            results[name] = func()  # call the stored function
+    for name, func in steps.items():
+        results[name] = func()  # call the stored function
 
-        return results
\ No newline at end of file
+    return results
\ No newline at end of file

From 1549af3e1910d092c140bf72cdb978b39abd7e61 Mon Sep 17 00:00:00 2001
From: D V <dv@Air-von-D.fritz.box>
Date: Thu, 27 Nov 2025 15:49:11 +0100
Subject: [PATCH 07/12] cleaned out mistakes

---
 src/eda_borrower.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/eda_borrower.py b/src/eda_borrower.py
index 729678e..02ad710 100644
--- a/src/eda_borrower.py
+++ b/src/eda_borrower.py
@@ -71,7 +71,7 @@ def categorical_freqs(self, max_levels: int = 10) -> Dict[str, pd.Series]:
         # loop over specified columns and write value_counts in dict result
         for col in cat_cols:
           if col in self.df.columns:
-            result[col] = self.df[col].value_counts.head(max_levels)
+            result[col] = self.df[col].value_counts().head(max_levels)
           else:
             result[col] = None
         # return dict with result

From 66783341255ac003f535ac6cfeb2ed7841fd9cbd Mon Sep 17 00:00:00 2001
From: D V <dv@Air-von-D.fritz.box>
Date: Thu, 27 Nov 2025 16:02:42 +0100
Subject: [PATCH 08/12] fixes

---
 src/.DS_Store                                 | Bin 6148 -> 6148 bytes
 .../eda_borrower-checkpoint.py                | 102 ++++++++++++++++++
 2 files changed, 102 insertions(+)
 create mode 100644 src/.ipynb_checkpoints/eda_borrower-checkpoint.py

diff --git a/src/.DS_Store b/src/.DS_Store
index 7b8876ba1c870614d6fff60b6497919012fc4792..5c7fced96d178afa572a26b342d8cc23773820a0 100644
GIT binary patch
delta 28
jcmZoMXffC@kCETh(4baFq1w{eR7b(Yz<l#s#&9tJdLak*

delta 28
jcmZoMXffC@kCETR)VNkhq1w{OKu5vE%w+Rg#&9tJdSnOs

diff --git a/src/.ipynb_checkpoints/eda_borrower-checkpoint.py b/src/.ipynb_checkpoints/eda_borrower-checkpoint.py
new file mode 100644
index 0000000..02ad710
--- /dev/null
+++ b/src/.ipynb_checkpoints/eda_borrower-checkpoint.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+"""eda_borrower.py
+
+Automatically generated by Colab.
+
+Original file is located at
+    https://colab.research.google.com/drive/1Xvxr8HvPPx6PW8OzzI-Ki4Z4_UV3SCBZ
+"""
+
+import pandas as pd
+from typing import Dict, Any, List, Callable
+
+BORROWER_COLS = [
+    "id", "member_id",
+    "emp_title", "emp_length",
+    "home_ownership",
+    "annual_inc", "annual_inc_joint",
+    "verification_status", "verification_status_joint",
+    "zip_code", "addr_state",
+    "purpose", "title", "desc",
+    "issue_d", "pymnt_plan", "policy_code",
+    "url",
+]
+
+class BorrowerProfileEDA:
+    def __init__(self, df: pd.DataFrame, target_col: str = "loan_status"):
+        """
+        Store the full DataFrame and the name of the target column.
+        """
+        self.df = df
+        self.target_col = target_col
+
+    def structure_summary(self) -> pd.DataFrame:
+        # Create empty list
+        rows = []
+        # Loop over BORROWER_COLS and if empty fields fill with dict
+        for col in BORROWER_COLS:
+            if col not in  self.df.columns:
+                rows.append({
+                    "column" : col,
+                    "dtype" : None,
+                    "n_missing" : None,
+                    "missing_pct" : None,
+                    "n_unique" : None
+            })
+            else:
+                # create series
+                s = self.df[col]
+
+                # fill series with asked information about data
+                rows.append({
+                    "column" : col,
+                    "dtype" : s.dtype,
+                    "n_missing" : s.isna().sum(),
+                    "missing_pct" : (s.isna().mean() *100),
+                    "n_unique" : s.nunique(dropna=True)
+            })
+        # return Dataframe with Information about the data
+        return pd.DataFrame(rows)
+
+    def income_summary(self) -> pd.DataFrame:
+        #provide information about two explicit columns via describe().T
+        return self.df[["annual_inc", "annual_inc_joint"]].describe().T
+
+
+    def categorical_freqs(self, max_levels: int = 10) -> Dict[str, pd.Series]:
+        # create variable for list of columns
+        cat_cols = ['home_ownership', 'addr_state', 'purpose']
+        # create empty dict to return info
+        result = {}
+        # loop over specified columns and write value_counts in dict result
+        for col in cat_cols:
+          if col in self.df.columns:
+            result[col] = self.df[col].value_counts().head(max_levels)
+          else:
+            result[col] = None
+        # return dict with result
+        return result
+
+
+    def default_rate_by_category(self, col: str) -> pd.Series:
+        return self.df.groupby(col)[self.target_col].mean()
+
+def borrower_eda_steps(eda: BorrowerProfileEDA) -> Dict[str, Callable[[], Any]]:
+
+    return {
+        "structure": eda.structure_summary,
+        "income": eda.income_summary,
+        "freqs": lambda: eda.categorical_freqs(max_levels=10),
+        "default_by_home_ownership": lambda: eda.default_rate_by_category("home_ownership"),
+        "default_by_purpose": lambda: eda.default_rate_by_category("purpose"),
+    }
+
+
+def run_borrower_eda_pipeline(eda: BorrowerProfileEDA) -> Dict[str, Any]:
+    steps = borrower_eda_steps(eda)
+    results = {}
+
+    for name, func in steps.items():
+        results[name] = func()  # call the stored function
+
+    return results
\ No newline at end of file

From b9070dae3172919605f81dc05eec9b2a0e5c0dbd Mon Sep 17 00:00:00 2001
From: D V <dv@Air-von-D.fritz.box>
Date: Thu, 27 Nov 2025 16:03:46 +0100
Subject: [PATCH 09/12] fixes

---
 .DS_Store | Bin 8196 -> 8196 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/.DS_Store b/.DS_Store
index 2e2ab3a8d1aca188ed5c3414f62f303878a35b85..5b0ff84908dab68db8bcb2259114f817889004a5 100644
GIT binary patch
delta 41
xcmZp1XmQxUBFJxQXi%%8P;F^!s-s|HU_SYkknv_W!PU%@{|mTpwh%qT1pxiI4EO*5

delta 25
hcmZp1XmQxUA~^Y*h|^{_!PU%@uL-$twh%qT1ps~o34H(n


From b4e25bf878ed27886c53c666884de2b56dff5a54 Mon Sep 17 00:00:00 2001
From: D V <dv@Air-von-D.fritz.box>
Date: Fri, 28 Nov 2025 11:13:08 +0100
Subject: [PATCH 10/12] beta for testing

---
 notebooks/eda_borrower_demo.ipynb | 109 ++++++++++++++++++++++++++----
 1 file changed, 94 insertions(+), 15 deletions(-)

diff --git a/notebooks/eda_borrower_demo.ipynb b/notebooks/eda_borrower_demo.ipynb
index 192e255..5106165 100644
--- a/notebooks/eda_borrower_demo.ipynb
+++ b/notebooks/eda_borrower_demo.ipynb
@@ -1,18 +1,4 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
   "cells": [
     {
       "cell_type": "code",
@@ -74,6 +60,99 @@
         "    -   Contains only EDA (no model training).\n",
         "    \"\"\""
       ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "EFc8nSn-gHZH",
+        "outputId": "94730639-f261-4b07-ada3-bf996dcbb600",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 349
+        }
+      },
+      "outputs": [
+        {
+          "output_type": "error",
+          "ename": "ModuleNotFoundError",
+          "evalue": "No module named 'src'",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+            "\u001b[0;32m/tmp/ipython-input-4123335800.py\u001b[0m in \u001b[0;36m<cell line: 0>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0msrc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0meda_borrower\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mBorrowerProfileEDA\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_borrower_eda_pipeline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"loan.csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'src'",
+            "",
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n"
+          ],
+          "errorDetails": {
+            "actions": [
+              {
+                "action": "open_url",
+                "actionText": "Open Examples",
+                "url": "/notebooks/snippets/importing_libraries.ipynb"
+              }
+            ]
+          }
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "from src.eda_borrower import BorrowerProfileEDA, run_borrower_eda_pipeline\n",
+        "df = pd.read_csv(\"/Users/dv/Documents/cloned_repos/ml-model-git-lab/notebooks/data/loan.csv\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "axMvdNZcgHZH"
+      },
+      "outputs": [],
+      "source": [
+        "eda = BorrowerProfileEDA(df, target_col=\"loan_status\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "report = run_borrower_eda_pipeline(eda)\n",
+        "\n",
+        "report[\"structure\"]\n",
+        "report[\"income\"]\n",
+        "report[\"freqs\"]\n",
+        "report[\"default_by_home_ownership\"]\n",
+        "report[\"default_by_purpose\"]"
+      ],
+      "metadata": {
+        "id": "dk0YDhTa6TPn"
+      },
+      "execution_count": null,
+      "outputs": []
     }
-  ]
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.13.5"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
\ No newline at end of file

From eb7e80253dcb41027f7715fde652abe765b0e571 Mon Sep 17 00:00:00 2001
From: D V <dv@Air-von-D.fritz.box>
Date: Fri, 28 Nov 2025 12:30:24 +0100
Subject: [PATCH 11/12] final fixes

---
 notebooks/eda_borrower_demo.ipynb | 783 ++++++++++++++++++++++++------
 1 file changed, 638 insertions(+), 145 deletions(-)

diff --git a/notebooks/eda_borrower_demo.ipynb b/notebooks/eda_borrower_demo.ipynb
index 5106165..7be0c57 100644
--- a/notebooks/eda_borrower_demo.ipynb
+++ b/notebooks/eda_borrower_demo.ipynb
@@ -1,158 +1,651 @@
 {
-  "cells": [
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "knS_VAzEkWuW"
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "1.  Load the dataset:\n",
+    "\n",
+    "        import pandas as pd\n",
+    "        from src.eda_borrower import BorrowerProfileEDA, run_borrower_eda_pipeline\n",
+    "\n",
+    "        df = pd.read_csv(\"data/loan_sample.csv\")  # or the correct path in your repo\n",
+    "\n",
+    "\n",
+    "2.  Instantiate the EDA class:\n",
+    "\n",
+    "        eda = BorrowerProfileEDA(df, target_col=\"loan_status\")\n",
+    "\n",
+    "\n",
+    "3.  Run the pipeline and inspect the results:\n",
+    "\n",
+    "        report = run_borrower_eda_pipeline(eda)\n",
+    "\n",
+    "\n",
+    "4.  Display at least:\n",
+    "\n",
+    "        report[\"structure\"]          # table of borrower column structure\n",
+    "        report[\"income\"]             # income stats\n",
+    "        report[\"freqs\"]              # categorical frequencies\n",
+    "        report[\"default_by_home_ownership\"]  # default rate by home_ownership\n",
+    "        report[\"default_by_purpose\"]         # default rate by purpose\n",
+    "\n",
+    "\n",
+    "\n",
+    "You can add markdown cells explaining what each result means in plain language (e.g., class imbalance, missingness, etc.).\n",
+    "\n",
+    "Acceptance Criteria ✅\n",
+    "---------------------\n",
+    "\n",
+    "-   `BorrowerProfileEDA`:\n",
+    "\n",
+    "    -   Initializes correctly with a DataFrame.\n",
+    "    -   `structure_summary()` returns a DataFrame with the requested columns/metrics.\n",
+    "    -   `income_summary()` returns a DataFrame with stats for `annual_inc` and `annual_inc_joint`.\n",
+    "    -   `categorical_freqs()` returns a dict of Series with top categories.\n",
+    "    -   `default_rate_by_category(col)` returns a Series of default rates per category.\n",
+    "-   Functional pipeline:\n",
+    "\n",
+    "    -   `borrower_eda_steps(eda)` returns a dict of callables.\n",
+    "    -   `run_borrower_eda_pipeline(eda)` iterates over that dict, calls each function, and returns a dict of results.\n",
+    "-   Notebook:\n",
+    "\n",
+    "    -   Runs top-to-bottom without errors.\n",
+    "    -   Shows the structure summary, income summary, categorical frequencies, and default-rate-by-category analysis.\n",
+    "    -   Contains only EDA (no model training).\n",
+    "    \"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "if not os.path.exists(\"../src/eda_borrower.py\"):\n",
+    "    raise FileNotFoundError(\"⚠️ eda_borrower.py missing in src/. Check your repo structure!\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "repo_root = r\"/Users/dv/Documents/cloned_repos/ml-model-git-lab\"\n",
+    "sys.path.insert(0, repo_root)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "knS_VAzEkWuW"
-      },
-      "outputs": [],
-      "source": [
-        "\"\"\"\n",
-        "1.  Load the dataset:\n",
-        "\n",
-        "        import pandas as pd\n",
-        "        from src.eda_borrower import BorrowerProfileEDA, run_borrower_eda_pipeline\n",
-        "\n",
-        "        df = pd.read_csv(\"data/loan_sample.csv\")  # or the correct path in your repo\n",
-        "\n",
-        "\n",
-        "2.  Instantiate the EDA class:\n",
-        "\n",
-        "        eda = BorrowerProfileEDA(df, target_col=\"loan_status\")\n",
-        "\n",
-        "\n",
-        "3.  Run the pipeline and inspect the results:\n",
-        "\n",
-        "        report = run_borrower_eda_pipeline(eda)\n",
-        "\n",
-        "\n",
-        "4.  Display at least:\n",
-        "\n",
-        "        report[\"structure\"]          # table of borrower column structure\n",
-        "        report[\"income\"]             # income stats\n",
-        "        report[\"freqs\"]              # categorical frequencies\n",
-        "        report[\"default_by_home_ownership\"]  # default rate by home_ownership\n",
-        "        report[\"default_by_purpose\"]         # default rate by purpose\n",
-        "\n",
-        "\n",
-        "\n",
-        "You can add markdown cells explaining what each result means in plain language (e.g., class imbalance, missingness, etc.).\n",
-        "\n",
-        "Acceptance Criteria ✅\n",
-        "---------------------\n",
-        "\n",
-        "-   `BorrowerProfileEDA`:\n",
-        "\n",
-        "    -   Initializes correctly with a DataFrame.\n",
-        "    -   `structure_summary()` returns a DataFrame with the requested columns/metrics.\n",
-        "    -   `income_summary()` returns a DataFrame with stats for `annual_inc` and `annual_inc_joint`.\n",
-        "    -   `categorical_freqs()` returns a dict of Series with top categories.\n",
-        "    -   `default_rate_by_category(col)` returns a Series of default rates per category.\n",
-        "-   Functional pipeline:\n",
-        "\n",
-        "    -   `borrower_eda_steps(eda)` returns a dict of callables.\n",
-        "    -   `run_borrower_eda_pipeline(eda)` iterates over that dict, calls each function, and returns a dict of results.\n",
-        "-   Notebook:\n",
-        "\n",
-        "    -   Runs top-to-bottom without errors.\n",
-        "    -   Shows the structure summary, income summary, categorical frequencies, and default-rate-by-category analysis.\n",
-        "    -   Contains only EDA (no model training).\n",
-        "    \"\"\""
+     "data": {
+      "text/plain": [
+       "<module 'src.eda_borrower' from '/Users/dv/Documents/cloned_repos/ml-model-git-lab/src/eda_borrower.py'>"
       ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import importlib\n",
+    "import src.eda_borrower\n",
+    "importlib.reload(src.eda_borrower)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from src.eda_borrower import BorrowerProfileEDA, run_borrower_eda_pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/dv/Documents/cloned_repos/ml-model-git-lab/src/eda_borrower.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(src.eda_borrower.__file__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 349
     },
+    "id": "EFc8nSn-gHZH",
+    "outputId": "94730639-f261-4b07-ada3-bf996dcbb600"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "EFc8nSn-gHZH",
-        "outputId": "94730639-f261-4b07-ada3-bf996dcbb600",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 349
-        }
-      },
-      "outputs": [
-        {
-          "output_type": "error",
-          "ename": "ModuleNotFoundError",
-          "evalue": "No module named 'src'",
-          "traceback": [
-            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-            "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-            "\u001b[0;32m/tmp/ipython-input-4123335800.py\u001b[0m in \u001b[0;36m<cell line: 0>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0msrc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0meda_borrower\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mBorrowerProfileEDA\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_borrower_eda_pipeline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"loan.csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'src'",
-            "",
-            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n"
-          ],
-          "errorDetails": {
-            "actions": [
-              {
-                "action": "open_url",
-                "actionText": "Open Examples",
-                "url": "/notebooks/snippets/importing_libraries.ipynb"
-              }
-            ]
-          }
-        }
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/gn/zvx5kxqj5ng9ng0cg5f15yk80000gn/T/ipykernel_62771/4134569851.py:3: DtypeWarning: Columns (19,55) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  df = pd.read_csv(\"/Users/dv/Documents/cloned_repos/ml-model-git-lab/notebooks/data/loan.csv\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from src.eda_borrower import BorrowerProfileEDA, run_borrower_eda_pipeline\n",
+    "df = pd.read_csv(\"/Users/dv/Documents/cloned_repos/ml-model-git-lab/notebooks/data/loan.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "default_map = {\n",
+    "    \"Fully Paid\": 0,\n",
+    "    \"Current\": 0,\n",
+    "    \"In Grace Period\": 0,\n",
+    "    \"Issued\": 0,\n",
+    "    \"Does not meet the credit policy. Status:Fully Paid\": 0,\n",
+    "\n",
+    "    \"Charged Off\": 1,\n",
+    "    \"Default\": 1,\n",
+    "    \"Late (31-120 days)\": 1,\n",
+    "    \"Late (16-30 days)\": 1,\n",
+    "    \"Does not meet the credit policy. Status:Charged Off\": 1\n",
+    "}\n",
+    "\n",
+    "df[\"loan_status_binary\"] = df[\"loan_status\"].map(default_map)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "id": "axMvdNZcgHZH"
+   },
+   "outputs": [],
+   "source": [
+    "eda = BorrowerProfileEDA(df, target_col=\"loan_status_binary\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "id": "dk0YDhTa6TPn"
+   },
+   "outputs": [],
+   "source": [
+    "report = run_borrower_eda_pipeline(eda)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>column</th>\n",
+       "      <th>dtype</th>\n",
+       "      <th>n_missing</th>\n",
+       "      <th>missing_pct</th>\n",
+       "      <th>n_unique</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>id</td>\n",
+       "      <td>int64</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>887379</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>member_id</td>\n",
+       "      <td>int64</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>887379</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>emp_title</td>\n",
+       "      <td>object</td>\n",
+       "      <td>51462</td>\n",
+       "      <td>5.799326</td>\n",
+       "      <td>299271</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>emp_length</td>\n",
+       "      <td>object</td>\n",
+       "      <td>44825</td>\n",
+       "      <td>5.051393</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>home_ownership</td>\n",
+       "      <td>object</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>annual_inc</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0.000451</td>\n",
+       "      <td>49384</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>annual_inc_joint</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>886868</td>\n",
+       "      <td>99.942415</td>\n",
+       "      <td>308</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>verification_status</td>\n",
+       "      <td>object</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>verification_status_joint</td>\n",
+       "      <td>object</td>\n",
+       "      <td>886868</td>\n",
+       "      <td>99.942415</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>zip_code</td>\n",
+       "      <td>object</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>935</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>addr_state</td>\n",
+       "      <td>object</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>51</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>purpose</td>\n",
+       "      <td>object</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>title</td>\n",
+       "      <td>object</td>\n",
+       "      <td>153</td>\n",
+       "      <td>0.017242</td>\n",
+       "      <td>63143</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>desc</td>\n",
+       "      <td>object</td>\n",
+       "      <td>761353</td>\n",
+       "      <td>85.797951</td>\n",
+       "      <td>124468</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>issue_d</td>\n",
+       "      <td>object</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>103</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>pymnt_plan</td>\n",
+       "      <td>object</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>policy_code</td>\n",
+       "      <td>float64</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>url</td>\n",
+       "      <td>object</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>887379</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "import pandas as pd\n",
-        "from src.eda_borrower import BorrowerProfileEDA, run_borrower_eda_pipeline\n",
-        "df = pd.read_csv(\"/Users/dv/Documents/cloned_repos/ml-model-git-lab/notebooks/data/loan.csv\")"
+      "text/plain": [
+       "                       column    dtype  n_missing  missing_pct  n_unique\n",
+       "0                          id    int64          0     0.000000    887379\n",
+       "1                   member_id    int64          0     0.000000    887379\n",
+       "2                   emp_title   object      51462     5.799326    299271\n",
+       "3                  emp_length   object      44825     5.051393        11\n",
+       "4              home_ownership   object          0     0.000000         6\n",
+       "5                  annual_inc  float64          4     0.000451     49384\n",
+       "6            annual_inc_joint  float64     886868    99.942415       308\n",
+       "7         verification_status   object          0     0.000000         3\n",
+       "8   verification_status_joint   object     886868    99.942415         3\n",
+       "9                    zip_code   object          0     0.000000       935\n",
+       "10                 addr_state   object          0     0.000000        51\n",
+       "11                    purpose   object          0     0.000000        14\n",
+       "12                      title   object        153     0.017242     63143\n",
+       "13                       desc   object     761353    85.797951    124468\n",
+       "14                    issue_d   object          0     0.000000       103\n",
+       "15                 pymnt_plan   object          0     0.000000         2\n",
+       "16                policy_code  float64          0     0.000000         1\n",
+       "17                        url   object          0     0.000000    887379"
       ]
-    },
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "report[\"structure\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "axMvdNZcgHZH"
-      },
-      "outputs": [],
-      "source": [
-        "eda = BorrowerProfileEDA(df, target_col=\"loan_status\")"
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "      <th>mean</th>\n",
+       "      <th>std</th>\n",
+       "      <th>min</th>\n",
+       "      <th>25%</th>\n",
+       "      <th>50%</th>\n",
+       "      <th>75%</th>\n",
+       "      <th>max</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>annual_inc</th>\n",
+       "      <td>887375.0</td>\n",
+       "      <td>75027.587761</td>\n",
+       "      <td>64698.300142</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>45000.0</td>\n",
+       "      <td>65000.0</td>\n",
+       "      <td>90000.0</td>\n",
+       "      <td>9500000.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>annual_inc_joint</th>\n",
+       "      <td>511.0</td>\n",
+       "      <td>109981.011585</td>\n",
+       "      <td>52730.379847</td>\n",
+       "      <td>17950.0</td>\n",
+       "      <td>76032.5</td>\n",
+       "      <td>101771.0</td>\n",
+       "      <td>132800.0</td>\n",
+       "      <td>500000.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                     count           mean           std      min      25%  \\\n",
+       "annual_inc        887375.0   75027.587761  64698.300142      0.0  45000.0   \n",
+       "annual_inc_joint     511.0  109981.011585  52730.379847  17950.0  76032.5   \n",
+       "\n",
+       "                       50%       75%        max  \n",
+       "annual_inc         65000.0   90000.0  9500000.0  \n",
+       "annual_inc_joint  101771.0  132800.0   500000.0  "
       ]
-    },
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "report[\"income\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "source": [
-        "report = run_borrower_eda_pipeline(eda)\n",
-        "\n",
-        "report[\"structure\"]\n",
-        "report[\"income\"]\n",
-        "report[\"freqs\"]\n",
-        "report[\"default_by_home_ownership\"]\n",
-        "report[\"default_by_purpose\"]"
-      ],
-      "metadata": {
-        "id": "dk0YDhTa6TPn"
-      },
-      "execution_count": null,
-      "outputs": []
+     "data": {
+      "text/plain": [
+       "{'home_ownership': home_ownership\n",
+       " MORTGAGE    443557\n",
+       " RENT        356117\n",
+       " OWN          87470\n",
+       " OTHER          182\n",
+       " NONE            50\n",
+       " ANY              3\n",
+       " Name: count, dtype: int64,\n",
+       " 'addr_state': addr_state\n",
+       " CA    129517\n",
+       " NY     74086\n",
+       " TX     71138\n",
+       " FL     60935\n",
+       " IL     35476\n",
+       " NJ     33256\n",
+       " PA     31393\n",
+       " OH     29631\n",
+       " GA     29085\n",
+       " VA     26255\n",
+       " Name: count, dtype: int64,\n",
+       " 'purpose': purpose\n",
+       " debt_consolidation    524215\n",
+       " credit_card           206182\n",
+       " home_improvement       51829\n",
+       " other                  42894\n",
+       " major_purchase         17277\n",
+       " small_business         10377\n",
+       " car                     8863\n",
+       " medical                 8540\n",
+       " moving                  5414\n",
+       " vacation                4736\n",
+       " Name: count, dtype: int64}"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
     }
-  ],
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.13.5"
+   ],
+   "source": [
+    "report[\"freqs\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "home_ownership\n",
+       "ANY         0.000000\n",
+       "MORTGAGE    0.060520\n",
+       "NONE        0.160000\n",
+       "OTHER       0.208791\n",
+       "OWN         0.064662\n",
+       "RENT        0.080395\n",
+       "Name: loan_status_binary, dtype: float64"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
     }
+   ],
+   "source": [
+    "report[\"default_by_home_ownership\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "purpose\n",
+       "car                   0.062733\n",
+       "credit_card           0.051435\n",
+       "debt_consolidation    0.071745\n",
+       "educational           0.208038\n",
+       "home_improvement      0.061471\n",
+       "house                 0.102509\n",
+       "major_purchase        0.067662\n",
+       "medical               0.087588\n",
+       "moving                0.104174\n",
+       "other                 0.089826\n",
+       "renewable_energy      0.111304\n",
+       "small_business        0.164017\n",
+       "vacation              0.077069\n",
+       "wedding               0.121858\n",
+       "Name: loan_status_binary, dtype: float64"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "report[\"default_by_purpose\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
\ No newline at end of file
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 5b35bee27849b9f5d16f354014cd18aec092f31c Mon Sep 17 00:00:00 2001
From: D V <dv@Air-von-D.fritz.box>
Date: Fri, 28 Nov 2025 13:47:40 +0100
Subject: [PATCH 12/12] add gitignore

---
 .DS_Store     | Bin 8196 -> 8196 bytes
 .gitignore    |  58 ++++++++++++++++++++++++++++++++++++++++++++++++++
 src/.DS_Store | Bin 6148 -> 6148 bytes
 3 files changed, 58 insertions(+)
 create mode 100644 .gitignore

diff --git a/.DS_Store b/.DS_Store
index 5b0ff84908dab68db8bcb2259114f817889004a5..854aed8ccfad65f1101329d6c488a13de677d169 100644
GIT binary patch
delta 20
ccmZp1XmQwZO=$9e0oKXiM4UFu3P0xo0A6?qeE<Le

delta 21
ccmZp1XmQwZO^Cz9z(PmC*w|?EJs~w-08W+$e*gdg

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..798b178
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,58 @@
+
+
+# --- Python ---
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.so
+
+# --- Virtual Environments ---
+venv/
+env/
+*.env
+.venv/
+
+# --- Jupyter Notebook ---
+.ipynb_checkpoints/
+*.ipynb_checkpoints
+
+# --- Data files (raw + processed) ---
+notebooks/data/
+data/
+*.csv
+*.tsv
+*.xlsx
+*.xls
+*.parquet
+*.feather
+
+# If you want to track specific datasets, remove entries above individually
+
+# --- MacOS Finder crap ---
+.DS_Store
+../.DS_Store
+
+# --- Logs ---
+*.log
+
+# --- Cache ---
+.cache/
+*.tmp
+*.temp
+
+# --- Config ---
+*.swp
+*.swo
+
+# --- IDEs ---
+.vscode/
+.idea/
+
+# --- Colab artifacts ---
+*.json
+*.pbtxt
+*.ipynb
+drive/
+content/
+
diff --git a/src/.DS_Store b/src/.DS_Store
index 5c7fced96d178afa572a26b342d8cc23773820a0..db6eadba38a04c28750b6e7339a10f1bc612b099 100644
GIT binary patch
delta 350
zcmah_J&VFX6r4>0iX0@MSZk++ZY;0AMi3DW#ZJ_OLr^!aQL(b|2N?VbwstA+FWK82
zHydrNWWU~fGdufckN(j=l`W@`tjbBVDx9SEd+{t=IOh*>JGclDV2lwaP|zF-JJvLs
zh_H|1NoTW;HGnq0vBw%Q(FG(1WX`c+9I`6$MU|LKqR-qYm}-D+hAB*++$C8Cnew(e
zU`cAg=QYoWk_K83YH(LX!Yuy5jMo~B$81!Rr70lU)_Vipr-5Sj(yP_t!)paKv`~L|
gPKEU+(p=un^VgW%?Xz3@^aQ|}ZI(IHVd-lB1Kq$yo&W#<

delta 76
zcmZoMXfc=|#>B`mu~2NHo+2aD#DLwC4MbQb^Rv9%EW#nova#V0<7Rdaeh#3T&4L`?
ZnJ4p$SPC!z0V4wg6O?Az93irX836KI5Uv0K