diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..854aed8 Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..798b178 --- /dev/null +++ b/.gitignore @@ -0,0 +1,58 @@ + + +# --- Python --- +__pycache__/ +*.py[cod] +*.pyo +*.pyd +*.so + +# --- Virtual Environments --- +venv/ +env/ +*.env +.venv/ + +# --- Jupyter Notebook --- +.ipynb_checkpoints/ +*.ipynb_checkpoints + +# --- Data files (raw + processed) --- +notebooks/data/ +data/ +*.csv +*.tsv +*.xlsx +*.xls +*.parquet +*.feather + +# If you want to track specific datasets, remove entries above individually + +# --- MacOS Finder crap --- +.DS_Store +../.DS_Store + +# --- Logs --- +*.log + +# --- Cache --- +.cache/ +*.tmp +*.temp + +# --- Config --- +*.swp +*.swo + +# --- IDEs --- +.vscode/ +.idea/ + +# --- Colab artifacts --- +*.json +*.pbtxt +*.ipynb +drive/ +content/ + diff --git a/notebooks/eda_borrower_demo.ipynb b/notebooks/eda_borrower_demo.ipynb new file mode 100644 index 0000000..7be0c57 --- /dev/null +++ b/notebooks/eda_borrower_demo.ipynb @@ -0,0 +1,651 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "knS_VAzEkWuW" + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "1. Load the dataset:\n", + "\n", + " import pandas as pd\n", + " from src.eda_borrower import BorrowerProfileEDA, run_borrower_eda_pipeline\n", + "\n", + " df = pd.read_csv(\"data/loan_sample.csv\") # or the correct path in your repo\n", + "\n", + "\n", + "2. Instantiate the EDA class:\n", + "\n", + " eda = BorrowerProfileEDA(df, target_col=\"loan_status\")\n", + "\n", + "\n", + "3. Run the pipeline and inspect the results:\n", + "\n", + " report = run_borrower_eda_pipeline(eda)\n", + "\n", + "\n", + "4. Display at least:\n", + "\n", + " report[\"structure\"] # table of borrower column structure\n", + " report[\"income\"] # income stats\n", + " report[\"freqs\"] # categorical frequencies\n", + " report[\"default_by_home_ownership\"] # default rate by home_ownership\n", + " report[\"default_by_purpose\"] # default rate by purpose\n", + "\n", + "\n", + "\n", + "You can add markdown cells explaining what each result means in plain language (e.g., class imbalance, missingness, etc.).\n", + "\n", + "Acceptance Criteria ✅\n", + "---------------------\n", + "\n", + "- `BorrowerProfileEDA`:\n", + "\n", + " - Initializes correctly with a DataFrame.\n", + " - `structure_summary()` returns a DataFrame with the requested columns/metrics.\n", + " - `income_summary()` returns a DataFrame with stats for `annual_inc` and `annual_inc_joint`.\n", + " - `categorical_freqs()` returns a dict of Series with top categories.\n", + " - `default_rate_by_category(col)` returns a Series of default rates per category.\n", + "- Functional pipeline:\n", + "\n", + " - `borrower_eda_steps(eda)` returns a dict of callables.\n", + " - `run_borrower_eda_pipeline(eda)` iterates over that dict, calls each function, and returns a dict of results.\n", + "- Notebook:\n", + "\n", + " - Runs top-to-bottom without errors.\n", + " - Shows the structure summary, income summary, categorical frequencies, and default-rate-by-category analysis.\n", + " - Contains only EDA (no model training).\n", + " \"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "if not os.path.exists(\"../src/eda_borrower.py\"):\n", + " raise FileNotFoundError(\"⚠️ eda_borrower.py missing in src/. Check your repo structure!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "repo_root = r\"/Users/dv/Documents/cloned_repos/ml-model-git-lab\"\n", + "sys.path.insert(0, repo_root)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import importlib\n", + "import src.eda_borrower\n", + "importlib.reload(src.eda_borrower)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from src.eda_borrower import BorrowerProfileEDA, run_borrower_eda_pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/dv/Documents/cloned_repos/ml-model-git-lab/src/eda_borrower.py\n" + ] + } + ], + "source": [ + "print(src.eda_borrower.__file__)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 349 + }, + "id": "EFc8nSn-gHZH", + "outputId": "94730639-f261-4b07-ada3-bf996dcbb600" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/gn/zvx5kxqj5ng9ng0cg5f15yk80000gn/T/ipykernel_62771/4134569851.py:3: DtypeWarning: Columns (19,55) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df = pd.read_csv(\"/Users/dv/Documents/cloned_repos/ml-model-git-lab/notebooks/data/loan.csv\")\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from src.eda_borrower import BorrowerProfileEDA, run_borrower_eda_pipeline\n", + "df = pd.read_csv(\"/Users/dv/Documents/cloned_repos/ml-model-git-lab/notebooks/data/loan.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "default_map = {\n", + " \"Fully Paid\": 0,\n", + " \"Current\": 0,\n", + " \"In Grace Period\": 0,\n", + " \"Issued\": 0,\n", + " \"Does not meet the credit policy. Status:Fully Paid\": 0,\n", + "\n", + " \"Charged Off\": 1,\n", + " \"Default\": 1,\n", + " \"Late (31-120 days)\": 1,\n", + " \"Late (16-30 days)\": 1,\n", + " \"Does not meet the credit policy. Status:Charged Off\": 1\n", + "}\n", + "\n", + "df[\"loan_status_binary\"] = df[\"loan_status\"].map(default_map)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "axMvdNZcgHZH" + }, + "outputs": [], + "source": [ + "eda = BorrowerProfileEDA(df, target_col=\"loan_status_binary\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "dk0YDhTa6TPn" + }, + "outputs": [], + "source": [ + "report = run_borrower_eda_pipeline(eda)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
columndtypen_missingmissing_pctn_unique
0idint6400.000000887379
1member_idint6400.000000887379
2emp_titleobject514625.799326299271
3emp_lengthobject448255.05139311
4home_ownershipobject00.0000006
5annual_incfloat6440.00045149384
6annual_inc_jointfloat6488686899.942415308
7verification_statusobject00.0000003
8verification_status_jointobject88686899.9424153
9zip_codeobject00.000000935
10addr_stateobject00.00000051
11purposeobject00.00000014
12titleobject1530.01724263143
13descobject76135385.797951124468
14issue_dobject00.000000103
15pymnt_planobject00.0000002
16policy_codefloat6400.0000001
17urlobject00.000000887379
\n", + "
" + ], + "text/plain": [ + " column dtype n_missing missing_pct n_unique\n", + "0 id int64 0 0.000000 887379\n", + "1 member_id int64 0 0.000000 887379\n", + "2 emp_title object 51462 5.799326 299271\n", + "3 emp_length object 44825 5.051393 11\n", + "4 home_ownership object 0 0.000000 6\n", + "5 annual_inc float64 4 0.000451 49384\n", + "6 annual_inc_joint float64 886868 99.942415 308\n", + "7 verification_status object 0 0.000000 3\n", + "8 verification_status_joint object 886868 99.942415 3\n", + "9 zip_code object 0 0.000000 935\n", + "10 addr_state object 0 0.000000 51\n", + "11 purpose object 0 0.000000 14\n", + "12 title object 153 0.017242 63143\n", + "13 desc object 761353 85.797951 124468\n", + "14 issue_d object 0 0.000000 103\n", + "15 pymnt_plan object 0 0.000000 2\n", + "16 policy_code float64 0 0.000000 1\n", + "17 url object 0 0.000000 887379" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report[\"structure\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countmeanstdmin25%50%75%max
annual_inc887375.075027.58776164698.3001420.045000.065000.090000.09500000.0
annual_inc_joint511.0109981.01158552730.37984717950.076032.5101771.0132800.0500000.0
\n", + "
" + ], + "text/plain": [ + " count mean std min 25% \\\n", + "annual_inc 887375.0 75027.587761 64698.300142 0.0 45000.0 \n", + "annual_inc_joint 511.0 109981.011585 52730.379847 17950.0 76032.5 \n", + "\n", + " 50% 75% max \n", + "annual_inc 65000.0 90000.0 9500000.0 \n", + "annual_inc_joint 101771.0 132800.0 500000.0 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report[\"income\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'home_ownership': home_ownership\n", + " MORTGAGE 443557\n", + " RENT 356117\n", + " OWN 87470\n", + " OTHER 182\n", + " NONE 50\n", + " ANY 3\n", + " Name: count, dtype: int64,\n", + " 'addr_state': addr_state\n", + " CA 129517\n", + " NY 74086\n", + " TX 71138\n", + " FL 60935\n", + " IL 35476\n", + " NJ 33256\n", + " PA 31393\n", + " OH 29631\n", + " GA 29085\n", + " VA 26255\n", + " Name: count, dtype: int64,\n", + " 'purpose': purpose\n", + " debt_consolidation 524215\n", + " credit_card 206182\n", + " home_improvement 51829\n", + " other 42894\n", + " major_purchase 17277\n", + " small_business 10377\n", + " car 8863\n", + " medical 8540\n", + " moving 5414\n", + " vacation 4736\n", + " Name: count, dtype: int64}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report[\"freqs\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "home_ownership\n", + "ANY 0.000000\n", + "MORTGAGE 0.060520\n", + "NONE 0.160000\n", + "OTHER 0.208791\n", + "OWN 0.064662\n", + "RENT 0.080395\n", + "Name: loan_status_binary, dtype: float64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report[\"default_by_home_ownership\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "purpose\n", + "car 0.062733\n", + "credit_card 0.051435\n", + "debt_consolidation 0.071745\n", + "educational 0.208038\n", + "home_improvement 0.061471\n", + "house 0.102509\n", + "major_purchase 0.067662\n", + "medical 0.087588\n", + "moving 0.104174\n", + "other 0.089826\n", + "renewable_energy 0.111304\n", + "small_business 0.164017\n", + "vacation 0.077069\n", + "wedding 0.121858\n", + "Name: loan_status_binary, dtype: float64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report[\"default_by_purpose\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000..db6eadb Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/.ipynb_checkpoints/eda_borrower-checkpoint.py b/src/.ipynb_checkpoints/eda_borrower-checkpoint.py new file mode 100644 index 0000000..02ad710 --- /dev/null +++ b/src/.ipynb_checkpoints/eda_borrower-checkpoint.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +"""eda_borrower.py + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1Xvxr8HvPPx6PW8OzzI-Ki4Z4_UV3SCBZ +""" + +import pandas as pd +from typing import Dict, Any, List, Callable + +BORROWER_COLS = [ + "id", "member_id", + "emp_title", "emp_length", + "home_ownership", + "annual_inc", "annual_inc_joint", + "verification_status", "verification_status_joint", + "zip_code", "addr_state", + "purpose", "title", "desc", + "issue_d", "pymnt_plan", "policy_code", + "url", +] + +class BorrowerProfileEDA: + def __init__(self, df: pd.DataFrame, target_col: str = "loan_status"): + """ + Store the full DataFrame and the name of the target column. + """ + self.df = df + self.target_col = target_col + + def structure_summary(self) -> pd.DataFrame: + # Create empty list + rows = [] + # Loop over BORROWER_COLS and if empty fields fill with dict + for col in BORROWER_COLS: + if col not in self.df.columns: + rows.append({ + "column" : col, + "dtype" : None, + "n_missing" : None, + "missing_pct" : None, + "n_unique" : None + }) + else: + # create series + s = self.df[col] + + # fill series with asked information about data + rows.append({ + "column" : col, + "dtype" : s.dtype, + "n_missing" : s.isna().sum(), + "missing_pct" : (s.isna().mean() *100), + "n_unique" : s.nunique(dropna=True) + }) + # return Dataframe with Information about the data + return pd.DataFrame(rows) + + def income_summary(self) -> pd.DataFrame: + #provide information about two explicit columns via describe().T + return self.df[["annual_inc", "annual_inc_joint"]].describe().T + + + def categorical_freqs(self, max_levels: int = 10) -> Dict[str, pd.Series]: + # create variable for list of columns + cat_cols = ['home_ownership', 'addr_state', 'purpose'] + # create empty dict to return info + result = {} + # loop over specified columns and write value_counts in dict result + for col in cat_cols: + if col in self.df.columns: + result[col] = self.df[col].value_counts().head(max_levels) + else: + result[col] = None + # return dict with result + return result + + + def default_rate_by_category(self, col: str) -> pd.Series: + return self.df.groupby(col)[self.target_col].mean() + +def borrower_eda_steps(eda: BorrowerProfileEDA) -> Dict[str, Callable[[], Any]]: + + return { + "structure": eda.structure_summary, + "income": eda.income_summary, + "freqs": lambda: eda.categorical_freqs(max_levels=10), + "default_by_home_ownership": lambda: eda.default_rate_by_category("home_ownership"), + "default_by_purpose": lambda: eda.default_rate_by_category("purpose"), + } + + +def run_borrower_eda_pipeline(eda: BorrowerProfileEDA) -> Dict[str, Any]: + steps = borrower_eda_steps(eda) + results = {} + + for name, func in steps.items(): + results[name] = func() # call the stored function + + return results \ No newline at end of file diff --git a/src/eda_borrower.py b/src/eda_borrower.py new file mode 100644 index 0000000..02ad710 --- /dev/null +++ b/src/eda_borrower.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +"""eda_borrower.py + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1Xvxr8HvPPx6PW8OzzI-Ki4Z4_UV3SCBZ +""" + +import pandas as pd +from typing import Dict, Any, List, Callable + +BORROWER_COLS = [ + "id", "member_id", + "emp_title", "emp_length", + "home_ownership", + "annual_inc", "annual_inc_joint", + "verification_status", "verification_status_joint", + "zip_code", "addr_state", + "purpose", "title", "desc", + "issue_d", "pymnt_plan", "policy_code", + "url", +] + +class BorrowerProfileEDA: + def __init__(self, df: pd.DataFrame, target_col: str = "loan_status"): + """ + Store the full DataFrame and the name of the target column. + """ + self.df = df + self.target_col = target_col + + def structure_summary(self) -> pd.DataFrame: + # Create empty list + rows = [] + # Loop over BORROWER_COLS and if empty fields fill with dict + for col in BORROWER_COLS: + if col not in self.df.columns: + rows.append({ + "column" : col, + "dtype" : None, + "n_missing" : None, + "missing_pct" : None, + "n_unique" : None + }) + else: + # create series + s = self.df[col] + + # fill series with asked information about data + rows.append({ + "column" : col, + "dtype" : s.dtype, + "n_missing" : s.isna().sum(), + "missing_pct" : (s.isna().mean() *100), + "n_unique" : s.nunique(dropna=True) + }) + # return Dataframe with Information about the data + return pd.DataFrame(rows) + + def income_summary(self) -> pd.DataFrame: + #provide information about two explicit columns via describe().T + return self.df[["annual_inc", "annual_inc_joint"]].describe().T + + + def categorical_freqs(self, max_levels: int = 10) -> Dict[str, pd.Series]: + # create variable for list of columns + cat_cols = ['home_ownership', 'addr_state', 'purpose'] + # create empty dict to return info + result = {} + # loop over specified columns and write value_counts in dict result + for col in cat_cols: + if col in self.df.columns: + result[col] = self.df[col].value_counts().head(max_levels) + else: + result[col] = None + # return dict with result + return result + + + def default_rate_by_category(self, col: str) -> pd.Series: + return self.df.groupby(col)[self.target_col].mean() + +def borrower_eda_steps(eda: BorrowerProfileEDA) -> Dict[str, Callable[[], Any]]: + + return { + "structure": eda.structure_summary, + "income": eda.income_summary, + "freqs": lambda: eda.categorical_freqs(max_levels=10), + "default_by_home_ownership": lambda: eda.default_rate_by_category("home_ownership"), + "default_by_purpose": lambda: eda.default_rate_by_category("purpose"), + } + + +def run_borrower_eda_pipeline(eda: BorrowerProfileEDA) -> Dict[str, Any]: + steps = borrower_eda_steps(eda) + results = {} + + for name, func in steps.items(): + results[name] = func() # call the stored function + + return results \ No newline at end of file