From acc7ef8664a0b79bcedac7eaa6c60f2e82a73ad3 Mon Sep 17 00:00:00 2001 From: aaghafari-dev Date: Fri, 28 Nov 2025 15:29:38 +0100 Subject: [PATCH 1/2] My Homework --- notebooks/eda_credit_history_demo.ipynb | 950 ++++++++++++++++++ src/__pycache__/__init__.cpython-39.pyc | Bin 0 -> 186 bytes .../eda_credit_history.cpython-39.pyc | Bin 0 -> 3574 bytes src/eda_credit_history.py | 116 +++ 4 files changed, 1066 insertions(+) create mode 100644 notebooks/eda_credit_history_demo.ipynb create mode 100644 src/__pycache__/__init__.cpython-39.pyc create mode 100644 src/__pycache__/eda_credit_history.cpython-39.pyc create mode 100644 src/eda_credit_history.py diff --git a/notebooks/eda_credit_history_demo.ipynb b/notebooks/eda_credit_history_demo.ipynb new file mode 100644 index 0000000..5cc3090 --- /dev/null +++ b/notebooks/eda_credit_history_demo.ipynb @@ -0,0 +1,950 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 30, + "id": "df20702c", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "repo_root = r\"D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Tusday\\ml-model-git-lab\"\n", + "sys.path.insert(0, repo_root)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "849444e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import importlib\n", + "import src.eda_credit_history\n", + "importlib.reload(src.eda_credit_history)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "f71547c0", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from src.eda_credit_history import CreditHistoryEDA, credit_history_report" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "bbc716e1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Tusday\\ml-model-git-lab\\src\\eda_credit_history.py\n" + ] + } + ], + "source": [ + "print(src.eda_credit_history.__file__)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "8cc26390", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\ASUS\\AppData\\Local\\Temp\\ipykernel_4312\\1681293082.py:1: DtypeWarning: Columns (19,55) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df = pd.read_csv(\"D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Monday\\loan.csv\")\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idmember_idloan_amntfunded_amntfunded_amnt_invtermint_rateinstallmentgradesub_grade...total_bal_ilil_utilopen_rv_12mopen_rv_24mmax_bal_bcall_utiltotal_rev_hi_liminq_fitotal_cu_tlinq_last_12m
0107750112965995000.05000.04975.036 months10.65162.87BB2...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1107743013141672500.02500.02500.060 months15.2759.83CC4...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2107717513135242400.02400.02400.036 months15.9684.33CC5...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

3 rows × 74 columns

\n", + "
" + ], + "text/plain": [ + " id member_id loan_amnt funded_amnt funded_amnt_inv term \\\n", + "0 1077501 1296599 5000.0 5000.0 4975.0 36 months \n", + "1 1077430 1314167 2500.0 2500.0 2500.0 60 months \n", + "2 1077175 1313524 2400.0 2400.0 2400.0 36 months \n", + "\n", + " int_rate installment grade sub_grade ... total_bal_il il_util \\\n", + "0 10.65 162.87 B B2 ... NaN NaN \n", + "1 15.27 59.83 C C4 ... NaN NaN \n", + "2 15.96 84.33 C C5 ... NaN NaN \n", + "\n", + " open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi \\\n", + "0 NaN NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN NaN \n", + "\n", + " total_cu_tl inq_last_12m \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "\n", + "[3 rows x 74 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Monday\\loan.csv\") \n", + "df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "bbef40c3", + "metadata": {}, + "outputs": [], + "source": [ + "default_map = {\n", + " \"Fully Paid\": 0,\n", + " \"Current\": 0,\n", + " \"In Grace Period\": 0,\n", + " \"Issued\": 0,\n", + " \"Does not meet the credit policy. Status:Fully Paid\": 0,\n", + "\n", + " \"Charged Off\": 1,\n", + " \"Default\": 1,\n", + " \"Late (31-120 days)\": 1,\n", + " \"Late (16-30 days)\": 1,\n", + " \"Does not meet the credit policy. Status:Charged Off\": 1\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "75edcf62", + "metadata": {}, + "outputs": [], + "source": [ + "df[\"loan_status_binary\"] = df[\"loan_status\"].map(default_map)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "fe151cf7", + "metadata": {}, + "outputs": [], + "source": [ + "#Instantiate the EDA class:\n", + "eda = CreditHistoryEDA(df, target_col=\"loan_status_binary\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "220be1c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Fully Paid', 'Charged Off', 'Current', 'Default',\n", + " 'Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)',\n", + " 'Does not meet the credit policy. Status:Fully Paid',\n", + " 'Does not meet the credit policy. Status:Charged Off', 'Issued'],\n", + " dtype=object)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loan_status.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "da3e1e57", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Tusday\\ml-model-git-lab\\src\\eda_credit_history.py:62: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " result = self.df.groupby(Buckets)[self.target_col].agg(n_loans=\"count\", default_rate=\"mean\").reset_index()\n", + "D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Tusday\\ml-model-git-lab\\src\\eda_credit_history.py:62: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " result = self.df.groupby(Buckets)[self.target_col].agg(n_loans=\"count\", default_rate=\"mean\").reset_index()\n", + "C:\\Users\\ASUS\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python39\\site-packages\\numpy\\lib\\_function_base_impl.py:2922: RuntimeWarning: invalid value encountered in divide\n", + " c /= stddev[:, None]\n", + "C:\\Users\\ASUS\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python39\\site-packages\\numpy\\lib\\_function_base_impl.py:2923: RuntimeWarning: invalid value encountered in divide\n", + " c /= stddev[None, :]\n" + ] + } + ], + "source": [ + "#Run the report:\n", + "report = credit_history_report(eda)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "308d0c04", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
columndtypen_missingmissing_pctmeanstd
0dtifloat6400.00000018.15703917.190626
1dti_jointfloat6488687099.94264018.3101187.169233
2delinq_2yrsfloat64290.0032680.3144420.862244
3mths_since_last_delinqfloat6445431251.19706534.06379821.884940
4mths_since_last_recordfloat6475032684.55530370.11790328.127914
5mths_since_last_major_derogfloat6466567675.01597444.10483822.179841
6open_accfloat64290.00326811.5484695.317313
7total_accfloat64290.00326825.26802611.840561
8pub_recfloat64290.0032680.1953070.582091
9acc_now_delinqfloat64290.0032680.0049910.077625
10revol_balfloat6400.00000016920.78753322426.791896
11revol_utilfloat645020.05657155.06769323.834344
12total_rev_hi_limfloat64702767.91950232068.62004537498.258326
13tot_coll_amtfloat64702767.919502225.70261010311.367195
14tot_cur_balfloat64702767.919502139458.189336153749.966885
15total_bal_ilfloat6486600797.59155936552.81138943103.833619
16open_acc_6mfloat6486600797.5915591.1090211.242675
17open_il_6mfloat6486600797.5915592.9288323.089987
18open_il_12mfloat6486600797.5915590.7616510.996035
19open_il_24mfloat6486600797.5915591.6745741.688725
20mths_since_rcnt_ilfloat6486656997.65489220.91268627.209081
21open_rv_12mfloat6486600797.5915591.3890601.520129
22open_rv_24mfloat6486600797.5915592.9754822.631886
23max_bal_bcfloat6486600797.5915595887.9797405284.701239
24all_utilfloat6486600797.59155960.83193920.013254
25inq_last_6mthsfloat64290.0032680.6946230.998448
26inq_last_12mfloat6486600797.5915591.9773072.874067
27inq_fifloat6486600797.5915590.9439451.446872
28collections_12_mths_ex_medfloat641450.0163400.0143800.134191
\n", + "
" + ], + "text/plain": [ + " column dtype n_missing missing_pct \\\n", + "0 dti float64 0 0.000000 \n", + "1 dti_joint float64 886870 99.942640 \n", + "2 delinq_2yrs float64 29 0.003268 \n", + "3 mths_since_last_delinq float64 454312 51.197065 \n", + "4 mths_since_last_record float64 750326 84.555303 \n", + "5 mths_since_last_major_derog float64 665676 75.015974 \n", + "6 open_acc float64 29 0.003268 \n", + "7 total_acc float64 29 0.003268 \n", + "8 pub_rec float64 29 0.003268 \n", + "9 acc_now_delinq float64 29 0.003268 \n", + "10 revol_bal float64 0 0.000000 \n", + "11 revol_util float64 502 0.056571 \n", + "12 total_rev_hi_lim float64 70276 7.919502 \n", + "13 tot_coll_amt float64 70276 7.919502 \n", + "14 tot_cur_bal float64 70276 7.919502 \n", + "15 total_bal_il float64 866007 97.591559 \n", + "16 open_acc_6m float64 866007 97.591559 \n", + "17 open_il_6m float64 866007 97.591559 \n", + "18 open_il_12m float64 866007 97.591559 \n", + "19 open_il_24m float64 866007 97.591559 \n", + "20 mths_since_rcnt_il float64 866569 97.654892 \n", + "21 open_rv_12m float64 866007 97.591559 \n", + "22 open_rv_24m float64 866007 97.591559 \n", + "23 max_bal_bc float64 866007 97.591559 \n", + "24 all_util float64 866007 97.591559 \n", + "25 inq_last_6mths float64 29 0.003268 \n", + "26 inq_last_12m float64 866007 97.591559 \n", + "27 inq_fi float64 866007 97.591559 \n", + "28 collections_12_mths_ex_med float64 145 0.016340 \n", + "\n", + " mean std \n", + "0 18.157039 17.190626 \n", + "1 18.310118 7.169233 \n", + "2 0.314442 0.862244 \n", + "3 34.063798 21.884940 \n", + "4 70.117903 28.127914 \n", + "5 44.104838 22.179841 \n", + "6 11.548469 5.317313 \n", + "7 25.268026 11.840561 \n", + "8 0.195307 0.582091 \n", + "9 0.004991 0.077625 \n", + "10 16920.787533 22426.791896 \n", + "11 55.067693 23.834344 \n", + "12 32068.620045 37498.258326 \n", + "13 225.702610 10311.367195 \n", + "14 139458.189336 153749.966885 \n", + "15 36552.811389 43103.833619 \n", + "16 1.109021 1.242675 \n", + "17 2.928832 3.089987 \n", + "18 0.761651 0.996035 \n", + "19 1.674574 1.688725 \n", + "20 20.912686 27.209081 \n", + "21 1.389060 1.520129 \n", + "22 2.975482 2.631886 \n", + "23 5887.979740 5284.701239 \n", + "24 60.831939 20.013254 \n", + "25 0.694623 0.998448 \n", + "26 1.977307 2.874067 \n", + "27 0.943945 1.446872 \n", + "28 0.014380 0.134191 " + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Display at least:\n", + "report[\"structure_summary\"] # structure of all credit-history columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "a8d40f26", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dtin_loansdefault_rate
0(-0.001, 10.62]1776930.060019
1(10.62, 15.4]1775200.063672
2(15.4, 19.97]1773230.070487
3(19.97, 25.53]1775190.077327
4(25.53, 9999.0]1773240.073211
\n", + "
" + ], + "text/plain": [ + " dti n_loans default_rate\n", + "0 (-0.001, 10.62] 177693 0.060019\n", + "1 (10.62, 15.4] 177520 0.063672\n", + "2 (15.4, 19.97] 177323 0.070487\n", + "3 (19.97, 25.53] 177519 0.077327\n", + "4 (25.53, 9999.0] 177324 0.073211" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report[\"dti_buckets\"] # default rate by DTI bucket\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "6e8fc3d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
revol_utiln_loansdefault_rate
0(-0.001, 33.3]1775640.052978
1(33.3, 49.1]1782780.060271
2(49.1, 62.8]1770720.068243
3(62.8, 77.5]1767070.076115
4(77.5, 892.3]1772560.087021
\n", + "
" + ], + "text/plain": [ + " revol_util n_loans default_rate\n", + "0 (-0.001, 33.3] 177564 0.052978\n", + "1 (33.3, 49.1] 178278 0.060271\n", + "2 (49.1, 62.8] 177072 0.068243\n", + "3 (62.8, 77.5] 176707 0.076115\n", + "4 (77.5, 892.3] 177256 0.087021" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report[\"revol_util_buckets\"] # default rate by revol_util bucket\n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "02549940", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dti 0.008322\n", + "dti_joint -0.040340\n", + "delinq_2yrs -0.002174\n", + "mths_since_last_delinq -0.006165\n", + "mths_since_last_record 0.043935\n", + "mths_since_last_major_derog -0.014210\n", + "open_acc -0.017776\n", + "total_acc -0.019214\n", + "pub_rec -0.015158\n", + "acc_now_delinq -0.000019\n", + "revol_bal -0.020264\n", + "revol_util 0.046479\n", + "total_rev_hi_lim -0.037400\n", + "tot_coll_amt -0.001642\n", + "tot_cur_bal -0.038387\n", + "total_bal_il NaN\n", + "open_acc_6m NaN\n", + "open_il_6m NaN\n", + "open_il_12m NaN\n", + "open_il_24m NaN\n", + "mths_since_rcnt_il NaN\n", + "open_rv_12m NaN\n", + "open_rv_24m NaN\n", + "max_bal_bc NaN\n", + "all_util NaN\n", + "inq_last_6mths 0.082200\n", + "inq_last_12m NaN\n", + "inq_fi NaN\n", + "collections_12_mths_ex_med -0.007635\n", + "Name: correlation_with_default, dtype: float64" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report[\"correlation_with_default\"] # correlation of each credit feature with default" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/__pycache__/__init__.cpython-39.pyc b/src/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8835e08fb5e179011b26073661e7da6d0deed938 GIT binary patch literal 186 zcmYe~<>g`kf|e7inIQTxh(HF6K#l_t7qb9~6oz01O-8?!3`HPe1o120#VRH>r8GV# zv7jWsAjY#OKQALOIXm9bGhQv;C9xziI5{&lFF7^FH$Sf=Bi=CHC?-5LHQO{Mq_j9C zu`(t%M>jV=B{fGkJ+nkNCow6exF|U$K0Y%qvm`!Vub}c4hYe7^G$+*#WbtPpW&i+e C1u*LX literal 0 HcmV?d00001 diff --git a/src/__pycache__/eda_credit_history.cpython-39.pyc b/src/__pycache__/eda_credit_history.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84c396dc5f021036bb39158c3632bdef94b4ef75 GIT binary patch literal 3574 zcmb7HOK%m)74G+a*f{196c{CHZ^0A(=%uK4xWGg zJ;~Jt^fZ}$d`v#atG-0T9qty+x@&uH-AC^gK^d-xWwaiZo%N3EY&qQj&Ry?*;qZWm z2M!OPd+R;Xm3>GTnM4 zDwbxM>FHXhg&lTA&45ALEMS3E_0wY$Jt>~l1skM=>9@m1=Y?5pML}WPIV72|(G~3M0W%>e<>AbEKG&5^hJY%Kc8SUWg z6JYW=UiAQtut-_=_z#3X3`EGimmc?da1gFXMByP25%CTX(cz0C6kXl}N1xAuqX&+@ z<(TJZz%j=cfSI#qaajtU>p$d5*K+sn${jOT)M>?(PIaSRdB)?Lrms`EC2Z_70`T!e zV-N2#UbTp3?2aAgj6Jlj1*Nz5*%}m!SZ|AXvnh&rCDrLYnU*3>D{jdZP3lcMZVkl< zQBzh+tMcbiN{CjX*p$D(M1F|ol_RgB`%k{|>4xA9Vm{LKXyd-DtL-#<%jF zahH29+>YN;zV3~KVgI>14#&~BGw$->Wpw1a&dy~$H=G}P$Sc8jE)QRLz(KV4&wf04 zzOF

!%T=BF<`M8E?s#OO6gRFlX=Er?8L9WlMB zeKR*>)&wQ286SF~pwr5z8=L5A$WQ5x6- z&^8Hi{AdDM*laSOPB3^@rgHZpR;dyJcIVu{?YdoW-d*(22eG@Cyo2BO>3L0j~qrB?1g6$43q^DTEvgY*GKW7$sP->Y}U!@ za@1gni_NqtbS#OF@ZK^l^`|->*b@vNrdwx$Kk-4$9)MOB;H{y0vBI zq)gIfc(J#4Bak~9)ON_$vJFC&JZaOkOA$Mv(weVFdU=0AX!rC^7sBR zjoPx6CYI}Uai11CZv@P6W-cKqMt)N10&pFtAcJ?iGEB9fPknLcAE zqfJ5MdB%2{v}jkXtMWGZ(1j<3uwp;0h0R`lEJDZFJ4pg%C(0Lo&8$- zWgDQMY>nyOD$;Vm)7!-HJzi(;$|MUhB;aG3-BZBxF4-VmCd{=c74p!xcBZr# zDat+cY4Rb_g3YGNW*oAW?T4qv;*a3Bv3J}Bcj3R07r2Y=yeo&GCvzsm)d5A?t41ao z;i8$U$+5wu1gBcktyL0CByD7S&IoH(zV*Py#WvQB?yVN%w@Fmd6%HlTz2J5G3wQnp D@(JEh literal 0 HcmV?d00001 diff --git a/src/eda_credit_history.py b/src/eda_credit_history.py new file mode 100644 index 0000000..907607f --- /dev/null +++ b/src/eda_credit_history.py @@ -0,0 +1,116 @@ + +import numpy as np +import pandas as pd +from typing import Dict, Any, Callable + +CREDIT_NUMERIC_COLS = ["dti", "dti_joint", "delinq_2yrs", "mths_since_last_delinq", "mths_since_last_record", "mths_since_last_major_derog", +"open_acc", "total_acc", "pub_rec", "acc_now_delinq", "revol_bal", "revol_util", "total_rev_hi_lim", +"tot_coll_amt", "tot_cur_bal", "total_bal_il", "open_acc_6m", "open_il_6m", "open_il_12m", "open_il_24m", +"mths_since_rcnt_il", "open_rv_12m", "open_rv_24m", "max_bal_bc", "all_util", +"inq_last_6mths", "inq_last_12m", "inq_fi", "collections_12_mths_ex_med"] + +"""Goal: +Explore how credit history, balances, utilization, and inquiries +relate to loan_status""" + +class CreditHistoryEDA: + def __init__(self, df: pd.DataFrame, target_col: str = "loan_status"): + """Store the full DataFrame and the name of the target column.""" + self.df = df + self.target_col = target_col + + def credit_structure_summary(self) -> pd.DataFrame: + """ + One row per CREDIT_NUMERIC_COLS column with: + - column + - dtype + - n_missing + - missing_pct + - mean (if numeric) + - std (if numeric) + """ + df_Numeric = self.df[CREDIT_NUMERIC_COLS].copy(deep=True) + row = [] + for col in CREDIT_NUMERIC_COLS: + series = df_Numeric[col] + + n_missing = series.isna().sum() + missing_pct = (n_missing / len(series)) * 100 + #if pd.api.types.is_numeric_dtype(series): # as meansioned above in docs "mean (if numeric)"" + mean_val = series.mean() + std_val = series.std() + #else: + # mean_val = None + # std_val = None + row.append({"column": col, "dtype": str(series.dtypes), "n_missing": n_missing, "missing_pct": missing_pct, "mean": mean_val, "std": std_val}) + return pd.DataFrame(row, columns=["column", "dtype", "n_missing", "missing_pct", "mean", "std" ]) + + def default_rate_by_bucket(self, col: str, bins: int = 4): + """ + For a numeric credit column (e.g., dti, revol_util), + create `bins` buckets and compute default rate per bucket. + + Return a DataFrame with columns: + - bucket (interval) + - n_loans + - default_rate + """ + #df_drop = self.df[[col]].dropna(subset=[col]).copy(deep=True) + #print("df_drop OK") + Buckets = pd.qcut(self.df[col], q=bins) # create interval bucket i.e 4 + #print("self.df[bucket] is OK") + result = self.df.groupby(Buckets)[self.target_col].agg(n_loans="count", default_rate="mean").reset_index() + # print("result is OK") + return result + #print("function is OK") + + def correlation_with_default(self) -> pd.Series: + """ + Compute correlation of each numeric credit column with the target + (assuming loan_status is encoded as 0/1). + Return a Series indexed by column name. + """ + correlation = {} + for col in CREDIT_NUMERIC_COLS: + #if pd.api.types.is_numeric_dtype(self.df[col]): + correlation[col] = self.df[col].corr(self.df[self.target_col]) + #else: + # correlation[col] = None + return pd.Series(correlation, name="correlation_with_default") +###################### part 2 +def credit_history_report(eda:CreditHistoryEDA): + steps: Dict[str, Callable[[], Any]] = {"structure_summary": eda.credit_structure_summary, "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5), + "revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5), "correlation_with_default": eda.correlation_with_default} + report: Dict[str, Any] = {} + for name, func in steps.items(): + report[name] = func() + return report + + + + +# # 2. Functional credit-history report +# # Add a functional report generator that coordinates several EDA steps: +# # """def credit_history_report(eda: CreditHistoryEDA) -> Dict[str, Any]:""" +# # Build a dict of step_name -> callable and run them to produce +# # a combined report. +# # Example steps: +# # - "structure_summary": eda.credit_structure_summary +# # - "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5) +# # - "revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5) +# # - "correlation_with_default": eda.correlation_with_default +# # Iterate over this dict, call each function, and return +# # a result dict: step_name -> output. +# # Example idea: +# # """ def credit_history_report(eda: CreditHistoryEDA) -> Dict[str, Any]: +# # steps: Dict[str, Callable[[], Any]] = { +# # "structure_summary": eda.credit_structure_summary, +# # "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5), +# # "revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5), +# # "correlation_with_default": eda.correlation_with_default, +# # } +# # report: Dict[str, Any] = {} +# # for name, func in steps.items(): +# # report[name] = func() +# # return report""" +# This should clearly show higher-order functions (functions stored and called later). \ No newline at end of file From d26f592430567626a384e9dde358b19b446f35e5 Mon Sep 17 00:00:00 2001 From: aaghafari-dev Date: Sun, 30 Nov 2025 19:24:55 +0100 Subject: [PATCH 2/2] My Homework_Issue#2 --- src/Example Usage.py | 8 +++ src/__pycache__/transformers.cpython-39.pyc | Bin 0 -> 2020 bytes src/cleaning_categorical.py | 31 +++++++++++ src/cleaning_text.py | 28 ++++++++++ src/test_subset2_transformer.py | 58 ++++++++++++++++++++ src/transformers.py | 50 +++++++++++++++++ 6 files changed, 175 insertions(+) create mode 100644 src/Example Usage.py create mode 100644 src/__pycache__/transformers.cpython-39.pyc create mode 100644 src/cleaning_categorical.py create mode 100644 src/cleaning_text.py create mode 100644 src/test_subset2_transformer.py create mode 100644 src/transformers.py diff --git a/src/Example Usage.py b/src/Example Usage.py new file mode 100644 index 0000000..c1d2d86 --- /dev/null +++ b/src/Example Usage.py @@ -0,0 +1,8 @@ +#Example Usage +cat_cols_subset2 = ["pymnt_plan", "purpose", "title", "zip_code", "addr_state"] + +transformer = Subset2CategoricalPerformanceTransformer(cat_cols=cat_cols_subset2) + +transformer.fit(X_train[cat_cols_subset2]) + +X_train_cat = transformer.transform(X_train[cat_cols_subset2]) diff --git a/src/__pycache__/transformers.cpython-39.pyc b/src/__pycache__/transformers.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..879be20c3c9ca340ac6b80e877dfb61390a5a270 GIT binary patch literal 2020 zcma)7OK;pZ5GJX|YS+8=#&J@(kDyJ9A_9^CG15z66lq><4%rk)fEL0Q1TAvcvL$Lm za_iVw_hMi32Z|on$Nrz5c<804{Dq!6qqQA3=phs|;*7}QXufYoqSxyX7{9LHEdELe z`5TpsBLd|f-1a^QNhB>vMKfBlj8W8C8B}2wHuIp2vKVYZ*~*gdhzwNXuAH!^XZmV zZsUP)YFwKl7v&GiU>ri{>OxR&Bpd>9C%JGuuS@IORiR-v)h-X<-Pyo7M)m-1`vr&@ zoe@rE3?y|-QVCJLMlC1^$WRi9Bpmp-80*@o$0^H}w}qCa;#0k=>pksbI}ygJG%_u4 zXaFc;@C@Hli{$u@5k~P!Oe{|WA6iw8-md+Fm}OY~PcFY6Dmmq)n7Dc}d}3<7E%IId z?Gyebe<+-Io)_Ry4WHK9ZSy;PWB8*|yI&1oOsy3A!>YVh0Tkt}ap7*2VrytkK6Gz^ zv$rSvKH^igf2q4>8nuVL> zbY=JS*_*{DbOp-bgrQ}Vyu1r5m**>>En=V)Km|bAz(>#AwlZnovx)R=%1 zpJ}CS1CS5%8nSi;psz$(94L+iv(6NHZ2h%SQI?IRDy#()-zxwatF$Y$Dt-4CQBGkC zvd}iX@KK?qdgTLPwdq2Re6$BqC?6KE-z=de7!@vUnI6vRPoD03CPzMiuzb959aEn8~hmH2X6qs z4cM_X?=E}+hbS;&aaueBPuM_UZ7gAJqerKDv`HMW;fMlbmKRT01x;qmfJZCyzPkjz zT5ffI8TbX(6AG+I1}Qa|J#z_!52doX2Z&tZN(D^`R9-5b^&11?(>6P&7~nV*vd_R6j!TF^UH9_IZ_ND?&uXKQG9_dFjr%(6Zf*6mM} zk49xJoM(FC<8AApZKO%VAae~*`4q)xC_V>q)|Vvo+Yxu_mCJf4`A7*jHOjurtQ)Wi z{a6f{&^6WtR&{_^*Xb%t;MoV6u(jY|hMXWL8QCX&c*Q&FD23CqGds32#2P-tiRu)86@!8k9(j>TY_(!l>D!ea5HgkFt1eS@~NlXdNG zsf5wDx8RRwkn;xr5!sb9`^2b;sdLp7_@b}J6ZioL+hFb^J*_7D<|gh$pZ^ztPc|1H P^9>%*wB0_W5R>*VLOcy* literal 0 HcmV?d00001 diff --git a/src/cleaning_categorical.py b/src/cleaning_categorical.py new file mode 100644 index 0000000..bc251d9 --- /dev/null +++ b/src/cleaning_categorical.py @@ -0,0 +1,31 @@ +import pandas as pd +def normalize_cat_strings(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame: + # Lowercase + strip whitespace in the given categorical columns. + df = df.copy() + for c in cols: + df[c] = df[c].astype(str).str.lower().str.strip() + return df + + +def fill_cat_missing_with_sentinel( + df: pd.DataFrame, + cols: list[str], + label: str = "Missing",) -> pd.DataFrame: + # Replace NaNs in categorical columns with a sentinel category. + df = df.copy() + for c in cols: + df[c] = df[c].fillna(label) + df[c] = df[c].replace("", label) + return df + +def group_rare_categories( + df: pd.DataFrame, + col: str, + min_count: int = 100, +) -> pd.DataFrame: + # Replace rare categories in a single column by "Other". + df = df.copy() + counts = df[col].value_counts() + rare = counts[counts < min_count].index + df[col] = df[col].where(~df[col].isin(rare), "Other") + return df \ No newline at end of file diff --git a/src/cleaning_text.py b/src/cleaning_text.py new file mode 100644 index 0000000..6a685dc --- /dev/null +++ b/src/cleaning_text.py @@ -0,0 +1,28 @@ +import pandas as pd +def add_desc_length(df: pd.DataFrame) -> pd.DataFrame: + # Create "desc_len" as character length of "desc" (or 0 if NaN). + df = df.copy() + df["desc_len"] = df["desc"].fillna("").astype(str).str.len() + return df + + +def add_title_word_count(df: pd.DataFrame) -> pd.DataFrame: + # Create "title_word_count" as word count of "title". + df = df.copy() + df["title_word_count"] = ( + df["title"].fillna("").astype(str).str.split().apply(len) + ) + return df + +###### Implement a functional composition helper: + +def apply_cat_steps( + df: pd.DataFrame, + cols: list[str], + steps: list, +) -> pd.DataFrame: + # Apply a sequence of functions(df, cols) -> df to categorical columns. + for step in steps: + df = step(df, cols) + return df + \ No newline at end of file diff --git a/src/test_subset2_transformer.py b/src/test_subset2_transformer.py new file mode 100644 index 0000000..0a813ac --- /dev/null +++ b/src/test_subset2_transformer.py @@ -0,0 +1,58 @@ +#tests/test_subset2_transformer.py +import pandas as pd +from transformers import Subset2CategoricalPerformanceTransformer + +def test_no_nans_after_transform(): + df = pd.DataFrame({ + "pymnt_plan": ["y", None, "n"], + "purpose": ["debt", "wedding", None], + "title": ["abc def", None, "xxx"], + "zip_code": ["123xx", "456xx", "123xx"], + "addr_state": ["CA", "ZZ", "CA"], + }) + + tr = Subset2CategoricalPerformanceTransformer( + cat_cols=list(df.columns), min_count=2 + ) + tr.fit(df) + out = tr.transform(df) + + assert not pd.isna(out).any() + + +def test_rare_category_grouping(): + df = pd.DataFrame({ + "purpose": ["a", "b", "c", "d", "e"], # all rare + "pymnt_plan": ["y", "y", "y", "y", "y"], + "title": ["t"]*5, + "zip_code": ["111"]*5, + "addr_state": ["CA","CA","CA","CA","CA"], + }) + + tr = Subset2CategoricalPerformanceTransformer( + cat_cols=df.columns.tolist(), + min_count=3 + ) + tr.fit(df) + + transformed = tr.transform(df) + + # purpose has 5 categories but each count=1 < 3 → all become Other + assert "Other" in tr.rare_maps_["purpose"] or len(tr.rare_maps_["purpose"]) == 5 + + +def test_text_features_numeric(): + df = pd.DataFrame({ + "pymnt_plan": ["y","n"], + "purpose": ["debt","car"], + "title": ["abc def","hello"], + "zip_code": ["123","456"], + "addr_state": ["CA","NY"], + }) + + tr = Subset2CategoricalPerformanceTransformer(cat_cols=df.columns.tolist()) + tr.fit(df) + out = tr.transform(df) + + # last 2 columns must be numeric + assert out[:, -2:].dtype.kind in ("i", "f") diff --git a/src/transformers.py b/src/transformers.py new file mode 100644 index 0000000..4cd5225 --- /dev/null +++ b/src/transformers.py @@ -0,0 +1,50 @@ +# In src/transformers.py, implement Subset2CategoricalPerformanceTransformer: +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.preprocessing import OneHotEncoder +import pandas as pd +import numpy as np + +class Subset2CategoricalPerformanceTransformer(BaseEstimator, TransformerMixin): + def __init__(self, cat_cols: list[str], min_count: int = 100): + self.cat_cols = cat_cols + self.min_count = min_count + self.encoder = OneHotEncoder(handle_unknown="ignore", sparse=False) + self.rare_maps_ = {} # mapping per column for rare categories + + def fit(self, X, y=None): + df = pd.DataFrame(X, columns[self.cat_cols]).copy() + # 1) normalize strings + df = normalize_cat_strings(df, self.cat_cols) + # 2) fill missing with sentinel + df = fill_cat_missing_with_sentinel(df, self.cat_cols) + # 3) detect rare categories per column and store mapping + for col in self.cat_cols: + counts = df[col].value_counts() + rare = counts[counts < self.min_count].index + self.rare_maps_[col] = set(rare) + df[col] = df[col].where(~df[col].isin(rare), "Other") + # 4) fit encoder + self.encoder.fit(df[self.cat_cols]) + return self + + def transform(self, X): + df = pd.DataFrame(X, columns=self.cat_cols).copy() + # apply normalization, missing fill, and rare grouping using self.rare_maps_ + df = normalize_cat_strings(df, self.cat_cols) + df = fill_cat_missing_with_sentinel(df, self.cat_cols) + for col in self.cat_cols: + rare = self.rare_maps_.get(col, set()) + df[col] = df[col].where(~df[col].isin(rare), "Other") + encoded = self.encoder.transform(df[self.cat_cols]) + # text features + df_text = df.copy() + df_text["desc"] = df_text.get("desc", "") + df_text["title"] = df_text.get("title", "") + + df_text = add_desc_length(df_text) + df_text = add_title_word_count(df_text) + + text_features = df_text[["desc_len", "title_word_count"]].values.astype(float) + + # final output: encoded OHE + text numeric + return np.hstack([encoded, text_features]) # return encoded \ No newline at end of file