diff --git a/notebooks/eda_credit_history_demo.ipynb b/notebooks/eda_credit_history_demo.ipynb new file mode 100644 index 0000000..5cc3090 --- /dev/null +++ b/notebooks/eda_credit_history_demo.ipynb @@ -0,0 +1,950 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 30, + "id": "df20702c", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "repo_root = r\"D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Tusday\\ml-model-git-lab\"\n", + "sys.path.insert(0, repo_root)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "849444e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import importlib\n", + "import src.eda_credit_history\n", + "importlib.reload(src.eda_credit_history)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "f71547c0", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from src.eda_credit_history import CreditHistoryEDA, credit_history_report" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "bbc716e1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Tusday\\ml-model-git-lab\\src\\eda_credit_history.py\n" + ] + } + ], + "source": [ + "print(src.eda_credit_history.__file__)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "8cc26390", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\ASUS\\AppData\\Local\\Temp\\ipykernel_4312\\1681293082.py:1: DtypeWarning: Columns (19,55) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df = pd.read_csv(\"D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Monday\\loan.csv\")\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idmember_idloan_amntfunded_amntfunded_amnt_invtermint_rateinstallmentgradesub_grade...total_bal_ilil_utilopen_rv_12mopen_rv_24mmax_bal_bcall_utiltotal_rev_hi_liminq_fitotal_cu_tlinq_last_12m
0107750112965995000.05000.04975.036 months10.65162.87BB2...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1107743013141672500.02500.02500.060 months15.2759.83CC4...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2107717513135242400.02400.02400.036 months15.9684.33CC5...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

3 rows × 74 columns

\n", + "
" + ], + "text/plain": [ + " id member_id loan_amnt funded_amnt funded_amnt_inv term \\\n", + "0 1077501 1296599 5000.0 5000.0 4975.0 36 months \n", + "1 1077430 1314167 2500.0 2500.0 2500.0 60 months \n", + "2 1077175 1313524 2400.0 2400.0 2400.0 36 months \n", + "\n", + " int_rate installment grade sub_grade ... total_bal_il il_util \\\n", + "0 10.65 162.87 B B2 ... NaN NaN \n", + "1 15.27 59.83 C C4 ... NaN NaN \n", + "2 15.96 84.33 C C5 ... NaN NaN \n", + "\n", + " open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi \\\n", + "0 NaN NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN NaN \n", + "\n", + " total_cu_tl inq_last_12m \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "\n", + "[3 rows x 74 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Monday\\loan.csv\") \n", + "df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "bbef40c3", + "metadata": {}, + "outputs": [], + "source": [ + "default_map = {\n", + " \"Fully Paid\": 0,\n", + " \"Current\": 0,\n", + " \"In Grace Period\": 0,\n", + " \"Issued\": 0,\n", + " \"Does not meet the credit policy. Status:Fully Paid\": 0,\n", + "\n", + " \"Charged Off\": 1,\n", + " \"Default\": 1,\n", + " \"Late (31-120 days)\": 1,\n", + " \"Late (16-30 days)\": 1,\n", + " \"Does not meet the credit policy. Status:Charged Off\": 1\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "75edcf62", + "metadata": {}, + "outputs": [], + "source": [ + "df[\"loan_status_binary\"] = df[\"loan_status\"].map(default_map)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "fe151cf7", + "metadata": {}, + "outputs": [], + "source": [ + "#Instantiate the EDA class:\n", + "eda = CreditHistoryEDA(df, target_col=\"loan_status_binary\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "220be1c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Fully Paid', 'Charged Off', 'Current', 'Default',\n", + " 'Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)',\n", + " 'Does not meet the credit policy. Status:Fully Paid',\n", + " 'Does not meet the credit policy. Status:Charged Off', 'Issued'],\n", + " dtype=object)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loan_status.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "da3e1e57", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Tusday\\ml-model-git-lab\\src\\eda_credit_history.py:62: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " result = self.df.groupby(Buckets)[self.target_col].agg(n_loans=\"count\", default_rate=\"mean\").reset_index()\n", + "D:\\edu_laptop\\Ironhack_AI_&_DataScience\\Month_1_2\\Week5\\Tusday\\ml-model-git-lab\\src\\eda_credit_history.py:62: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " result = self.df.groupby(Buckets)[self.target_col].agg(n_loans=\"count\", default_rate=\"mean\").reset_index()\n", + "C:\\Users\\ASUS\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python39\\site-packages\\numpy\\lib\\_function_base_impl.py:2922: RuntimeWarning: invalid value encountered in divide\n", + " c /= stddev[:, None]\n", + "C:\\Users\\ASUS\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python39\\site-packages\\numpy\\lib\\_function_base_impl.py:2923: RuntimeWarning: invalid value encountered in divide\n", + " c /= stddev[None, :]\n" + ] + } + ], + "source": [ + "#Run the report:\n", + "report = credit_history_report(eda)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "308d0c04", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
columndtypen_missingmissing_pctmeanstd
0dtifloat6400.00000018.15703917.190626
1dti_jointfloat6488687099.94264018.3101187.169233
2delinq_2yrsfloat64290.0032680.3144420.862244
3mths_since_last_delinqfloat6445431251.19706534.06379821.884940
4mths_since_last_recordfloat6475032684.55530370.11790328.127914
5mths_since_last_major_derogfloat6466567675.01597444.10483822.179841
6open_accfloat64290.00326811.5484695.317313
7total_accfloat64290.00326825.26802611.840561
8pub_recfloat64290.0032680.1953070.582091
9acc_now_delinqfloat64290.0032680.0049910.077625
10revol_balfloat6400.00000016920.78753322426.791896
11revol_utilfloat645020.05657155.06769323.834344
12total_rev_hi_limfloat64702767.91950232068.62004537498.258326
13tot_coll_amtfloat64702767.919502225.70261010311.367195
14tot_cur_balfloat64702767.919502139458.189336153749.966885
15total_bal_ilfloat6486600797.59155936552.81138943103.833619
16open_acc_6mfloat6486600797.5915591.1090211.242675
17open_il_6mfloat6486600797.5915592.9288323.089987
18open_il_12mfloat6486600797.5915590.7616510.996035
19open_il_24mfloat6486600797.5915591.6745741.688725
20mths_since_rcnt_ilfloat6486656997.65489220.91268627.209081
21open_rv_12mfloat6486600797.5915591.3890601.520129
22open_rv_24mfloat6486600797.5915592.9754822.631886
23max_bal_bcfloat6486600797.5915595887.9797405284.701239
24all_utilfloat6486600797.59155960.83193920.013254
25inq_last_6mthsfloat64290.0032680.6946230.998448
26inq_last_12mfloat6486600797.5915591.9773072.874067
27inq_fifloat6486600797.5915590.9439451.446872
28collections_12_mths_ex_medfloat641450.0163400.0143800.134191
\n", + "
" + ], + "text/plain": [ + " column dtype n_missing missing_pct \\\n", + "0 dti float64 0 0.000000 \n", + "1 dti_joint float64 886870 99.942640 \n", + "2 delinq_2yrs float64 29 0.003268 \n", + "3 mths_since_last_delinq float64 454312 51.197065 \n", + "4 mths_since_last_record float64 750326 84.555303 \n", + "5 mths_since_last_major_derog float64 665676 75.015974 \n", + "6 open_acc float64 29 0.003268 \n", + "7 total_acc float64 29 0.003268 \n", + "8 pub_rec float64 29 0.003268 \n", + "9 acc_now_delinq float64 29 0.003268 \n", + "10 revol_bal float64 0 0.000000 \n", + "11 revol_util float64 502 0.056571 \n", + "12 total_rev_hi_lim float64 70276 7.919502 \n", + "13 tot_coll_amt float64 70276 7.919502 \n", + "14 tot_cur_bal float64 70276 7.919502 \n", + "15 total_bal_il float64 866007 97.591559 \n", + "16 open_acc_6m float64 866007 97.591559 \n", + "17 open_il_6m float64 866007 97.591559 \n", + "18 open_il_12m float64 866007 97.591559 \n", + "19 open_il_24m float64 866007 97.591559 \n", + "20 mths_since_rcnt_il float64 866569 97.654892 \n", + "21 open_rv_12m float64 866007 97.591559 \n", + "22 open_rv_24m float64 866007 97.591559 \n", + "23 max_bal_bc float64 866007 97.591559 \n", + "24 all_util float64 866007 97.591559 \n", + "25 inq_last_6mths float64 29 0.003268 \n", + "26 inq_last_12m float64 866007 97.591559 \n", + "27 inq_fi float64 866007 97.591559 \n", + "28 collections_12_mths_ex_med float64 145 0.016340 \n", + "\n", + " mean std \n", + "0 18.157039 17.190626 \n", + "1 18.310118 7.169233 \n", + "2 0.314442 0.862244 \n", + "3 34.063798 21.884940 \n", + "4 70.117903 28.127914 \n", + "5 44.104838 22.179841 \n", + "6 11.548469 5.317313 \n", + "7 25.268026 11.840561 \n", + "8 0.195307 0.582091 \n", + "9 0.004991 0.077625 \n", + "10 16920.787533 22426.791896 \n", + "11 55.067693 23.834344 \n", + "12 32068.620045 37498.258326 \n", + "13 225.702610 10311.367195 \n", + "14 139458.189336 153749.966885 \n", + "15 36552.811389 43103.833619 \n", + "16 1.109021 1.242675 \n", + "17 2.928832 3.089987 \n", + "18 0.761651 0.996035 \n", + "19 1.674574 1.688725 \n", + "20 20.912686 27.209081 \n", + "21 1.389060 1.520129 \n", + "22 2.975482 2.631886 \n", + "23 5887.979740 5284.701239 \n", + "24 60.831939 20.013254 \n", + "25 0.694623 0.998448 \n", + "26 1.977307 2.874067 \n", + "27 0.943945 1.446872 \n", + "28 0.014380 0.134191 " + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Display at least:\n", + "report[\"structure_summary\"] # structure of all credit-history columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "a8d40f26", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dtin_loansdefault_rate
0(-0.001, 10.62]1776930.060019
1(10.62, 15.4]1775200.063672
2(15.4, 19.97]1773230.070487
3(19.97, 25.53]1775190.077327
4(25.53, 9999.0]1773240.073211
\n", + "
" + ], + "text/plain": [ + " dti n_loans default_rate\n", + "0 (-0.001, 10.62] 177693 0.060019\n", + "1 (10.62, 15.4] 177520 0.063672\n", + "2 (15.4, 19.97] 177323 0.070487\n", + "3 (19.97, 25.53] 177519 0.077327\n", + "4 (25.53, 9999.0] 177324 0.073211" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report[\"dti_buckets\"] # default rate by DTI bucket\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "6e8fc3d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
revol_utiln_loansdefault_rate
0(-0.001, 33.3]1775640.052978
1(33.3, 49.1]1782780.060271
2(49.1, 62.8]1770720.068243
3(62.8, 77.5]1767070.076115
4(77.5, 892.3]1772560.087021
\n", + "
" + ], + "text/plain": [ + " revol_util n_loans default_rate\n", + "0 (-0.001, 33.3] 177564 0.052978\n", + "1 (33.3, 49.1] 178278 0.060271\n", + "2 (49.1, 62.8] 177072 0.068243\n", + "3 (62.8, 77.5] 176707 0.076115\n", + "4 (77.5, 892.3] 177256 0.087021" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report[\"revol_util_buckets\"] # default rate by revol_util bucket\n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "02549940", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dti 0.008322\n", + "dti_joint -0.040340\n", + "delinq_2yrs -0.002174\n", + "mths_since_last_delinq -0.006165\n", + "mths_since_last_record 0.043935\n", + "mths_since_last_major_derog -0.014210\n", + "open_acc -0.017776\n", + "total_acc -0.019214\n", + "pub_rec -0.015158\n", + "acc_now_delinq -0.000019\n", + "revol_bal -0.020264\n", + "revol_util 0.046479\n", + "total_rev_hi_lim -0.037400\n", + "tot_coll_amt -0.001642\n", + "tot_cur_bal -0.038387\n", + "total_bal_il NaN\n", + "open_acc_6m NaN\n", + "open_il_6m NaN\n", + "open_il_12m NaN\n", + "open_il_24m NaN\n", + "mths_since_rcnt_il NaN\n", + "open_rv_12m NaN\n", + "open_rv_24m NaN\n", + "max_bal_bc NaN\n", + "all_util NaN\n", + "inq_last_6mths 0.082200\n", + "inq_last_12m NaN\n", + "inq_fi NaN\n", + "collections_12_mths_ex_med -0.007635\n", + "Name: correlation_with_default, dtype: float64" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report[\"correlation_with_default\"] # correlation of each credit feature with default" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/Example Usage.py b/src/Example Usage.py new file mode 100644 index 0000000..c1d2d86 --- /dev/null +++ b/src/Example Usage.py @@ -0,0 +1,8 @@ +#Example Usage +cat_cols_subset2 = ["pymnt_plan", "purpose", "title", "zip_code", "addr_state"] + +transformer = Subset2CategoricalPerformanceTransformer(cat_cols=cat_cols_subset2) + +transformer.fit(X_train[cat_cols_subset2]) + +X_train_cat = transformer.transform(X_train[cat_cols_subset2]) diff --git a/src/__pycache__/__init__.cpython-39.pyc b/src/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..8835e08 Binary files /dev/null and b/src/__pycache__/__init__.cpython-39.pyc differ diff --git a/src/__pycache__/eda_credit_history.cpython-39.pyc b/src/__pycache__/eda_credit_history.cpython-39.pyc new file mode 100644 index 0000000..84c396d Binary files /dev/null and b/src/__pycache__/eda_credit_history.cpython-39.pyc differ diff --git a/src/__pycache__/transformers.cpython-39.pyc b/src/__pycache__/transformers.cpython-39.pyc new file mode 100644 index 0000000..879be20 Binary files /dev/null and b/src/__pycache__/transformers.cpython-39.pyc differ diff --git a/src/cleaning_categorical.py b/src/cleaning_categorical.py new file mode 100644 index 0000000..bc251d9 --- /dev/null +++ b/src/cleaning_categorical.py @@ -0,0 +1,31 @@ +import pandas as pd +def normalize_cat_strings(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame: + # Lowercase + strip whitespace in the given categorical columns. + df = df.copy() + for c in cols: + df[c] = df[c].astype(str).str.lower().str.strip() + return df + + +def fill_cat_missing_with_sentinel( + df: pd.DataFrame, + cols: list[str], + label: str = "Missing",) -> pd.DataFrame: + # Replace NaNs in categorical columns with a sentinel category. + df = df.copy() + for c in cols: + df[c] = df[c].fillna(label) + df[c] = df[c].replace("", label) + return df + +def group_rare_categories( + df: pd.DataFrame, + col: str, + min_count: int = 100, +) -> pd.DataFrame: + # Replace rare categories in a single column by "Other". + df = df.copy() + counts = df[col].value_counts() + rare = counts[counts < min_count].index + df[col] = df[col].where(~df[col].isin(rare), "Other") + return df \ No newline at end of file diff --git a/src/cleaning_text.py b/src/cleaning_text.py new file mode 100644 index 0000000..6a685dc --- /dev/null +++ b/src/cleaning_text.py @@ -0,0 +1,28 @@ +import pandas as pd +def add_desc_length(df: pd.DataFrame) -> pd.DataFrame: + # Create "desc_len" as character length of "desc" (or 0 if NaN). + df = df.copy() + df["desc_len"] = df["desc"].fillna("").astype(str).str.len() + return df + + +def add_title_word_count(df: pd.DataFrame) -> pd.DataFrame: + # Create "title_word_count" as word count of "title". + df = df.copy() + df["title_word_count"] = ( + df["title"].fillna("").astype(str).str.split().apply(len) + ) + return df + +###### Implement a functional composition helper: + +def apply_cat_steps( + df: pd.DataFrame, + cols: list[str], + steps: list, +) -> pd.DataFrame: + # Apply a sequence of functions(df, cols) -> df to categorical columns. + for step in steps: + df = step(df, cols) + return df + \ No newline at end of file diff --git a/src/eda_credit_history.py b/src/eda_credit_history.py new file mode 100644 index 0000000..907607f --- /dev/null +++ b/src/eda_credit_history.py @@ -0,0 +1,116 @@ + +import numpy as np +import pandas as pd +from typing import Dict, Any, Callable + +CREDIT_NUMERIC_COLS = ["dti", "dti_joint", "delinq_2yrs", "mths_since_last_delinq", "mths_since_last_record", "mths_since_last_major_derog", +"open_acc", "total_acc", "pub_rec", "acc_now_delinq", "revol_bal", "revol_util", "total_rev_hi_lim", +"tot_coll_amt", "tot_cur_bal", "total_bal_il", "open_acc_6m", "open_il_6m", "open_il_12m", "open_il_24m", +"mths_since_rcnt_il", "open_rv_12m", "open_rv_24m", "max_bal_bc", "all_util", +"inq_last_6mths", "inq_last_12m", "inq_fi", "collections_12_mths_ex_med"] + +"""Goal: +Explore how credit history, balances, utilization, and inquiries +relate to loan_status""" + +class CreditHistoryEDA: + def __init__(self, df: pd.DataFrame, target_col: str = "loan_status"): + """Store the full DataFrame and the name of the target column.""" + self.df = df + self.target_col = target_col + + def credit_structure_summary(self) -> pd.DataFrame: + """ + One row per CREDIT_NUMERIC_COLS column with: + - column + - dtype + - n_missing + - missing_pct + - mean (if numeric) + - std (if numeric) + """ + df_Numeric = self.df[CREDIT_NUMERIC_COLS].copy(deep=True) + row = [] + for col in CREDIT_NUMERIC_COLS: + series = df_Numeric[col] + + n_missing = series.isna().sum() + missing_pct = (n_missing / len(series)) * 100 + #if pd.api.types.is_numeric_dtype(series): # as meansioned above in docs "mean (if numeric)"" + mean_val = series.mean() + std_val = series.std() + #else: + # mean_val = None + # std_val = None + row.append({"column": col, "dtype": str(series.dtypes), "n_missing": n_missing, "missing_pct": missing_pct, "mean": mean_val, "std": std_val}) + return pd.DataFrame(row, columns=["column", "dtype", "n_missing", "missing_pct", "mean", "std" ]) + + def default_rate_by_bucket(self, col: str, bins: int = 4): + """ + For a numeric credit column (e.g., dti, revol_util), + create `bins` buckets and compute default rate per bucket. + + Return a DataFrame with columns: + - bucket (interval) + - n_loans + - default_rate + """ + #df_drop = self.df[[col]].dropna(subset=[col]).copy(deep=True) + #print("df_drop OK") + Buckets = pd.qcut(self.df[col], q=bins) # create interval bucket i.e 4 + #print("self.df[bucket] is OK") + result = self.df.groupby(Buckets)[self.target_col].agg(n_loans="count", default_rate="mean").reset_index() + # print("result is OK") + return result + #print("function is OK") + + def correlation_with_default(self) -> pd.Series: + """ + Compute correlation of each numeric credit column with the target + (assuming loan_status is encoded as 0/1). + Return a Series indexed by column name. + """ + correlation = {} + for col in CREDIT_NUMERIC_COLS: + #if pd.api.types.is_numeric_dtype(self.df[col]): + correlation[col] = self.df[col].corr(self.df[self.target_col]) + #else: + # correlation[col] = None + return pd.Series(correlation, name="correlation_with_default") +###################### part 2 +def credit_history_report(eda:CreditHistoryEDA): + steps: Dict[str, Callable[[], Any]] = {"structure_summary": eda.credit_structure_summary, "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5), + "revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5), "correlation_with_default": eda.correlation_with_default} + report: Dict[str, Any] = {} + for name, func in steps.items(): + report[name] = func() + return report + + + + +# # 2. Functional credit-history report +# # Add a functional report generator that coordinates several EDA steps: +# # """def credit_history_report(eda: CreditHistoryEDA) -> Dict[str, Any]:""" +# # Build a dict of step_name -> callable and run them to produce +# # a combined report. +# # Example steps: +# # - "structure_summary": eda.credit_structure_summary +# # - "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5) +# # - "revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5) +# # - "correlation_with_default": eda.correlation_with_default +# # Iterate over this dict, call each function, and return +# # a result dict: step_name -> output. +# # Example idea: +# # """ def credit_history_report(eda: CreditHistoryEDA) -> Dict[str, Any]: +# # steps: Dict[str, Callable[[], Any]] = { +# # "structure_summary": eda.credit_structure_summary, +# # "dti_buckets": lambda: eda.default_rate_by_bucket("dti", bins=5), +# # "revol_util_buckets": lambda: eda.default_rate_by_bucket("revol_util", bins=5), +# # "correlation_with_default": eda.correlation_with_default, +# # } +# # report: Dict[str, Any] = {} +# # for name, func in steps.items(): +# # report[name] = func() +# # return report""" +# This should clearly show higher-order functions (functions stored and called later). \ No newline at end of file diff --git a/src/test_subset2_transformer.py b/src/test_subset2_transformer.py new file mode 100644 index 0000000..0a813ac --- /dev/null +++ b/src/test_subset2_transformer.py @@ -0,0 +1,58 @@ +#tests/test_subset2_transformer.py +import pandas as pd +from transformers import Subset2CategoricalPerformanceTransformer + +def test_no_nans_after_transform(): + df = pd.DataFrame({ + "pymnt_plan": ["y", None, "n"], + "purpose": ["debt", "wedding", None], + "title": ["abc def", None, "xxx"], + "zip_code": ["123xx", "456xx", "123xx"], + "addr_state": ["CA", "ZZ", "CA"], + }) + + tr = Subset2CategoricalPerformanceTransformer( + cat_cols=list(df.columns), min_count=2 + ) + tr.fit(df) + out = tr.transform(df) + + assert not pd.isna(out).any() + + +def test_rare_category_grouping(): + df = pd.DataFrame({ + "purpose": ["a", "b", "c", "d", "e"], # all rare + "pymnt_plan": ["y", "y", "y", "y", "y"], + "title": ["t"]*5, + "zip_code": ["111"]*5, + "addr_state": ["CA","CA","CA","CA","CA"], + }) + + tr = Subset2CategoricalPerformanceTransformer( + cat_cols=df.columns.tolist(), + min_count=3 + ) + tr.fit(df) + + transformed = tr.transform(df) + + # purpose has 5 categories but each count=1 < 3 → all become Other + assert "Other" in tr.rare_maps_["purpose"] or len(tr.rare_maps_["purpose"]) == 5 + + +def test_text_features_numeric(): + df = pd.DataFrame({ + "pymnt_plan": ["y","n"], + "purpose": ["debt","car"], + "title": ["abc def","hello"], + "zip_code": ["123","456"], + "addr_state": ["CA","NY"], + }) + + tr = Subset2CategoricalPerformanceTransformer(cat_cols=df.columns.tolist()) + tr.fit(df) + out = tr.transform(df) + + # last 2 columns must be numeric + assert out[:, -2:].dtype.kind in ("i", "f") diff --git a/src/transformers.py b/src/transformers.py new file mode 100644 index 0000000..4cd5225 --- /dev/null +++ b/src/transformers.py @@ -0,0 +1,50 @@ +# In src/transformers.py, implement Subset2CategoricalPerformanceTransformer: +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.preprocessing import OneHotEncoder +import pandas as pd +import numpy as np + +class Subset2CategoricalPerformanceTransformer(BaseEstimator, TransformerMixin): + def __init__(self, cat_cols: list[str], min_count: int = 100): + self.cat_cols = cat_cols + self.min_count = min_count + self.encoder = OneHotEncoder(handle_unknown="ignore", sparse=False) + self.rare_maps_ = {} # mapping per column for rare categories + + def fit(self, X, y=None): + df = pd.DataFrame(X, columns[self.cat_cols]).copy() + # 1) normalize strings + df = normalize_cat_strings(df, self.cat_cols) + # 2) fill missing with sentinel + df = fill_cat_missing_with_sentinel(df, self.cat_cols) + # 3) detect rare categories per column and store mapping + for col in self.cat_cols: + counts = df[col].value_counts() + rare = counts[counts < self.min_count].index + self.rare_maps_[col] = set(rare) + df[col] = df[col].where(~df[col].isin(rare), "Other") + # 4) fit encoder + self.encoder.fit(df[self.cat_cols]) + return self + + def transform(self, X): + df = pd.DataFrame(X, columns=self.cat_cols).copy() + # apply normalization, missing fill, and rare grouping using self.rare_maps_ + df = normalize_cat_strings(df, self.cat_cols) + df = fill_cat_missing_with_sentinel(df, self.cat_cols) + for col in self.cat_cols: + rare = self.rare_maps_.get(col, set()) + df[col] = df[col].where(~df[col].isin(rare), "Other") + encoded = self.encoder.transform(df[self.cat_cols]) + # text features + df_text = df.copy() + df_text["desc"] = df_text.get("desc", "") + df_text["title"] = df_text.get("title", "") + + df_text = add_desc_length(df_text) + df_text = add_title_word_count(df_text) + + text_features = df_text[["desc_len", "title_word_count"]].values.astype(float) + + # final output: encoded OHE + text numeric + return np.hstack([encoded, text_features]) # return encoded \ No newline at end of file