From 0e73e436f3e7c27fb58d4995678d9318685b1e45 Mon Sep 17 00:00:00 2001 From: Dee-M123 Date: Wed, 26 Nov 2025 15:07:11 +0100 Subject: [PATCH 1/3] both files created --- notebooks/eda_loan_terms_demo.ipynb | 0 src/eda_loan_terms.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 notebooks/eda_loan_terms_demo.ipynb create mode 100644 src/eda_loan_terms.py diff --git a/notebooks/eda_loan_terms_demo.ipynb b/notebooks/eda_loan_terms_demo.ipynb new file mode 100644 index 0000000..e69de29 diff --git a/src/eda_loan_terms.py b/src/eda_loan_terms.py new file mode 100644 index 0000000..e69de29 From 2dbd952ffef1ceb8828cd5a7357e7cfbb4e8feab Mon Sep 17 00:00:00 2001 From: Dee-M123 Date: Wed, 26 Nov 2025 16:56:31 +0100 Subject: [PATCH 2/3] second file excercise added: eda_loan_terms_demo --- notebooks/eda_loan_terms_demo.ipynb | 50 +++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/notebooks/eda_loan_terms_demo.ipynb b/notebooks/eda_loan_terms_demo.ipynb index e69de29..1432517 100644 --- a/notebooks/eda_loan_terms_demo.ipynb +++ b/notebooks/eda_loan_terms_demo.ipynb @@ -0,0 +1,50 @@ +# Load the dataset: +!pip install gdown --quiet + +import gdown +import pandas as pd +from src.eda_loan_terms import LoanTermsEDA, summarize_numeric_terms +import matplotlib.pyplot as plt + +url = "https://drive.google.com/uc?id=1EKPQm63-_9P9eZsFeQsGI9O2GuAFUxdk" + +output = "credit_risk.csv" +gdown.download(url, output, quiet=False) + +df = pd.read_csv(output) + +# df = pd.read_csv("data/loan_sample.csv") + +# or the correct path +# + +df.head() + +#Instantiate the EDA class: + +eda = LoanTermsEDA(df, target_col="loan_status") + + +# Run EDA: + +numeric_summary = eda.numeric_terms_summary() +per_column_stats = summarize_numeric_terms(eda) +default_by_term = eda.default_rate_by_term() +default_by_grade = eda.default_rate_by_grade() + +#Display at least: + +numeric_summary # describe() table of numeric loan terms +per_column_stats # dict of stats per numeric column +default_by_term # default rate by term (e.g., 36 vs 60 months) +default_by_grade # default rate by grade/sub_grade + + +# Optionally add a histogram of int_rate or loan_amnt and a short markdown comment. +# (But no modeling, only EDA.) + +plt.hist(df["int_rate"].dropna(), bins=20) +plt.xlabel("Interest Rate") +plt.ylabel("Frequency") +plt.title("Distribution of Interest Rates") +plt.show() From fedd02a497470f2280a950ceeac813cfdb17ea0c Mon Sep 17 00:00:00 2001 From: Dee-M123 Date: Wed, 26 Nov 2025 17:07:22 +0100 Subject: [PATCH 3/3] first file added: eda_loan_terms.py --- src/eda_loan_terms.py | 118 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/src/eda_loan_terms.py b/src/eda_loan_terms.py index e69de29..212b340 100644 --- a/src/eda_loan_terms.py +++ b/src/eda_loan_terms.py @@ -0,0 +1,118 @@ + +#import the following + +import pandas as pd +import numpy as np + +from typing import Dict, Any, List + +# upload dataset as follows: + +!pip install gdown --quiet + +import gdown + +# shared file URL (can be /file/d/.../view) + +url = "https://drive.google.com/uc?id=1EKPQm63-_9P9eZsFeQsGI9O2GuAFUxdk" + +# Download it to a local file in Colab +output = "credit_risk.csv" +gdown.download(url, output, quiet=False) + +# Now read it with pandas to see if everything is as expected +df = pd.read_csv(output) +df.head(10) + +#_______________________________________________________ + +NUMERIC_TERM_COLS = [ +"loan_amnt", "funded_amnt", "funded_amnt_inv", +"int_rate", "installment", +"out_prncp", "out_prncp_inv", +"total_pymnt", "total_pymnt_inv", +"total_rec_prncp", "total_rec_int", +"total_rec_late_fee", "recoveries", "collection_recovery_fee", +"last_pymnt_amnt", +] + +CATEGORICAL_TERM_COLS = [ +"term", "grade", "sub_grade", "initial_list_status", +] + +# create a class assess numerical columns +class LoanTermsEDA: + def __init__(self, df: pd.DataFrame, target_col: str = "loan_status"): + self.df = df + self.target_col = target_col + +#Store the full DataFrame and the name of the target column... + +def numeric_terms_summary(self) -> pd.DataFrame: + return self.df[NUMERIC_TERM_COLS].describe().T + +#(one row per numeric term/cashflow column)) +#(one row per numeric term/cashflow column)) +# once dataframe is stored in ... eda = LoanTermsEDA(df, target_col="loan_status") +#you can have summery (describe) via eda.numeric_terms_summary() + + + +def distribution_stats_for_column(self, col: str) -> Dict[str, float]: + #For a given numeric column return stats in a dict: + numeric_col = self.df[col].dropna() + numeric_col = numeric_col + return {"mean": numeric_col.mean(), "std": numeric_col.std(), "min": numeric_col.min(), "max": numeric_col.max(), "q25": numeric_col.quantile(0.25), "q50": numeric_col.quantile(0.50), "q75": numeric_col.quantile(0.75)} + + # - std + # - min + # - max + # - q25 (25% quantile) + # - q50 (median) + # - q75 (75% quantile) + + # eda.distribution_stats_for_column()... takes a postional argument inside (), as you are looking into stats distributions per column + + + +def default_rate_by_term(self) -> pd.Series: + + + # Default rate (mean of target) per 'term' category. + + return self.df.groupby("term")[self.target_col].mean() + + #could not test this as it went on forever running + + +def default_rate_by_grade(self) -> pd.Series: + + # Default rate per 'grade' (or per 'sub_grade', if you prefer). + + + return self.df.groupby("grade")[self.target_col].mean() + +#_____________________________________________________________________ + +# the function in a new cell outside the created class othwerwise it will cosidered as part of the class method. + + +# 2. Functional per-column numeric summary +# Add a function that uses functional programming over numeric columns: + +#***** did not work this way: + +# def summarize_numeric_terms(eda: LoanTermsEDA) -> Dict[str, Dict[str, float]]: +# numeric_list = list(map(eda.distribution_stats_for_column, NUMERIC_TERM_COLS)) +# return dict(NUMERIC_TERM_COLS, numeric_list) + + +# works with provided code: +def summarize_numeric_terms(eda: LoanTermsEDA) -> Dict[str, Dict[str, float]]: + return {col: eda.distribution_stats_for_column(col) for col in NUMERIC_TERM_COLS} + + + #eda = LoanTermsEDA(df, target_col="loan_status") + #per_column_stats = summarize_numeric_terms(eda) + #per_column_stats +