RigelAlgebar · Dee-M123 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025 · RamiSaad93
diff --git a/notebooks/eda_loan_terms_demo.ipynb b/notebooks/eda_loan_terms_demo.ipynb
@@ -0,0 +1,50 @@
+# Load the dataset:
+!pip install gdown --quiet
+
+import gdown
+import pandas as pd
+from src.eda_loan_terms import LoanTermsEDA, summarize_numeric_terms
+import matplotlib.pyplot as plt
+
+url = "https://drive.google.com/uc?id=1EKPQm63-_9P9eZsFeQsGI9O2GuAFUxdk"
+
+output = "credit_risk.csv"
+gdown.download(url, output, quiet=False)
+
+df = pd.read_csv(output)
+
+# df = pd.read_csv("data/loan_sample.csv")
+
+# or the correct path
+#
+
+df.head()
+
+#Instantiate the EDA class:
+
+eda = LoanTermsEDA(df, target_col="loan_status")
+
+
+# Run EDA:
+
+numeric_summary = eda.numeric_terms_summary()
+per_column_stats = summarize_numeric_terms(eda)
+default_by_term = eda.default_rate_by_term()
+default_by_grade = eda.default_rate_by_grade()
+
+#Display at least:
+
+numeric_summary # describe() table of numeric loan terms
+per_column_stats # dict of stats per numeric column
+default_by_term # default rate by term (e.g., 36 vs 60 months)
+default_by_grade # default rate by grade/sub_grade
+
+
+# Optionally add a histogram of int_rate or loan_amnt and a short markdown comment.
+# (But no modeling, only EDA.)
+
+plt.hist(df["int_rate"].dropna(), bins=20)
+plt.xlabel("Interest Rate")
+plt.ylabel("Frequency")
+plt.title("Distribution of Interest Rates")
+plt.show()
diff --git a/src/eda_loan_terms.py b/src/eda_loan_terms.py
@@ -0,0 +1,118 @@
+
+#import the following 
+
+import pandas as pd
+import numpy as np
+
+from typing import Dict, Any, List
+
+# upload dataset as follows:
+
+!pip install gdown --quiet
+
+import gdown
+
+# shared file URL (can be /file/d/.../view) 
+
+url = "https://drive.google.com/uc?id=1EKPQm63-_9P9eZsFeQsGI9O2GuAFUxdk"
+
+# Download it to a local file in Colab
+output = "credit_risk.csv"
+gdown.download(url, output, quiet=False)
+
+# Now read it with pandas to see if everything is as expected 
+df = pd.read_csv(output)
+df.head(10)
+
+#_______________________________________________________
+
+NUMERIC_TERM_COLS = [
+"loan_amnt", "funded_amnt", "funded_amnt_inv",
+"int_rate", "installment",
+"out_prncp", "out_prncp_inv",
+"total_pymnt", "total_pymnt_inv",
+"total_rec_prncp", "total_rec_int",
+"total_rec_late_fee", "recoveries", "collection_recovery_fee",
+"last_pymnt_amnt",
+]
+
+CATEGORICAL_TERM_COLS = [
+"term", "grade", "sub_grade", "initial_list_status",
+]
+
+# create a class assess numerical columns
+class LoanTermsEDA:
+  def __init__(self, df: pd.DataFrame, target_col: str = "loan_status"):
+    self.df = df
+    self.target_col = target_col
+
+#Store the full DataFrame and the name of the target column...
+
+def numeric_terms_summary(self) -> pd.DataFrame:
+  return self.df[NUMERIC_TERM_COLS].describe().T
+
+#(one row per numeric term/cashflow column))
+#(one row per numeric term/cashflow column))
+# once dataframe is stored in ... eda = LoanTermsEDA(df, target_col="loan_status") 
+#you can have summery (describe) via eda.numeric_terms_summary()
+
+
+
+def distribution_stats_for_column(self, col: str) -> Dict[str, float]:
+  #For a given numeric column return stats in a dict:
+  numeric_col = self.df[col].dropna()
+  numeric_col = numeric_col
+  return {"mean": numeric_col.mean(), "std": numeric_col.std(), "min": numeric_col.min(), "max": numeric_col.max(), "q25": numeric_col.quantile(0.25), "q50": numeric_col.quantile(0.50), "q75": numeric_col.quantile(0.75)}
+
+    # - std
+    # - min
+    # - max
+    # - q25 (25% quantile)
+    # - q50 (median)
+    # - q75 (75% quantile)
+
+    # eda.distribution_stats_for_column()... takes a postional argument inside (), as you are looking into stats distributions per column 
+
+
+
+def default_rate_by_term(self) -> pd.Series:
+
+
+    # Default rate (mean of target) per 'term' category.
+
+  return self.df.groupby("term")[self.target_col].mean()
+
+  #could not test this as it went on forever running 
+
+
+def default_rate_by_grade(self) -> pd.Series:
+
+  # Default rate per 'grade' (or per 'sub_grade', if you prefer).
+
+
+  return self.df.groupby("grade")[self.target_col].mean()
+
+#_____________________________________________________________________
+
+# the function in a new cell outside the created class othwerwise it will cosidered as part of the class method. 
+
+
+#   2. Functional per-column numeric summary
+# Add a function that uses functional programming over numeric columns:
+
+#***** did not work this way: 
+
+# def summarize_numeric_terms(eda: LoanTermsEDA) -> Dict[str, Dict[str, float]]:
+#   numeric_list = list(map(eda.distribution_stats_for_column, NUMERIC_TERM_COLS))
+#   return dict(NUMERIC_TERM_COLS, numeric_list)
+
+
+# works with provided code: 
+def summarize_numeric_terms(eda: LoanTermsEDA) -> Dict[str, Dict[str, float]]:
+  return {col: eda.distribution_stats_for_column(col) for col in NUMERIC_TERM_COLS} 
+
+
+  #eda = LoanTermsEDA(df, target_col="loan_status")
+  #per_column_stats = summarize_numeric_terms(eda)
+   #per_column_stats
+