From 4da4ad396fc3fb978c3f41a9b923aa34ba306b33 Mon Sep 17 00:00:00 2001 From: "henrydingliu@gmail.com" Date: Mon, 8 Dec 2025 23:32:01 +0000 Subject: [PATCH 01/10] rebasing BZ changes --- chainladder/development/barnzehn.py | 23 +- chainladder/development/learning.py | 27 +- .../development/tests/test_barnzehn.py | 252 +++++++++++++++++- 3 files changed, 285 insertions(+), 17 deletions(-) diff --git a/chainladder/development/barnzehn.py b/chainladder/development/barnzehn.py index 2975ba87..ec156ded 100644 --- a/chainladder/development/barnzehn.py +++ b/chainladder/development/barnzehn.py @@ -24,6 +24,24 @@ class BarnettZehnwirth(TweedieGLM): ---------- formula: formula-like A patsy formula describing the independent variables, X of the GLM + feat_eng: dict + A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') + (e.g. { + 'feature_1':{ + 'func': function_name for feature 1, + 'kwargs': keyword arguments for the function + }, + 'feature_2':{ + 'func': function_name for feature 2, + 'kwargs': keyword arguments for the function + } + } + ); + functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time + (e.g. this function adds 1 to every origin + def test_func(df) + return df['origin'] + 1 + ) response: str Column name for the reponse variable of the GLM. If ommitted, then the first column of the Triangle will be used. @@ -31,9 +49,10 @@ class BarnettZehnwirth(TweedieGLM): """ - def __init__(self, formula='C(origin) + development', response=None): + def __init__(self, formula='C(origin) + development', feat_eng=None, response=None): self.formula = formula self.response = response + self.feat_eng = feat_eng def fit(self, X, y=None, sample_weight=None): if max(X.shape[:2]) > 1: @@ -50,7 +69,7 @@ def fit(self, X, y=None, sample_weight=None): self.model_ = DevelopmentML(Pipeline(steps=[ ('design_matrix', PatsyFormula(self.formula)), ('model', LinearRegression(fit_intercept=False))]), - y_ml=response, fit_incrementals=False).fit(tri) + y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng).fit(tri) resid = tri - self.model_.triangle_ml_[ self.model_.triangle_ml_.valuation <= tri.valuation_date] self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / ( diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py index 1705c696..b682d636 100644 --- a/chainladder/development/learning.py +++ b/chainladder/development/learning.py @@ -33,6 +33,24 @@ class DevelopmentML(DevelopmentBase): Time Series aspects of the model. Predictions from one development period get used as featues in the next development period. Lags should be negative integers. + feat_eng: dict + A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') + (e.g. { + 'feature_1':{ + 'func': function_name for feature 1, + 'kwargs': keyword arguments for the function + }, + 'feature_2':{ + 'func': function_name for feature 2, + 'kwargs': keyword arguments for the function + } + } + ); + functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time + (e.g. this function adds 1 to every origin + def test_func(df) + return df['origin'] + 1 + ) fit_incrementals: Whether the response variable should be converted to an incremental basis for fitting. @@ -48,12 +66,13 @@ class DevelopmentML(DevelopmentBase): """ def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, - weight_ml=None, fit_incrementals=True): + weight_ml=None, fit_incrementals=True, feat_eng=None): self.estimator_ml=estimator_ml self.y_ml=y_ml self.weight_ml = weight_ml self.autoregressive=autoregressive self.fit_incrementals = fit_incrementals + self.feat_eng = feat_eng def _get_y_names(self): """ private function to get the response column name""" @@ -112,6 +131,9 @@ def _get_triangle_ml(self, df, preds=None): if len(out) == 0: continue X_r.append(out.copy()) + if self.feat_eng is not None: + for key, item in self.feat_eng.items(): + out[key] = item['func'](df=out,**item['kwargs']) preds = self.estimator_ml.predict(out) y_r.append(preds.copy()) X_r = pd.concat(X_r, axis=0).reset_index(drop=True) @@ -145,6 +167,9 @@ def _prep_X_ml(self, X): on=list(df_base.columns)).fillna(0) df['origin'] = df['origin'].map(self.origin_encoder_) df['valuation'] = df['valuation'].map(self.valuation_encoder_) + if self.feat_eng is not None: + for key, item in self.feat_eng.items(): + df[key] = item['func'](df=df,**item['kwargs']) return df def fit(self, X, y=None, sample_weight=None): diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py index 15110b89..3cf7e0bd 100644 --- a/chainladder/development/tests/test_barnzehn.py +++ b/chainladder/development/tests/test_barnzehn.py @@ -1,15 +1,239 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. + import numpy as np -import chainladder as cl -import pytest - -def test_basic_bz(): - abc = cl.load_sample('abc') - assert np.all( - np.around(cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(abc).coef_.T.values,3).flatten() - == np.array([11.837,0.179,0.345,0.378,0.405,0.427,0.431,0.66,0.963,1.157,1.278,0.251,-0.056,-0.449,-0.829,-1.169,-1.508,-1.798,-2.023,-2.238,-2.428]) - ) - -def test_multiple_triangle_exception(): - d = cl.load_sample("usauto") - with pytest.raises(ValueError): - cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d) \ No newline at end of file +import pandas as pd + +from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures +from sklearn.compose import ColumnTransformer +from chainladder.development.base import DevelopmentBase +from chainladder import options + + +class DevelopmentML(DevelopmentBase): + """ A Estimator that interfaces with machine learning (ML) tools that implement + the scikit-learn API. + + The `DevelopmentML` estimator is used to generate ``ldf_`` patterns from + the data. + + .. versionadded:: 0.8.1 + + + Parameters + ---------- + estimator_ml: skearn Estimator + Any sklearn compatible regression estimator, including Pipelines and + y_ml: list or str or sklearn_transformer + The response column(s) for the machine learning algorithm. It must be + present within the Triangle. + autoregressive: tuple, (autoregressive_col_name, lag, source_col_name) + The subset of response column(s) to use as lagged features for the + Time Series aspects of the model. Predictions from one development period + get used as featues in the next development period. Lags should be negative + integers. + feat_eng: dict + A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') + (e.g. { + 'feature_1':{ + 'func': function_name for feature 1, + 'kwargs': keyword arguments for the function + }, + 'feature_2':{ + 'func': function_name for feature 2, + 'kwargs': keyword arguments for the function + } + } + ); + functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time + (e.g. this function adds 1 to every origin + def test_func(df) + return df['origin'] + 1 + ) + fit_incrementals: + Whether the response variable should be converted to an incremental basis + for fitting. + + Attributes + ---------- + estimator_ml: Estimator + An sklearn-style estimator to predict development patterns + ldf_: Triangle + The estimated loss development patterns. + cdf_: Triangle + The estimated cumulative development patterns. + """ + + def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, + weight_ml=None, fit_incrementals=True, feat_eng=None): + self.estimator_ml=estimator_ml + self.y_ml=y_ml + self.weight_ml = weight_ml + self.autoregressive=autoregressive + self.fit_incrementals = fit_incrementals + self.feat_eng = feat_eng + + def _get_y_names(self): + """ private function to get the response column name""" + if not self.y_ml: + y_names = self._columns + if hasattr(self.y_ml, '_columns'): + y_names = self.y_ml._columns + elif isinstance(self.y_ml, ColumnTransformer): + y_names = self.y_ml.transformers[0][-1] + if type(self.y_ml) is list: + y_names = self.y_ml + elif type(self.y_ml) is str: + y_names = [self.y_ml] + return y_names + + + @property + def y_ml_(self): + defaults = self._get_y_names() + transformer = self.y_ml + if not transformer: + return ColumnTransformer( + transformers=[('passthrough', 'passthrough', defaults)]) + elif type(transformer) is list: + return ColumnTransformer( + transformers=[('passthrough', 'passthrough', transformer)]) + elif type(transformer) is str: + return ColumnTransformer( + transformers=[('passthrough', 'passthrough', [transformer])]) + else: + return transformer + + def _get_triangle_ml(self, df, preds=None): + """ Create fitted Triangle """ + from chainladder.core import Triangle + if preds is None: + preds = self.estimator_ml.predict(df) + X_r = [df] + y_r = [preds] + dgrain = {'Y':12, 'Q':3, 'M': 1, 'S': 6}[self.development_grain_] + ograin = {'Y':1, 'Q':4, 'M': 12, 'S': 6}[self.origin_grain_] + latest_filter = (df['origin']+1)*ograin+(df['development']-dgrain)/dgrain + latest_filter = latest_filter == latest_filter.max() + preds=pd.DataFrame(preds.copy())[latest_filter].values + out = df.loc[latest_filter].copy() + dev_lags = df['development'].drop_duplicates().sort_values() + for d in dev_lags[1:]: + out['development'] = out['development'] + dgrain + out['valuation'] = out['valuation'] + dgrain / 12 + if len(preds.shape) == 1: + preds = preds[:, None] + if self.autoregressive: + for num, col in enumerate(self.autoregressive): + out[col[0]]=preds[:, num] + out = out[out['development']<=dev_lags.max()] + if len(out) == 0: + continue + X_r.append(out.copy()) + if self.feat_eng is not None: + for key, item in self.feat_eng.items(): + out[key] = item['func'](df=out,**item['kwargs']) + preds = self.estimator_ml.predict(out) + y_r.append(preds.copy()) + X_r = pd.concat(X_r, axis=0).reset_index(drop=True) + if True: + X_r = X_r.drop(self._get_y_names(), axis=1) + out = pd.concat((X_r, + pd.DataFrame(np.concatenate(y_r, 0), columns=self._get_y_names())), axis=1) + out['origin'] = out['origin'].map({v: k for k, v in self.origin_encoder_.items()}) + out['valuation'] = out['valuation'].map({v: k for k, v in self.valuation_encoder_.items()}) + return Triangle( + out, origin='origin', development='valuation', + index=self._key_labels, columns=self._get_y_names(), + cumulative=not self.fit_incrementals).dropna() + + def _prep_X_ml(self, X): + """ Preps Triangle data ahead of the pipeline """ + if self.fit_incrementals: + X_ = X.cum_to_incr() + else: + X_ = X.copy() + if self.autoregressive: + for i in self.autoregressive: + lag = X[i[2]].shift(i[1]) + X_[i[0]] = lag[lag.valuation<=X.valuation_date] + df_base = X.incr_to_cum().to_frame( + keepdims=True, implicit_axis=True, origin_as_datetime=True + ).reset_index().iloc[:, :-1] + df = df_base.merge(X.cum_to_incr().to_frame( + keepdims=True, implicit_axis=True, origin_as_datetime=True + ).reset_index(), how='left', + on=list(df_base.columns)).fillna(0) + df['origin'] = df['origin'].map(self.origin_encoder_) + df['valuation'] = df['valuation'].map(self.valuation_encoder_) + if self.feat_eng is not None: + for key, item in self.feat_eng.items(): + df[key] = item['func'](df=df,**item['kwargs']) + return df + + def fit(self, X, y=None, sample_weight=None): + """Fit the model with X. + + Parameters + ---------- + X : Triangle-like + Set of LDFs to which the estimator will be applied. + y : None + Ignored, use y_ml to set a reponse variable for the ML algorithm + sample_weight : None + Ignored + + Returns + ------- + self : object + Returns the instance itself. + """ + + self._columns = list(X.columns) + self._key_labels = X.key_labels + self.origin_grain_ = X.origin_grain + self.development_grain_ = X.development_grain + self.origin_encoder_ = dict(zip( + X.origin.to_timestamp(how='s'), + (pd.Series(X.origin).rank()-1)/{'Y':1, 'Q':4, 'M': 12, 'S': 6}[X.origin_grain])) + val = X.valuation.sort_values().unique() + self.valuation_encoder_ = dict(zip( + val, + (pd.Series(val).rank()-1)/{'Y':1, 'Q':4, 'M': 12, 'S': 6}[X.development_grain])) + df = self._prep_X_ml(X) + self.df_ = df + # Fit model + self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze()) + #return selffit_incrementals + self.triangle_ml_ = self._get_triangle_ml(df) + return self + + @property + def ldf_(self): + ldf = self.triangle_ml_.incr_to_cum().link_ratio + ldf.valuation_date = pd.to_datetime(options.ULT_VAL) + return ldf + + def transform(self, X): + """ If X and self are of different shapes, align self to X, else + return self. + + Parameters + ---------- + X : Triangle + The triangle to be transformed + + Returns + ------- + X_new : New triangle with transformed attributes. + """ + X_new = X.copy() + X_ml = self._prep_X_ml(X) + y_ml=self.estimator_ml.predict(X_ml) + triangle_ml = self._get_triangle_ml(X_ml, y_ml) + backend = "cupy" if X.array_backend == "cupy" else "numpy" + X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend) + X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL) + X_new._set_slicers() + return X_new From 15039de386a7dfbc15a0a0bab2a3faf8ca2a9d92 Mon Sep 17 00:00:00 2001 From: "henrydingliu@gmail.com" Date: Mon, 8 Dec 2025 23:34:38 +0000 Subject: [PATCH 02/10] correcting an incorrect copy --- .../development/tests/test_barnzehn.py | 287 +++--------------- 1 file changed, 49 insertions(+), 238 deletions(-) diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py index 3cf7e0bd..8dd17b69 100644 --- a/chainladder/development/tests/test_barnzehn.py +++ b/chainladder/development/tests/test_barnzehn.py @@ -1,239 +1,50 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. - import numpy as np -import pandas as pd - -from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures -from sklearn.compose import ColumnTransformer -from chainladder.development.base import DevelopmentBase -from chainladder import options - - -class DevelopmentML(DevelopmentBase): - """ A Estimator that interfaces with machine learning (ML) tools that implement - the scikit-learn API. - - The `DevelopmentML` estimator is used to generate ``ldf_`` patterns from - the data. - - .. versionadded:: 0.8.1 - - - Parameters - ---------- - estimator_ml: skearn Estimator - Any sklearn compatible regression estimator, including Pipelines and - y_ml: list or str or sklearn_transformer - The response column(s) for the machine learning algorithm. It must be - present within the Triangle. - autoregressive: tuple, (autoregressive_col_name, lag, source_col_name) - The subset of response column(s) to use as lagged features for the - Time Series aspects of the model. Predictions from one development period - get used as featues in the next development period. Lags should be negative - integers. - feat_eng: dict - A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') - (e.g. { - 'feature_1':{ - 'func': function_name for feature 1, - 'kwargs': keyword arguments for the function - }, - 'feature_2':{ - 'func': function_name for feature 2, - 'kwargs': keyword arguments for the function - } - } - ); - functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time - (e.g. this function adds 1 to every origin - def test_func(df) - return df['origin'] + 1 - ) - fit_incrementals: - Whether the response variable should be converted to an incremental basis - for fitting. - - Attributes - ---------- - estimator_ml: Estimator - An sklearn-style estimator to predict development patterns - ldf_: Triangle - The estimated loss development patterns. - cdf_: Triangle - The estimated cumulative development patterns. - """ - - def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, - weight_ml=None, fit_incrementals=True, feat_eng=None): - self.estimator_ml=estimator_ml - self.y_ml=y_ml - self.weight_ml = weight_ml - self.autoregressive=autoregressive - self.fit_incrementals = fit_incrementals - self.feat_eng = feat_eng - - def _get_y_names(self): - """ private function to get the response column name""" - if not self.y_ml: - y_names = self._columns - if hasattr(self.y_ml, '_columns'): - y_names = self.y_ml._columns - elif isinstance(self.y_ml, ColumnTransformer): - y_names = self.y_ml.transformers[0][-1] - if type(self.y_ml) is list: - y_names = self.y_ml - elif type(self.y_ml) is str: - y_names = [self.y_ml] - return y_names - - - @property - def y_ml_(self): - defaults = self._get_y_names() - transformer = self.y_ml - if not transformer: - return ColumnTransformer( - transformers=[('passthrough', 'passthrough', defaults)]) - elif type(transformer) is list: - return ColumnTransformer( - transformers=[('passthrough', 'passthrough', transformer)]) - elif type(transformer) is str: - return ColumnTransformer( - transformers=[('passthrough', 'passthrough', [transformer])]) - else: - return transformer - - def _get_triangle_ml(self, df, preds=None): - """ Create fitted Triangle """ - from chainladder.core import Triangle - if preds is None: - preds = self.estimator_ml.predict(df) - X_r = [df] - y_r = [preds] - dgrain = {'Y':12, 'Q':3, 'M': 1, 'S': 6}[self.development_grain_] - ograin = {'Y':1, 'Q':4, 'M': 12, 'S': 6}[self.origin_grain_] - latest_filter = (df['origin']+1)*ograin+(df['development']-dgrain)/dgrain - latest_filter = latest_filter == latest_filter.max() - preds=pd.DataFrame(preds.copy())[latest_filter].values - out = df.loc[latest_filter].copy() - dev_lags = df['development'].drop_duplicates().sort_values() - for d in dev_lags[1:]: - out['development'] = out['development'] + dgrain - out['valuation'] = out['valuation'] + dgrain / 12 - if len(preds.shape) == 1: - preds = preds[:, None] - if self.autoregressive: - for num, col in enumerate(self.autoregressive): - out[col[0]]=preds[:, num] - out = out[out['development']<=dev_lags.max()] - if len(out) == 0: - continue - X_r.append(out.copy()) - if self.feat_eng is not None: - for key, item in self.feat_eng.items(): - out[key] = item['func'](df=out,**item['kwargs']) - preds = self.estimator_ml.predict(out) - y_r.append(preds.copy()) - X_r = pd.concat(X_r, axis=0).reset_index(drop=True) - if True: - X_r = X_r.drop(self._get_y_names(), axis=1) - out = pd.concat((X_r, - pd.DataFrame(np.concatenate(y_r, 0), columns=self._get_y_names())), axis=1) - out['origin'] = out['origin'].map({v: k for k, v in self.origin_encoder_.items()}) - out['valuation'] = out['valuation'].map({v: k for k, v in self.valuation_encoder_.items()}) - return Triangle( - out, origin='origin', development='valuation', - index=self._key_labels, columns=self._get_y_names(), - cumulative=not self.fit_incrementals).dropna() - - def _prep_X_ml(self, X): - """ Preps Triangle data ahead of the pipeline """ - if self.fit_incrementals: - X_ = X.cum_to_incr() - else: - X_ = X.copy() - if self.autoregressive: - for i in self.autoregressive: - lag = X[i[2]].shift(i[1]) - X_[i[0]] = lag[lag.valuation<=X.valuation_date] - df_base = X.incr_to_cum().to_frame( - keepdims=True, implicit_axis=True, origin_as_datetime=True - ).reset_index().iloc[:, :-1] - df = df_base.merge(X.cum_to_incr().to_frame( - keepdims=True, implicit_axis=True, origin_as_datetime=True - ).reset_index(), how='left', - on=list(df_base.columns)).fillna(0) - df['origin'] = df['origin'].map(self.origin_encoder_) - df['valuation'] = df['valuation'].map(self.valuation_encoder_) - if self.feat_eng is not None: - for key, item in self.feat_eng.items(): - df[key] = item['func'](df=df,**item['kwargs']) - return df - - def fit(self, X, y=None, sample_weight=None): - """Fit the model with X. - - Parameters - ---------- - X : Triangle-like - Set of LDFs to which the estimator will be applied. - y : None - Ignored, use y_ml to set a reponse variable for the ML algorithm - sample_weight : None - Ignored - - Returns - ------- - self : object - Returns the instance itself. - """ - - self._columns = list(X.columns) - self._key_labels = X.key_labels - self.origin_grain_ = X.origin_grain - self.development_grain_ = X.development_grain - self.origin_encoder_ = dict(zip( - X.origin.to_timestamp(how='s'), - (pd.Series(X.origin).rank()-1)/{'Y':1, 'Q':4, 'M': 12, 'S': 6}[X.origin_grain])) - val = X.valuation.sort_values().unique() - self.valuation_encoder_ = dict(zip( - val, - (pd.Series(val).rank()-1)/{'Y':1, 'Q':4, 'M': 12, 'S': 6}[X.development_grain])) - df = self._prep_X_ml(X) - self.df_ = df - # Fit model - self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze()) - #return selffit_incrementals - self.triangle_ml_ = self._get_triangle_ml(df) - return self - - @property - def ldf_(self): - ldf = self.triangle_ml_.incr_to_cum().link_ratio - ldf.valuation_date = pd.to_datetime(options.ULT_VAL) - return ldf - - def transform(self, X): - """ If X and self are of different shapes, align self to X, else - return self. - - Parameters - ---------- - X : Triangle - The triangle to be transformed - - Returns - ------- - X_new : New triangle with transformed attributes. - """ - X_new = X.copy() - X_ml = self._prep_X_ml(X) - y_ml=self.estimator_ml.predict(X_ml) - triangle_ml = self._get_triangle_ml(X_ml, y_ml) - backend = "cupy" if X.array_backend == "cupy" else "numpy" - X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend) - X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL) - X_new._set_slicers() - return X_new +import chainladder as cl +import pytest + +def test_basic_bz(): + abc = cl.load_sample('abc') + assert np.all( + np.around(cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(abc).coef_.T.values,3).flatten() + == np.array([11.837,0.179,0.345,0.378,0.405,0.427,0.431,0.66,0.963,1.157,1.278,0.251,-0.056,-0.449,-0.829,-1.169,-1.508,-1.798,-2.023,-2.238,-2.428]) + ) + +def test_multiple_triangle_exception(): + d = cl.load_sample("usauto") + with pytest.raises(ValueError): + cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d) + +def test_feat_eng_1(): + ''' + this function tests the passing in a basic engineered feature. Since test_func just returns development, C(development) and C(teatfeat) should yield identical results + ''' + def test_func(df): + return df["development"] + + abc = cl.load_sample('abc') + test_dict = {'testfeat':{'func':test_func,'kwargs':{}}} + + assert np.all( + np.around(cl.BarnettZehnwirth(formula='C(origin)+development+valuation').fit(abc).coef_.T.values,3) + == np.around(cl.BarnettZehnwirth(formula='C(origin)+testfeat+valuation',feat_eng = test_dict).fit(abc).coef_.T.values,3) + ) + +def test_feat_eng_2(): + ''' + this function tests more complex feature engineering. Since origin_onehot just replicates the one-hot encoding that's performed inside sklearn LinearRegression, the two BZ models should yield identical results + + this function also tests the BZ transformer + ''' + def origin_onehot(df,ori): + return [1 if x == ori else 0 for x in df["origin"]] + + abc = cl.load_sample('abc') + feat_dict = {f'origin_{x}':{'func':origin_onehot,'kwargs':{'ori':float(x+1)}} for x in range(10)} + assert np.all( + np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3) + == np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3) + ) + assert np.all( + np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3) + == np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3) + ) \ No newline at end of file From a219c24fa0af0227c76ffdf8136da23806571808 Mon Sep 17 00:00:00 2001 From: "henrydingliu@gmail.com" Date: Wed, 31 Dec 2025 01:50:37 +0000 Subject: [PATCH 03/10] Adding sample weight to bz and parent sample weights enables dropping specific points from fitting, which is essential for recreating BZ results --- chainladder/development/barnzehn.py | 15 +++++-- chainladder/development/learning.py | 41 ++++++++++++++----- .../development/tests/test_barnzehn.py | 30 +++++++++++++- 3 files changed, 70 insertions(+), 16 deletions(-) diff --git a/chainladder/development/barnzehn.py b/chainladder/development/barnzehn.py index ec156ded..7a2eb2e8 100644 --- a/chainladder/development/barnzehn.py +++ b/chainladder/development/barnzehn.py @@ -22,6 +22,10 @@ class BarnettZehnwirth(TweedieGLM): Parameters ---------- + drop: tuple or list of tuples + Drops specific origin/development combination(s) + drop_valuation: str or list of str (default = None) + Drops specific valuation periods. str must be date convertible. formula: formula-like A patsy formula describing the independent variables, X of the GLM feat_eng: dict @@ -49,7 +53,9 @@ def test_func(df) """ - def __init__(self, formula='C(origin) + development', feat_eng=None, response=None): + def __init__(self, drop=None,drop_valuation=None,formula='C(origin) + development', feat_eng=None, response=None): + self.drop = drop + self.drop_valuation = drop_valuation self.formula = formula self.response = response self.feat_eng = feat_eng @@ -69,7 +75,7 @@ def fit(self, X, y=None, sample_weight=None): self.model_ = DevelopmentML(Pipeline(steps=[ ('design_matrix', PatsyFormula(self.formula)), ('model', LinearRegression(fit_intercept=False))]), - y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng).fit(tri) + y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(tri) resid = tri - self.model_.triangle_ml_[ self.model_.triangle_ml_.valuation <= tri.valuation_date] self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / ( @@ -94,12 +100,13 @@ def transform(self, X): X_new : New triangle with transformed attributes. """ X_new = X.copy() - X_ml = self.model_._prep_X_ml(X.cum_to_incr().log()) + X_ml, weight_ml = self.model_._prep_X_ml(X.cum_to_incr().log()) y_ml = self.model_.estimator_ml.predict(X_ml) - triangle_ml = self.model_._get_triangle_ml(X_ml, y_ml) + triangle_ml, predicted_data = self.model_._get_triangle_ml(X_ml, y_ml) backend = "cupy" if X.array_backend == "cupy" else "numpy" triangle_ml.is_cumulative = False X_new.ldf_ = triangle_ml.exp().incr_to_cum().link_ratio.set_backend(backend) X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL) X_new._set_slicers() + X_new.predicted_data_ = predicted_data return X_new diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py index b682d636..0df1092b 100644 --- a/chainladder/development/learning.py +++ b/chainladder/development/learning.py @@ -33,6 +33,10 @@ class DevelopmentML(DevelopmentBase): Time Series aspects of the model. Predictions from one development period get used as featues in the next development period. Lags should be negative integers. + drop: tuple or list of tuples + Drops specific origin/development combination(s) + drop_valuation: str or list of str (default = None) + Drops specific valuation periods. str must be date convertible. feat_eng: dict A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') (e.g. { @@ -66,11 +70,14 @@ def test_func(df) """ def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, - weight_ml=None, fit_incrementals=True, feat_eng=None): + weight_ml=None, weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True, feat_eng=None): self.estimator_ml=estimator_ml self.y_ml=y_ml self.weight_ml = weight_ml - self.autoregressive=autoregressive + self.weighted_step = weighted_step + self.autoregressive = autoregressive + self.drop = drop + self.drop_valuation = drop_valuation self.fit_incrementals = fit_incrementals self.feat_eng = feat_eng @@ -146,7 +153,7 @@ def _get_triangle_ml(self, df, preds=None): return Triangle( out, origin='origin', development='valuation', index=self._key_labels, columns=self._get_y_names(), - cumulative=not self.fit_incrementals).dropna() + cumulative=not self.fit_incrementals).dropna(), out def _prep_X_ml(self, X): """ Preps Triangle data ahead of the pipeline """ @@ -170,7 +177,13 @@ def _prep_X_ml(self, X): if self.feat_eng is not None: for key, item in self.feat_eng.items(): df[key] = item['func'](df=df,**item['kwargs']) - return df + weight_base = (~np.isnan(X.values)).astype(float) + weight = weight_base.copy() + if self.drop is not None: + weight = weight * self._drop_func(X) + if self.drop_valuation is not None: + weight = weight * self._drop_valuation_func(X) + return df, weight.flatten()[weight_base.flatten()>0] def fit(self, X, y=None, sample_weight=None): """Fit the model with X. @@ -201,12 +214,19 @@ def fit(self, X, y=None, sample_weight=None): self.valuation_encoder_ = dict(zip( val, (pd.Series(val).rank()-1)/{'Y':1, 'S': 2, 'Q':4, 'M': 12}[X.development_grain])) - df = self._prep_X_ml(X) + df, weight = self._prep_X_ml(X) self.df_ = df + self.weight_ = weight + if self.weighted_step == None: + sample_weights = {} + elif isinstance(self.weighted_step, list): + sample_weights = {x + '__sample_weight':weight for x in self.weighted_step} + else: + sample_weights = {self.weighted_step + '__sample_weight':weight} # Fit model - self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze()) + self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze(),**sample_weights) #return selffit_incrementals - self.triangle_ml_ = self._get_triangle_ml(df) + self.triangle_ml_, self.predicted_data_ = self._get_triangle_ml(df) return self @property @@ -229,11 +249,12 @@ def transform(self, X): X_new : New triangle with transformed attributes. """ X_new = X.copy() - X_ml = self._prep_X_ml(X) + X_ml, weight_ml = self._prep_X_ml(X) y_ml=self.estimator_ml.predict(X_ml) - triangle_ml = self._get_triangle_ml(X_ml, y_ml) + triangle_ml, predicted_data = self._get_triangle_ml(X_ml, y_ml) backend = "cupy" if X.array_backend == "cupy" else "numpy" X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend) X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL) X_new._set_slicers() - return X_new + X_new.predicted_data_ = predicted_data + return X_new \ No newline at end of file diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py index 8dd17b69..b056bd16 100644 --- a/chainladder/development/tests/test_barnzehn.py +++ b/chainladder/development/tests/test_barnzehn.py @@ -44,7 +44,33 @@ def origin_onehot(df,ori): np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3) == np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3) ) + +def test_bz_2008(): + ''' + this function tests the drop parameter by recreating the example in the 2008 BZ paper, section 4.1 + ''' + abc = cl.load_sample('abc') + exposure=np.array([[2.2], [2.4], [2.2], [2.0], [1.9], [1.6], [1.6], [1.8], [2.2], [2.5], [2.6]]) + abc_adj = abc/exposure + + def predictor_bins(df,pbin,axis): + return [int(x >= min(pbin)) for x in df[axis]] + + origin_groups = {f'origin_{ori}'.replace('[','').replace(']','').replace(', ',''):{'func':predictor_bins,'kwargs':{'pbin':ori,'axis':'origin'}} for ori in [[2],[3,4],[5,6,7,8,9,10]]} + + def trend_piece(df,piece,axis): + pmax = float(max(piece)) + increment=min(df[axis][df[axis]>0]) + pfirst = piece[0]-increment + return [(x-pfirst)/increment if x in piece else (0 if x Date: Wed, 31 Dec 2025 02:32:08 +0000 Subject: [PATCH 04/10] Adding a test for drop_valuation --- chainladder/development/tests/test_barnzehn.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py index b056bd16..ed4e6e36 100644 --- a/chainladder/development/tests/test_barnzehn.py +++ b/chainladder/development/tests/test_barnzehn.py @@ -45,6 +45,21 @@ def origin_onehot(df,ori): == np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3) ) +def test_drops(): + ''' + this function tests the passing in a basic drop_valuation + ''' + def test_func(df): + return df["development"] + + abc = cl.load_sample('abc') + test_dict = {'testfeat':{'func':test_func,'kwargs':{}}} + + assert np.all( + np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit(abc).coef_.T.values,3) + == np.around(cl.BarnettZehnwirth(formula='C(testfeat)',drop = [('1977',36),('1978',24),('1979',12)],feat_eng = test_dict).fit(abc).coef_.T.values,3) + ) + def test_bz_2008(): ''' this function tests the drop parameter by recreating the example in the 2008 BZ paper, section 4.1 From 4f61b996e093b66ccd029a3ecf32c6aac28ac0c6 Mon Sep 17 00:00:00 2001 From: "henrydingliu@gmail.com" Date: Wed, 31 Dec 2025 18:51:33 +0000 Subject: [PATCH 05/10] More fixed for ml methods - separating weight flattening into a separate method from _prep_X_ml - adding sample weight support to glm - cleaning up how weights are handled in each ml method. - various fixes per #533 --- chainladder/development/barnzehn.py | 4 +-- chainladder/development/glm.py | 25 ++++++++++++------- chainladder/development/learning.py | 30 ++++++++++++++--------- chainladder/development/tests/test_glm.py | 2 +- 4 files changed, 37 insertions(+), 24 deletions(-) diff --git a/chainladder/development/barnzehn.py b/chainladder/development/barnzehn.py index 7a2eb2e8..35d5c0e3 100644 --- a/chainladder/development/barnzehn.py +++ b/chainladder/development/barnzehn.py @@ -75,7 +75,7 @@ def fit(self, X, y=None, sample_weight=None): self.model_ = DevelopmentML(Pipeline(steps=[ ('design_matrix', PatsyFormula(self.formula)), ('model', LinearRegression(fit_intercept=False))]), - y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(tri) + y_ml=response, fit_incrementals=True, feat_eng = self.feat_eng, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(X = tri, sample_weight = sample_weight) resid = tri - self.model_.triangle_ml_[ self.model_.triangle_ml_.valuation <= tri.valuation_date] self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / ( @@ -100,7 +100,7 @@ def transform(self, X): X_new : New triangle with transformed attributes. """ X_new = X.copy() - X_ml, weight_ml = self.model_._prep_X_ml(X.cum_to_incr().log()) + X_ml = self.model_._prep_X_ml(X.cum_to_incr().log()) y_ml = self.model_.estimator_ml.predict(X_ml) triangle_ml, predicted_data = self.model_._get_triangle_ml(X_ml, y_ml) backend = "cupy" if X.array_backend == "cupy" else "numpy" diff --git a/chainladder/development/glm.py b/chainladder/development/glm.py index 9b121367..c44bb0d5 100644 --- a/chainladder/development/glm.py +++ b/chainladder/development/glm.py @@ -22,15 +22,16 @@ class TweedieGLM(DevelopmentBase): Parameters ---------- + drop: tuple or list of tuples + Drops specific origin/development combination(s) + drop_valuation: str or list of str (default = None) + Drops specific valuation periods. str must be date convertible. design_matrix: formula-like A patsy formula describing the independent variables, X of the GLM response: str Column name for the reponse variable of the GLM. If ommitted, then the first column of the Triangle will be used. - weight: str - Column name of any weight to use in the GLM. If none specified, then an - unweighted regression will be performed. - power: float, default=0 + power: float, default=1 The power determines the underlying target distribution according to the following table: +-------+------------------------+ @@ -52,7 +53,7 @@ class TweedieGLM(DevelopmentBase): regularization strength. ``alpha = 0`` is equivalent to unpenalized GLMs. In this case, the design matrix `X` must have full column rank (no collinearities). - link: {'auto', 'identity', 'log'}, default='auto' + link: {'auto', 'identity', 'log'}, default='log' The link function of the GLM, i.e. mapping from linear predictor `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets the link depending on the chosen family as follows: @@ -78,10 +79,11 @@ class TweedieGLM(DevelopmentBase): """ def __init__(self, design_matrix='C(development) + C(origin)', - response=None, weight=None, power=1.0, alpha=1.0, link='log', - max_iter=100, tol=0.0001, warm_start=False, verbose=0): + response=None, power=1.0, alpha=1.0, link='log', + max_iter=100, tol=0.0001, warm_start=False, verbose=0, drop=None,drop_valuation=None): + self.drop = drop + self.drop_valuation = drop_valuation self.response=response - self.weight=weight self.design_matrix = design_matrix self.power=power self.alpha=alpha @@ -93,13 +95,18 @@ def __init__(self, design_matrix='C(development) + C(origin)', def fit(self, X, y=None, sample_weight=None): response = X.columns[0] if not self.response else self.response + if sample_weight is None: + weight = None + else: + weight = 'model' self.model_ = DevelopmentML(Pipeline(steps=[ ('design_matrix', PatsyFormula(self.design_matrix)), ('model', TweedieRegressor( link=self.link, power=self.power, max_iter=self.max_iter, tol=self.tol, warm_start=self.warm_start, verbose=self.verbose, fit_intercept=False))]), - y_ml=response, weight_ml=self.weight).fit(X) + y_ml=response, weighted_step = weight, + drop=self.drop, drop_valuation=self.drop_valuation).fit(X = X, sample_weight = sample_weight) return self @property diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py index 0df1092b..2a14a640 100644 --- a/chainladder/development/learning.py +++ b/chainladder/development/learning.py @@ -33,6 +33,8 @@ class DevelopmentML(DevelopmentBase): Time Series aspects of the model. Predictions from one development period get used as featues in the next development period. Lags should be negative integers. + weight_step: str + Step name within estimator_ml that is weighted drop: tuple or list of tuples Drops specific origin/development combination(s) drop_valuation: str or list of str (default = None) @@ -56,8 +58,7 @@ def test_func(df) return df['origin'] + 1 ) fit_incrementals: - Whether the response variable should be converted to an incremental basis - for fitting. + Whether the response variable should be converted to an incremental basis for fitting. Attributes ---------- @@ -70,10 +71,9 @@ def test_func(df) """ def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, - weight_ml=None, weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True, feat_eng=None): + weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True, feat_eng=None): self.estimator_ml=estimator_ml self.y_ml=y_ml - self.weight_ml = weight_ml self.weighted_step = weighted_step self.autoregressive = autoregressive self.drop = drop @@ -168,7 +168,7 @@ def _prep_X_ml(self, X): df_base = X.incr_to_cum().to_frame( keepdims=True, implicit_axis=True, origin_as_datetime=True ).reset_index().iloc[:, :-1] - df = df_base.merge(X.cum_to_incr().to_frame( + df = df_base.merge(X_.to_frame( keepdims=True, implicit_axis=True, origin_as_datetime=True ).reset_index(), how='left', on=list(df_base.columns)).fillna(0) @@ -177,13 +177,18 @@ def _prep_X_ml(self, X): if self.feat_eng is not None: for key, item in self.feat_eng.items(): df[key] = item['func'](df=df,**item['kwargs']) + return df + + def _prep_w_ml(self,X,sample_weight=None): weight_base = (~np.isnan(X.values)).astype(float) - weight = weight_base.copy() + weight = weight_base.copy() if self.drop is not None: weight = weight * self._drop_func(X) if self.drop_valuation is not None: - weight = weight * self._drop_valuation_func(X) - return df, weight.flatten()[weight_base.flatten()>0] + weight = weight * self._drop_valuation_func(X) + if sample_weight is not None: + weight = weight * sample_weight.values + return weight.flatten()[weight_base.flatten()>0] def fit(self, X, y=None, sample_weight=None): """Fit the model with X. @@ -194,8 +199,8 @@ def fit(self, X, y=None, sample_weight=None): Set of LDFs to which the estimator will be applied. y : None Ignored, use y_ml to set a reponse variable for the ML algorithm - sample_weight : None - Ignored + sample_weight : Triangle-like + Weights to use in the regression Returns ------- @@ -214,8 +219,9 @@ def fit(self, X, y=None, sample_weight=None): self.valuation_encoder_ = dict(zip( val, (pd.Series(val).rank()-1)/{'Y':1, 'S': 2, 'Q':4, 'M': 12}[X.development_grain])) - df, weight = self._prep_X_ml(X) + df = self._prep_X_ml(X) self.df_ = df + weight = self._prep_w_ml(X,sample_weight) self.weight_ = weight if self.weighted_step == None: sample_weights = {} @@ -249,7 +255,7 @@ def transform(self, X): X_new : New triangle with transformed attributes. """ X_new = X.copy() - X_ml, weight_ml = self._prep_X_ml(X) + X_ml = self._prep_X_ml(X) y_ml=self.estimator_ml.predict(X_ml) triangle_ml, predicted_data = self._get_triangle_ml(X_ml, y_ml) backend = "cupy" if X.array_backend == "cupy" else "numpy" diff --git a/chainladder/development/tests/test_glm.py b/chainladder/development/tests/test_glm.py index 0065d93e..c2c876df 100644 --- a/chainladder/development/tests/test_glm.py +++ b/chainladder/development/tests/test_glm.py @@ -4,4 +4,4 @@ def test_basic_odp_cl(genins): assert abs( (cl.Chainladder().fit(genins).ultimate_ - cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins)).ultimate_) / - genins.latest_diagonal).max()< 1e-2 + genins.latest_diagonal).max()< 1e-2 \ No newline at end of file From d7e093b318a8342e0599259211a82fde4ceae3b2 Mon Sep 17 00:00:00 2001 From: "henrydingliu@gmail.com" Date: Wed, 31 Dec 2025 19:19:32 +0000 Subject: [PATCH 06/10] adding some tests for ml methods --- chainladder/development/tests/test_glm.py | 6 ++++++ chainladder/development/tests/test_learning.py | 11 +++++++++++ 2 files changed, 17 insertions(+) create mode 100644 chainladder/development/tests/test_learning.py diff --git a/chainladder/development/tests/test_glm.py b/chainladder/development/tests/test_glm.py index c2c876df..ce1a2e82 100644 --- a/chainladder/development/tests/test_glm.py +++ b/chainladder/development/tests/test_glm.py @@ -4,4 +4,10 @@ def test_basic_odp_cl(genins): assert abs( (cl.Chainladder().fit(genins).ultimate_ - cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins)).ultimate_) / + genins.latest_diagonal).max()< 1e-2 + +def test_sample_weight(genins): + assert abs( + (cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins,sample_weight=genins/genins)).ultimate_ - + cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins)).ultimate_) / genins.latest_diagonal).max()< 1e-2 \ No newline at end of file diff --git a/chainladder/development/tests/test_learning.py b/chainladder/development/tests/test_learning.py new file mode 100644 index 00000000..a6f02249 --- /dev/null +++ b/chainladder/development/tests/test_learning.py @@ -0,0 +1,11 @@ +import chainladder as cl +from sklearn.linear_model import LinearRegression +from sklearn.pipeline import Pipeline +from chainladder.utils.utility_functions import PatsyFormula + +def test_basic_odp_cl(genins): + model = cl.DevelopmentML(Pipeline(steps=[ + ('design_matrix', PatsyFormula('C(development)')), + ('model', LinearRegression(fit_intercept=False))]), + y_ml=response,fit_incrementals=False).fit(genins) + assert abs(model.triangle_ml_.loc[:,:,'2010',:] - genins.mean()).max() < 1e2 \ No newline at end of file From 02a088ff61f003cde785d55448a6654eec2f95f8 Mon Sep 17 00:00:00 2001 From: "henrydingliu@gmail.com" Date: Wed, 31 Dec 2025 19:35:33 +0000 Subject: [PATCH 07/10] fixing tests --- chainladder/development/tests/test_glm.py | 4 ++-- chainladder/development/tests/test_learning.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/chainladder/development/tests/test_glm.py b/chainladder/development/tests/test_glm.py index ce1a2e82..34532871 100644 --- a/chainladder/development/tests/test_glm.py +++ b/chainladder/development/tests/test_glm.py @@ -8,6 +8,6 @@ def test_basic_odp_cl(genins): def test_sample_weight(genins): assert abs( - (cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins,sample_weight=genins/genins)).ultimate_ - - cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins)).ultimate_) / + (cl.Chainladder().fit(genins).ultimate_ - + cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins,sample_weight=genins/genins)).ultimate_) / genins.latest_diagonal).max()< 1e-2 \ No newline at end of file diff --git a/chainladder/development/tests/test_learning.py b/chainladder/development/tests/test_learning.py index a6f02249..f9d93e9d 100644 --- a/chainladder/development/tests/test_learning.py +++ b/chainladder/development/tests/test_learning.py @@ -3,7 +3,8 @@ from sklearn.pipeline import Pipeline from chainladder.utils.utility_functions import PatsyFormula -def test_basic_odp_cl(genins): +def test_incremental(genins): + response = [genins.columns[0]] model = cl.DevelopmentML(Pipeline(steps=[ ('design_matrix', PatsyFormula('C(development)')), ('model', LinearRegression(fit_intercept=False))]), From 3c185b15ca1e5f6eff55b4ff996d32b1ce5d96dc Mon Sep 17 00:00:00 2001 From: "henrydingliu@gmail.com" Date: Wed, 31 Dec 2025 20:05:57 +0000 Subject: [PATCH 08/10] more ml tests --- chainladder/development/tests/test_barnzehn.py | 10 +++------- chainladder/development/tests/test_learning.py | 7 +++++++ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py index ed4e6e36..d9841e11 100644 --- a/chainladder/development/tests/test_barnzehn.py +++ b/chainladder/development/tests/test_barnzehn.py @@ -1,9 +1,9 @@ import numpy as np import chainladder as cl import pytest +abc = cl.load_sample('abc') def test_basic_bz(): - abc = cl.load_sample('abc') assert np.all( np.around(cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(abc).coef_.T.values,3).flatten() == np.array([11.837,0.179,0.345,0.378,0.405,0.427,0.431,0.66,0.963,1.157,1.278,0.251,-0.056,-0.449,-0.829,-1.169,-1.508,-1.798,-2.023,-2.238,-2.428]) @@ -21,7 +21,6 @@ def test_feat_eng_1(): def test_func(df): return df["development"] - abc = cl.load_sample('abc') test_dict = {'testfeat':{'func':test_func,'kwargs':{}}} assert np.all( @@ -38,7 +37,6 @@ def test_feat_eng_2(): def origin_onehot(df,ori): return [1 if x == ori else 0 for x in df["origin"]] - abc = cl.load_sample('abc') feat_dict = {f'origin_{x}':{'func':origin_onehot,'kwargs':{'ori':float(x+1)}} for x in range(10)} assert np.all( np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3) @@ -52,19 +50,17 @@ def test_drops(): def test_func(df): return df["development"] - abc = cl.load_sample('abc') test_dict = {'testfeat':{'func':test_func,'kwargs':{}}} assert np.all( - np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit(abc).coef_.T.values,3) - == np.around(cl.BarnettZehnwirth(formula='C(testfeat)',drop = [('1977',36),('1978',24),('1979',12)],feat_eng = test_dict).fit(abc).coef_.T.values,3) + np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit(abc).triangle_ml_.values,3) + == np.around(cl.BarnettZehnwirth(formula='C(testfeat)',drop = [('1977',36),('1978',24),('1979',12)],feat_eng = test_dict).fit(abc).triangle_ml_.values,3) ) def test_bz_2008(): ''' this function tests the drop parameter by recreating the example in the 2008 BZ paper, section 4.1 ''' - abc = cl.load_sample('abc') exposure=np.array([[2.2], [2.4], [2.2], [2.0], [1.9], [1.6], [1.6], [1.8], [2.2], [2.5], [2.6]]) abc_adj = abc/exposure diff --git a/chainladder/development/tests/test_learning.py b/chainladder/development/tests/test_learning.py index f9d93e9d..0b411aca 100644 --- a/chainladder/development/tests/test_learning.py +++ b/chainladder/development/tests/test_learning.py @@ -9,4 +9,11 @@ def test_incremental(genins): ('design_matrix', PatsyFormula('C(development)')), ('model', LinearRegression(fit_intercept=False))]), y_ml=response,fit_incrementals=False).fit(genins) + assert abs(model.triangle_ml_.loc[:,:,'2010',:] - genins.mean()).max() < 1e2 + +def test_misc(genins): + model = cl.DevelopmentML(Pipeline(steps=[ + ('design_matrix', PatsyFormula('C(development)')), + ('model', LinearRegression(fit_intercept=False))]), + weighted_step = ['model'], fit_incrementals=False).fit(genins, sample_weight=genins/genins) assert abs(model.triangle_ml_.loc[:,:,'2010',:] - genins.mean()).max() < 1e2 \ No newline at end of file From 1ed162bded355cebdcd884ca26bf9504acb3a5c7 Mon Sep 17 00:00:00 2001 From: "henrydingliu@gmail.com" Date: Wed, 31 Dec 2025 21:02:46 +0000 Subject: [PATCH 09/10] removing feat_eng --- chainladder/development/barnzehn.py | 23 +------- chainladder/development/learning.py | 27 +-------- .../development/tests/test_barnzehn.py | 59 +++---------------- 3 files changed, 11 insertions(+), 98 deletions(-) diff --git a/chainladder/development/barnzehn.py b/chainladder/development/barnzehn.py index 35d5c0e3..243d4f30 100644 --- a/chainladder/development/barnzehn.py +++ b/chainladder/development/barnzehn.py @@ -28,24 +28,6 @@ class BarnettZehnwirth(TweedieGLM): Drops specific valuation periods. str must be date convertible. formula: formula-like A patsy formula describing the independent variables, X of the GLM - feat_eng: dict - A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') - (e.g. { - 'feature_1':{ - 'func': function_name for feature 1, - 'kwargs': keyword arguments for the function - }, - 'feature_2':{ - 'func': function_name for feature 2, - 'kwargs': keyword arguments for the function - } - } - ); - functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time - (e.g. this function adds 1 to every origin - def test_func(df) - return df['origin'] + 1 - ) response: str Column name for the reponse variable of the GLM. If ommitted, then the first column of the Triangle will be used. @@ -53,12 +35,11 @@ def test_func(df) """ - def __init__(self, drop=None,drop_valuation=None,formula='C(origin) + development', feat_eng=None, response=None): + def __init__(self, drop=None,drop_valuation=None,formula='C(origin) + development', response=None): self.drop = drop self.drop_valuation = drop_valuation self.formula = formula self.response = response - self.feat_eng = feat_eng def fit(self, X, y=None, sample_weight=None): if max(X.shape[:2]) > 1: @@ -75,7 +56,7 @@ def fit(self, X, y=None, sample_weight=None): self.model_ = DevelopmentML(Pipeline(steps=[ ('design_matrix', PatsyFormula(self.formula)), ('model', LinearRegression(fit_intercept=False))]), - y_ml=response, fit_incrementals=True, feat_eng = self.feat_eng, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(X = tri, sample_weight = sample_weight) + y_ml=response, fit_incrementals=True, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(X = tri, sample_weight = sample_weight) resid = tri - self.model_.triangle_ml_[ self.model_.triangle_ml_.valuation <= tri.valuation_date] self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / ( diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py index 2a14a640..041a9abc 100644 --- a/chainladder/development/learning.py +++ b/chainladder/development/learning.py @@ -39,24 +39,6 @@ class DevelopmentML(DevelopmentBase): Drops specific origin/development combination(s) drop_valuation: str or list of str (default = None) Drops specific valuation periods. str must be date convertible. - feat_eng: dict - A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') - (e.g. { - 'feature_1':{ - 'func': function_name for feature 1, - 'kwargs': keyword arguments for the function - }, - 'feature_2':{ - 'func': function_name for feature 2, - 'kwargs': keyword arguments for the function - } - } - ); - functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time - (e.g. this function adds 1 to every origin - def test_func(df) - return df['origin'] + 1 - ) fit_incrementals: Whether the response variable should be converted to an incremental basis for fitting. @@ -71,7 +53,7 @@ def test_func(df) """ def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, - weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True, feat_eng=None): + weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True): self.estimator_ml=estimator_ml self.y_ml=y_ml self.weighted_step = weighted_step @@ -79,7 +61,6 @@ def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, self.drop = drop self.drop_valuation = drop_valuation self.fit_incrementals = fit_incrementals - self.feat_eng = feat_eng def _get_y_names(self): """ private function to get the response column name""" @@ -138,9 +119,6 @@ def _get_triangle_ml(self, df, preds=None): if len(out) == 0: continue X_r.append(out.copy()) - if self.feat_eng is not None: - for key, item in self.feat_eng.items(): - out[key] = item['func'](df=out,**item['kwargs']) preds = self.estimator_ml.predict(out) y_r.append(preds.copy()) X_r = pd.concat(X_r, axis=0).reset_index(drop=True) @@ -174,9 +152,6 @@ def _prep_X_ml(self, X): on=list(df_base.columns)).fillna(0) df['origin'] = df['origin'].map(self.origin_encoder_) df['valuation'] = df['valuation'].map(self.valuation_encoder_) - if self.feat_eng is not None: - for key, item in self.feat_eng.items(): - df[key] = item['func'](df=df,**item['kwargs']) return df def _prep_w_ml(self,X,sample_weight=None): diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py index d9841e11..bdc72c13 100644 --- a/chainladder/development/tests/test_barnzehn.py +++ b/chainladder/development/tests/test_barnzehn.py @@ -14,47 +14,13 @@ def test_multiple_triangle_exception(): with pytest.raises(ValueError): cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d) -def test_feat_eng_1(): - ''' - this function tests the passing in a basic engineered feature. Since test_func just returns development, C(development) and C(teatfeat) should yield identical results - ''' - def test_func(df): - return df["development"] - - test_dict = {'testfeat':{'func':test_func,'kwargs':{}}} - - assert np.all( - np.around(cl.BarnettZehnwirth(formula='C(origin)+development+valuation').fit(abc).coef_.T.values,3) - == np.around(cl.BarnettZehnwirth(formula='C(origin)+testfeat+valuation',feat_eng = test_dict).fit(abc).coef_.T.values,3) - ) - -def test_feat_eng_2(): - ''' - this function tests more complex feature engineering. Since origin_onehot just replicates the one-hot encoding that's performed inside sklearn LinearRegression, the two BZ models should yield identical results - - this function also tests the BZ transformer - ''' - def origin_onehot(df,ori): - return [1 if x == ori else 0 for x in df["origin"]] - - feat_dict = {f'origin_{x}':{'func':origin_onehot,'kwargs':{'ori':float(x+1)}} for x in range(10)} - assert np.all( - np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3) - == np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3) - ) - def test_drops(): ''' this function tests the passing in a basic drop_valuation ''' - def test_func(df): - return df["development"] - - test_dict = {'testfeat':{'func':test_func,'kwargs':{}}} - assert np.all( np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit(abc).triangle_ml_.values,3) - == np.around(cl.BarnettZehnwirth(formula='C(testfeat)',drop = [('1977',36),('1978',24),('1979',12)],feat_eng = test_dict).fit(abc).triangle_ml_.values,3) + == np.around(cl.BarnettZehnwirth(formula='C(development)',drop = [('1977',36),('1978',24),('1979',12)]).fit(abc).triangle_ml_.values,3) ) def test_bz_2008(): @@ -64,23 +30,14 @@ def test_bz_2008(): exposure=np.array([[2.2], [2.4], [2.2], [2.0], [1.9], [1.6], [1.6], [1.8], [2.2], [2.5], [2.6]]) abc_adj = abc/exposure - def predictor_bins(df,pbin,axis): - return [int(x >= min(pbin)) for x in df[axis]] - - origin_groups = {f'origin_{ori}'.replace('[','').replace(']','').replace(', ',''):{'func':predictor_bins,'kwargs':{'pbin':ori,'axis':'origin'}} for ori in [[2],[3,4],[5,6,7,8,9,10]]} - - def trend_piece(df,piece,axis): - pmax = float(max(piece)) - increment=min(df[axis][df[axis]>0]) - pfirst = piece[0]-increment - return [(x-pfirst)/increment if x in piece else (0 if x= {x})' for x in origin_buckets]) + dev_formula = '+'.join([f'I((np.minimum({x[1]-12},development) - np.minimum({x[0]-12},development))/12)' for x in dev_buckets]) + val_formula = '+'.join([f'I(np.minimum({x[1]-1},valuation) - np.minimum({x[0]-1},valuation))' for x in val_buckets]) + model=cl.BarnettZehnwirth(formula=origin_formula + '+' + dev_formula + '+' + val_formula, drop=('1982',72)).fit(abc_adj) assert np.all( np.around(model.coef_.values,4).flatten() == np.array([11.1579,0.1989,0.0703,0.0919,0.1871,-0.3771,-0.4465,-0.3727,-0.3154,0.0432,0.0858,0.1464]) From 89fd43c57de1eaf8d3ef37fd247d37e8f1222563 Mon Sep 17 00:00:00 2001 From: "henrydingliu@gmail.com" Date: Wed, 31 Dec 2025 21:16:48 +0000 Subject: [PATCH 10/10] adding a test for the BZ transformer --- chainladder/development/tests/test_barnzehn.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py index bdc72c13..8ec0cbb6 100644 --- a/chainladder/development/tests/test_barnzehn.py +++ b/chainladder/development/tests/test_barnzehn.py @@ -18,6 +18,10 @@ def test_drops(): ''' this function tests the passing in a basic drop_valuation ''' + assert np.all( + np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit_transform(abc).ldf_.values,3) + == np.around(cl.BarnettZehnwirth(formula='C(development)',drop = [('1977',36),('1978',24),('1979',12)]).fit_transform(abc).ldf_.values,3) + ) assert np.all( np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit(abc).triangle_ml_.values,3) == np.around(cl.BarnettZehnwirth(formula='C(development)',drop = [('1977',36),('1978',24),('1979',12)]).fit(abc).triangle_ml_.values,3)