From 4da4ad396fc3fb978c3f41a9b923aa34ba306b33 Mon Sep 17 00:00:00 2001 From: "henrydingliu@gmail.com" Date: Mon, 8 Dec 2025 23:32:01 +0000 Subject: [PATCH 1/3] rebasing BZ changes --- chainladder/development/barnzehn.py | 23 +- chainladder/development/learning.py | 27 +- .../development/tests/test_barnzehn.py | 252 +++++++++++++++++- 3 files changed, 285 insertions(+), 17 deletions(-) diff --git a/chainladder/development/barnzehn.py b/chainladder/development/barnzehn.py index 2975ba87..ec156ded 100644 --- a/chainladder/development/barnzehn.py +++ b/chainladder/development/barnzehn.py @@ -24,6 +24,24 @@ class BarnettZehnwirth(TweedieGLM): ---------- formula: formula-like A patsy formula describing the independent variables, X of the GLM + feat_eng: dict + A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') + (e.g. { + 'feature_1':{ + 'func': function_name for feature 1, + 'kwargs': keyword arguments for the function + }, + 'feature_2':{ + 'func': function_name for feature 2, + 'kwargs': keyword arguments for the function + } + } + ); + functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time + (e.g. this function adds 1 to every origin + def test_func(df) + return df['origin'] + 1 + ) response: str Column name for the reponse variable of the GLM. If ommitted, then the first column of the Triangle will be used. @@ -31,9 +49,10 @@ class BarnettZehnwirth(TweedieGLM): """ - def __init__(self, formula='C(origin) + development', response=None): + def __init__(self, formula='C(origin) + development', feat_eng=None, response=None): self.formula = formula self.response = response + self.feat_eng = feat_eng def fit(self, X, y=None, sample_weight=None): if max(X.shape[:2]) > 1: @@ -50,7 +69,7 @@ def fit(self, X, y=None, sample_weight=None): self.model_ = DevelopmentML(Pipeline(steps=[ ('design_matrix', PatsyFormula(self.formula)), ('model', LinearRegression(fit_intercept=False))]), - y_ml=response, fit_incrementals=False).fit(tri) + y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng).fit(tri) resid = tri - self.model_.triangle_ml_[ self.model_.triangle_ml_.valuation <= tri.valuation_date] self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / ( diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py index 1705c696..b682d636 100644 --- a/chainladder/development/learning.py +++ b/chainladder/development/learning.py @@ -33,6 +33,24 @@ class DevelopmentML(DevelopmentBase): Time Series aspects of the model. Predictions from one development period get used as featues in the next development period. Lags should be negative integers. + feat_eng: dict + A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') + (e.g. { + 'feature_1':{ + 'func': function_name for feature 1, + 'kwargs': keyword arguments for the function + }, + 'feature_2':{ + 'func': function_name for feature 2, + 'kwargs': keyword arguments for the function + } + } + ); + functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time + (e.g. this function adds 1 to every origin + def test_func(df) + return df['origin'] + 1 + ) fit_incrementals: Whether the response variable should be converted to an incremental basis for fitting. @@ -48,12 +66,13 @@ class DevelopmentML(DevelopmentBase): """ def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, - weight_ml=None, fit_incrementals=True): + weight_ml=None, fit_incrementals=True, feat_eng=None): self.estimator_ml=estimator_ml self.y_ml=y_ml self.weight_ml = weight_ml self.autoregressive=autoregressive self.fit_incrementals = fit_incrementals + self.feat_eng = feat_eng def _get_y_names(self): """ private function to get the response column name""" @@ -112,6 +131,9 @@ def _get_triangle_ml(self, df, preds=None): if len(out) == 0: continue X_r.append(out.copy()) + if self.feat_eng is not None: + for key, item in self.feat_eng.items(): + out[key] = item['func'](df=out,**item['kwargs']) preds = self.estimator_ml.predict(out) y_r.append(preds.copy()) X_r = pd.concat(X_r, axis=0).reset_index(drop=True) @@ -145,6 +167,9 @@ def _prep_X_ml(self, X): on=list(df_base.columns)).fillna(0) df['origin'] = df['origin'].map(self.origin_encoder_) df['valuation'] = df['valuation'].map(self.valuation_encoder_) + if self.feat_eng is not None: + for key, item in self.feat_eng.items(): + df[key] = item['func'](df=df,**item['kwargs']) return df def fit(self, X, y=None, sample_weight=None): diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py index 15110b89..3cf7e0bd 100644 --- a/chainladder/development/tests/test_barnzehn.py +++ b/chainladder/development/tests/test_barnzehn.py @@ -1,15 +1,239 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. + import numpy as np -import chainladder as cl -import pytest - -def test_basic_bz(): - abc = cl.load_sample('abc') - assert np.all( - np.around(cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(abc).coef_.T.values,3).flatten() - == np.array([11.837,0.179,0.345,0.378,0.405,0.427,0.431,0.66,0.963,1.157,1.278,0.251,-0.056,-0.449,-0.829,-1.169,-1.508,-1.798,-2.023,-2.238,-2.428]) - ) - -def test_multiple_triangle_exception(): - d = cl.load_sample("usauto") - with pytest.raises(ValueError): - cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d) \ No newline at end of file +import pandas as pd + +from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures +from sklearn.compose import ColumnTransformer +from chainladder.development.base import DevelopmentBase +from chainladder import options + + +class DevelopmentML(DevelopmentBase): + """ A Estimator that interfaces with machine learning (ML) tools that implement + the scikit-learn API. + + The `DevelopmentML` estimator is used to generate ``ldf_`` patterns from + the data. + + .. versionadded:: 0.8.1 + + + Parameters + ---------- + estimator_ml: skearn Estimator + Any sklearn compatible regression estimator, including Pipelines and + y_ml: list or str or sklearn_transformer + The response column(s) for the machine learning algorithm. It must be + present within the Triangle. + autoregressive: tuple, (autoregressive_col_name, lag, source_col_name) + The subset of response column(s) to use as lagged features for the + Time Series aspects of the model. Predictions from one development period + get used as featues in the next development period. Lags should be negative + integers. + feat_eng: dict + A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') + (e.g. { + 'feature_1':{ + 'func': function_name for feature 1, + 'kwargs': keyword arguments for the function + }, + 'feature_2':{ + 'func': function_name for feature 2, + 'kwargs': keyword arguments for the function + } + } + ); + functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time + (e.g. this function adds 1 to every origin + def test_func(df) + return df['origin'] + 1 + ) + fit_incrementals: + Whether the response variable should be converted to an incremental basis + for fitting. + + Attributes + ---------- + estimator_ml: Estimator + An sklearn-style estimator to predict development patterns + ldf_: Triangle + The estimated loss development patterns. + cdf_: Triangle + The estimated cumulative development patterns. + """ + + def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, + weight_ml=None, fit_incrementals=True, feat_eng=None): + self.estimator_ml=estimator_ml + self.y_ml=y_ml + self.weight_ml = weight_ml + self.autoregressive=autoregressive + self.fit_incrementals = fit_incrementals + self.feat_eng = feat_eng + + def _get_y_names(self): + """ private function to get the response column name""" + if not self.y_ml: + y_names = self._columns + if hasattr(self.y_ml, '_columns'): + y_names = self.y_ml._columns + elif isinstance(self.y_ml, ColumnTransformer): + y_names = self.y_ml.transformers[0][-1] + if type(self.y_ml) is list: + y_names = self.y_ml + elif type(self.y_ml) is str: + y_names = [self.y_ml] + return y_names + + + @property + def y_ml_(self): + defaults = self._get_y_names() + transformer = self.y_ml + if not transformer: + return ColumnTransformer( + transformers=[('passthrough', 'passthrough', defaults)]) + elif type(transformer) is list: + return ColumnTransformer( + transformers=[('passthrough', 'passthrough', transformer)]) + elif type(transformer) is str: + return ColumnTransformer( + transformers=[('passthrough', 'passthrough', [transformer])]) + else: + return transformer + + def _get_triangle_ml(self, df, preds=None): + """ Create fitted Triangle """ + from chainladder.core import Triangle + if preds is None: + preds = self.estimator_ml.predict(df) + X_r = [df] + y_r = [preds] + dgrain = {'Y':12, 'Q':3, 'M': 1, 'S': 6}[self.development_grain_] + ograin = {'Y':1, 'Q':4, 'M': 12, 'S': 6}[self.origin_grain_] + latest_filter = (df['origin']+1)*ograin+(df['development']-dgrain)/dgrain + latest_filter = latest_filter == latest_filter.max() + preds=pd.DataFrame(preds.copy())[latest_filter].values + out = df.loc[latest_filter].copy() + dev_lags = df['development'].drop_duplicates().sort_values() + for d in dev_lags[1:]: + out['development'] = out['development'] + dgrain + out['valuation'] = out['valuation'] + dgrain / 12 + if len(preds.shape) == 1: + preds = preds[:, None] + if self.autoregressive: + for num, col in enumerate(self.autoregressive): + out[col[0]]=preds[:, num] + out = out[out['development']<=dev_lags.max()] + if len(out) == 0: + continue + X_r.append(out.copy()) + if self.feat_eng is not None: + for key, item in self.feat_eng.items(): + out[key] = item['func'](df=out,**item['kwargs']) + preds = self.estimator_ml.predict(out) + y_r.append(preds.copy()) + X_r = pd.concat(X_r, axis=0).reset_index(drop=True) + if True: + X_r = X_r.drop(self._get_y_names(), axis=1) + out = pd.concat((X_r, + pd.DataFrame(np.concatenate(y_r, 0), columns=self._get_y_names())), axis=1) + out['origin'] = out['origin'].map({v: k for k, v in self.origin_encoder_.items()}) + out['valuation'] = out['valuation'].map({v: k for k, v in self.valuation_encoder_.items()}) + return Triangle( + out, origin='origin', development='valuation', + index=self._key_labels, columns=self._get_y_names(), + cumulative=not self.fit_incrementals).dropna() + + def _prep_X_ml(self, X): + """ Preps Triangle data ahead of the pipeline """ + if self.fit_incrementals: + X_ = X.cum_to_incr() + else: + X_ = X.copy() + if self.autoregressive: + for i in self.autoregressive: + lag = X[i[2]].shift(i[1]) + X_[i[0]] = lag[lag.valuation<=X.valuation_date] + df_base = X.incr_to_cum().to_frame( + keepdims=True, implicit_axis=True, origin_as_datetime=True + ).reset_index().iloc[:, :-1] + df = df_base.merge(X.cum_to_incr().to_frame( + keepdims=True, implicit_axis=True, origin_as_datetime=True + ).reset_index(), how='left', + on=list(df_base.columns)).fillna(0) + df['origin'] = df['origin'].map(self.origin_encoder_) + df['valuation'] = df['valuation'].map(self.valuation_encoder_) + if self.feat_eng is not None: + for key, item in self.feat_eng.items(): + df[key] = item['func'](df=df,**item['kwargs']) + return df + + def fit(self, X, y=None, sample_weight=None): + """Fit the model with X. + + Parameters + ---------- + X : Triangle-like + Set of LDFs to which the estimator will be applied. + y : None + Ignored, use y_ml to set a reponse variable for the ML algorithm + sample_weight : None + Ignored + + Returns + ------- + self : object + Returns the instance itself. + """ + + self._columns = list(X.columns) + self._key_labels = X.key_labels + self.origin_grain_ = X.origin_grain + self.development_grain_ = X.development_grain + self.origin_encoder_ = dict(zip( + X.origin.to_timestamp(how='s'), + (pd.Series(X.origin).rank()-1)/{'Y':1, 'Q':4, 'M': 12, 'S': 6}[X.origin_grain])) + val = X.valuation.sort_values().unique() + self.valuation_encoder_ = dict(zip( + val, + (pd.Series(val).rank()-1)/{'Y':1, 'Q':4, 'M': 12, 'S': 6}[X.development_grain])) + df = self._prep_X_ml(X) + self.df_ = df + # Fit model + self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze()) + #return selffit_incrementals + self.triangle_ml_ = self._get_triangle_ml(df) + return self + + @property + def ldf_(self): + ldf = self.triangle_ml_.incr_to_cum().link_ratio + ldf.valuation_date = pd.to_datetime(options.ULT_VAL) + return ldf + + def transform(self, X): + """ If X and self are of different shapes, align self to X, else + return self. + + Parameters + ---------- + X : Triangle + The triangle to be transformed + + Returns + ------- + X_new : New triangle with transformed attributes. + """ + X_new = X.copy() + X_ml = self._prep_X_ml(X) + y_ml=self.estimator_ml.predict(X_ml) + triangle_ml = self._get_triangle_ml(X_ml, y_ml) + backend = "cupy" if X.array_backend == "cupy" else "numpy" + X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend) + X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL) + X_new._set_slicers() + return X_new From 15039de386a7dfbc15a0a0bab2a3faf8ca2a9d92 Mon Sep 17 00:00:00 2001 From: "henrydingliu@gmail.com" Date: Mon, 8 Dec 2025 23:34:38 +0000 Subject: [PATCH 2/3] correcting an incorrect copy --- .../development/tests/test_barnzehn.py | 287 +++--------------- 1 file changed, 49 insertions(+), 238 deletions(-) diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py index 3cf7e0bd..8dd17b69 100644 --- a/chainladder/development/tests/test_barnzehn.py +++ b/chainladder/development/tests/test_barnzehn.py @@ -1,239 +1,50 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. - import numpy as np -import pandas as pd - -from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures -from sklearn.compose import ColumnTransformer -from chainladder.development.base import DevelopmentBase -from chainladder import options - - -class DevelopmentML(DevelopmentBase): - """ A Estimator that interfaces with machine learning (ML) tools that implement - the scikit-learn API. - - The `DevelopmentML` estimator is used to generate ``ldf_`` patterns from - the data. - - .. versionadded:: 0.8.1 - - - Parameters - ---------- - estimator_ml: skearn Estimator - Any sklearn compatible regression estimator, including Pipelines and - y_ml: list or str or sklearn_transformer - The response column(s) for the machine learning algorithm. It must be - present within the Triangle. - autoregressive: tuple, (autoregressive_col_name, lag, source_col_name) - The subset of response column(s) to use as lagged features for the - Time Series aspects of the model. Predictions from one development period - get used as featues in the next development period. Lags should be negative - integers. - feat_eng: dict - A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') - (e.g. { - 'feature_1':{ - 'func': function_name for feature 1, - 'kwargs': keyword arguments for the function - }, - 'feature_2':{ - 'func': function_name for feature 2, - 'kwargs': keyword arguments for the function - } - } - ); - functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time - (e.g. this function adds 1 to every origin - def test_func(df) - return df['origin'] + 1 - ) - fit_incrementals: - Whether the response variable should be converted to an incremental basis - for fitting. - - Attributes - ---------- - estimator_ml: Estimator - An sklearn-style estimator to predict development patterns - ldf_: Triangle - The estimated loss development patterns. - cdf_: Triangle - The estimated cumulative development patterns. - """ - - def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, - weight_ml=None, fit_incrementals=True, feat_eng=None): - self.estimator_ml=estimator_ml - self.y_ml=y_ml - self.weight_ml = weight_ml - self.autoregressive=autoregressive - self.fit_incrementals = fit_incrementals - self.feat_eng = feat_eng - - def _get_y_names(self): - """ private function to get the response column name""" - if not self.y_ml: - y_names = self._columns - if hasattr(self.y_ml, '_columns'): - y_names = self.y_ml._columns - elif isinstance(self.y_ml, ColumnTransformer): - y_names = self.y_ml.transformers[0][-1] - if type(self.y_ml) is list: - y_names = self.y_ml - elif type(self.y_ml) is str: - y_names = [self.y_ml] - return y_names - - - @property - def y_ml_(self): - defaults = self._get_y_names() - transformer = self.y_ml - if not transformer: - return ColumnTransformer( - transformers=[('passthrough', 'passthrough', defaults)]) - elif type(transformer) is list: - return ColumnTransformer( - transformers=[('passthrough', 'passthrough', transformer)]) - elif type(transformer) is str: - return ColumnTransformer( - transformers=[('passthrough', 'passthrough', [transformer])]) - else: - return transformer - - def _get_triangle_ml(self, df, preds=None): - """ Create fitted Triangle """ - from chainladder.core import Triangle - if preds is None: - preds = self.estimator_ml.predict(df) - X_r = [df] - y_r = [preds] - dgrain = {'Y':12, 'Q':3, 'M': 1, 'S': 6}[self.development_grain_] - ograin = {'Y':1, 'Q':4, 'M': 12, 'S': 6}[self.origin_grain_] - latest_filter = (df['origin']+1)*ograin+(df['development']-dgrain)/dgrain - latest_filter = latest_filter == latest_filter.max() - preds=pd.DataFrame(preds.copy())[latest_filter].values - out = df.loc[latest_filter].copy() - dev_lags = df['development'].drop_duplicates().sort_values() - for d in dev_lags[1:]: - out['development'] = out['development'] + dgrain - out['valuation'] = out['valuation'] + dgrain / 12 - if len(preds.shape) == 1: - preds = preds[:, None] - if self.autoregressive: - for num, col in enumerate(self.autoregressive): - out[col[0]]=preds[:, num] - out = out[out['development']<=dev_lags.max()] - if len(out) == 0: - continue - X_r.append(out.copy()) - if self.feat_eng is not None: - for key, item in self.feat_eng.items(): - out[key] = item['func'](df=out,**item['kwargs']) - preds = self.estimator_ml.predict(out) - y_r.append(preds.copy()) - X_r = pd.concat(X_r, axis=0).reset_index(drop=True) - if True: - X_r = X_r.drop(self._get_y_names(), axis=1) - out = pd.concat((X_r, - pd.DataFrame(np.concatenate(y_r, 0), columns=self._get_y_names())), axis=1) - out['origin'] = out['origin'].map({v: k for k, v in self.origin_encoder_.items()}) - out['valuation'] = out['valuation'].map({v: k for k, v in self.valuation_encoder_.items()}) - return Triangle( - out, origin='origin', development='valuation', - index=self._key_labels, columns=self._get_y_names(), - cumulative=not self.fit_incrementals).dropna() - - def _prep_X_ml(self, X): - """ Preps Triangle data ahead of the pipeline """ - if self.fit_incrementals: - X_ = X.cum_to_incr() - else: - X_ = X.copy() - if self.autoregressive: - for i in self.autoregressive: - lag = X[i[2]].shift(i[1]) - X_[i[0]] = lag[lag.valuation<=X.valuation_date] - df_base = X.incr_to_cum().to_frame( - keepdims=True, implicit_axis=True, origin_as_datetime=True - ).reset_index().iloc[:, :-1] - df = df_base.merge(X.cum_to_incr().to_frame( - keepdims=True, implicit_axis=True, origin_as_datetime=True - ).reset_index(), how='left', - on=list(df_base.columns)).fillna(0) - df['origin'] = df['origin'].map(self.origin_encoder_) - df['valuation'] = df['valuation'].map(self.valuation_encoder_) - if self.feat_eng is not None: - for key, item in self.feat_eng.items(): - df[key] = item['func'](df=df,**item['kwargs']) - return df - - def fit(self, X, y=None, sample_weight=None): - """Fit the model with X. - - Parameters - ---------- - X : Triangle-like - Set of LDFs to which the estimator will be applied. - y : None - Ignored, use y_ml to set a reponse variable for the ML algorithm - sample_weight : None - Ignored - - Returns - ------- - self : object - Returns the instance itself. - """ - - self._columns = list(X.columns) - self._key_labels = X.key_labels - self.origin_grain_ = X.origin_grain - self.development_grain_ = X.development_grain - self.origin_encoder_ = dict(zip( - X.origin.to_timestamp(how='s'), - (pd.Series(X.origin).rank()-1)/{'Y':1, 'Q':4, 'M': 12, 'S': 6}[X.origin_grain])) - val = X.valuation.sort_values().unique() - self.valuation_encoder_ = dict(zip( - val, - (pd.Series(val).rank()-1)/{'Y':1, 'Q':4, 'M': 12, 'S': 6}[X.development_grain])) - df = self._prep_X_ml(X) - self.df_ = df - # Fit model - self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze()) - #return selffit_incrementals - self.triangle_ml_ = self._get_triangle_ml(df) - return self - - @property - def ldf_(self): - ldf = self.triangle_ml_.incr_to_cum().link_ratio - ldf.valuation_date = pd.to_datetime(options.ULT_VAL) - return ldf - - def transform(self, X): - """ If X and self are of different shapes, align self to X, else - return self. - - Parameters - ---------- - X : Triangle - The triangle to be transformed - - Returns - ------- - X_new : New triangle with transformed attributes. - """ - X_new = X.copy() - X_ml = self._prep_X_ml(X) - y_ml=self.estimator_ml.predict(X_ml) - triangle_ml = self._get_triangle_ml(X_ml, y_ml) - backend = "cupy" if X.array_backend == "cupy" else "numpy" - X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend) - X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL) - X_new._set_slicers() - return X_new +import chainladder as cl +import pytest + +def test_basic_bz(): + abc = cl.load_sample('abc') + assert np.all( + np.around(cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(abc).coef_.T.values,3).flatten() + == np.array([11.837,0.179,0.345,0.378,0.405,0.427,0.431,0.66,0.963,1.157,1.278,0.251,-0.056,-0.449,-0.829,-1.169,-1.508,-1.798,-2.023,-2.238,-2.428]) + ) + +def test_multiple_triangle_exception(): + d = cl.load_sample("usauto") + with pytest.raises(ValueError): + cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d) + +def test_feat_eng_1(): + ''' + this function tests the passing in a basic engineered feature. Since test_func just returns development, C(development) and C(teatfeat) should yield identical results + ''' + def test_func(df): + return df["development"] + + abc = cl.load_sample('abc') + test_dict = {'testfeat':{'func':test_func,'kwargs':{}}} + + assert np.all( + np.around(cl.BarnettZehnwirth(formula='C(origin)+development+valuation').fit(abc).coef_.T.values,3) + == np.around(cl.BarnettZehnwirth(formula='C(origin)+testfeat+valuation',feat_eng = test_dict).fit(abc).coef_.T.values,3) + ) + +def test_feat_eng_2(): + ''' + this function tests more complex feature engineering. Since origin_onehot just replicates the one-hot encoding that's performed inside sklearn LinearRegression, the two BZ models should yield identical results + + this function also tests the BZ transformer + ''' + def origin_onehot(df,ori): + return [1 if x == ori else 0 for x in df["origin"]] + + abc = cl.load_sample('abc') + feat_dict = {f'origin_{x}':{'func':origin_onehot,'kwargs':{'ori':float(x+1)}} for x in range(10)} + assert np.all( + np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3) + == np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3) + ) + assert np.all( + np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3) + == np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3) + ) \ No newline at end of file From a219c24fa0af0227c76ffdf8136da23806571808 Mon Sep 17 00:00:00 2001 From: "henrydingliu@gmail.com" Date: Wed, 31 Dec 2025 01:50:37 +0000 Subject: [PATCH 3/3] Adding sample weight to bz and parent sample weights enables dropping specific points from fitting, which is essential for recreating BZ results --- chainladder/development/barnzehn.py | 15 +++++-- chainladder/development/learning.py | 41 ++++++++++++++----- .../development/tests/test_barnzehn.py | 30 +++++++++++++- 3 files changed, 70 insertions(+), 16 deletions(-) diff --git a/chainladder/development/barnzehn.py b/chainladder/development/barnzehn.py index ec156ded..7a2eb2e8 100644 --- a/chainladder/development/barnzehn.py +++ b/chainladder/development/barnzehn.py @@ -22,6 +22,10 @@ class BarnettZehnwirth(TweedieGLM): Parameters ---------- + drop: tuple or list of tuples + Drops specific origin/development combination(s) + drop_valuation: str or list of str (default = None) + Drops specific valuation periods. str must be date convertible. formula: formula-like A patsy formula describing the independent variables, X of the GLM feat_eng: dict @@ -49,7 +53,9 @@ def test_func(df) """ - def __init__(self, formula='C(origin) + development', feat_eng=None, response=None): + def __init__(self, drop=None,drop_valuation=None,formula='C(origin) + development', feat_eng=None, response=None): + self.drop = drop + self.drop_valuation = drop_valuation self.formula = formula self.response = response self.feat_eng = feat_eng @@ -69,7 +75,7 @@ def fit(self, X, y=None, sample_weight=None): self.model_ = DevelopmentML(Pipeline(steps=[ ('design_matrix', PatsyFormula(self.formula)), ('model', LinearRegression(fit_intercept=False))]), - y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng).fit(tri) + y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(tri) resid = tri - self.model_.triangle_ml_[ self.model_.triangle_ml_.valuation <= tri.valuation_date] self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / ( @@ -94,12 +100,13 @@ def transform(self, X): X_new : New triangle with transformed attributes. """ X_new = X.copy() - X_ml = self.model_._prep_X_ml(X.cum_to_incr().log()) + X_ml, weight_ml = self.model_._prep_X_ml(X.cum_to_incr().log()) y_ml = self.model_.estimator_ml.predict(X_ml) - triangle_ml = self.model_._get_triangle_ml(X_ml, y_ml) + triangle_ml, predicted_data = self.model_._get_triangle_ml(X_ml, y_ml) backend = "cupy" if X.array_backend == "cupy" else "numpy" triangle_ml.is_cumulative = False X_new.ldf_ = triangle_ml.exp().incr_to_cum().link_ratio.set_backend(backend) X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL) X_new._set_slicers() + X_new.predicted_data_ = predicted_data return X_new diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py index b682d636..0df1092b 100644 --- a/chainladder/development/learning.py +++ b/chainladder/development/learning.py @@ -33,6 +33,10 @@ class DevelopmentML(DevelopmentBase): Time Series aspects of the model. Predictions from one development period get used as featues in the next development period. Lags should be negative integers. + drop: tuple or list of tuples + Drops specific origin/development combination(s) + drop_valuation: str or list of str (default = None) + Drops specific valuation periods. str must be date convertible. feat_eng: dict A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') (e.g. { @@ -66,11 +70,14 @@ def test_func(df) """ def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, - weight_ml=None, fit_incrementals=True, feat_eng=None): + weight_ml=None, weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True, feat_eng=None): self.estimator_ml=estimator_ml self.y_ml=y_ml self.weight_ml = weight_ml - self.autoregressive=autoregressive + self.weighted_step = weighted_step + self.autoregressive = autoregressive + self.drop = drop + self.drop_valuation = drop_valuation self.fit_incrementals = fit_incrementals self.feat_eng = feat_eng @@ -146,7 +153,7 @@ def _get_triangle_ml(self, df, preds=None): return Triangle( out, origin='origin', development='valuation', index=self._key_labels, columns=self._get_y_names(), - cumulative=not self.fit_incrementals).dropna() + cumulative=not self.fit_incrementals).dropna(), out def _prep_X_ml(self, X): """ Preps Triangle data ahead of the pipeline """ @@ -170,7 +177,13 @@ def _prep_X_ml(self, X): if self.feat_eng is not None: for key, item in self.feat_eng.items(): df[key] = item['func'](df=df,**item['kwargs']) - return df + weight_base = (~np.isnan(X.values)).astype(float) + weight = weight_base.copy() + if self.drop is not None: + weight = weight * self._drop_func(X) + if self.drop_valuation is not None: + weight = weight * self._drop_valuation_func(X) + return df, weight.flatten()[weight_base.flatten()>0] def fit(self, X, y=None, sample_weight=None): """Fit the model with X. @@ -201,12 +214,19 @@ def fit(self, X, y=None, sample_weight=None): self.valuation_encoder_ = dict(zip( val, (pd.Series(val).rank()-1)/{'Y':1, 'S': 2, 'Q':4, 'M': 12}[X.development_grain])) - df = self._prep_X_ml(X) + df, weight = self._prep_X_ml(X) self.df_ = df + self.weight_ = weight + if self.weighted_step == None: + sample_weights = {} + elif isinstance(self.weighted_step, list): + sample_weights = {x + '__sample_weight':weight for x in self.weighted_step} + else: + sample_weights = {self.weighted_step + '__sample_weight':weight} # Fit model - self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze()) + self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze(),**sample_weights) #return selffit_incrementals - self.triangle_ml_ = self._get_triangle_ml(df) + self.triangle_ml_, self.predicted_data_ = self._get_triangle_ml(df) return self @property @@ -229,11 +249,12 @@ def transform(self, X): X_new : New triangle with transformed attributes. """ X_new = X.copy() - X_ml = self._prep_X_ml(X) + X_ml, weight_ml = self._prep_X_ml(X) y_ml=self.estimator_ml.predict(X_ml) - triangle_ml = self._get_triangle_ml(X_ml, y_ml) + triangle_ml, predicted_data = self._get_triangle_ml(X_ml, y_ml) backend = "cupy" if X.array_backend == "cupy" else "numpy" X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend) X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL) X_new._set_slicers() - return X_new + X_new.predicted_data_ = predicted_data + return X_new \ No newline at end of file diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py index 8dd17b69..b056bd16 100644 --- a/chainladder/development/tests/test_barnzehn.py +++ b/chainladder/development/tests/test_barnzehn.py @@ -44,7 +44,33 @@ def origin_onehot(df,ori): np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3) == np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3) ) + +def test_bz_2008(): + ''' + this function tests the drop parameter by recreating the example in the 2008 BZ paper, section 4.1 + ''' + abc = cl.load_sample('abc') + exposure=np.array([[2.2], [2.4], [2.2], [2.0], [1.9], [1.6], [1.6], [1.8], [2.2], [2.5], [2.6]]) + abc_adj = abc/exposure + + def predictor_bins(df,pbin,axis): + return [int(x >= min(pbin)) for x in df[axis]] + + origin_groups = {f'origin_{ori}'.replace('[','').replace(']','').replace(', ',''):{'func':predictor_bins,'kwargs':{'pbin':ori,'axis':'origin'}} for ori in [[2],[3,4],[5,6,7,8,9,10]]} + + def trend_piece(df,piece,axis): + pmax = float(max(piece)) + increment=min(df[axis][df[axis]>0]) + pfirst = piece[0]-increment + return [(x-pfirst)/increment if x in piece else (0 if x