Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 30 additions & 4 deletions chainladder/development/barnzehn.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,43 @@ class BarnettZehnwirth(TweedieGLM):

Parameters
----------
drop: tuple or list of tuples
Drops specific origin/development combination(s)
drop_valuation: str or list of str (default = None)
Drops specific valuation periods. str must be date convertible.
formula: formula-like
A patsy formula describing the independent variables, X of the GLM
feat_eng: dict
A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs')
(e.g. {
'feature_1':{
'func': function_name for feature 1,
'kwargs': keyword arguments for the function
},
'feature_2':{
'func': function_name for feature 2,
'kwargs': keyword arguments for the function
}
}
);
functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time
(e.g. this function adds 1 to every origin
def test_func(df)
return df['origin'] + 1
)
response: str
Column name for the reponse variable of the GLM. If ommitted, then the
first column of the Triangle will be used.


"""

def __init__(self, formula='C(origin) + development', response=None):
def __init__(self, drop=None,drop_valuation=None,formula='C(origin) + development', feat_eng=None, response=None):
self.drop = drop
self.drop_valuation = drop_valuation
self.formula = formula
self.response = response
self.feat_eng = feat_eng

def fit(self, X, y=None, sample_weight=None):
if max(X.shape[:2]) > 1:
Expand All @@ -50,7 +75,7 @@ def fit(self, X, y=None, sample_weight=None):
self.model_ = DevelopmentML(Pipeline(steps=[
('design_matrix', PatsyFormula(self.formula)),
('model', LinearRegression(fit_intercept=False))]),
y_ml=response, fit_incrementals=False).fit(tri)
y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(tri)
resid = tri - self.model_.triangle_ml_[
self.model_.triangle_ml_.valuation <= tri.valuation_date]
self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / (
Expand All @@ -75,12 +100,13 @@ def transform(self, X):
X_new : New triangle with transformed attributes.
"""
X_new = X.copy()
X_ml = self.model_._prep_X_ml(X.cum_to_incr().log())
X_ml, weight_ml = self.model_._prep_X_ml(X.cum_to_incr().log())
y_ml = self.model_.estimator_ml.predict(X_ml)
triangle_ml = self.model_._get_triangle_ml(X_ml, y_ml)
triangle_ml, predicted_data = self.model_._get_triangle_ml(X_ml, y_ml)
backend = "cupy" if X.array_backend == "cupy" else "numpy"
triangle_ml.is_cumulative = False
X_new.ldf_ = triangle_ml.exp().incr_to_cum().link_ratio.set_backend(backend)
X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
X_new._set_slicers()
X_new.predicted_data_ = predicted_data
return X_new
66 changes: 56 additions & 10 deletions chainladder/development/learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,28 @@ class DevelopmentML(DevelopmentBase):
Time Series aspects of the model. Predictions from one development period
get used as featues in the next development period. Lags should be negative
integers.
drop: tuple or list of tuples
Drops specific origin/development combination(s)
drop_valuation: str or list of str (default = None)
Drops specific valuation periods. str must be date convertible.
feat_eng: dict
A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs')
(e.g. {
'feature_1':{
'func': function_name for feature 1,
'kwargs': keyword arguments for the function
},
'feature_2':{
'func': function_name for feature 2,
'kwargs': keyword arguments for the function
}
}
);
functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time
(e.g. this function adds 1 to every origin
def test_func(df)
return df['origin'] + 1
)
fit_incrementals:
Whether the response variable should be converted to an incremental basis
for fitting.
Expand All @@ -48,12 +70,16 @@ class DevelopmentML(DevelopmentBase):
"""

def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False,
weight_ml=None, fit_incrementals=True):
weight_ml=None, weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True, feat_eng=None):
self.estimator_ml=estimator_ml
self.y_ml=y_ml
self.weight_ml = weight_ml
self.autoregressive=autoregressive
self.weighted_step = weighted_step
self.autoregressive = autoregressive
self.drop = drop
self.drop_valuation = drop_valuation
self.fit_incrementals = fit_incrementals
self.feat_eng = feat_eng

def _get_y_names(self):
""" private function to get the response column name"""
Expand Down Expand Up @@ -112,6 +138,9 @@ def _get_triangle_ml(self, df, preds=None):
if len(out) == 0:
continue
X_r.append(out.copy())
if self.feat_eng is not None:
for key, item in self.feat_eng.items():
out[key] = item['func'](df=out,**item['kwargs'])
preds = self.estimator_ml.predict(out)
y_r.append(preds.copy())
X_r = pd.concat(X_r, axis=0).reset_index(drop=True)
Expand All @@ -124,7 +153,7 @@ def _get_triangle_ml(self, df, preds=None):
return Triangle(
out, origin='origin', development='valuation',
index=self._key_labels, columns=self._get_y_names(),
cumulative=not self.fit_incrementals).dropna()
cumulative=not self.fit_incrementals).dropna(), out

def _prep_X_ml(self, X):
""" Preps Triangle data ahead of the pipeline """
Expand All @@ -145,7 +174,16 @@ def _prep_X_ml(self, X):
on=list(df_base.columns)).fillna(0)
df['origin'] = df['origin'].map(self.origin_encoder_)
df['valuation'] = df['valuation'].map(self.valuation_encoder_)
return df
if self.feat_eng is not None:
for key, item in self.feat_eng.items():
df[key] = item['func'](df=df,**item['kwargs'])
weight_base = (~np.isnan(X.values)).astype(float)
weight = weight_base.copy()
if self.drop is not None:
weight = weight * self._drop_func(X)
if self.drop_valuation is not None:
weight = weight * self._drop_valuation_func(X)
return df, weight.flatten()[weight_base.flatten()>0]

def fit(self, X, y=None, sample_weight=None):
"""Fit the model with X.
Expand Down Expand Up @@ -176,12 +214,19 @@ def fit(self, X, y=None, sample_weight=None):
self.valuation_encoder_ = dict(zip(
val,
(pd.Series(val).rank()-1)/{'Y':1, 'S': 2, 'Q':4, 'M': 12}[X.development_grain]))
df = self._prep_X_ml(X)
df, weight = self._prep_X_ml(X)
self.df_ = df
self.weight_ = weight
if self.weighted_step == None:
sample_weights = {}
elif isinstance(self.weighted_step, list):
sample_weights = {x + '__sample_weight':weight for x in self.weighted_step}
else:
sample_weights = {self.weighted_step + '__sample_weight':weight}
# Fit model
self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze())
self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze(),**sample_weights)
#return selffit_incrementals
self.triangle_ml_ = self._get_triangle_ml(df)
self.triangle_ml_, self.predicted_data_ = self._get_triangle_ml(df)
return self

@property
Expand All @@ -204,11 +249,12 @@ def transform(self, X):
X_new : New triangle with transformed attributes.
"""
X_new = X.copy()
X_ml = self._prep_X_ml(X)
X_ml, weight_ml = self._prep_X_ml(X)
y_ml=self.estimator_ml.predict(X_ml)
triangle_ml = self._get_triangle_ml(X_ml, y_ml)
triangle_ml, predicted_data = self._get_triangle_ml(X_ml, y_ml)
backend = "cupy" if X.array_backend == "cupy" else "numpy"
X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend)
X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
X_new._set_slicers()
return X_new
X_new.predicted_data_ = predicted_data
return X_new
63 changes: 62 additions & 1 deletion chainladder/development/tests/test_barnzehn.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,65 @@ def test_basic_bz():
def test_multiple_triangle_exception():
d = cl.load_sample("usauto")
with pytest.raises(ValueError):
cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d)
cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d)

def test_feat_eng_1():
'''
this function tests the passing in a basic engineered feature. Since test_func just returns development, C(development) and C(teatfeat) should yield identical results
'''
def test_func(df):
return df["development"]

abc = cl.load_sample('abc')
test_dict = {'testfeat':{'func':test_func,'kwargs':{}}}

assert np.all(
np.around(cl.BarnettZehnwirth(formula='C(origin)+development+valuation').fit(abc).coef_.T.values,3)
== np.around(cl.BarnettZehnwirth(formula='C(origin)+testfeat+valuation',feat_eng = test_dict).fit(abc).coef_.T.values,3)
)

def test_feat_eng_2():
'''
this function tests more complex feature engineering. Since origin_onehot just replicates the one-hot encoding that's performed inside sklearn LinearRegression, the two BZ models should yield identical results

this function also tests the BZ transformer
'''
def origin_onehot(df,ori):
return [1 if x == ori else 0 for x in df["origin"]]

abc = cl.load_sample('abc')
feat_dict = {f'origin_{x}':{'func':origin_onehot,'kwargs':{'ori':float(x+1)}} for x in range(10)}
assert np.all(
np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3)
== np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3)
)

def test_bz_2008():
'''
this function tests the drop parameter by recreating the example in the 2008 BZ paper, section 4.1
'''
abc = cl.load_sample('abc')
exposure=np.array([[2.2], [2.4], [2.2], [2.0], [1.9], [1.6], [1.6], [1.8], [2.2], [2.5], [2.6]])
abc_adj = abc/exposure

def predictor_bins(df,pbin,axis):
return [int(x >= min(pbin)) for x in df[axis]]

origin_groups = {f'origin_{ori}'.replace('[','').replace(']','').replace(', ',''):{'func':predictor_bins,'kwargs':{'pbin':ori,'axis':'origin'}} for ori in [[2],[3,4],[5,6,7,8,9,10]]}

def trend_piece(df,piece,axis):
pmax = float(max(piece))
increment=min(df[axis][df[axis]>0])
pfirst = piece[0]-increment
return [(x-pfirst)/increment if x in piece else (0 if x<pmax else (pmax-pfirst)/increment) for x in df[axis]]

development_groups = {f'development_{dev}'.replace('[','').replace(']','').replace(', ',''):{'func':trend_piece,'kwargs':{'piece':dev,'axis':'development'}} for dev in [[24],[36],[48,60,72],[84,96],[108,120,132]]}

valuation_groups = {f'valuation_{val}'.replace('[','').replace(']','').replace(', ',''):{'func':trend_piece,'kwargs':{'piece':val,'axis':'valuation'}} for val in [[1,2,3,4,5,6,7],[8],[9,10]]}

abc_dict = {**origin_groups,**development_groups,**valuation_groups}
model=cl.BarnettZehnwirth(formula='+'.join([z for z in abc_dict.keys()]),feat_eng=abc_dict, drop=('1982',72)).fit(abc_adj)
assert np.all(
np.around(model.coef_.values,4).flatten()
== np.array([11.1579,0.1989,0.0703,0.0919,0.1871,-0.3771,-0.4465,-0.3727,-0.3154,0.0432,0.0858,0.1464])
)