diff --git a/chainladder/development/barnzehn.py b/chainladder/development/barnzehn.py index 2975ba87..7a2eb2e8 100644 --- a/chainladder/development/barnzehn.py +++ b/chainladder/development/barnzehn.py @@ -22,8 +22,30 @@ class BarnettZehnwirth(TweedieGLM): Parameters ---------- + drop: tuple or list of tuples + Drops specific origin/development combination(s) + drop_valuation: str or list of str (default = None) + Drops specific valuation periods. str must be date convertible. formula: formula-like A patsy formula describing the independent variables, X of the GLM + feat_eng: dict + A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') + (e.g. { + 'feature_1':{ + 'func': function_name for feature 1, + 'kwargs': keyword arguments for the function + }, + 'feature_2':{ + 'func': function_name for feature 2, + 'kwargs': keyword arguments for the function + } + } + ); + functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time + (e.g. this function adds 1 to every origin + def test_func(df) + return df['origin'] + 1 + ) response: str Column name for the reponse variable of the GLM. If ommitted, then the first column of the Triangle will be used. @@ -31,9 +53,12 @@ class BarnettZehnwirth(TweedieGLM): """ - def __init__(self, formula='C(origin) + development', response=None): + def __init__(self, drop=None,drop_valuation=None,formula='C(origin) + development', feat_eng=None, response=None): + self.drop = drop + self.drop_valuation = drop_valuation self.formula = formula self.response = response + self.feat_eng = feat_eng def fit(self, X, y=None, sample_weight=None): if max(X.shape[:2]) > 1: @@ -50,7 +75,7 @@ def fit(self, X, y=None, sample_weight=None): self.model_ = DevelopmentML(Pipeline(steps=[ ('design_matrix', PatsyFormula(self.formula)), ('model', LinearRegression(fit_intercept=False))]), - y_ml=response, fit_incrementals=False).fit(tri) + y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(tri) resid = tri - self.model_.triangle_ml_[ self.model_.triangle_ml_.valuation <= tri.valuation_date] self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / ( @@ -75,12 +100,13 @@ def transform(self, X): X_new : New triangle with transformed attributes. """ X_new = X.copy() - X_ml = self.model_._prep_X_ml(X.cum_to_incr().log()) + X_ml, weight_ml = self.model_._prep_X_ml(X.cum_to_incr().log()) y_ml = self.model_.estimator_ml.predict(X_ml) - triangle_ml = self.model_._get_triangle_ml(X_ml, y_ml) + triangle_ml, predicted_data = self.model_._get_triangle_ml(X_ml, y_ml) backend = "cupy" if X.array_backend == "cupy" else "numpy" triangle_ml.is_cumulative = False X_new.ldf_ = triangle_ml.exp().incr_to_cum().link_ratio.set_backend(backend) X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL) X_new._set_slicers() + X_new.predicted_data_ = predicted_data return X_new diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py index 1705c696..0df1092b 100644 --- a/chainladder/development/learning.py +++ b/chainladder/development/learning.py @@ -33,6 +33,28 @@ class DevelopmentML(DevelopmentBase): Time Series aspects of the model. Predictions from one development period get used as featues in the next development period. Lags should be negative integers. + drop: tuple or list of tuples + Drops specific origin/development combination(s) + drop_valuation: str or list of str (default = None) + Drops specific valuation periods. str must be date convertible. + feat_eng: dict + A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') + (e.g. { + 'feature_1':{ + 'func': function_name for feature 1, + 'kwargs': keyword arguments for the function + }, + 'feature_2':{ + 'func': function_name for feature 2, + 'kwargs': keyword arguments for the function + } + } + ); + functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time + (e.g. this function adds 1 to every origin + def test_func(df) + return df['origin'] + 1 + ) fit_incrementals: Whether the response variable should be converted to an incremental basis for fitting. @@ -48,12 +70,16 @@ class DevelopmentML(DevelopmentBase): """ def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, - weight_ml=None, fit_incrementals=True): + weight_ml=None, weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True, feat_eng=None): self.estimator_ml=estimator_ml self.y_ml=y_ml self.weight_ml = weight_ml - self.autoregressive=autoregressive + self.weighted_step = weighted_step + self.autoregressive = autoregressive + self.drop = drop + self.drop_valuation = drop_valuation self.fit_incrementals = fit_incrementals + self.feat_eng = feat_eng def _get_y_names(self): """ private function to get the response column name""" @@ -112,6 +138,9 @@ def _get_triangle_ml(self, df, preds=None): if len(out) == 0: continue X_r.append(out.copy()) + if self.feat_eng is not None: + for key, item in self.feat_eng.items(): + out[key] = item['func'](df=out,**item['kwargs']) preds = self.estimator_ml.predict(out) y_r.append(preds.copy()) X_r = pd.concat(X_r, axis=0).reset_index(drop=True) @@ -124,7 +153,7 @@ def _get_triangle_ml(self, df, preds=None): return Triangle( out, origin='origin', development='valuation', index=self._key_labels, columns=self._get_y_names(), - cumulative=not self.fit_incrementals).dropna() + cumulative=not self.fit_incrementals).dropna(), out def _prep_X_ml(self, X): """ Preps Triangle data ahead of the pipeline """ @@ -145,7 +174,16 @@ def _prep_X_ml(self, X): on=list(df_base.columns)).fillna(0) df['origin'] = df['origin'].map(self.origin_encoder_) df['valuation'] = df['valuation'].map(self.valuation_encoder_) - return df + if self.feat_eng is not None: + for key, item in self.feat_eng.items(): + df[key] = item['func'](df=df,**item['kwargs']) + weight_base = (~np.isnan(X.values)).astype(float) + weight = weight_base.copy() + if self.drop is not None: + weight = weight * self._drop_func(X) + if self.drop_valuation is not None: + weight = weight * self._drop_valuation_func(X) + return df, weight.flatten()[weight_base.flatten()>0] def fit(self, X, y=None, sample_weight=None): """Fit the model with X. @@ -176,12 +214,19 @@ def fit(self, X, y=None, sample_weight=None): self.valuation_encoder_ = dict(zip( val, (pd.Series(val).rank()-1)/{'Y':1, 'S': 2, 'Q':4, 'M': 12}[X.development_grain])) - df = self._prep_X_ml(X) + df, weight = self._prep_X_ml(X) self.df_ = df + self.weight_ = weight + if self.weighted_step == None: + sample_weights = {} + elif isinstance(self.weighted_step, list): + sample_weights = {x + '__sample_weight':weight for x in self.weighted_step} + else: + sample_weights = {self.weighted_step + '__sample_weight':weight} # Fit model - self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze()) + self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze(),**sample_weights) #return selffit_incrementals - self.triangle_ml_ = self._get_triangle_ml(df) + self.triangle_ml_, self.predicted_data_ = self._get_triangle_ml(df) return self @property @@ -204,11 +249,12 @@ def transform(self, X): X_new : New triangle with transformed attributes. """ X_new = X.copy() - X_ml = self._prep_X_ml(X) + X_ml, weight_ml = self._prep_X_ml(X) y_ml=self.estimator_ml.predict(X_ml) - triangle_ml = self._get_triangle_ml(X_ml, y_ml) + triangle_ml, predicted_data = self._get_triangle_ml(X_ml, y_ml) backend = "cupy" if X.array_backend == "cupy" else "numpy" X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend) X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL) X_new._set_slicers() - return X_new + X_new.predicted_data_ = predicted_data + return X_new \ No newline at end of file diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py index 15110b89..b056bd16 100644 --- a/chainladder/development/tests/test_barnzehn.py +++ b/chainladder/development/tests/test_barnzehn.py @@ -12,4 +12,65 @@ def test_basic_bz(): def test_multiple_triangle_exception(): d = cl.load_sample("usauto") with pytest.raises(ValueError): - cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d) \ No newline at end of file + cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d) + +def test_feat_eng_1(): + ''' + this function tests the passing in a basic engineered feature. Since test_func just returns development, C(development) and C(teatfeat) should yield identical results + ''' + def test_func(df): + return df["development"] + + abc = cl.load_sample('abc') + test_dict = {'testfeat':{'func':test_func,'kwargs':{}}} + + assert np.all( + np.around(cl.BarnettZehnwirth(formula='C(origin)+development+valuation').fit(abc).coef_.T.values,3) + == np.around(cl.BarnettZehnwirth(formula='C(origin)+testfeat+valuation',feat_eng = test_dict).fit(abc).coef_.T.values,3) + ) + +def test_feat_eng_2(): + ''' + this function tests more complex feature engineering. Since origin_onehot just replicates the one-hot encoding that's performed inside sklearn LinearRegression, the two BZ models should yield identical results + + this function also tests the BZ transformer + ''' + def origin_onehot(df,ori): + return [1 if x == ori else 0 for x in df["origin"]] + + abc = cl.load_sample('abc') + feat_dict = {f'origin_{x}':{'func':origin_onehot,'kwargs':{'ori':float(x+1)}} for x in range(10)} + assert np.all( + np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3) + == np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3) + ) + +def test_bz_2008(): + ''' + this function tests the drop parameter by recreating the example in the 2008 BZ paper, section 4.1 + ''' + abc = cl.load_sample('abc') + exposure=np.array([[2.2], [2.4], [2.2], [2.0], [1.9], [1.6], [1.6], [1.8], [2.2], [2.5], [2.6]]) + abc_adj = abc/exposure + + def predictor_bins(df,pbin,axis): + return [int(x >= min(pbin)) for x in df[axis]] + + origin_groups = {f'origin_{ori}'.replace('[','').replace(']','').replace(', ',''):{'func':predictor_bins,'kwargs':{'pbin':ori,'axis':'origin'}} for ori in [[2],[3,4],[5,6,7,8,9,10]]} + + def trend_piece(df,piece,axis): + pmax = float(max(piece)) + increment=min(df[axis][df[axis]>0]) + pfirst = piece[0]-increment + return [(x-pfirst)/increment if x in piece else (0 if x