diff --git a/chainladder/development/barnzehn.py b/chainladder/development/barnzehn.py index 2975ba87..243d4f30 100644 --- a/chainladder/development/barnzehn.py +++ b/chainladder/development/barnzehn.py @@ -22,6 +22,10 @@ class BarnettZehnwirth(TweedieGLM): Parameters ---------- + drop: tuple or list of tuples + Drops specific origin/development combination(s) + drop_valuation: str or list of str (default = None) + Drops specific valuation periods. str must be date convertible. formula: formula-like A patsy formula describing the independent variables, X of the GLM response: str @@ -31,7 +35,9 @@ class BarnettZehnwirth(TweedieGLM): """ - def __init__(self, formula='C(origin) + development', response=None): + def __init__(self, drop=None,drop_valuation=None,formula='C(origin) + development', response=None): + self.drop = drop + self.drop_valuation = drop_valuation self.formula = formula self.response = response @@ -50,7 +56,7 @@ def fit(self, X, y=None, sample_weight=None): self.model_ = DevelopmentML(Pipeline(steps=[ ('design_matrix', PatsyFormula(self.formula)), ('model', LinearRegression(fit_intercept=False))]), - y_ml=response, fit_incrementals=False).fit(tri) + y_ml=response, fit_incrementals=True, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(X = tri, sample_weight = sample_weight) resid = tri - self.model_.triangle_ml_[ self.model_.triangle_ml_.valuation <= tri.valuation_date] self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / ( @@ -77,10 +83,11 @@ def transform(self, X): X_new = X.copy() X_ml = self.model_._prep_X_ml(X.cum_to_incr().log()) y_ml = self.model_.estimator_ml.predict(X_ml) - triangle_ml = self.model_._get_triangle_ml(X_ml, y_ml) + triangle_ml, predicted_data = self.model_._get_triangle_ml(X_ml, y_ml) backend = "cupy" if X.array_backend == "cupy" else "numpy" triangle_ml.is_cumulative = False X_new.ldf_ = triangle_ml.exp().incr_to_cum().link_ratio.set_backend(backend) X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL) X_new._set_slicers() + X_new.predicted_data_ = predicted_data return X_new diff --git a/chainladder/development/glm.py b/chainladder/development/glm.py index 9b121367..c44bb0d5 100644 --- a/chainladder/development/glm.py +++ b/chainladder/development/glm.py @@ -22,15 +22,16 @@ class TweedieGLM(DevelopmentBase): Parameters ---------- + drop: tuple or list of tuples + Drops specific origin/development combination(s) + drop_valuation: str or list of str (default = None) + Drops specific valuation periods. str must be date convertible. design_matrix: formula-like A patsy formula describing the independent variables, X of the GLM response: str Column name for the reponse variable of the GLM. If ommitted, then the first column of the Triangle will be used. - weight: str - Column name of any weight to use in the GLM. If none specified, then an - unweighted regression will be performed. - power: float, default=0 + power: float, default=1 The power determines the underlying target distribution according to the following table: +-------+------------------------+ @@ -52,7 +53,7 @@ class TweedieGLM(DevelopmentBase): regularization strength. ``alpha = 0`` is equivalent to unpenalized GLMs. In this case, the design matrix `X` must have full column rank (no collinearities). - link: {'auto', 'identity', 'log'}, default='auto' + link: {'auto', 'identity', 'log'}, default='log' The link function of the GLM, i.e. mapping from linear predictor `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets the link depending on the chosen family as follows: @@ -78,10 +79,11 @@ class TweedieGLM(DevelopmentBase): """ def __init__(self, design_matrix='C(development) + C(origin)', - response=None, weight=None, power=1.0, alpha=1.0, link='log', - max_iter=100, tol=0.0001, warm_start=False, verbose=0): + response=None, power=1.0, alpha=1.0, link='log', + max_iter=100, tol=0.0001, warm_start=False, verbose=0, drop=None,drop_valuation=None): + self.drop = drop + self.drop_valuation = drop_valuation self.response=response - self.weight=weight self.design_matrix = design_matrix self.power=power self.alpha=alpha @@ -93,13 +95,18 @@ def __init__(self, design_matrix='C(development) + C(origin)', def fit(self, X, y=None, sample_weight=None): response = X.columns[0] if not self.response else self.response + if sample_weight is None: + weight = None + else: + weight = 'model' self.model_ = DevelopmentML(Pipeline(steps=[ ('design_matrix', PatsyFormula(self.design_matrix)), ('model', TweedieRegressor( link=self.link, power=self.power, max_iter=self.max_iter, tol=self.tol, warm_start=self.warm_start, verbose=self.verbose, fit_intercept=False))]), - y_ml=response, weight_ml=self.weight).fit(X) + y_ml=response, weighted_step = weight, + drop=self.drop, drop_valuation=self.drop_valuation).fit(X = X, sample_weight = sample_weight) return self @property diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py index 1705c696..041a9abc 100644 --- a/chainladder/development/learning.py +++ b/chainladder/development/learning.py @@ -33,9 +33,14 @@ class DevelopmentML(DevelopmentBase): Time Series aspects of the model. Predictions from one development period get used as featues in the next development period. Lags should be negative integers. + weight_step: str + Step name within estimator_ml that is weighted + drop: tuple or list of tuples + Drops specific origin/development combination(s) + drop_valuation: str or list of str (default = None) + Drops specific valuation periods. str must be date convertible. fit_incrementals: - Whether the response variable should be converted to an incremental basis - for fitting. + Whether the response variable should be converted to an incremental basis for fitting. Attributes ---------- @@ -48,11 +53,13 @@ class DevelopmentML(DevelopmentBase): """ def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, - weight_ml=None, fit_incrementals=True): + weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True): self.estimator_ml=estimator_ml self.y_ml=y_ml - self.weight_ml = weight_ml - self.autoregressive=autoregressive + self.weighted_step = weighted_step + self.autoregressive = autoregressive + self.drop = drop + self.drop_valuation = drop_valuation self.fit_incrementals = fit_incrementals def _get_y_names(self): @@ -124,7 +131,7 @@ def _get_triangle_ml(self, df, preds=None): return Triangle( out, origin='origin', development='valuation', index=self._key_labels, columns=self._get_y_names(), - cumulative=not self.fit_incrementals).dropna() + cumulative=not self.fit_incrementals).dropna(), out def _prep_X_ml(self, X): """ Preps Triangle data ahead of the pipeline """ @@ -139,7 +146,7 @@ def _prep_X_ml(self, X): df_base = X.incr_to_cum().to_frame( keepdims=True, implicit_axis=True, origin_as_datetime=True ).reset_index().iloc[:, :-1] - df = df_base.merge(X.cum_to_incr().to_frame( + df = df_base.merge(X_.to_frame( keepdims=True, implicit_axis=True, origin_as_datetime=True ).reset_index(), how='left', on=list(df_base.columns)).fillna(0) @@ -147,6 +154,17 @@ def _prep_X_ml(self, X): df['valuation'] = df['valuation'].map(self.valuation_encoder_) return df + def _prep_w_ml(self,X,sample_weight=None): + weight_base = (~np.isnan(X.values)).astype(float) + weight = weight_base.copy() + if self.drop is not None: + weight = weight * self._drop_func(X) + if self.drop_valuation is not None: + weight = weight * self._drop_valuation_func(X) + if sample_weight is not None: + weight = weight * sample_weight.values + return weight.flatten()[weight_base.flatten()>0] + def fit(self, X, y=None, sample_weight=None): """Fit the model with X. @@ -156,8 +174,8 @@ def fit(self, X, y=None, sample_weight=None): Set of LDFs to which the estimator will be applied. y : None Ignored, use y_ml to set a reponse variable for the ML algorithm - sample_weight : None - Ignored + sample_weight : Triangle-like + Weights to use in the regression Returns ------- @@ -178,10 +196,18 @@ def fit(self, X, y=None, sample_weight=None): (pd.Series(val).rank()-1)/{'Y':1, 'S': 2, 'Q':4, 'M': 12}[X.development_grain])) df = self._prep_X_ml(X) self.df_ = df + weight = self._prep_w_ml(X,sample_weight) + self.weight_ = weight + if self.weighted_step == None: + sample_weights = {} + elif isinstance(self.weighted_step, list): + sample_weights = {x + '__sample_weight':weight for x in self.weighted_step} + else: + sample_weights = {self.weighted_step + '__sample_weight':weight} # Fit model - self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze()) + self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze(),**sample_weights) #return selffit_incrementals - self.triangle_ml_ = self._get_triangle_ml(df) + self.triangle_ml_, self.predicted_data_ = self._get_triangle_ml(df) return self @property @@ -206,9 +232,10 @@ def transform(self, X): X_new = X.copy() X_ml = self._prep_X_ml(X) y_ml=self.estimator_ml.predict(X_ml) - triangle_ml = self._get_triangle_ml(X_ml, y_ml) + triangle_ml, predicted_data = self._get_triangle_ml(X_ml, y_ml) backend = "cupy" if X.array_backend == "cupy" else "numpy" X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend) X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL) X_new._set_slicers() - return X_new + X_new.predicted_data_ = predicted_data + return X_new \ No newline at end of file diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py index 15110b89..8ec0cbb6 100644 --- a/chainladder/development/tests/test_barnzehn.py +++ b/chainladder/development/tests/test_barnzehn.py @@ -1,9 +1,9 @@ import numpy as np import chainladder as cl import pytest +abc = cl.load_sample('abc') def test_basic_bz(): - abc = cl.load_sample('abc') assert np.all( np.around(cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(abc).coef_.T.values,3).flatten() == np.array([11.837,0.179,0.345,0.378,0.405,0.427,0.431,0.66,0.963,1.157,1.278,0.251,-0.056,-0.449,-0.829,-1.169,-1.508,-1.798,-2.023,-2.238,-2.428]) @@ -12,4 +12,37 @@ def test_basic_bz(): def test_multiple_triangle_exception(): d = cl.load_sample("usauto") with pytest.raises(ValueError): - cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d) \ No newline at end of file + cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d) + +def test_drops(): + ''' + this function tests the passing in a basic drop_valuation + ''' + assert np.all( + np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit_transform(abc).ldf_.values,3) + == np.around(cl.BarnettZehnwirth(formula='C(development)',drop = [('1977',36),('1978',24),('1979',12)]).fit_transform(abc).ldf_.values,3) + ) + assert np.all( + np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit(abc).triangle_ml_.values,3) + == np.around(cl.BarnettZehnwirth(formula='C(development)',drop = [('1977',36),('1978',24),('1979',12)]).fit(abc).triangle_ml_.values,3) + ) + +def test_bz_2008(): + ''' + this function tests the drop parameter by recreating the example in the 2008 BZ paper, section 4.1 + ''' + exposure=np.array([[2.2], [2.4], [2.2], [2.0], [1.9], [1.6], [1.6], [1.8], [2.2], [2.5], [2.6]]) + abc_adj = abc/exposure + + origin_buckets = [2,3,5] + dev_buckets = [(24,36),(36,48),(48,84),(84,108),(108,9999)] + val_buckets = [(1,8),(8,9),(9,999)] + + origin_formula = '+'.join([f'I(origin >= {x})' for x in origin_buckets]) + dev_formula = '+'.join([f'I((np.minimum({x[1]-12},development) - np.minimum({x[0]-12},development))/12)' for x in dev_buckets]) + val_formula = '+'.join([f'I(np.minimum({x[1]-1},valuation) - np.minimum({x[0]-1},valuation))' for x in val_buckets]) + model=cl.BarnettZehnwirth(formula=origin_formula + '+' + dev_formula + '+' + val_formula, drop=('1982',72)).fit(abc_adj) + assert np.all( + np.around(model.coef_.values,4).flatten() + == np.array([11.1579,0.1989,0.0703,0.0919,0.1871,-0.3771,-0.4465,-0.3727,-0.3154,0.0432,0.0858,0.1464]) + ) \ No newline at end of file diff --git a/chainladder/development/tests/test_glm.py b/chainladder/development/tests/test_glm.py index 0065d93e..34532871 100644 --- a/chainladder/development/tests/test_glm.py +++ b/chainladder/development/tests/test_glm.py @@ -5,3 +5,9 @@ def test_basic_odp_cl(genins): (cl.Chainladder().fit(genins).ultimate_ - cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins)).ultimate_) / genins.latest_diagonal).max()< 1e-2 + +def test_sample_weight(genins): + assert abs( + (cl.Chainladder().fit(genins).ultimate_ - + cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins,sample_weight=genins/genins)).ultimate_) / + genins.latest_diagonal).max()< 1e-2 \ No newline at end of file diff --git a/chainladder/development/tests/test_learning.py b/chainladder/development/tests/test_learning.py new file mode 100644 index 00000000..0b411aca --- /dev/null +++ b/chainladder/development/tests/test_learning.py @@ -0,0 +1,19 @@ +import chainladder as cl +from sklearn.linear_model import LinearRegression +from sklearn.pipeline import Pipeline +from chainladder.utils.utility_functions import PatsyFormula + +def test_incremental(genins): + response = [genins.columns[0]] + model = cl.DevelopmentML(Pipeline(steps=[ + ('design_matrix', PatsyFormula('C(development)')), + ('model', LinearRegression(fit_intercept=False))]), + y_ml=response,fit_incrementals=False).fit(genins) + assert abs(model.triangle_ml_.loc[:,:,'2010',:] - genins.mean()).max() < 1e2 + +def test_misc(genins): + model = cl.DevelopmentML(Pipeline(steps=[ + ('design_matrix', PatsyFormula('C(development)')), + ('model', LinearRegression(fit_intercept=False))]), + weighted_step = ['model'], fit_incrementals=False).fit(genins, sample_weight=genins/genins) + assert abs(model.triangle_ml_.loc[:,:,'2010',:] - genins.mean()).max() < 1e2 \ No newline at end of file