diff --git a/biolearn/data/GP-age_model_10_cpgs.json.zip b/biolearn/data/GP-age_model_10_cpgs.json.zip new file mode 100644 index 0000000..8a37130 Binary files /dev/null and b/biolearn/data/GP-age_model_10_cpgs.json.zip differ diff --git a/biolearn/data/GP-age_model_30_cpgs.json.zip b/biolearn/data/GP-age_model_30_cpgs.json.zip new file mode 100644 index 0000000..f4e3b4e Binary files /dev/null and b/biolearn/data/GP-age_model_30_cpgs.json.zip differ diff --git a/biolearn/data/GP-age_model_71_cpgs.json.zip b/biolearn/data/GP-age_model_71_cpgs.json.zip new file mode 100644 index 0000000..2b4ad36 Binary files /dev/null and b/biolearn/data/GP-age_model_71_cpgs.json.zip differ diff --git a/biolearn/data/GP-age_model_a.json.zip b/biolearn/data/GP-age_model_a.json.zip new file mode 100644 index 0000000..4c6b3ab Binary files /dev/null and b/biolearn/data/GP-age_model_a.json.zip differ diff --git a/biolearn/data/GP-age_model_b.json.zip b/biolearn/data/GP-age_model_b.json.zip new file mode 100644 index 0000000..6865ff1 Binary files /dev/null and b/biolearn/data/GP-age_model_b.json.zip differ diff --git a/biolearn/data/GP-age_model_c.json.zip b/biolearn/data/GP-age_model_c.json.zip new file mode 100644 index 0000000..6dbe361 Binary files /dev/null and b/biolearn/data/GP-age_model_c.json.zip differ diff --git a/biolearn/model.py b/biolearn/model.py index d529e53..becb7da 100644 --- a/biolearn/model.py +++ b/biolearn/model.py @@ -102,6 +102,272 @@ def preprocess(df): "output": "Age (Years)", "model": {"type": "LinearMethylationModel", "file": "Hannum.csv"}, }, + "GPAge10": { + "year": 2023, + "species": "Human", + "tissue": "Blood", + "source": "https://www.cell.com/cell-reports-methods/fulltext/S2667-2375(23)00211-4", + "output": "Age (Years)", + "model": { + "type": "GPAgeModel", + "file": "GP-age_model_10_cpgs.json.zip", + "sites": [ + "cg16867657", + "cg06639320", + "cg04875128", + "cg19283806", + "cg07553761", + "cg08128734", + "cg12934382", + "cg00573770", + "cg23479922", + "cg10501210", + ], + "default_imputation": "none", + }, + }, + "GPAge30": { + "year": 2023, + "species": "Human", + "tissue": "Blood", + "source": "https://www.cell.com/cell-reports-methods/fulltext/S2667-2375(23)00211-4", + "output": "Age (Years)", + "model": { + "type": "GPAgeModel", + "file": "GP-age_model_30_cpgs.json.zip", + "sites": [ + "cg16867657", + "cg22454769", + "cg06639320", + "cg04875128", + "cg19283806", + "cg24724428", + "cg07553761", + "cg24079702", + "cg08128734", + "cg12934382", + "cg08468401", + "cg20816447", + "cg00573770", + "cg06335143", + "cg06155229", + "cg03032497", + "cg06619077", + "cg17804348", + "cg00329615", + "cg23479922", + "cg10501210", + "cg19991948", + "cg27312979", + "cg23186333", + "cg25413977", + "cg22078805", + "cg17621438", + "cg21878650", + "cg04503319", + "cg09809672", + ], + "default_imputation": "none", + }, + }, + "GPAge71": { + "year": 2023, + "species": "Human", + "tissue": "Blood", + "source": "https://www.cell.com/cell-reports-methods/fulltext/S2667-2375(23)00211-4", + "output": "Age (Years)", + "model": { + "type": "GPAgeModel", + "file": "GP-age_model_71_cpgs.json.zip", + "sites": [ + "cg16867657", + "cg22454769", + "cg06639320", + "cg04875128", + "cg19283806", + "cg24724428", + "cg07553761", + "cg24079702", + "cg14556683", + "cg07547549", + "cg08128734", + "cg23500537", + "cg12934382", + "cg08468401", + "cg05404236", + "cg20816447", + "cg17110586", + "cg24466241", + "cg18473521", + "cg00573770", + "cg06335143", + "cg06155229", + "cg03032497", + "cg12899747", + "cg06619077", + "cg17804348", + "cg00329615", + "cg23479922", + "cg09017434", + "cg10501210", + "cg19991948", + "cg03738025", + "cg27312979", + "cg14766700", + "cg23186333", + "cg21184711", + "cg22730004", + "cg19421125", + "cg15894389", + "cg10835286", + "cg25413977", + "cg11807280", + "cg22078805", + "cg02872426", + "cg00303541", + "cg04295144", + "cg19729744", + "cg24794228", + "cg17621438", + "cg05017994", + "cg21878650", + "cg15804973", + "cg07797372", + "cg27152890", + "cg17471939", + "cg18887458", + "cg15243034", + "cg04503319", + "cg03350900", + "cg09809672", + "cg14577707", + "cg24892069", + "cg00602811", + "cg05991454", + "cg23126342", + "cg09988805", + "cg19344626", + "cg20059012", + "cg21826784", + "cg07164639", + "cg11693709", + ], + "default_imputation": "none", + }, + }, + "GPAgeA": { + "year": 2023, + "species": "Human", + "tissue": "Blood", + "source": "https://www.cell.com/cell-reports-methods/fulltext/S2667-2375(23)00211-4", + "output": "Age (Years)", + "model": { + "type": "GPAgeModel", + "file": "GP-age_model_a.json.zip", + "sites": [ + "cg16867657", + "cg19283806", + "cg24724428", + "cg07547549", + "cg05404236", + "cg06155229", + "cg03738025", + "cg14766700", + "cg23186333", + "cg21184711", + "cg15894389", + "cg02872426", + "cg15804973", + "cg18887458", + "cg15243034", + "cg03350900", + "cg23126342", + "cg07164639", + ], + "default_imputation": "none", + }, + }, + "GPAgeB": { + "year": 2023, + "species": "Human", + "tissue": "Blood", + "source": "https://www.cell.com/cell-reports-methods/fulltext/S2667-2375(23)00211-4", + "output": "Age (Years)", + "model": { + "type": "GPAgeModel", + "file": "GP-age_model_b.json.zip", + "sites": [ + "cg22454769", + "cg06639320", + "cg07553761", + "cg24079702", + "cg08128734", + "cg12934382", + "cg08468401", + "cg20816447", + "cg24466241", + "cg00573770", + "cg06335143", + "cg03032497", + "cg12899747", + "cg06619077", + "cg17804348", + "cg00329615", + "cg10501210", + "cg22730004", + "cg10835286", + "cg25413977", + "cg11807280", + "cg22078805", + "cg00303541", + "cg19729744", + "cg07797372", + "cg17471939", + "cg09809672", + "cg14577707", + "cg00602811", + "cg05991454", + "cg09988805", + "cg21826784", + ], + "default_imputation": "none", + }, + }, + "GPAgeC": { + "year": 2023, + "species": "Human", + "tissue": "Blood", + "source": "https://www.cell.com/cell-reports-methods/fulltext/S2667-2375(23)00211-4", + "output": "Age (Years)", + "model": { + "type": "GPAgeModel", + "file": "GP-age_model_c.json.zip", + "sites": [ + "cg04875128", + "cg14556683", + "cg23500537", + "cg17110586", + "cg18473521", + "cg23479922", + "cg09017434", + "cg19991948", + "cg27312979", + "cg19421125", + "cg04295144", + "cg24794228", + "cg17621438", + "cg05017994", + "cg21878650", + "cg27152890", + "cg04503319", + "cg24892069", + "cg19344626", + "cg20059012", + "cg11693709", + ], + "default_imputation": "none", + }, + }, "Lin": { "year": 2016, "species": "Human", @@ -1930,6 +2196,56 @@ def methylation_sites(self): return self.required_cpgs if self.required_cpgs else [] +class GPAgeModel: + """Gaussian Process regression model for age prediction (GP-age clock).""" + + def __init__(self, model_file, sites): + self.model_file = model_file + self._sites = sites + self._model = None + self._training_means = None + + def _load_model(self): + if self._model is not None: + return + try: + import GPy + except ImportError: + raise ImportError( + "GPy is required for GP-age models. " + "Install with: pip install biolearn[gpage]" + ) + model_path = get_data_file(self.model_file) + self._model = GPy.models.GPRegression.load_model(model_path) + self._training_means = np.mean(self._model.X, axis=0) + + @classmethod + def from_definition(cls, clock_definition): + model_def = clock_definition["model"] + return cls(model_def["file"], model_def["sites"]) + + def predict(self, geo_data): + self._load_model() + dnam = geo_data.dnam + methylation_data = pd.DataFrame( + np.nan, index=dnam.columns, columns=self._sites + ) + existing = [s for s in self._sites if s in dnam.index] + for site in existing: + methylation_data[site] = dnam.loc[site].values + X = methylation_data.values + if np.isnan(X).any(): + for i, mean_val in enumerate(self._training_means): + X[np.isnan(X[:, i]), i] = mean_val + predictions = self._model.predict(X)[0].squeeze() + return pd.DataFrame( + predictions, index=dnam.columns, columns=["Predicted"] + ) + + def methylation_sites(self): + return self._sites + + class ImputationDecorator: def __init__(self, clock, imputation_method): self.clock = clock diff --git a/biolearn/model_gallery.py b/biolearn/model_gallery.py index 2037d35..2ca8971 100644 --- a/biolearn/model_gallery.py +++ b/biolearn/model_gallery.py @@ -13,6 +13,7 @@ PCLinearTransformationModel, MiAgeModel, HurdleAPIModel, + GPAgeModel, ) from biolearn.imputation import ( hybrid_impute, @@ -40,6 +41,7 @@ class ModelGallery: "PCLinearTransformationModel": PCLinearTransformationModel.from_definition, "MiAgeModel": MiAgeModel.from_definition, "HurdleAPIModel": HurdleAPIModel.from_definition, + "GPAgeModel": GPAgeModel.from_definition, } def __init__(self, models=model_definitions): diff --git a/biolearn/test/data/expected_model_outputs/GPAge10.csv b/biolearn/test/data/expected_model_outputs/GPAge10.csv new file mode 100644 index 0000000..0f34e5b --- /dev/null +++ b/biolearn/test/data/expected_model_outputs/GPAge10.csv @@ -0,0 +1,11 @@ +,Predicted +GSM1009660,31.89704727031237 +GSM1009661,23.903188943555232 +GSM1009662,74.73425675682917 +GSM1009663,18.578239549985653 +GSM1009664,26.700354717080252 +GSM1009665,25.511381170918273 +GSM1009666,15.411114544706832 +GSM1009667,24.447221724018352 +GSM1009668,20.506828510955785 +GSM1009669,19.82849328756421 diff --git a/biolearn/test/data/expected_model_outputs/GPAge30.csv b/biolearn/test/data/expected_model_outputs/GPAge30.csv new file mode 100644 index 0000000..83ab435 --- /dev/null +++ b/biolearn/test/data/expected_model_outputs/GPAge30.csv @@ -0,0 +1,11 @@ +,Predicted +GSM1009660,34.77582182754103 +GSM1009661,27.320792200007833 +GSM1009662,68.58154920240196 +GSM1009663,21.221924955458782 +GSM1009664,27.543776269924667 +GSM1009665,26.083311597312022 +GSM1009666,21.77038103306041 +GSM1009667,22.65136967579668 +GSM1009668,20.18206633237497 +GSM1009669,19.249568421988524 diff --git a/biolearn/test/data/expected_model_outputs/GPAge71.csv b/biolearn/test/data/expected_model_outputs/GPAge71.csv new file mode 100644 index 0000000..3db8323 --- /dev/null +++ b/biolearn/test/data/expected_model_outputs/GPAge71.csv @@ -0,0 +1,11 @@ +,Predicted +GSM1009660,33.397193679701225 +GSM1009661,27.301255786703674 +GSM1009662,70.7486930314503 +GSM1009663,22.43499389016172 +GSM1009664,28.47807323841766 +GSM1009665,27.82262163954521 +GSM1009666,21.118096364885393 +GSM1009667,20.766219965455534 +GSM1009668,21.09132109177609 +GSM1009669,20.959442794228888 diff --git a/biolearn/test/data/expected_model_outputs/GPAgeA.csv b/biolearn/test/data/expected_model_outputs/GPAgeA.csv new file mode 100644 index 0000000..500d58f --- /dev/null +++ b/biolearn/test/data/expected_model_outputs/GPAgeA.csv @@ -0,0 +1,11 @@ +,Predicted +GSM1009660,30.223491203897595 +GSM1009661,23.773177186897197 +GSM1009662,68.55257703888005 +GSM1009663,17.32071283788694 +GSM1009664,29.405484234863785 +GSM1009665,24.859859187152804 +GSM1009666,20.806872201548995 +GSM1009667,23.729867009949583 +GSM1009668,23.654339109005498 +GSM1009669,17.604339500089328 diff --git a/biolearn/test/data/expected_model_outputs/GPAgeB.csv b/biolearn/test/data/expected_model_outputs/GPAgeB.csv new file mode 100644 index 0000000..674e022 --- /dev/null +++ b/biolearn/test/data/expected_model_outputs/GPAgeB.csv @@ -0,0 +1,11 @@ +,Predicted +GSM1009660,38.053207193340114 +GSM1009661,26.838589826778367 +GSM1009662,65.40032765179201 +GSM1009663,21.929513088767344 +GSM1009664,28.204459426349594 +GSM1009665,28.490788190605667 +GSM1009666,18.970951609583576 +GSM1009667,22.634972677172925 +GSM1009668,16.882486073155313 +GSM1009669,21.025250055961784 diff --git a/biolearn/test/data/expected_model_outputs/GPAgeC.csv b/biolearn/test/data/expected_model_outputs/GPAgeC.csv new file mode 100644 index 0000000..f092065 --- /dev/null +++ b/biolearn/test/data/expected_model_outputs/GPAgeC.csv @@ -0,0 +1,11 @@ +,Predicted +GSM1009660,35.872956172522194 +GSM1009661,31.745596307927535 +GSM1009662,62.699845394737785 +GSM1009663,19.563242250310083 +GSM1009664,32.938776035415444 +GSM1009665,31.152218020521573 +GSM1009666,25.678496268631186 +GSM1009667,22.626146308618416 +GSM1009668,20.353462400398076 +GSM1009669,23.768585991739368 diff --git a/pyproject.toml b/pyproject.toml index 708291d..1f248cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,8 @@ test = [ plotting = ["matplotlib>=3.3.0", "seaborn>=0.13.0"] # Examples external dependencies examples = ["lifelines"] +# Required for GP-age clock (Gaussian Process regression) +gpage = ["gpy>=1.10.0"] # Requirements necessary for building the documentation doc = [ "biolearn[plotting,examples]", @@ -75,7 +77,7 @@ doc = [ ] # A combination of dependencies useful for developers dev = [ - "biolearn[test,plotting,examples,doc]", + "biolearn[test,plotting,examples,doc,gpage]", "jupyterlab", "scikit-learn", "torch",