Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added biolearn/data/GP-age_model_10_cpgs.json.zip
Binary file not shown.
Binary file added biolearn/data/GP-age_model_30_cpgs.json.zip
Binary file not shown.
Binary file added biolearn/data/GP-age_model_71_cpgs.json.zip
Binary file not shown.
Binary file added biolearn/data/GP-age_model_a.json.zip
Binary file not shown.
Binary file added biolearn/data/GP-age_model_b.json.zip
Binary file not shown.
Binary file added biolearn/data/GP-age_model_c.json.zip
Binary file not shown.
316 changes: 316 additions & 0 deletions biolearn/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,272 @@ def preprocess(df):
"output": "Age (Years)",
"model": {"type": "LinearMethylationModel", "file": "Hannum.csv"},
},
"GPAge10": {
"year": 2023,
"species": "Human",
"tissue": "Blood",
"source": "https://www.cell.com/cell-reports-methods/fulltext/S2667-2375(23)00211-4",
"output": "Age (Years)",
"model": {
"type": "GPAgeModel",
"file": "GP-age_model_10_cpgs.json.zip",
"sites": [
"cg16867657",
"cg06639320",
"cg04875128",
"cg19283806",
"cg07553761",
"cg08128734",
"cg12934382",
"cg00573770",
"cg23479922",
"cg10501210",
],
"default_imputation": "none",
},
},
"GPAge30": {
"year": 2023,
"species": "Human",
"tissue": "Blood",
"source": "https://www.cell.com/cell-reports-methods/fulltext/S2667-2375(23)00211-4",
"output": "Age (Years)",
"model": {
"type": "GPAgeModel",
"file": "GP-age_model_30_cpgs.json.zip",
"sites": [
"cg16867657",
"cg22454769",
"cg06639320",
"cg04875128",
"cg19283806",
"cg24724428",
"cg07553761",
"cg24079702",
"cg08128734",
"cg12934382",
"cg08468401",
"cg20816447",
"cg00573770",
"cg06335143",
"cg06155229",
"cg03032497",
"cg06619077",
"cg17804348",
"cg00329615",
"cg23479922",
"cg10501210",
"cg19991948",
"cg27312979",
"cg23186333",
"cg25413977",
"cg22078805",
"cg17621438",
"cg21878650",
"cg04503319",
"cg09809672",
],
"default_imputation": "none",
},
},
"GPAge71": {
"year": 2023,
"species": "Human",
"tissue": "Blood",
"source": "https://www.cell.com/cell-reports-methods/fulltext/S2667-2375(23)00211-4",
"output": "Age (Years)",
"model": {
"type": "GPAgeModel",
"file": "GP-age_model_71_cpgs.json.zip",
"sites": [
"cg16867657",
"cg22454769",
"cg06639320",
"cg04875128",
"cg19283806",
"cg24724428",
"cg07553761",
"cg24079702",
"cg14556683",
"cg07547549",
"cg08128734",
"cg23500537",
"cg12934382",
"cg08468401",
"cg05404236",
"cg20816447",
"cg17110586",
"cg24466241",
"cg18473521",
"cg00573770",
"cg06335143",
"cg06155229",
"cg03032497",
"cg12899747",
"cg06619077",
"cg17804348",
"cg00329615",
"cg23479922",
"cg09017434",
"cg10501210",
"cg19991948",
"cg03738025",
"cg27312979",
"cg14766700",
"cg23186333",
"cg21184711",
"cg22730004",
"cg19421125",
"cg15894389",
"cg10835286",
"cg25413977",
"cg11807280",
"cg22078805",
"cg02872426",
"cg00303541",
"cg04295144",
"cg19729744",
"cg24794228",
"cg17621438",
"cg05017994",
"cg21878650",
"cg15804973",
"cg07797372",
"cg27152890",
"cg17471939",
"cg18887458",
"cg15243034",
"cg04503319",
"cg03350900",
"cg09809672",
"cg14577707",
"cg24892069",
"cg00602811",
"cg05991454",
"cg23126342",
"cg09988805",
"cg19344626",
"cg20059012",
"cg21826784",
"cg07164639",
"cg11693709",
],
"default_imputation": "none",
},
},
"GPAgeA": {
"year": 2023,
"species": "Human",
"tissue": "Blood",
"source": "https://www.cell.com/cell-reports-methods/fulltext/S2667-2375(23)00211-4",
"output": "Age (Years)",
"model": {
"type": "GPAgeModel",
"file": "GP-age_model_a.json.zip",
"sites": [
"cg16867657",
"cg19283806",
"cg24724428",
"cg07547549",
"cg05404236",
"cg06155229",
"cg03738025",
"cg14766700",
"cg23186333",
"cg21184711",
"cg15894389",
"cg02872426",
"cg15804973",
"cg18887458",
"cg15243034",
"cg03350900",
"cg23126342",
"cg07164639",
],
"default_imputation": "none",
},
},
"GPAgeB": {
"year": 2023,
"species": "Human",
"tissue": "Blood",
"source": "https://www.cell.com/cell-reports-methods/fulltext/S2667-2375(23)00211-4",
"output": "Age (Years)",
"model": {
"type": "GPAgeModel",
"file": "GP-age_model_b.json.zip",
"sites": [
"cg22454769",
"cg06639320",
"cg07553761",
"cg24079702",
"cg08128734",
"cg12934382",
"cg08468401",
"cg20816447",
"cg24466241",
"cg00573770",
"cg06335143",
"cg03032497",
"cg12899747",
"cg06619077",
"cg17804348",
"cg00329615",
"cg10501210",
"cg22730004",
"cg10835286",
"cg25413977",
"cg11807280",
"cg22078805",
"cg00303541",
"cg19729744",
"cg07797372",
"cg17471939",
"cg09809672",
"cg14577707",
"cg00602811",
"cg05991454",
"cg09988805",
"cg21826784",
],
"default_imputation": "none",
},
},
"GPAgeC": {
"year": 2023,
"species": "Human",
"tissue": "Blood",
"source": "https://www.cell.com/cell-reports-methods/fulltext/S2667-2375(23)00211-4",
"output": "Age (Years)",
"model": {
"type": "GPAgeModel",
"file": "GP-age_model_c.json.zip",
"sites": [
"cg04875128",
"cg14556683",
"cg23500537",
"cg17110586",
"cg18473521",
"cg23479922",
"cg09017434",
"cg19991948",
"cg27312979",
"cg19421125",
"cg04295144",
"cg24794228",
"cg17621438",
"cg05017994",
"cg21878650",
"cg27152890",
"cg04503319",
"cg24892069",
"cg19344626",
"cg20059012",
"cg11693709",
],
"default_imputation": "none",
},
},
"Lin": {
"year": 2016,
"species": "Human",
Expand Down Expand Up @@ -1930,6 +2196,56 @@ def methylation_sites(self):
return self.required_cpgs if self.required_cpgs else []


class GPAgeModel:
"""Gaussian Process regression model for age prediction (GP-age clock)."""

def __init__(self, model_file, sites):
self.model_file = model_file
self._sites = sites
self._model = None
self._training_means = None

def _load_model(self):
if self._model is not None:
return
try:
import GPy
except ImportError:
raise ImportError(
"GPy is required for GP-age models. "
"Install with: pip install biolearn[gpage]"
)
model_path = get_data_file(self.model_file)
self._model = GPy.models.GPRegression.load_model(model_path)
self._training_means = np.mean(self._model.X, axis=0)

@classmethod
def from_definition(cls, clock_definition):
model_def = clock_definition["model"]
return cls(model_def["file"], model_def["sites"])

def predict(self, geo_data):
self._load_model()
dnam = geo_data.dnam
methylation_data = pd.DataFrame(
np.nan, index=dnam.columns, columns=self._sites
)
existing = [s for s in self._sites if s in dnam.index]
for site in existing:
methylation_data[site] = dnam.loc[site].values
X = methylation_data.values
if np.isnan(X).any():
for i, mean_val in enumerate(self._training_means):
X[np.isnan(X[:, i]), i] = mean_val
predictions = self._model.predict(X)[0].squeeze()
return pd.DataFrame(
predictions, index=dnam.columns, columns=["Predicted"]
)

def methylation_sites(self):
return self._sites


class ImputationDecorator:
def __init__(self, clock, imputation_method):
self.clock = clock
Expand Down
2 changes: 2 additions & 0 deletions biolearn/model_gallery.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
PCLinearTransformationModel,
MiAgeModel,
HurdleAPIModel,
GPAgeModel,
)
from biolearn.imputation import (
hybrid_impute,
Expand Down Expand Up @@ -40,6 +41,7 @@ class ModelGallery:
"PCLinearTransformationModel": PCLinearTransformationModel.from_definition,
"MiAgeModel": MiAgeModel.from_definition,
"HurdleAPIModel": HurdleAPIModel.from_definition,
"GPAgeModel": GPAgeModel.from_definition,
}

def __init__(self, models=model_definitions):
Expand Down
11 changes: 11 additions & 0 deletions biolearn/test/data/expected_model_outputs/GPAge10.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
,Predicted
GSM1009660,31.89704727031237
GSM1009661,23.903188943555232
GSM1009662,74.73425675682917
GSM1009663,18.578239549985653
GSM1009664,26.700354717080252
GSM1009665,25.511381170918273
GSM1009666,15.411114544706832
GSM1009667,24.447221724018352
GSM1009668,20.506828510955785
GSM1009669,19.82849328756421
11 changes: 11 additions & 0 deletions biolearn/test/data/expected_model_outputs/GPAge30.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
,Predicted
GSM1009660,34.77582182754103
GSM1009661,27.320792200007833
GSM1009662,68.58154920240196
GSM1009663,21.221924955458782
GSM1009664,27.543776269924667
GSM1009665,26.083311597312022
GSM1009666,21.77038103306041
GSM1009667,22.65136967579668
GSM1009668,20.18206633237497
GSM1009669,19.249568421988524
11 changes: 11 additions & 0 deletions biolearn/test/data/expected_model_outputs/GPAge71.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
,Predicted
GSM1009660,33.397193679701225
GSM1009661,27.301255786703674
GSM1009662,70.7486930314503
GSM1009663,22.43499389016172
GSM1009664,28.47807323841766
GSM1009665,27.82262163954521
GSM1009666,21.118096364885393
GSM1009667,20.766219965455534
GSM1009668,21.09132109177609
GSM1009669,20.959442794228888
11 changes: 11 additions & 0 deletions biolearn/test/data/expected_model_outputs/GPAgeA.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
,Predicted
GSM1009660,30.223491203897595
GSM1009661,23.773177186897197
GSM1009662,68.55257703888005
GSM1009663,17.32071283788694
GSM1009664,29.405484234863785
GSM1009665,24.859859187152804
GSM1009666,20.806872201548995
GSM1009667,23.729867009949583
GSM1009668,23.654339109005498
GSM1009669,17.604339500089328
Loading