From bc28bdd40f1fd6d9411ea7cd2bff1ff656e8ef03 Mon Sep 17 00:00:00 2001 From: richardjgowers Date: Wed, 10 Oct 2018 16:43:43 -0500 Subject: [PATCH 1/6] alternative (easier?) way to define datasets --- MDAnalysisData/__init__.py | 2 +- MDAnalysisData/adk_equilibrium.py | 100 +++++++----------------------- MDAnalysisData/base.py | 57 ++++++++++++++++- MDAnalysisData/datasets.py | 3 +- 4 files changed, 80 insertions(+), 82 deletions(-) diff --git a/MDAnalysisData/__init__.py b/MDAnalysisData/__init__.py index d93867a..3aceae5 100644 --- a/MDAnalysisData/__init__.py +++ b/MDAnalysisData/__init__.py @@ -8,7 +8,7 @@ __all__ = ['datasets'] from . import datasets - +from .base import fetch, DATASET_NAMES diff --git a/MDAnalysisData/adk_equilibrium.py b/MDAnalysisData/adk_equilibrium.py index a274f7b..d68dcef 100644 --- a/MDAnalysisData/adk_equilibrium.py +++ b/MDAnalysisData/adk_equilibrium.py @@ -5,88 +5,32 @@ https://figshare.com/articles/Molecular_dynamics_trajectory_for_benchmarking_MDAnalysis/5108170/1 """ -from os.path import dirname, exists, join -from os import makedirs, remove -import codecs - import logging -from .base import get_data_home -from .base import _fetch_remote -from .base import RemoteFileMetadata -from .base import Bunch +from .base import RemoteFileMetadata, Dataset -NAME = "adk_equilibrium" -DESCRIPTION = "adk_equilibrium.rst" -# The original data can be found at the figshare URL. -# The SHA256 checksum of the zip file changes with every download so we -# cannot check its checksum. Instead we download individual files. -# separately. The keys of this dict are also going to be the keys in the -# Bunch that is returned. -ARCHIVE = { - 'topology': RemoteFileMetadata( - filename='adk4AKE.psf', - url='https://ndownloader.figshare.com/files/8672230', - checksum='1aa947d58fb41b6805dc1e7be4dbe65c6a8f4690f0bd7fc2ae03e7bd437085f4', - ), - 'trajectory': RemoteFileMetadata( - filename='1ake_007-nowater-core-dt240ps.dcd', - url='https://ndownloader.figshare.com/files/8672074', - checksum='598fcbcfcc425f6eafbe9997238320fcacc6a4613ecce061e1521732bab734bf', - ), -} logger = logging.getLogger(__name__) -def fetch_adk_equilibrium(data_home=None, download_if_missing=True): - """Load the AdK 1us equilibrium trajectory (without water) - - Parameters - ---------- - data_home : optional, default: None - Specify another download and cache folder for the datasets. By default - all MDAnalysisData data is stored in '~/MDAnalysis_data' subfolders. - This dataset is stored in ``/adk_equilibrium``. - download_if_missing : optional, default=True - If ``False``, raise a :exc:`IOError` if the data is not locally available - instead of trying to download the data from the source site. - - Returns - ------- - dataset : dict-like object with the following attributes: - dataset.topology : filename - Filename of the topology file - dataset.trajectory : filename - Filename of the trajectory file - dataset.DESCR : string - Description of the trajectory. - - - See :ref:`adk-equilibrium-dataset` for description. - """ - name = NAME - data_location = join(get_data_home(data_home=data_home), - name) - if not exists(data_location): - makedirs(data_location) - - records = Bunch() - for file_type, meta in ARCHIVE.items(): - local_path = join(data_location, meta.filename) - records[file_type] = local_path - - if not exists(local_path): - if not download_if_missing: - raise IOError("Data {0}={1} not found and `download_if_missing` is " - "False".format(file_type, local_path)) - logger.info("Downloading {0}: {1} -> {2}...".format( - file_type, meta.url, local_path)) - archive_path = _fetch_remote(meta, dirname=data_location) - - module_path = dirname(__file__) - with codecs.open(join(module_path, 'descr', DESCRIPTION), - encoding="utf-8") as dfile: - records.DESCR = dfile.read() - - return records +class ADK_Equilibrium(Dataset): + NAME = "adk_equilibrium" + DESCRIPTION = "adk_equilibrium.rst" + + # The original data can be found at the figshare URL. + # The SHA256 checksum of the zip file changes with every download so we + # cannot check its checksum. Instead we download individual files. + # separately. The keys of this dict are also going to be the keys in the + # Bunch that is returned. + ARCHIVE = { + 'topology': RemoteFileMetadata( + filename='adk4AKE.psf', + url='https://ndownloader.figshare.com/files/8672230', + checksum='1aa947d58fb41b6805dc1e7be4dbe65c6a8f4690f0bd7fc2ae03e7bd437085f4', + ), + 'trajectory': RemoteFileMetadata( + filename='1ake_007-nowater-core-dt240ps.dcd', + url='https://ndownloader.figshare.com/files/8672074', + checksum='598fcbcfcc425f6eafbe9997238320fcacc6a4613ecce061e1521732bab734bf', + ), + } diff --git a/MDAnalysisData/base.py b/MDAnalysisData/base.py index a90fe5a..594fa9c 100644 --- a/MDAnalysisData/base.py +++ b/MDAnalysisData/base.py @@ -31,9 +31,11 @@ import shutil from collections import namedtuple -from os import environ, listdir, makedirs +from os import environ, listdir, makedirs, remove from os.path import dirname, exists, expanduser, isdir, join, splitext import hashlib +import codecs + @@ -94,6 +96,59 @@ def __setstate__(self, state): RemoteFileMetadata = namedtuple('RemoteFileMetadata', ['filename', 'url', 'checksum']) +DATASET_NAMES = {} + +class _DatasetRegister(type): + def __new__(meta, name, bases, class_dict): + cls = type.__new__(meta, name, bases, class_dict) + if not cls.NAME is None: + DATASET_NAMES[cls.NAME] = cls + return cls + + +class Dataset(Bunch, metaclass=_DatasetRegister): + NAME = None + DESCRIPTION = None + ARCHIVE = None + + def __init__(self, data_home=None, download_if_missing=True): + data_location = join(get_data_home(data_home=data_home), + self.NAME) + + if not exists(data_location): + makedirs(data_location) + + contents = {} + for file_type, meta in self.ARCHIVE.items(): + local_path = join(data_location, meta.filename) + contents[file_type] = local_path + + if not exists(local_path): + if not download_if_missing: + raise IOError("Data {0}={1} not found and `download_if_missing` is " + "False".format(file_type, local_path)) + logger.info("Downloading {0}: {1} -> {2}...".format( + file_type, meta.url, local_path)) + archive_path = _fetch_remote(meta, dirname=data_location) + + module_path = dirname(__file__) + with codecs.open(join(module_path, 'descr', self.DESCRIPTION), + encoding="utf-8") as dfile: + contents['DESCR'] = dfile.read() + + + # finally, init the Bunch object + super().__init__(**contents) + + +def fetch(dataset, data_home=None, download_if_missing=True): + """Grab a named dataset""" + try: + return DATASET_NAMES[dataset](data_home=data_home, + download_if_missing=True) + except KeyError: + raise KeyError("unknown dataset: {}".format(dataset)) + def get_data_home(data_home=None): """Return the path of the MDAnalysisData data dir. diff --git a/MDAnalysisData/datasets.py b/MDAnalysisData/datasets.py index 4888cef..1385211 100644 --- a/MDAnalysisData/datasets.py +++ b/MDAnalysisData/datasets.py @@ -7,7 +7,7 @@ from .base import get_data_home, clear_data_home -from .adk_equilibrium import fetch_adk_equilibrium +from . import adk_equilibrium from .adk_transitions import (fetch_adk_transitions_DIMS, fetch_adk_transitions_FRODA) from .ifabp_water import fetch_ifabp_water @@ -16,7 +16,6 @@ __all__ = [ 'get_data_home', 'clear_data_home', - 'fetch_adk_equilibrium', 'fetch_adk_transitions_DIMS', 'fetch_adk_transitions_FRODA', 'fetch_ifabp_water', From 3b5dc82056222e255df9df615ed47e83827db4ff Mon Sep 17 00:00:00 2001 From: richardjgowers Date: Wed, 10 Oct 2018 16:51:04 -0500 Subject: [PATCH 2/6] added adk_description --- MDAnalysisData/adk_equilibrium.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/MDAnalysisData/adk_equilibrium.py b/MDAnalysisData/adk_equilibrium.py index d68dcef..3741495 100644 --- a/MDAnalysisData/adk_equilibrium.py +++ b/MDAnalysisData/adk_equilibrium.py @@ -13,7 +13,18 @@ logger = logging.getLogger(__name__) -class ADK_Equilibrium(Dataset): +class AdK_Equilibrium(Dataset): + """AdK 1us equilibrium trajectory (without water) + + Attributes + ---------- + topology : filename + Filename of the topology file + trajectory : filename + Filename of the trajectory file + DESCR : string + Description of the trajectory. + """ NAME = "adk_equilibrium" DESCRIPTION = "adk_equilibrium.rst" From 7e1cbd3bce6c2f2062b82fb587437a41f47b91cd Mon Sep 17 00:00:00 2001 From: richardjgowers Date: Wed, 10 Oct 2018 16:53:07 -0500 Subject: [PATCH 3/6] fancy repr --- MDAnalysisData/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MDAnalysisData/base.py b/MDAnalysisData/base.py index 594fa9c..1303c63 100644 --- a/MDAnalysisData/base.py +++ b/MDAnalysisData/base.py @@ -140,6 +140,9 @@ def __init__(self, data_home=None, download_if_missing=True): # finally, init the Bunch object super().__init__(**contents) + def __repr__(self): + print(self.DESCR) + def fetch(dataset, data_home=None, download_if_missing=True): """Grab a named dataset""" From 665517b01eb835dcac957dcae23386b2f5b7d789 Mon Sep 17 00:00:00 2001 From: richardjgowers Date: Thu, 11 Oct 2018 09:47:59 -0500 Subject: [PATCH 4/6] readded explicit fetch_x functions --- MDAnalysisData/adk_equilibrium.py | 7 ++++++- MDAnalysisData/base.py | 2 +- MDAnalysisData/datasets.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/MDAnalysisData/adk_equilibrium.py b/MDAnalysisData/adk_equilibrium.py index 3741495..20be55c 100644 --- a/MDAnalysisData/adk_equilibrium.py +++ b/MDAnalysisData/adk_equilibrium.py @@ -7,7 +7,7 @@ import logging -from .base import RemoteFileMetadata, Dataset +from .base import RemoteFileMetadata, Dataset, fetch logger = logging.getLogger(__name__) @@ -45,3 +45,8 @@ class AdK_Equilibrium(Dataset): checksum='598fcbcfcc425f6eafbe9997238320fcacc6a4613ecce061e1521732bab734bf', ), } + + +def fetch_adk_equilibrium(data_home=None, download_if_missing=True): + return fetch(AdK_Equilibrium.NAME, data_home=data_home, + download_if_missing=download_if_missing) diff --git a/MDAnalysisData/base.py b/MDAnalysisData/base.py index 1303c63..c0ac791 100644 --- a/MDAnalysisData/base.py +++ b/MDAnalysisData/base.py @@ -141,7 +141,7 @@ def __init__(self, data_home=None, download_if_missing=True): super().__init__(**contents) def __repr__(self): - print(self.DESCR) + return self.__doc__ def fetch(dataset, data_home=None, download_if_missing=True): diff --git a/MDAnalysisData/datasets.py b/MDAnalysisData/datasets.py index 1385211..33c45df 100644 --- a/MDAnalysisData/datasets.py +++ b/MDAnalysisData/datasets.py @@ -7,7 +7,7 @@ from .base import get_data_home, clear_data_home -from . import adk_equilibrium +from . adk_equilibrium import fetch_adk_equilibrium from .adk_transitions import (fetch_adk_transitions_DIMS, fetch_adk_transitions_FRODA) from .ifabp_water import fetch_ifabp_water From 369266a6d9436a70b6dbbc8b769e6e3c1c42ce4c Mon Sep 17 00:00:00 2001 From: richardjgowers Date: Thu, 11 Oct 2018 09:51:52 -0500 Subject: [PATCH 5/6] shared docstring --- MDAnalysisData/adk_equilibrium.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/MDAnalysisData/adk_equilibrium.py b/MDAnalysisData/adk_equilibrium.py index 20be55c..8c69722 100644 --- a/MDAnalysisData/adk_equilibrium.py +++ b/MDAnalysisData/adk_equilibrium.py @@ -13,7 +13,7 @@ logger = logging.getLogger(__name__) -class AdK_Equilibrium(Dataset): +def fetch_adk_equilibrium(data_home=None, download_if_missing=True): """AdK 1us equilibrium trajectory (without water) Attributes @@ -25,6 +25,11 @@ class AdK_Equilibrium(Dataset): DESCR : string Description of the trajectory. """ + return fetch(AdK_Equilibrium.NAME, data_home=data_home, + download_if_missing=download_if_missing) + +class AdK_Equilibrium(Dataset): + __doc__ = fetch_adk_equilibrium.__doc__ NAME = "adk_equilibrium" DESCRIPTION = "adk_equilibrium.rst" @@ -46,7 +51,3 @@ class AdK_Equilibrium(Dataset): ), } - -def fetch_adk_equilibrium(data_home=None, download_if_missing=True): - return fetch(AdK_Equilibrium.NAME, data_home=data_home, - download_if_missing=download_if_missing) From bfd2c88c057e5e3600436a99ef1496276c1388cd Mon Sep 17 00:00:00 2001 From: richardjgowers Date: Fri, 12 Oct 2018 10:23:14 -0500 Subject: [PATCH 6/6] added fetch docstring --- MDAnalysisData/adk_equilibrium.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/MDAnalysisData/adk_equilibrium.py b/MDAnalysisData/adk_equilibrium.py index 8c69722..5dd3383 100644 --- a/MDAnalysisData/adk_equilibrium.py +++ b/MDAnalysisData/adk_equilibrium.py @@ -14,16 +14,27 @@ def fetch_adk_equilibrium(data_home=None, download_if_missing=True): - """AdK 1us equilibrium trajectory (without water) + """Load AdK 1us equilibrium trajectory (without water) - Attributes + Parameters ---------- - topology : filename - Filename of the topology file - trajectory : filename - Filename of the trajectory file - DESCR : string - Description of the trajectory. + data_home : optional, default: None + Specify another download and cache folder for the datasets. By default + all MDAnalysisData data is stored in '~/MDAnalysis_data' subfolders. + This dataset is stored in ``/adk_transitions_DIMS``. + download_if_missing : optional, default=True + If ``False``, raise a :exc:`IOError` if the data is not locally available + instead of trying to download the data from the source site. + + Returns + ------- + dataset : dict-like with following attributes: + topology : filename + Filename of the topology file + trajectory : filename + Filename of the trajectory file + DESCR : string + Description of the trajectory. """ return fetch(AdK_Equilibrium.NAME, data_home=data_home, download_if_missing=download_if_missing)