diff --git a/MDAnalysisData/__init__.py b/MDAnalysisData/__init__.py index d93867a..3aceae5 100644 --- a/MDAnalysisData/__init__.py +++ b/MDAnalysisData/__init__.py @@ -8,7 +8,7 @@ __all__ = ['datasets'] from . import datasets - +from .base import fetch, DATASET_NAMES diff --git a/MDAnalysisData/adk_equilibrium.py b/MDAnalysisData/adk_equilibrium.py index a274f7b..5dd3383 100644 --- a/MDAnalysisData/adk_equilibrium.py +++ b/MDAnalysisData/adk_equilibrium.py @@ -5,88 +5,60 @@ https://figshare.com/articles/Molecular_dynamics_trajectory_for_benchmarking_MDAnalysis/5108170/1 """ -from os.path import dirname, exists, join -from os import makedirs, remove -import codecs - import logging -from .base import get_data_home -from .base import _fetch_remote -from .base import RemoteFileMetadata -from .base import Bunch +from .base import RemoteFileMetadata, Dataset, fetch -NAME = "adk_equilibrium" -DESCRIPTION = "adk_equilibrium.rst" -# The original data can be found at the figshare URL. -# The SHA256 checksum of the zip file changes with every download so we -# cannot check its checksum. Instead we download individual files. -# separately. The keys of this dict are also going to be the keys in the -# Bunch that is returned. -ARCHIVE = { - 'topology': RemoteFileMetadata( - filename='adk4AKE.psf', - url='https://ndownloader.figshare.com/files/8672230', - checksum='1aa947d58fb41b6805dc1e7be4dbe65c6a8f4690f0bd7fc2ae03e7bd437085f4', - ), - 'trajectory': RemoteFileMetadata( - filename='1ake_007-nowater-core-dt240ps.dcd', - url='https://ndownloader.figshare.com/files/8672074', - checksum='598fcbcfcc425f6eafbe9997238320fcacc6a4613ecce061e1521732bab734bf', - ), -} logger = logging.getLogger(__name__) def fetch_adk_equilibrium(data_home=None, download_if_missing=True): - """Load the AdK 1us equilibrium trajectory (without water) + """Load AdK 1us equilibrium trajectory (without water) Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all MDAnalysisData data is stored in '~/MDAnalysis_data' subfolders. - This dataset is stored in ``/adk_equilibrium``. + This dataset is stored in ``/adk_transitions_DIMS``. download_if_missing : optional, default=True If ``False``, raise a :exc:`IOError` if the data is not locally available instead of trying to download the data from the source site. Returns ------- - dataset : dict-like object with the following attributes: - dataset.topology : filename - Filename of the topology file - dataset.trajectory : filename - Filename of the trajectory file - dataset.DESCR : string - Description of the trajectory. - - - See :ref:`adk-equilibrium-dataset` for description. + dataset : dict-like with following attributes: + topology : filename + Filename of the topology file + trajectory : filename + Filename of the trajectory file + DESCR : string + Description of the trajectory. """ - name = NAME - data_location = join(get_data_home(data_home=data_home), - name) - if not exists(data_location): - makedirs(data_location) - - records = Bunch() - for file_type, meta in ARCHIVE.items(): - local_path = join(data_location, meta.filename) - records[file_type] = local_path - - if not exists(local_path): - if not download_if_missing: - raise IOError("Data {0}={1} not found and `download_if_missing` is " - "False".format(file_type, local_path)) - logger.info("Downloading {0}: {1} -> {2}...".format( - file_type, meta.url, local_path)) - archive_path = _fetch_remote(meta, dirname=data_location) - - module_path = dirname(__file__) - with codecs.open(join(module_path, 'descr', DESCRIPTION), - encoding="utf-8") as dfile: - records.DESCR = dfile.read() + return fetch(AdK_Equilibrium.NAME, data_home=data_home, + download_if_missing=download_if_missing) + +class AdK_Equilibrium(Dataset): + __doc__ = fetch_adk_equilibrium.__doc__ + NAME = "adk_equilibrium" + DESCRIPTION = "adk_equilibrium.rst" + + # The original data can be found at the figshare URL. + # The SHA256 checksum of the zip file changes with every download so we + # cannot check its checksum. Instead we download individual files. + # separately. The keys of this dict are also going to be the keys in the + # Bunch that is returned. + ARCHIVE = { + 'topology': RemoteFileMetadata( + filename='adk4AKE.psf', + url='https://ndownloader.figshare.com/files/8672230', + checksum='1aa947d58fb41b6805dc1e7be4dbe65c6a8f4690f0bd7fc2ae03e7bd437085f4', + ), + 'trajectory': RemoteFileMetadata( + filename='1ake_007-nowater-core-dt240ps.dcd', + url='https://ndownloader.figshare.com/files/8672074', + checksum='598fcbcfcc425f6eafbe9997238320fcacc6a4613ecce061e1521732bab734bf', + ), + } - return records diff --git a/MDAnalysisData/base.py b/MDAnalysisData/base.py index a90fe5a..c0ac791 100644 --- a/MDAnalysisData/base.py +++ b/MDAnalysisData/base.py @@ -31,9 +31,11 @@ import shutil from collections import namedtuple -from os import environ, listdir, makedirs +from os import environ, listdir, makedirs, remove from os.path import dirname, exists, expanduser, isdir, join, splitext import hashlib +import codecs + @@ -94,6 +96,62 @@ def __setstate__(self, state): RemoteFileMetadata = namedtuple('RemoteFileMetadata', ['filename', 'url', 'checksum']) +DATASET_NAMES = {} + +class _DatasetRegister(type): + def __new__(meta, name, bases, class_dict): + cls = type.__new__(meta, name, bases, class_dict) + if not cls.NAME is None: + DATASET_NAMES[cls.NAME] = cls + return cls + + +class Dataset(Bunch, metaclass=_DatasetRegister): + NAME = None + DESCRIPTION = None + ARCHIVE = None + + def __init__(self, data_home=None, download_if_missing=True): + data_location = join(get_data_home(data_home=data_home), + self.NAME) + + if not exists(data_location): + makedirs(data_location) + + contents = {} + for file_type, meta in self.ARCHIVE.items(): + local_path = join(data_location, meta.filename) + contents[file_type] = local_path + + if not exists(local_path): + if not download_if_missing: + raise IOError("Data {0}={1} not found and `download_if_missing` is " + "False".format(file_type, local_path)) + logger.info("Downloading {0}: {1} -> {2}...".format( + file_type, meta.url, local_path)) + archive_path = _fetch_remote(meta, dirname=data_location) + + module_path = dirname(__file__) + with codecs.open(join(module_path, 'descr', self.DESCRIPTION), + encoding="utf-8") as dfile: + contents['DESCR'] = dfile.read() + + + # finally, init the Bunch object + super().__init__(**contents) + + def __repr__(self): + return self.__doc__ + + +def fetch(dataset, data_home=None, download_if_missing=True): + """Grab a named dataset""" + try: + return DATASET_NAMES[dataset](data_home=data_home, + download_if_missing=True) + except KeyError: + raise KeyError("unknown dataset: {}".format(dataset)) + def get_data_home(data_home=None): """Return the path of the MDAnalysisData data dir. diff --git a/MDAnalysisData/datasets.py b/MDAnalysisData/datasets.py index 4888cef..33c45df 100644 --- a/MDAnalysisData/datasets.py +++ b/MDAnalysisData/datasets.py @@ -7,7 +7,7 @@ from .base import get_data_home, clear_data_home -from .adk_equilibrium import fetch_adk_equilibrium +from . adk_equilibrium import fetch_adk_equilibrium from .adk_transitions import (fetch_adk_transitions_DIMS, fetch_adk_transitions_FRODA) from .ifabp_water import fetch_ifabp_water @@ -16,7 +16,6 @@ __all__ = [ 'get_data_home', 'clear_data_home', - 'fetch_adk_equilibrium', 'fetch_adk_transitions_DIMS', 'fetch_adk_transitions_FRODA', 'fetch_ifabp_water',