From a9f17e644ffdf139b6b47e1aa107a4ea7a5b1e2f Mon Sep 17 00:00:00 2001 From: Tyler Sutterley Date: Tue, 7 Oct 2025 13:57:17 -0700 Subject: [PATCH 1/3] refactor: GFZ ISDC ftp server is being retired --- ...ing_ftp.py => gfz_isdc_dealiasing_sync.py} | 159 +++++++++--- ...dc_grace_ftp.py => gfz_isdc_grace_sync.py} | 238 +++++++++++------- access/podaac_cumulus.py | 1 + ...g_ftp.rst => gfz_isdc_dealiasing_sync.rst} | 12 +- ..._grace_ftp.rst => gfz_isdc_grace_sync.rst} | 12 +- doc/source/index.rst | 4 +- setup.py | 2 +- 7 files changed, 282 insertions(+), 146 deletions(-) rename access/{gfz_isdc_dealiasing_ftp.py => gfz_isdc_dealiasing_sync.py} (62%) rename access/{gfz_isdc_grace_ftp.py => gfz_isdc_grace_sync.py} (68%) rename doc/source/api_reference/access/{gfz_isdc_dealiasing_ftp.rst => gfz_isdc_dealiasing_sync.rst} (65%) rename doc/source/api_reference/access/{gfz_isdc_grace_ftp.rst => gfz_isdc_grace_sync.rst} (75%) diff --git a/access/gfz_isdc_dealiasing_ftp.py b/access/gfz_isdc_dealiasing_sync.py similarity index 62% rename from access/gfz_isdc_dealiasing_ftp.py rename to access/gfz_isdc_dealiasing_sync.py index 73394d0d..f51f9b78 100644 --- a/access/gfz_isdc_dealiasing_ftp.py +++ b/access/gfz_isdc_dealiasing_sync.py @@ -1,13 +1,14 @@ #!/usr/bin/env python u""" -gfz_isdc_dealiasing_ftp.py -Written by Tyler Sutterley (05/2023) +gfz_isdc_dealiasing_sync.py +Written by Tyler Sutterley (10/2025) Syncs GRACE Level-1b dealiasing products from the GFZ Information System and Data Center (ISDC) + Optionally outputs as monthly tar files CALLING SEQUENCE: - python gfz_isdc_dealiasing_ftp.py --year=2015 --release=RL06 --tar + python gfz_isdc_dealiasing_sync.py --year=2015 --release=RL06 --tar COMMAND LINE OPTIONS: -D X, --directory X: working data directory @@ -30,6 +31,7 @@ utilities.py: download and management utilities for syncing files UPDATE HISTORY: + Updated 10/2025: switch to https as ftp server is being retired Updated 05/2023: use pathlib to define and operate on paths Updated 03/2023: increase default year range to sync Updated 12/2022: single implicit import of gravity toolkit @@ -51,8 +53,9 @@ import sys import os import re +import ssl import time -import ftplib +import shutil import logging import pathlib import tarfile @@ -62,7 +65,7 @@ # PURPOSE: syncs GRACE Level-1b dealiasing products from the GFZ data server # and optionally outputs as monthly tar files -def gfz_isdc_dealiasing_ftp(base_dir, DREL, YEAR=None, MONTHS=None, TAR=False, +def gfz_isdc_dealiasing_sync(base_dir, DREL, YEAR=None, MONTHS=None, TAR=False, TIMEOUT=None, LOG=False, CLOBBER=False, MODE=None): # check if directory exists and recursively create if not base_dir = pathlib.Path(base_dir).expanduser().absolute() @@ -81,10 +84,8 @@ def gfz_isdc_dealiasing_ftp(base_dir, DREL, YEAR=None, MONTHS=None, TAR=False, # standard output (terminal output) logging.basicConfig(level=logging.INFO) - # remote HOST for DREL on GFZ data server - # connect and login to GFZ ftp server - ftp = ftplib.FTP('isdcftp.gfz-potsdam.de', timeout=TIMEOUT) - ftp.login() + # GFZ ISDC https host + HOST = 'https://isdc-data.gfz.de/' # compile regular expression operator for years to sync if YEAR is None: @@ -97,9 +98,8 @@ def gfz_isdc_dealiasing_ftp(base_dir, DREL, YEAR=None, MONTHS=None, TAR=False, SUFFIX = dict(RL04='tar.gz', RL05='tar.gz', RL06='tgz') # find remote yearly directories for DREL - YRS,_ = gravtk.utilities.ftp_list([ftp.host,'grace', - 'Level-1B', 'GFZ','AOD',DREL], timeout=TIMEOUT, basename=True, - pattern=R1, sort=True) + YRS,_ = http_list([HOST,'grace','Level-1B', 'GFZ','AOD',DREL], + timeout=TIMEOUT, basename=True, pattern=R1, sort=True) # for each year for Y in YRS: # for each month of interest @@ -114,8 +114,8 @@ def gfz_isdc_dealiasing_ftp(base_dir, DREL, YEAR=None, MONTHS=None, TAR=False, # will extract year and month and calendar day from the ascii file regex_pattern = r'AOD1B_({0})-({1:02d})-(\d+)_X_\d+.asc.gz$' R2 = re.compile(regex_pattern.format(Y,M), re.VERBOSE) - remote_files,remote_mtimes = gravtk.utilities.ftp_list( - [ftp.host,'grace','Level-1B','GFZ','AOD',DREL,Y], + remote_files,remote_mtimes = http_list( + [HOST,'grace','Level-1B','GFZ','AOD',DREL,Y], timeout=TIMEOUT, basename=True, pattern=R2, sort=True) file_count = len(remote_files) # if compressing into monthly tar files @@ -124,10 +124,10 @@ def gfz_isdc_dealiasing_ftp(base_dir, DREL, YEAR=None, MONTHS=None, TAR=False, tar = tarfile.open(name=local_tar_file, mode='w:gz') for fi,remote_mtime in zip(remote_files,remote_mtimes): # remote version of each input file - remote = [ftp.host,'grace','Level-1B','GFZ','AOD',DREL,Y,fi] - logging.info(posixpath.join('ftp://',*remote)) + remote = [HOST,'grace','Level-1B','GFZ','AOD',DREL,Y,fi] + logging.info(posixpath.join(*remote)) # retrieve bytes from remote file - remote_buffer = gravtk.utilities.from_ftp(remote, + remote_buffer = gravtk.utilities.from_sync(remote, timeout=TIMEOUT) # add file to tar tar_info = tarfile.TarInfo(name=fi) @@ -142,23 +142,96 @@ def gfz_isdc_dealiasing_ftp(base_dir, DREL, YEAR=None, MONTHS=None, TAR=False, # copy each gzip file and keep as individual daily files for fi,remote_mtime in zip(remote_files,remote_mtimes): # remote and local version of each input file - remote = [ftp.host,'grace','Level-1B','GFZ','AOD',DREL,Y,fi] + remote = [HOST,'grace','Level-1B','GFZ','AOD',DREL,Y,fi] local_file = grace_dir.joinpath(fi) - ftp_mirror_file(ftp,remote,remote_mtime,local_file, + http_pull_file(remote,remote_mtime,local_file, CLOBBER=CLOBBER, MODE=MODE) - # close the ftp connection - ftp.quit() # close log file and set permissions level to MODE if LOG: LOGFILE.chmod(mode=MODE) +# PURPOSE: list a directory on the GFZ https server +def http_list( + HOST: str | list, + timeout: int | None = None, + context: ssl.SSLContext = gravtk.utilities._default_ssl_context, + pattern: str | re.Pattern = '', + sort: bool = False + ): + """ + List a directory on the GFZ https Server + + Parameters + ---------- + HOST: str or list + remote http host path + timeout: int or NoneType, default None + timeout in seconds for blocking operations + context: obj, default gravity_toolkit.utilities._default_ssl_context + SSL context for ``urllib`` opener object + pattern: str, default '' + regular expression pattern for reducing list + sort: bool, default False + sort output list + + Returns + ------- + colnames: list + column names in a directory + collastmod: list + last modification times for items in the directory + """ + # verify inputs for remote http host + if isinstance(HOST, str): + HOST = gravtk.utilities.url_split(HOST) + # regular expression pattern for finding files and modification times + parser = r'\(.*?)\<\/a\>\s+(\d{4}-\d{2}-\d{2}\s+\d{2}\:\d{2})' + rx = re.compile(parser, re.VERBOSE) + # try listing from http + try: + # Create and submit request. + request = gravtk.utilities.urllib2.Request(posixpath.join(*HOST)) + response = gravtk.utilities.urllib2.urlopen(request, + timeout=timeout, context=context) + except Exception as exc: + raise Exception('List error from {0}'.format(posixpath.join(*HOST))) + # read the directory listing + contents = response.readlines() + # read and parse request for files (column names and modified times) + lines = [l for l in contents if rx.search(l.decode('utf-8'))] + # column names and last modified times + colnames = [None]*len(lines) + collastmod = [None]*len(lines) + for i, l in enumerate(lines): + colnames[i], lastmod = rx.findall(l.decode('utf-8')).pop() + # get the Unix timestamp value for a modification time + collastmod[i] = gravtk.utilities.get_unix_time(lastmod, + format='%Y-%m-%d %H:%M') + # reduce using regular expression pattern + if pattern: + i = [i for i,f in enumerate(colnames) if re.search(pattern, f)] + # reduce list of column names and last modified times + colnames = [colnames[indice] for indice in i] + collastmod = [collastmod[indice] for indice in i] + # sort the list + if sort: + i = [i for i,j in sorted(enumerate(colnames), key=lambda i: i[1])] + # sort list of column names and last modified times + colnames = [colnames[indice] for indice in i] + collastmod = [collastmod[indice] for indice in i] + # return the list of column names and last modified times + return (colnames, collastmod) + # PURPOSE: pull file from a remote host checking if file exists locally # and if the remote file is newer than the local file -def ftp_mirror_file(ftp,remote_path,remote_mtime,local_file, - CLOBBER=False,MODE=0o775): - # path to remote file - remote_file = posixpath.join(*remote_path[1:]) +def http_pull_file(remote_path, remote_mtime, local_file, + TIMEOUT=0, LIST=False, CLOBBER=False, MODE=0o775): + # verify inputs for remote http host + if isinstance(remote_path, str): + remote_path = gravtk.utilities.url_split(remote_path) + # construct remote file path + remote_file = posixpath.join(*remote_path) # if file exists in file system: check if remote file is newer TEST = False OVERWRITE = ' (clobber)' @@ -178,15 +251,24 @@ def ftp_mirror_file(ftp,remote_path,remote_mtime,local_file, # if file does not exist locally, is to be overwritten, or CLOBBER is set if TEST or CLOBBER: # Printing files transferred - remote_ftp_url = posixpath.join('ftp://',*remote_path) - logging.info(f'{remote_ftp_url} -->') - logging.info(f'\t{local_file}{OVERWRITE}\n') - # copy remote file contents to local file - with local_file.open(mode='wb') as f: - ftp.retrbinary(f'RETR {remote_file}', f.write) - # keep remote modification time of file and local access time - os.utime(local_file, (local_file.stat().st_atime, remote_mtime)) - local_file.chmod(mode=MODE) + logging.info(f'{remote_file} --> ') + logging.info(f'\t{str(local_file)}{OVERWRITE}\n') + # if executing copy command (not only printing the files) + if not LIST: + # Create and submit request. There are a wide range of exceptions + # that can be thrown here, including HTTPError and URLError. + request = gravtk.utilities.urllib2.Request(remote_file) + response = gravtk.utilities.urllib2.urlopen(request, + timeout=TIMEOUT) + # chunked transfer encoding size + CHUNK = 16 * 1024 + # copy contents to local file using chunked transfer encoding + # transfer should work properly with ascii and binary data formats + with local_file.open(mode='wb') as f: + shutil.copyfileobj(response, f, CHUNK) + # keep remote modification time of file and local access time + os.utime(local_file, (local_file.stat().st_atime, remote_mtime)) + local_file.chmod(mode=MODE) # PURPOSE: create argument parser def arguments(): @@ -243,14 +325,17 @@ def main(): parser = arguments() args,_ = parser.parse_known_args() + # GFZ ISDC https host + HOST = 'https://isdc-data.gfz.de/' # check internet connection before attempting to run program - HOST = 'isdcftp.gfz-potsdam.de' - if gravtk.utilities.check_ftp_connection(HOST): + if gravtk.utilities.check_connection(HOST): for DREL in args.release: - gfz_isdc_dealiasing_ftp(args.directory, DREL=DREL, + gfz_isdc_dealiasing_sync(args.directory, DREL=DREL, YEAR=args.year, MONTHS=args.month, TAR=args.tar, TIMEOUT=args.timeout, LOG=args.log, CLOBBER=args.clobber, MODE=args.mode) + else: + raise RuntimeError('Check internet connection') # run main program if __name__ == '__main__': diff --git a/access/gfz_isdc_grace_ftp.py b/access/gfz_isdc_grace_sync.py similarity index 68% rename from access/gfz_isdc_grace_ftp.py rename to access/gfz_isdc_grace_sync.py index f8a9187e..b641e5aa 100644 --- a/access/gfz_isdc_grace_ftp.py +++ b/access/gfz_isdc_grace_sync.py @@ -1,15 +1,11 @@ #!/usr/bin/env python u""" -gfz_isdc_grace_ftp.py -Written by Tyler Sutterley (09/2023) +gfz_isdc_grace_sync.py +Written by Tyler Sutterley (10/2025) Syncs GRACE/GRACE-FO data from the GFZ Information System and Data Center (ISDC) -Syncs CSR/GFZ/JPL files for RL06 GAA/GAB/GAC/GAD/GSM - GAA and GAB are GFZ/JPL only -Gets the latest technical note (TN) files -Gets the monthly GRACE/GRACE-FO newsletters CALLING SEQUENCE: - python gfz_isdc_grace_ftp.py + python gfz_isdc_grace_sync.py OUTPUTS: CSR RL06: GAC/GAD/GSM @@ -27,7 +23,6 @@ -L, --list: print files to be transferred, but do not execute transfer -l, --log: output log of files downloaded -C, --clobber: Overwrite existing data in transfer - --checksum: compare hashes to check if overwriting existing data -M X, --mode X: Local permissions mode of the directories and files synced PYTHON DEPENDENCIES: @@ -40,6 +35,7 @@ utilities.py: download and management utilities for syncing files UPDATE HISTORY: + Updated 10/2025: switch to https as ftp server is being retired Updated 09/2023: don't restrict version number to a set list Updated 05/2023: use pathlib to define and operate on paths Updated 12/2022: single implicit import of gravity toolkit @@ -68,11 +64,10 @@ import sys import os import re +import ssl import copy import time -import ftplib import shutil -import hashlib import logging import pathlib import argparse @@ -80,14 +75,16 @@ import gravity_toolkit as gravtk # PURPOSE: sync local GRACE/GRACE-FO files with GFZ ISDC server -def gfz_isdc_grace_ftp(DIRECTORY, PROC=[], DREL=[], VERSION=[], +def gfz_isdc_grace_sync(DIRECTORY, PROC=[], DREL=[], VERSION=[], NEWSLETTERS=False, TIMEOUT=None, LOG=False, LIST=False, - CLOBBER=False, CHECKSUM=False, MODE=None): + CLOBBER=False, MODE=None): # check if directory exists and recursively create if not DIRECTORY = pathlib.Path(DIRECTORY).expanduser().absolute() DIRECTORY.mkdir(mode=MODE, parents=True, exist_ok=True) + # GFZ ISDC https host + HOST = 'https://isdc-data.gfz.de/' # mission shortnames shortname = {'grace':'GRAC', 'grace-fo':'GRFO'} # datasets for each processing center @@ -110,10 +107,6 @@ def gfz_isdc_grace_ftp(DIRECTORY, PROC=[], DREL=[], VERSION=[], # standard output (terminal output) logging.basicConfig(level=logging.INFO) - # connect and login to GFZ ISDC ftp server - ftp = ftplib.FTP('isdcftp.gfz-potsdam.de', timeout=TIMEOUT) - ftp.login() - # Degree 1 (geocenter) coefficients logging.info('Degree 1 Coefficients:') local_dir = DIRECTORY.joinpath('geocenter') @@ -123,51 +116,51 @@ def gfz_isdc_grace_ftp(DIRECTORY, PROC=[], DREL=[], VERSION=[], # compile regular expression operator for remote files R1 = re.compile(r'TN-13_GEOC_(CSR|GFZ|JPL)_(.*?).txt$', re.VERBOSE) # get filenames from remote directory - remote_files,remote_mtimes = gravtk.utilities.ftp_list( - [ftp.host,'grace-fo','DOCUMENTS','TECHNICAL_NOTES'], - timeout=TIMEOUT, basename=True, pattern=R1, sort=True) + remote_files,remote_mtimes = http_list( + [HOST,'grace-fo','DOCUMENTS','TECHNICAL_NOTES'], + timeout=TIMEOUT, pattern=R1, sort=True) # for each file on the remote server for fi,remote_mtime in zip(remote_files,remote_mtimes): # extract filename from regex object - remote_path = [ftp.host,'grace-fo','DOCUMENTS','TECHNICAL_NOTES',fi] + remote_path = [HOST,'grace-fo','DOCUMENTS','TECHNICAL_NOTES',fi] local_file = local_dir.joinpath(fi) - ftp_mirror_file(ftp, remote_path, remote_mtime, + http_pull_file(remote_path, remote_mtime, local_file, TIMEOUT=TIMEOUT, LIST=LIST, - CLOBBER=CLOBBER, CHECKSUM=CHECKSUM, MODE=MODE) + CLOBBER=CLOBBER, MODE=MODE) # SLR C2,0 coefficients logging.info('C2,0 Coefficients:') # compile regular expression operator for remote files R1 = re.compile(r'TN-(05|07|11)_C20_SLR_RL(.*?).txt$', re.VERBOSE) # get filenames from remote directory - remote_files,remote_mtimes = gravtk.utilities.ftp_list( - [ftp.host,'grace','DOCUMENTS','TECHNICAL_NOTES'], - timeout=TIMEOUT, basename=True, pattern=R1, sort=True) + remote_files,remote_mtimes = http_list( + [HOST,'grace','DOCUMENTS','TECHNICAL_NOTES'], + timeout=TIMEOUT, pattern=R1, sort=True) # for each file on the remote server for fi,remote_mtime in zip(remote_files,remote_mtimes): # extract filename from regex object - remote_path = [ftp.host,'grace','DOCUMENTS','TECHNICAL_NOTES',fi] + remote_path = [HOST,'grace','DOCUMENTS','TECHNICAL_NOTES',fi] local_file = DIRECTORY.joinpath(re.sub(r'(_RL.*?).txt','.txt',fi)) - ftp_mirror_file(ftp, remote_path, remote_mtime, + http_pull_file(remote_path, remote_mtime, local_file, TIMEOUT=TIMEOUT, LIST=LIST, - CLOBBER=CLOBBER, CHECKSUM=CHECKSUM, MODE=MODE) + CLOBBER=CLOBBER, MODE=MODE) # SLR C3,0 coefficients logging.info('C3,0 Coefficients:') # compile regular expression operator for remote files R1 = re.compile(r'TN-(14)_C30_C20_SLR_GSFC.txt$', re.VERBOSE) # get filenames from remote directory - remote_files,remote_mtimes = gravtk.utilities.ftp_list( - [ftp.host,'grace-fo','DOCUMENTS','TECHNICAL_NOTES'], - timeout=TIMEOUT, basename=True, pattern=R1, sort=True) + remote_files,remote_mtimes = http_list( + [HOST,'grace-fo','DOCUMENTS','TECHNICAL_NOTES'], + timeout=TIMEOUT, pattern=R1, sort=True) # for each file on the remote server for fi,remote_mtime in zip(remote_files,remote_mtimes): # extract filename from regex object - remote_path = [ftp.host,'grace-fo','DOCUMENTS','TECHNICAL_NOTES',fi] + remote_path = [HOST,'grace-fo','DOCUMENTS','TECHNICAL_NOTES',fi] local_file = DIRECTORY.joinpath(re.sub(r'(SLR_GSFC)','GSFC_SLR',fi)) - ftp_mirror_file(ftp, remote_path, remote_mtime, + http_pull_file(remote_path, remote_mtime, local_file, TIMEOUT=TIMEOUT, LIST=LIST, - CLOBBER=CLOBBER, CHECKSUM=CHECKSUM, MODE=MODE) + CLOBBER=CLOBBER, MODE=MODE) # TN-08 GAE, TN-09 GAF and TN-10 GAG ECMWF atmosphere correction products logging.info('TN-08 GAE, TN-09 GAF and TN-10 GAG products:') @@ -178,17 +171,17 @@ def gfz_isdc_grace_ftp(DIRECTORY, PROC=[], DREL=[], VERSION=[], # compile regular expression operator for remote files R1 = re.compile(r'({0}|{1}|{2})'.format(*ECMWF_files), re.VERBOSE) # get filenames from remote directory - remote_files,remote_mtimes = gravtk.utilities.ftp_list( - [ftp.host,'grace','DOCUMENTS','TECHNICAL_NOTES'], - timeout=TIMEOUT, basename=True, pattern=R1, sort=True) + remote_files,remote_mtimes = http_list( + [HOST,'grace','DOCUMENTS','TECHNICAL_NOTES'], + timeout=TIMEOUT, pattern=R1, sort=True) # for each file on the remote server for fi,remote_mtime in zip(remote_files,remote_mtimes): # extract filename from regex object - remote_path = [ftp.host,'grace','DOCUMENTS','TECHNICAL_NOTES',fi] + remote_path = [HOST,'grace','DOCUMENTS','TECHNICAL_NOTES',fi] local_file = DIRECTORY.joinpath(fi) - ftp_mirror_file(ftp, remote_path, remote_mtime, + http_pull_file(remote_path, remote_mtime, local_file, TIMEOUT=TIMEOUT, LIST=LIST, - CLOBBER=CLOBBER, CHECKSUM=CHECKSUM, MODE=MODE) + CLOBBER=CLOBBER, MODE=MODE) # GRACE and GRACE-FO newsletters if NEWSLETTERS: @@ -203,25 +196,24 @@ def gfz_isdc_grace_ftp(DIRECTORY, PROC=[], DREL=[], VERSION=[], NAME = mi.upper().replace('-','_') R1 = re.compile(rf'{NAME}_SDS_NL_(\d+).pdf', re.VERBOSE) # find years for GRACE/GRACE-FO newsletters - years,_ = gravtk.utilities.ftp_list( - [ftp.host,mi,'DOCUMENTS','NEWSLETTER'], - timeout=TIMEOUT, basename=True, pattern=r'\d+', + years,_ = http_list([HOST,mi,'DOCUMENTS','NEWSLETTER'], + timeout=TIMEOUT, pattern=r'\d+', sort=True) # for each year of GRACE/GRACE-FO newsletters for Y in years: # find GRACE/GRACE-FO newsletters - remote_files,remote_mtimes = gravtk.utilities.ftp_list( - [ftp.host,mi,'DOCUMENTS','NEWSLETTER',Y], - timeout=TIMEOUT, basename=True, pattern=R1, + remote_files,remote_mtimes = http_list( + [HOST,mi,'DOCUMENTS','NEWSLETTER',Y], + timeout=TIMEOUT, pattern=R1, sort=True) # for each file on the remote server for fi,remote_mtime in zip(remote_files,remote_mtimes): # extract filename from regex object - remote_path = [ftp.host,mi,'DOCUMENTS','NEWSLETTER',Y,fi] + remote_path = [HOST,mi,'DOCUMENTS','NEWSLETTER',Y,fi] local_file = local_dir.joinpath(fi) - ftp_mirror_file(ftp, remote_path, remote_mtime, + http_pull_file(remote_path, remote_mtime, local_file, TIMEOUT=TIMEOUT, LIST=LIST, - CLOBBER=CLOBBER, CHECKSUM=CHECKSUM, MODE=MODE) + CLOBBER=CLOBBER, MODE=MODE) # GRACE/GRACE-FO level-2 spherical harmonic products logging.info('GRACE/GRACE-FO L2 Global Spherical Harmonics:') @@ -249,16 +241,16 @@ def gfz_isdc_grace_ftp(DIRECTORY, PROC=[], DREL=[], VERSION=[], # compile the regular expression operator to find files R1 = re.compile(rf'({ds}-(.*?)(gz|txt|dif))') # get filenames from remote directory - remote_files,remote_mtimes = gravtk.utilities.ftp_list( - [ftp.host,mi,'Level-2',pr,drel_str], timeout=TIMEOUT, - basename=True, pattern=R1, sort=True) + remote_files,remote_mtimes = http_list( + [HOST,mi,'Level-2',pr,drel_str], timeout=TIMEOUT, + pattern=R1, sort=True) for fi,remote_mtime in zip(remote_files,remote_mtimes): # extract filename from regex object - remote_path = [ftp.host,mi,'Level-2',pr,drel_str,fi] + remote_path = [HOST,mi,'Level-2',pr,drel_str,fi] local_file = local_dir.joinpath(fi) - ftp_mirror_file(ftp, remote_path, remote_mtime, + http_pull_file(remote_path, remote_mtime, local_file, TIMEOUT=TIMEOUT, LIST=LIST, - CLOBBER=CLOBBER, CHECKSUM=CHECKSUM, MODE=MODE) + CLOBBER=CLOBBER, MODE=MODE) # regular expression operator for data product rx = gravtk.utilities.compile_regex_pattern( pr, rl, ds, mission=shortname[mi]) @@ -278,35 +270,97 @@ def gfz_isdc_grace_ftp(DIRECTORY, PROC=[], DREL=[], VERSION=[], # change permissions of index file index_file.chmod(mode=MODE) - # close the ftp connection - ftp.quit() # close log file and set permissions level to MODE if LOG: LOGFILE.chmod(mode=MODE) +# PURPOSE: list a directory on the GFZ https server +def http_list( + HOST: str | list, + timeout: int | None = None, + context: ssl.SSLContext = gravtk.utilities._default_ssl_context, + pattern: str | re.Pattern = '', + sort: bool = False + ): + """ + List a directory on the GFZ https Server + + Parameters + ---------- + HOST: str or list + remote http host path + timeout: int or NoneType, default None + timeout in seconds for blocking operations + context: obj, default gravity_toolkit.utilities._default_ssl_context + SSL context for ``urllib`` opener object + pattern: str, default '' + regular expression pattern for reducing list + sort: bool, default False + sort output list + + Returns + ------- + colnames: list + column names in a directory + collastmod: list + last modification times for items in the directory + """ + # verify inputs for remote http host + if isinstance(HOST, str): + HOST = gravtk.utilities.url_split(HOST) + # regular expression pattern for finding files and modification times + parser = r'\(.*?)\<\/a\>\s+(\d{4}-\d{2}-\d{2}\s+\d{2}\:\d{2})' + rx = re.compile(parser, re.VERBOSE) + # try listing from http + try: + # Create and submit request. + request = gravtk.utilities.urllib2.Request(posixpath.join(*HOST)) + response = gravtk.utilities.urllib2.urlopen(request, + timeout=timeout, context=context) + except Exception as exc: + raise Exception('List error from {0}'.format(posixpath.join(*HOST))) + # read the directory listing + contents = response.readlines() + # read and parse request for files (column names and modified times) + lines = [l for l in contents if rx.search(l.decode('utf-8'))] + # column names and last modified times + colnames = [None]*len(lines) + collastmod = [None]*len(lines) + for i, l in enumerate(lines): + colnames[i], lastmod = rx.findall(l.decode('utf-8')).pop() + # get the Unix timestamp value for a modification time + collastmod[i] = gravtk.utilities.get_unix_time(lastmod, + format='%Y-%m-%d %H:%M') + # reduce using regular expression pattern + if pattern: + i = [i for i,f in enumerate(colnames) if re.search(pattern, f)] + # reduce list of column names and last modified times + colnames = [colnames[indice] for indice in i] + collastmod = [collastmod[indice] for indice in i] + # sort the list + if sort: + i = [i for i,j in sorted(enumerate(colnames), key=lambda i: i[1])] + # sort list of column names and last modified times + colnames = [colnames[indice] for indice in i] + collastmod = [collastmod[indice] for indice in i] + # return the list of column names and last modified times + return (colnames, collastmod) + # PURPOSE: pull file from a remote host checking if file exists locally # and if the remote file is newer than the local file -def ftp_mirror_file(ftp,remote_path,remote_mtime,local_file, - TIMEOUT=None,LIST=False,CLOBBER=False,CHECKSUM=False,MODE=0o775): +def http_pull_file(remote_path, remote_mtime, local_file, + TIMEOUT=0, LIST=False, CLOBBER=False, MODE=0o775): + # verify inputs for remote http host + if isinstance(remote_path, str): + remote_path = gravtk.utilities.url_split(remote_path) + # construct remote file path + remote_file = posixpath.join(*remote_path) # if file exists in file system: check if remote file is newer TEST = False OVERWRITE = ' (clobber)' # check if local version of file exists local_file = pathlib.Path(local_file).expanduser().absolute() - if CHECKSUM and local_file.exists(): - # generate checksum hash for local file - # open the local_file in binary read mode - local_hash = gravtk.utilities.get_hash(local_file) - # copy remote file contents to bytesIO object - remote_buffer = gravtk.utilities.from_ftp(remote_path, - timeout=TIMEOUT) - # generate checksum hash for remote file - remote_hash = hashlib.md5(remote_buffer.getvalue()).hexdigest() - # compare checksums - if (local_hash != remote_hash): - TEST = True - OVERWRITE = f' (checksums: {local_hash} {remote_hash})' - elif local_file.exists(): + if local_file.exists(): # check last modification time of local file local_mtime = local_file.stat().st_mtime # if remote file is newer: overwrite the local file @@ -320,23 +374,21 @@ def ftp_mirror_file(ftp,remote_path,remote_mtime,local_file, # if file does not exist locally, is to be overwritten, or CLOBBER is set if TEST or CLOBBER: # Printing files transferred - remote_ftp_url = posixpath.join('ftp://',*remote_path) - logging.info(f'{remote_ftp_url} -->') + logging.info(f'{remote_file} --> ') logging.info(f'\t{str(local_file)}{OVERWRITE}\n') # if executing copy command (not only printing the files) if not LIST: - # copy file from ftp server or from bytesIO object - if CHECKSUM and local_file.exists(): - # store bytes to file using chunked transfer encoding - remote_buffer.seek(0) - with local_file.open(mode='wb') as f: - shutil.copyfileobj(remote_buffer, f, 16 * 1024) - else: - # path to remote file - remote_file = posixpath.join(*remote_path[1:]) - # copy remote file contents to local file - with local_file.open(mode='wb') as f: - ftp.retrbinary(f'RETR {remote_file}', f.write) + # Create and submit request. There are a wide range of exceptions + # that can be thrown here, including HTTPError and URLError. + request = gravtk.utilities.urllib2.Request(remote_file) + response = gravtk.utilities.urllib2.urlopen(request, + timeout=TIMEOUT) + # chunked transfer encoding size + CHUNK = 16 * 1024 + # copy contents to local file using chunked transfer encoding + # transfer should work properly with ascii and binary data formats + with local_file.open(mode='wb') as f: + shutil.copyfileobj(response, f, CHUNK) # keep remote modification time of file and local access time os.utime(local_file, (local_file.stat().st_atime, remote_mtime)) local_file.chmod(mode=MODE) @@ -385,9 +437,6 @@ def arguments(): parser.add_argument('--list','-L', default=False, action='store_true', help='Only print files that could be transferred') - parser.add_argument('--checksum', - default=False, action='store_true', - help='Compare hashes to check for overwriting existing data') parser.add_argument('--clobber','-C', default=False, action='store_true', help='Overwrite existing data in transfer') @@ -404,14 +453,15 @@ def main(): parser = arguments() args,_ = parser.parse_known_args() + # GFZ ISDC https host + HOST = 'https://isdc-data.gfz.de/' # check internet connection before attempting to run program - HOST = 'isdcftp.gfz-potsdam.de' - if gravtk.utilities.check_ftp_connection(HOST): - gfz_isdc_grace_ftp(args.directory, PROC=args.center, + if gravtk.utilities.check_connection(HOST): + gfz_isdc_grace_sync(args.directory, PROC=args.center, DREL=args.release, VERSION=args.version, NEWSLETTERS=args.newsletters, TIMEOUT=args.timeout, LIST=args.list, LOG=args.log, CLOBBER=args.clobber, - CHECKSUM=args.checksum, MODE=args.mode) + MODE=args.mode) else: raise RuntimeError('Check internet connection') diff --git a/access/podaac_cumulus.py b/access/podaac_cumulus.py index 71a85905..dcae0784 100644 --- a/access/podaac_cumulus.py +++ b/access/podaac_cumulus.py @@ -4,6 +4,7 @@ Written by Tyler Sutterley (11/2024) Syncs GRACE/GRACE-FO data from NASA JPL PO.DAAC Cumulus AWS S3 bucket + S3 Cumulus syncs are only available in AWS instances in us-west-2 Register with NASA Earthdata Login system: diff --git a/doc/source/api_reference/access/gfz_isdc_dealiasing_ftp.rst b/doc/source/api_reference/access/gfz_isdc_dealiasing_sync.rst similarity index 65% rename from doc/source/api_reference/access/gfz_isdc_dealiasing_ftp.rst rename to doc/source/api_reference/access/gfz_isdc_dealiasing_sync.rst index ca9af1e0..da1867d6 100644 --- a/doc/source/api_reference/access/gfz_isdc_dealiasing_ftp.rst +++ b/doc/source/api_reference/access/gfz_isdc_dealiasing_sync.rst @@ -1,20 +1,20 @@ -========================== -gfz_isdc_dealiasing_ftp.py -========================== +=========================== +gfz_isdc_dealiasing_sync.py +=========================== - Syncs GRACE Level-1b dealiasing products from the `GFZ Information System and Data Center (ISDC) `_ - Optionally outputs as monthly tar files `Source code`__ -.. __: https://github.com/tsutterley/gravity-toolkit/blob/main/access/gfz_isdc_dealiasing_ftp.py +.. __: https://github.com/tsutterley/gravity-toolkit/blob/main/access/gfz_isdc_dealiasing_sync.py Calling Sequence ################ .. argparse:: - :filename: gfz_isdc_dealiasing_ftp.py + :filename: gfz_isdc_dealiasing_sync.py :func: arguments - :prog: gfz_isdc_dealiasing_ftp.py + :prog: gfz_isdc_dealiasing_sync.py :nodescription: :nodefault: diff --git a/doc/source/api_reference/access/gfz_isdc_grace_ftp.rst b/doc/source/api_reference/access/gfz_isdc_grace_sync.rst similarity index 75% rename from doc/source/api_reference/access/gfz_isdc_grace_ftp.rst rename to doc/source/api_reference/access/gfz_isdc_grace_sync.rst index 033e5cc5..303391fa 100644 --- a/doc/source/api_reference/access/gfz_isdc_grace_ftp.rst +++ b/doc/source/api_reference/access/gfz_isdc_grace_sync.rst @@ -1,6 +1,6 @@ -===================== -gfz_isdc_grace_ftp.py -===================== +====================== +gfz_isdc_grace_sync.py +====================== - Syncs GRACE/GRACE-FO and auxiliary data from the `GFZ Information System and Data Center (ISDC) `_ - Syncs CSR/GFZ/JPL Level-2 spherical harmonic files @@ -10,14 +10,14 @@ gfz_isdc_grace_ftp.py `Source code`__ -.. __: https://github.com/tsutterley/gravity-toolkit/blob/main/access/gfz_isdc_grace_ftp.py +.. __: https://github.com/tsutterley/gravity-toolkit/blob/main/access/gfz_isdc_grace_sync.py Calling Sequence ################ .. argparse:: - :filename: gfz_isdc_grace_ftp.py + :filename: gfz_isdc_grace_sync.py :func: arguments - :prog: gfz_isdc_grace_ftp.py + :prog: gfz_isdc_grace_sync.py :nodescription: :nodefault: diff --git a/doc/source/index.rst b/doc/source/index.rst index 8a32f1fa..de8eb843 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -159,8 +159,8 @@ Contribute api_reference/access/cnes_grace_sync.rst api_reference/access/esa_costg_swarm_sync.rst api_reference/access/gfz_icgem_costg_ftp.rst - api_reference/access/gfz_isdc_dealiasing_ftp.rst - api_reference/access/gfz_isdc_grace_ftp.rst + api_reference/access/gfz_isdc_dealiasing_sync.rst + api_reference/access/gfz_isdc_grace_sync.rst api_reference/access/itsg_graz_grace_sync.rst api_reference/access/podaac_cumulus.rst diff --git a/setup.py b/setup.py index 855306c3..40ede658 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ import os -from setuptools import setup, find_packages +from setuptools import setup # list of all scripts to be included with package scripts = [] From e004166a5d43907fe59e20fa739081fcf0e7c8ff Mon Sep 17 00:00:00 2001 From: Tyler Sutterley Date: Tue, 7 Oct 2025 14:01:06 -0700 Subject: [PATCH 2/3] Update test_download_and_read.py --- test/test_download_and_read.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/test_download_and_read.py b/test/test_download_and_read.py index 700b7084..3980841a 100644 --- a/test/test_download_and_read.py +++ b/test/test_download_and_read.py @@ -28,6 +28,19 @@ def test_podaac_cumulus_download_and_read(username,password): assert all((Ylms[key] == val) for key,val in test.items()) assert (Ylms['clm'][2,0] == -0.484169355584e-03) +# PURPOSE: Download a GRACE file from GFZ and check that read program runs +def test_gfz_http_download_and_read(): + HOST=['https://isdc-data.gfz.de','grace','Level-2','CSR','RL06', + 'GSM-2_2002095-2002120_GRAC_UTCSR_BA01_0600.gz'] + # download and read as virtual file object + FILE = gravtk.utilities.from_http(HOST,verbose=True) + Ylms = gravtk.read_GRACE_harmonics(FILE, 60) + keys = ['time', 'start', 'end', 'clm', 'slm', 'eclm', 'eslm', 'header'] + test = dict(start=2452369.5, end=2452394.5) + assert all((key in Ylms.keys()) for key in keys) + assert all((Ylms[key] == val) for key,val in test.items()) + assert (Ylms['clm'][2,0] == -0.484169355584e-03) + # PURPOSE: Download a GRACE file from GFZ and check that read program runs def test_gfz_ftp_download_and_read(): HOST=['isdcftp.gfz-potsdam.de','grace','Level-2','CSR','RL06', From 55d2a214209437e68f3a6734ee87c9e44f7d7ffd Mon Sep 17 00:00:00 2001 From: Tyler Sutterley Date: Tue, 7 Oct 2025 14:02:16 -0700 Subject: [PATCH 3/3] Update pixi.lock --- pixi.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pixi.lock b/pixi.lock index 3c318e56..a289b9e1 100644 --- a/pixi.lock +++ b/pixi.lock @@ -5031,7 +5031,7 @@ packages: - pypi: ./ name: gravity-toolkit version: 1.2.4 - sha256: 78349d849cc9f0f33250239d7f0eb3bfe0e4c387f07127875a7c8b1409e3f645 + sha256: 3f8afea7f56123b97f283b73aba8217cd7af40580e34057759f2e877d5ff6c88 requires_dist: - boto3 - future