From dec27d4fd514c36cbfb2b1e4323cb39a5111739c Mon Sep 17 00:00:00 2001 From: Bruce Flynn Date: Thu, 2 Jan 2025 22:10:21 +0000 Subject: [PATCH] support multiple collections --- scripts/catgen | 166 +++++++++++++++++++++++++++---------------------- 1 file changed, 90 insertions(+), 76 deletions(-) diff --git a/scripts/catgen b/scripts/catgen index a04b0e0..b8921d9 100755 --- a/scripts/catgen +++ b/scripts/catgen @@ -12,7 +12,8 @@ from pathlib import Path import fnmeta import pystac -from pystac import Asset, Catalog, CatalogType, Item, ItemCollection, Link, RelType +from pystac import (Asset, Catalog, CatalogType, Item, ItemCollection, Link, + RelType) LOG = logging.getLogger("catgen") @@ -30,45 +31,45 @@ def get_times(fpath: Path) -> "tuple[datetime, datetime] | None": def generate_collection( - collection_id: str, - pattern: str, + inputs: list[tuple[str, Path]], basedir: Path, ) -> ItemCollection: - dirpath, pattern = os.path.split(pattern) items: list[Item] = [] - for fpath in Path(dirpath).glob(pattern): - LOG.debug("adding datafile %s", fpath) - times = get_times(fpath) - if not times: - LOG.warning("failed to determine file times for %s; skipping", fpath.name) - continue - start, end = times - - try: - item = Item( - id=fpath.name.rsplit(".", 1)[0], - geometry=None, - bbox=None, - properties={}, - datetime=start, - start_datetime=start, - end_datetime=end, - # The href will alway automatically be update to be absolute which will - # likely break consumers of this collection, so don't set it. Let consumers - # assume the asset paths are relateive to basedir. - # href=str(basedir), - assets={ - fpath.name: Asset( - href=str(fpath.relative_to(basedir)), roles=["data"] - ), - }, - **({"collection": collection_id} if collection_id else {}), # type: ignore - ) - items.append(item) - except pystac.STACError: - LOG.exception("failed to create or add item for %s", fpath.name) - continue + for collection_id, pattern in inputs: + dirpath, pattern = os.path.split(pattern) + for fpath in Path(dirpath).glob(pattern): + LOG.debug("adding datafile %s", fpath) + times = get_times(fpath) + if not times: + LOG.warning("failed to determine file times for %s; skipping", fpath.name) + continue + start, end = times + + try: + item = Item( + id=fpath.name.rsplit(".", 1)[0], + geometry=None, + bbox=None, + properties={}, + datetime=start, + start_datetime=start, + end_datetime=end, + # The href will alway automatically be update to be absolute which will + # likely break consumers of this collection, so don't set it. Let consumers + # assume the asset paths are relateive to basedir. + # href=str(basedir), + assets={ + fpath.name: Asset( + href=str(fpath.relative_to(basedir)), roles=["data"] + ), + }, + collection=collection_id + ) + items.append(item) + except pystac.STACError: + LOG.exception("failed to create or add item for %s", fpath.name) + continue if len(items) == 0: raise ValueError("(catgen) Failed to catalog any files") @@ -79,46 +80,48 @@ def generate_collection( def generate_catalog( - collection_id: str, - pattern: str, + inputs: list[tuple[str, Path]], basedir: Path, ) -> Catalog: - dirpath, pattern = os.path.split(pattern) - catalog = Catalog(collection_id, description="STAC Catalog") + + catalog = Catalog("catgen", description="STAC Catalog generated by catgen") catalog.add_link(Link(RelType.SELF, str(basedir / "catalog.json"))) - for fpath in Path(dirpath).glob(pattern): - LOG.debug("adding datafile %s", fpath) - times = get_times(fpath) - if not times: - LOG.warning("failed to determine file times for %s; skipping", fpath.name) - continue - start, end = times - try: - item = Item( - id=fpath.name.rsplit(".", 1)[0], - geometry=None, - bbox=None, - properties={}, - datetime=start, - start_datetime=start, - end_datetime=end, - assets={ - fpath.name: Asset(href=str(fpath), roles=["data"]), - }, - **({"collection": collection_id} if collection_id else {}), # type: ignore - ) - meta_path = fpath.with_suffix(".json") - LOG.debug("adding metadata %s", meta_path) - item.add_asset( - meta_path.name, Asset(href=str(meta_path), roles=["metadata"]) - ) - json.dump(item.to_dict(), open(meta_path, "w")) - - catalog.add_link(Link(RelType.ITEM, str(meta_path), "application/json")) - except pystac.STACError: - LOG.exception("failed to create or add item for %s", fpath.name) - continue + for collection_id, pattern in inputs: + dirpath, pattern = os.path.split(pattern) + + for fpath in Path(dirpath).glob(pattern): + LOG.debug("adding datafile %s", fpath) + times = get_times(fpath) + if not times: + LOG.warning("failed to determine file times for %s; skipping", fpath.name) + continue + start, end = times + try: + item = Item( + id=fpath.name.rsplit(".", 1)[0], + geometry=None, + bbox=None, + properties={}, + datetime=start, + start_datetime=start, + end_datetime=end, + assets={ + fpath.name: Asset(href=str(fpath), roles=["data"]), + }, + collection=collection_id, + ) + meta_path = fpath.with_suffix(".json") + LOG.debug("adding metadata %s", meta_path) + item.add_asset( + meta_path.name, Asset(href=str(meta_path), roles=["metadata"]) + ) + json.dump(item.to_dict(), open(meta_path, "w")) + + catalog.add_link(Link(RelType.ITEM, str(meta_path), "application/json")) + except pystac.STACError: + LOG.exception("failed to create or add item for %s", fpath.name) + continue if len(list(catalog.get_all_items())) == 0: raise ValueError("(catgen) Failed to catalog any files") @@ -140,18 +143,29 @@ if __name__ == "__main__": default="catalog", help="Write either a Catalog or FeatureCollection", ) - parser.add_argument("collection_id") + def inputtype(v: str) -> tuple[str, Path]: + r = [s.strip() for s in v.split(",")] + return r[0], Path(r[1]).absolute() + parser.add_argument( - "file_pattern", help="Glob style file pattern to include in catalog" + "input", + nargs="+", + metavar="SPEC", + type=inputtype, + help=( + "Where SPEC is ,; file-pattern is a shell style " + "glob pattern. Each file found matching file-pattern will be assigned a collection " + "id of collection. One or more items must be specified."), ) args = parser.parse_args() + print(args.input) + logging.basicConfig(level=logging.INFO, format="%(name)s -- %(message)s") LOG.setLevel(logging.DEBUG if args.verbose else logging.INFO) catalog = globals()[f"generate_{args.type}"]( - args.collection_id, - Path(args.file_pattern).absolute(), + args.input, basedir=Path(".").absolute(), ) if args.verbose: