From 96c4d9c3ee21ad24a4f8d9f6acfd91a39fb11dae Mon Sep 17 00:00:00 2001 From: Ryan Carlson <60898723+rcarlson89@users.noreply.github.com> Date: Wed, 3 Feb 2021 11:40:56 -0600 Subject: [PATCH 01/23] Create gediCombine.py This program can be used to download GEDI files using an text file of urls provided by EarthData Search. Combined with the gediFinder.py program, one should be able to gather data from a large bounding box using Earthdata search without a lot of computer storage space and in a timely manner. --- gedi/gediCombine.py | 416 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 416 insertions(+) create mode 100644 gedi/gediCombine.py diff --git a/gedi/gediCombine.py b/gedi/gediCombine.py new file mode 100644 index 0000000..376a0c2 --- /dev/null +++ b/gedi/gediCombine.py @@ -0,0 +1,416 @@ +""" +This program goes through the following process: + Download GEDI HDF5 files + Retrieve approproate data from files + Remove GEDI HDF5 files + Write data to file +The following layers are retrieved from the raw GEDI data: + shot_number, lat_lowestmode, lon_lowestmode, elev_lowestmode, elev_highestreturn, + sensitivity, quality_flag, rh +rh is converted to the following canopy height metrics: + canopy height max, min, std, avg + canopy height percentiles 10, 25, 50, 75, 90 + number of canopy height measurements in range h<5m, 5m pd.DataFrame: + """ + Given the absolute path to a directory of GEDI Level 2A h5 files, the + names of the h5 files, and information about the desired output, ingest + the h5 files and output a pd.DataFrame containing data for all the valid + shots within the bounding box. + The output will have an array-valued column that is designed to be handled + by append_canopy_metrics and then deleted from the DataFrame. + Parameters + ---------- + file_dir : str + Path to directory containing the h5 files, including a trailing slash + such that {file_dir}{fname} is a valid path to a file, given fname a + valid file name. + file_paths : List[str] + The names of the h5 files to process. + bbox : List[float] + The coordinates of the bounding box, formatted as + [ul_lat, ul_lon, lr_lat, lr_lon]. + layers : List[str] + The columns for the output DataFrame. + latlayer : str + The name of the latitude layer. + lonlayer : str + The name of the longitude layer. + Returns + ------- + pd.DataFrame containing the user-specified contents of the h5 files for the + valid shots within the specified bounding box. + """ + df = pd.DataFrame() + for _f_name in file_paths: + try: + _f = h5py.File(f"{_f_name}", "r") + print(f"Processing file {os.path.basename(_f_name)}") + [ul_lat, ul_lon, lr_lat, lr_lon] = bbox + for beam in [ + "BEAM0000", + "BEAM0001", + "BEAM0010", + "BEAM0011", + "BEAM0101", + "BEAM0110", + "BEAM1000", + "BEAM1011", + ]: + tmp_df = pd.DataFrame() + x = _f[beam][latlayer][()] + y = _f[beam][lonlayer][()] + qual = _f[beam]["quality_flag"][()] + + mask = ( + (qual == 1) + & (x <= ul_lat) + & (x >= lr_lat) + & (y <= lr_lon) + & (y >= ul_lon) + ) + + if np.count_nonzero(mask) == 0: + continue + print(f"{beam} in {os.path.basename(_f_name)} does not contain any usable data") + + else: + tmp_df["beam"] = [beam] * np.count_nonzero(mask) + + for layer in layers: + tmp_df[layer] = _f[beam][layer][()][mask].tolist() + + append_canopy_metrics(tmp_df, canopy_threshold=2) + del tmp_df["rh"] + + df = df.append(tmp_df) + + print(f"Finished processing file {os.path.basename(_f_name)}") + + except KeyError as e: + print(f"Encountered file read error with file {_f_name}") + + + return df.reset_index(drop=True) + + +def _compute_nan_percentile(a: np.ndarray, q: float) -> np.array: + """ + This code is taken directly from StackOverflow at: + https://stackoverflow.com/questions/60015245/numpy-nanpercentile-is-extremely-slow + For 2d masked array of roughly 10^5 x 100, this performs ~3x faster than + np.nanpercentile (~10s vs. 3s) and produces results which are identical. + Would be good to add two tests to verify: (1) the results of this function are + the same as np.nanpercentile, and (2) that this function takes less time on a + reasonably sized array, in case of future performance gains in nanpercentile. + Parameters + ---------- + a : array_like + 2d array for which to compute a row-wise percentile. + q : float + Percentile to compute -- allowed values are [0, 100] + Returns + ------- + np.array of the q'th percentile for each row of a. Dimensions of [len(a), 1]. + """ + if q < 0 or q > 100: + raise ValueError(f"Expected a value between 0 and 100; received {q} instead.") + mask = (a >= np.nanmin(a)).astype(int) + + count = mask.sum(axis=1) + groups = np.unique(count) + groups = groups[groups > 0] + + p = np.zeros((a.shape[0])) + for g in range(len(groups)): + pos = np.where(count == groups[g]) + values = a[pos] + values = np.nan_to_num(values, nan=(np.nanmin(a) - 1)) + values = np.sort(values, axis=1) + values = values[:, -groups[g] :] + p[pos] = np.percentile(values, q, axis=1) + return p + + +def append_canopy_metrics(df: pd.DataFrame, canopy_threshold: float) -> None: + """ + This function takes a pd.DataFrame object and a numerical value corresponding + to the height threshold to consider a return as coming from the forest canopy. + It extracts the "rh" column from the provided DataFrame as an np.ndarray, + applies a mask, and computes the relevant canopy metrics which are then + appended to the provided DataFrame. + Note: the least bad option for computing these canopy metrics seemed to be + using np.ma module to work with masked arrays. Not converting out of Pandas is + untenable due to computation time, and the varying dimension of rows when + subset to canopy-only observations means np converts the result of applying + the mask to the 2d array to a 1d array. + Parameters + ---------- + df : pd.DataFrame + The DataFrame, assumed to be the output of gedi_L2A_to_df, for which + canopy metrics should be calculated. The provided df must contain a column + labeled "rh" which is array-valued. + canopy_threshold : float + The minimum value for a return to be considered a "canopy" return. + Side Effects + ------------ + Modifies the provided pd.DataFrame in-place to add canopy metrics. + Returns + ------- + None + """ + rh = np.stack(df.rh.to_numpy()) + canopy_returns = np.ma.masked_less(rh, canopy_threshold) + + df["canopy_max"] = pd.Series(np.max(canopy_returns, axis=1)) + df["canopy_min"] = pd.Series(np.min(canopy_returns, axis=1)) + df["canopy_std"] = pd.Series(np.std(canopy_returns, axis=1)) + df["canopy_avg"] = pd.Series(np.average(canopy_returns, axis=1)) + df["dns"] = pd.Series( + canopy_returns.count(axis=1) + ) # % all returns >= canopy threshold + + canopy_returns = np.ma.filled(canopy_returns, np.nan) + df["canopy_p10"] = pd.Series(_compute_nan_percentile(canopy_returns, 10)) + df["canopy_p25"] = pd.Series(_compute_nan_percentile(canopy_returns, 25)) + df["canopy_p50"] = pd.Series(_compute_nan_percentile(canopy_returns, 50)) + df["canopy_p75"] = pd.Series(_compute_nan_percentile(canopy_returns, 75)) + df["canopy_p90"] = pd.Series(_compute_nan_percentile(canopy_returns, 90)) + + df["d01"] = pd.Series( + np.ma.masked_outside(rh, canopy_threshold, 5).count(axis=1) + ) # % returns >= canopy, <=5m + df["d02"] = pd.Series( + np.ma.masked_outside(rh, 5, 10).count(axis=1) + ) # % >= 5m, <=10m + df["d03"] = pd.Series( + np.ma.masked_outside(rh, 10, 20).count(axis=1) + ) # % >= 10m, <=20m + df["d04"] = pd.Series( + np.ma.masked_outside(rh, 20, 30).count(axis=1) + ) # % >= 20m, <=30m + + +def df_to_geojson(df : pd.DataFrame, outfile : str) -> None: + """ + Convert pandas dataframe to GeoJSON file and save as given file name and path. + This file type can be useful when working locally in GIS software. + Parameters + ---------- + df : pd.DataFrame + The DataFrame, assumed to be the output of gedi_L2A_to_df, with or without + canopy height metrics appended. + outfile : str + The location and filename of output json file. + Side Effects + ------- + Writes dataframe to geojson filetype + Returns + ------- + None + """ + df["geometry"] = df.apply( + lambda row: Point(row.lon_lowestmode, row.lat_lowestmode), axis=1 + ) + GeoDF = gp.GeoDataFrame(df) + GeoDF = GeoDF.drop(columns=["lat_lowestmode", "lon_lowestmode"]) + GeoDF.to_file(outfile, driver="GeoJSON") + +def download_url(url : str, dir : str) -> None: + """ + "Description of func" + Parameters + ---------- + url : str + "description of parameter" + Side Effects + ------- + Downloads + Returns + ------- + None + """ + r = requests.get(url) + + if r.status_code == requests.codes.ok: + print(f"Downloading files from {url}") + z = zipfile.ZipFile(io.BytesIO(r.content)) + z.extractall(dir) + + print(f"Completed download of files from {url}.") + + del r, z + return True + + else: + print( + f'Error {r.status_code} has occurred.\n' + f'{url} cannot be downloaded at this time.' + ) + + return False + +if __name__ == "__main__": + # things I would like to see: displays 'arg help' if no arguments are given, docstrings with proper usage + parser = argparse.ArgumentParser() + parser.add_argument( + "-d", + "--dir", + help=( + "The directory containing url txt file supplied by the" + "--textfile argument" + ), + ) + parser.add_argument( + "-t", + "--textfile", + help=( + "The txt file containing the urls for the zip files, supplied by" + "EarthData Search." + ), + ) + parser.add_argument( + "-b", + "--bbox", + help=( + "The bounding box of the region of interest. In format" + "ul_lat,ul_lon,lr_lat,lr_lon" + ) + ) + parser.add_argument( + "-o", + "--outfile", + help=( + "The stem of the name of the output file, without file extension," + "optional. Default value is 'gedi_output'." + ), + default="gedi_output", + ) + parser.add_argument( + "-f", + "--filetype", + help=( + "The type of file to output. Acceptable formats are: csv," + "parquet, GeoJSON. Default value is csv." + ), + default="csv", + ) + args = parser.parse_args() + + print("------------------------------------------") + + bbox = [float(b) for b in args.bbox.split(',')] + + df = pd.DataFrame() + + with open(str(args.textfile), 'r') as texturls: + for url in texturls: + status = download_url(url, args.dir) + if status: + continue + else: + print("------------------------------------------") + break + + # Make list of filepaths + filepaths = [ + os.path.join(root, file) + for root, dir, files in os.walk(args.dir, topdown = False) + for file in files + if file.endswith('.h5') + ] + + tmp_df = gedi_L2A_to_df(filepaths, bbox = bbox) + + # append data to dataframe + df = df.append(tmp_df) + print(f"Appended {len(tmp_df)} lines to dataframe") + + # Delete hdf5 files + for filepath in filepaths: + rmtree(os.path.dirname(filepath)) + + print("------------------------------------------") + + if df.empty: + print("DataFrame is empty. Not writing data to file") + print("------------------------------------------") + else: + if args.filetype.lower() == "csv": + filename = os.path.join(args.dir, args.outfile + ".csv") + print(f'Writing to file {filename}') + df.to_csv(filename, index=False) + elif args.filetype.lower() == "parquet": + filename = os.path.join(args.dir, args.outfile + ".parquet.gzip") + print(f'Writing to file {filename}') + df.to_parquet(filename, compression="gzip") + elif args.filetype.lower() == "geojson": + filename = os.path.join(args.dir, args.outfile + ".geojson") + print(f'Writing to file {filename}') + df_to_geojson(df, filename) + else: + raise ValueError( + f"Received unsupported file type {args.filetype}. Please provide one of: csv, parquet, or GeoJSON." + ) + print("Job Complete!") + print("------------------------------------------") From ffb27e9cf1eaa5dfcbf3c23f0314e098776c2efa Mon Sep 17 00:00:00 2001 From: Ryan Carlson <60898723+rcarlson89@users.noreply.github.com> Date: Wed, 3 Feb 2021 11:55:28 -0600 Subject: [PATCH 02/23] Update README.md --- gedi/README.md | 53 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/gedi/README.md b/gedi/README.md index 6bdbcb3..2f141e4 100644 --- a/gedi/README.md +++ b/gedi/README.md @@ -1,4 +1,53 @@ # GEDI -This folder is intended for code relating to data from the [GEDI](https://gedi.umd.edu/) mission. The Level 1B, Level 2A, and Level 2B data from GEDI are of particular interest for our work here. +This folder is intended for code relating to data from the [GEDI](https://gedi.umd.edu/) mission. The Level 2A data from GEDI are of particular interest for our work here. This dataset allows us to measure canopy heights for large swaths of land at approximately 30m resolution. -Currently, we are using the rGEDI package (see: [CRAN](https://cran.r-project.org/web/packages/rGEDI/index.html), [tutorial](https://cran.r-project.org/web/packages/rGEDI/vignettes/tutorial.html), [paper](https://www.researchgate.net/publication/339971349_rGEDI_An_R_Package_for_NASA%27s_Global_Ecosystem_Dynamics_Investigation_GEDI_Data_Visualizing_and_Processing), [source](https://github.com/carlos-alberto-silva/rGEDI)) to download and process the data from GEDI. As a result, a considerable amount of code here will likely be in R. +# GEDI Retrieval +The GEDI mission is a relatively new mission to the story of Earth Observation, and is still currently taking data of the Earth's surface while aboard the International Space Station. Thus the GEDI data is still in a rudimentary form. When downloaded, the entire Level 2A dataset available as of 3 February 2021 is nearly 40 TB. This is too big for many individuals to work with, and time needed to download that much data to an individual machine would be too great for the highly adaptive work we are doing. + +We must therefore develop a methodology that can quickly find, retrieve, download, and gather the appropriate data for our purpose. Here, we present such a methodology that utilizes the NASA LP DAAC GEDI Data Finder service (https://lpdaac.usgs.gov/news/release-gedi-finder-web-service/) and the NASA EarthData Search application (https://earthdata.nasa.gov) to accomplish this task. + +## GEDI Finder +open terminal + +In terminal: +conda activate pyGEDI +cd /Users/ryancarlson/Earthshot_Labs/Carbon/GEDI/Data/L2A_PNW/Test +python gediFinder.py -b ul_lat,ul_lon,lr_lat,lr_lon -l 2a + +open granule_list.txt +copy contents + +## EarthData Search +open safari +go to https://search.earthdata.nasa.gov/search +Search collections for gedi 2a +Click on GEDI L2A Elevation and Height Metrics Data Global Footprint Level V001 +Paste granule list in granule search on left side and press enter +Click download all button +click edit options on left side +Select customize +Select "Click to enable" in spatial subsetting" +Enter in appropriate bounds for the boundign box + North -> ul_lat + West -> ul_lon + East -> lr_lon + South -> lr_lat +Click Done +Click Download Data + +Wait until data has been processed by Nasa. This could take anywhere from a few minutes to multiple days depending on the size of the bounding box and other variables on the server side. + +# GEDI Combine +open safari +navigate to download page in Earthdata Search +open html file (first file listed in download links) +Download first zip file + +open Finder +Open README in unzipped file +Create new text file to store zip urls + +open terminal (again) + +In terminal: +python gediCombine_individual.py -d DirectoryPath -t FilePath -o gedi_output -f csv -b ul_lat,ul_lon,lr_lat,lr_lon From d5a63240a902a92e35708acc5ec8c0f99f732626 Mon Sep 17 00:00:00 2001 From: Ryan Carlson <60898723+rcarlson89@users.noreply.github.com> Date: Wed, 3 Feb 2021 11:56:38 -0600 Subject: [PATCH 03/23] Update README.md --- gedi/README.md | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/gedi/README.md b/gedi/README.md index 2f141e4..8a5382c 100644 --- a/gedi/README.md +++ b/gedi/README.md @@ -10,44 +10,63 @@ We must therefore develop a methodology that can quickly find, retrieve, downloa open terminal In terminal: + conda activate pyGEDI -cd /Users/ryancarlson/Earthshot_Labs/Carbon/GEDI/Data/L2A_PNW/Test + +cd loc python gediFinder.py -b ul_lat,ul_lon,lr_lat,lr_lon -l 2a open granule_list.txt + copy contents ## EarthData Search open safari + go to https://search.earthdata.nasa.gov/search + Search collections for gedi 2a + Click on GEDI L2A Elevation and Height Metrics Data Global Footprint Level V001 + Paste granule list in granule search on left side and press enter + Click download all button + click edit options on left side + Select customize + Select "Click to enable" in spatial subsetting" + Enter in appropriate bounds for the boundign box North -> ul_lat West -> ul_lon East -> lr_lon South -> lr_lat + Click Done + Click Download Data Wait until data has been processed by Nasa. This could take anywhere from a few minutes to multiple days depending on the size of the bounding box and other variables on the server side. # GEDI Combine open safari + navigate to download page in Earthdata Search + open html file (first file listed in download links) + Download first zip file open Finder Open README in unzipped file + Create new text file to store zip urls -open terminal (again) +open terminal In terminal: + python gediCombine_individual.py -d DirectoryPath -t FilePath -o gedi_output -f csv -b ul_lat,ul_lon,lr_lat,lr_lon From 2ac7afc7b7759cc566ea34c13b55cccc132b0771 Mon Sep 17 00:00:00 2001 From: Ryan Carlson <60898723+rcarlson89@users.noreply.github.com> Date: Wed, 3 Feb 2021 11:57:58 -0600 Subject: [PATCH 04/23] Create gediFinder.py --- gedi/gediFinder.py | 134 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 gedi/gediFinder.py diff --git a/gedi/gediFinder.py b/gedi/gediFinder.py new file mode 100644 index 0000000..3f87cd4 --- /dev/null +++ b/gedi/gediFinder.py @@ -0,0 +1,134 @@ +""" +This script is meant to create a GediFinder URL and get the corresponding list of granules within a +user-defined bounding box. This list can then be used in the Earthdata Search Tool to pull data from +within a bounding box. + +To Run +---------- + + +Arguments +------- + -b, --bbox : str + The bounding box of the region of interest. In format + ul_lat,ul_lon,lr_lat,lr_lon + -l,--level : str (optional) + The data product level of interest. Acceptable answers at this time + are 1B, 2A, or 2B with default of 2A + default = 2A + -o,--outfile : str (optional) + The stem of the name of the output file, without file extension, + optional. Default value is 'gedi_output'. + default = granule_list + +Outputs +------- +Writes granule list to a txt file in +""" + +import requests +import argparse + +from urllib.parse import urlencode +from typing import List + +def gediFinder( + level: str, + bbox: str, + version: str = "001", + output: str = "json", + lpdaac: str = "https://lpdaacsvc.cr.usgs.gov/services/gedifinder", +) -> str: + """ + "Description of func" + Parameters + ---------- + level : str + "description of parameter" + version : str + + bbox : List[float] + + output : str + + Returns + ------- + str of all HDF5 granules separated by commas + """ + + payload = { + 'product': f'{level}', + 'version': f'{version}', + 'bbox': f'{bbox}', + 'output': f'{output}' + } + payload_str = urlencode(payload, safe=',') + + r = requests.get(lpdaac, params = payload_str) + + if r.status_code == requests.codes.ok: + print('Success!') + + granules = [g.split('/')[-1] for g in r.json()['data']] # take filename from url + return ','.join(granules) + + else: + print(f'Error {r.status_code} has occurred.') + + error = str( + f'URL below failed to retrieve GEDI data. ' + f'Error {r.status_code} has occurred. \n {r.url}' + ) + return error + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-d", + "--dir", + help=( + "The directory containing url txt file, formatted with a trailing slash," + " such that {dir}{fname} is a valid path, for fname a valid file name." + ), + ) + parser.add_argument( + "-b", + "--bbox", + help=( + "The bounding box of the region of interest. In format" + "ul_lat,ul_lon,lr_lat,lr_lon" + ) + ) + parser.add_argument( + "-l", + "--level", + help=( + "The data product level of interest. Acceptable answers at this time" + "are 1B, 2A, or 2B with default of 2A" + ), + default = "2A" # Pacific Northwest + ) + parser.add_argument( + "-o", + "--outfile", + help=( + "The stem of the name of the output file, without file extension, " + "optional. Default value is 'gedi_output'." + ), + default="granule_list", + ) + args = parser.parse_args() + + if args.level.upper() == '1B': + level = 'GEDI01_B' + elif args.level.upper() == '2A': + level = 'GEDI02_A' + elif args.level.upper() == '2B': + level = 'GEDI02_B' + else: + raise ValueError( + f"Recieved unsupported data level {args.level}. Please provide 1B, 2A, or 2B" + ) + + with open(os.path.join(args.dir, args.outfile + ".txt"), 'w+') as file: + file.write(gediFinder(level, args.bbox)) From 183fd1364ab365c1e426b4513789035db05b7ae9 Mon Sep 17 00:00:00 2001 From: Ryan Carlson <60898723+rcarlson89@users.noreply.github.com> Date: Wed, 3 Feb 2021 12:04:14 -0600 Subject: [PATCH 05/23] Optimized _compute_nan_percentile function --- gedi/gediCombine.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/gedi/gediCombine.py b/gedi/gediCombine.py index 376a0c2..aec947c 100644 --- a/gedi/gediCombine.py +++ b/gedi/gediCombine.py @@ -175,20 +175,18 @@ def _compute_nan_percentile(a: np.ndarray, q: float) -> np.array: """ if q < 0 or q > 100: raise ValueError(f"Expected a value between 0 and 100; received {q} instead.") - mask = (a >= np.nanmin(a)).astype(int) - - count = mask.sum(axis=1) - groups = np.unique(count) - groups = groups[groups > 0] - + a = np.sort(a, axis =1) + count = (~np.isnan(a)).sum(axis=1) # count number of non-nans in row + groups = np.unique(count) # returns sorted unique values + groups = groups[groups > 0] # only returns groups with at least 1 non-nan value\n", + p = np.zeros((a.shape[0])) - for g in range(len(groups)): - pos = np.where(count == groups[g]) + for group in groups: + pos = np.where(count == group) values = a[pos] - values = np.nan_to_num(values, nan=(np.nanmin(a) - 1)) - values = np.sort(values, axis=1) - values = values[:, -groups[g] :] + values = values[:, :group] p[pos] = np.percentile(values, q, axis=1) + return p From 1f5cf2b52cb2023ce44607146f8edd8c1429daeb Mon Sep 17 00:00:00 2001 From: Ryan Carlson <60898723+rcarlson89@users.noreply.github.com> Date: Wed, 3 Feb 2021 12:05:28 -0600 Subject: [PATCH 06/23] Deleted unused module import --- gedi/gediCombine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gedi/gediCombine.py b/gedi/gediCombine.py index aec947c..1f8a61e 100644 --- a/gedi/gediCombine.py +++ b/gedi/gediCombine.py @@ -51,7 +51,6 @@ import pandas as pd import geopandas as gp -from tqdm import tqdm from typing import List from shutil import rmtree from shapely.geometry import Point From 96a3e524ab56c31b8c2151452683480258621ecd Mon Sep 17 00:00:00 2001 From: Ryan Carlson <60898723+rcarlson89@users.noreply.github.com> Date: Wed, 3 Feb 2021 12:09:49 -0600 Subject: [PATCH 07/23] Update gediFinder.py --- gedi/gediFinder.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gedi/gediFinder.py b/gedi/gediFinder.py index 3f87cd4..49b3c90 100644 --- a/gedi/gediFinder.py +++ b/gedi/gediFinder.py @@ -5,10 +5,13 @@ To Run ---------- - +python gediFinder.py -d DirectoryPath -b ul_lat,ul_lon,lr_lat,lr_lon -l level -o granule_list Arguments ------- + -d,--dir : str + The directory containing url txt file, formatted with a trailing slash, + such that {dir}{fname} is a valid path, for fname a valid file name. -b, --bbox : str The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon @@ -23,14 +26,13 @@ Outputs ------- -Writes granule list to a txt file in +Writes granule list to a txt file in directory """ import requests import argparse from urllib.parse import urlencode -from typing import List def gediFinder( level: str, @@ -65,7 +67,6 @@ def gediFinder( payload_str = urlencode(payload, safe=',') r = requests.get(lpdaac, params = payload_str) - if r.status_code == requests.codes.ok: print('Success!') From 1a5323dcd6112bcfca75bd068aec5b9a3ade4c64 Mon Sep 17 00:00:00 2001 From: "Ryan M. Carlson" Date: Wed, 3 Feb 2021 16:00:17 -0600 Subject: [PATCH 08/23] Update GEDI Finder section of README --- gedi/README.md | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/gedi/README.md b/gedi/README.md index 8a5382c..79052ed 100644 --- a/gedi/README.md +++ b/gedi/README.md @@ -1,24 +1,31 @@ # GEDI -This folder is intended for code relating to data from the [GEDI](https://gedi.umd.edu/) mission. The Level 2A data from GEDI are of particular interest for our work here. This dataset allows us to measure canopy heights for large swaths of land at approximately 30m resolution. +This folder is intended for code relating to data from the [GEDI](https://gedi.umd.edu/) mission. +The Level 2A data from GEDI are of particular interest for our work here. +This dataset allows us to measure canopy heights for large swaths of land at approximately 30m resolution. # GEDI Retrieval -The GEDI mission is a relatively new mission to the story of Earth Observation, and is still currently taking data of the Earth's surface while aboard the International Space Station. Thus the GEDI data is still in a rudimentary form. When downloaded, the entire Level 2A dataset available as of 3 February 2021 is nearly 40 TB. This is too big for many individuals to work with, and time needed to download that much data to an individual machine would be too great for the highly adaptive work we are doing. +The GEDI mission is a relatively new mission to the story of Earth Observation, and is still currently taking data of the Earth's surface while aboard the International Space Station. +Thus the GEDI data is still in a rudimentary form. +When downloaded, the entire Level 2A dataset available as of 3 February 2021 is nearly 40 TB. +This is too big for many individuals to work with, and time needed to download that much data to an individual machine would be too great for the highly adaptive work we are doing. -We must therefore develop a methodology that can quickly find, retrieve, download, and gather the appropriate data for our purpose. Here, we present such a methodology that utilizes the NASA LP DAAC GEDI Data Finder service (https://lpdaac.usgs.gov/news/release-gedi-finder-web-service/) and the NASA EarthData Search application (https://earthdata.nasa.gov) to accomplish this task. +We must therefore develop a methodology that can quickly find, retrieve, download, and gather the appropriate data for our purpose. +Here, we present such a methodology that utilizes the NASA LP DAAC GEDI Data Finder service (https://lpdaac.usgs.gov/news/release-gedi-finder-web-service/) and the NASA EarthData Search application (https://earthdata.nasa.gov) to accomplish this task. ## GEDI Finder -open terminal -In terminal: +The GEDI Finder program (gediFinder.py) builds off the NASA service, the LP DAAC GEDI Finder, and allows for a fast and easy way to determine the appropriate GEDI data for a specific bounding box. -conda activate pyGEDI +The user supplies the output directory and the bounding box of interest, and the user has the option to supply the GEDI data level and the name of the file created. +If the optional arguments for the GEDI data level and the name of the output file are not supplied, the program uses the default parameters of L2A and granule_list, respectively. -cd loc -python gediFinder.py -b ul_lat,ul_lon,lr_lat,lr_lon -l 2a +A comma-separated list of each GEDI granule that intersects the bounding box of interest is written to the output file as a .txt file. +The list of granules is used to select the appropriate granules to download in the NASA EarthData Search application. +***To use this list in future steps, the user must open the file and copy the entire contents.*** -open granule_list.txt +The program can be run, once the user has navigated to the appropriate directory, as follows: -copy contents +`python gediFinder.py -d DirectoryPath -b ul_lat,ul_lon,lr_lat,lr_lon -l 2a` ## EarthData Search open safari @@ -44,7 +51,7 @@ Enter in appropriate bounds for the boundign box West -> ul_lon East -> lr_lon South -> lr_lat - + Click Done Click Download Data From 33c16fa6e3509999b1ffb1c07991b096a5fa48e6 Mon Sep 17 00:00:00 2001 From: "Ryan M. Carlson" Date: Wed, 3 Feb 2021 17:08:33 -0600 Subject: [PATCH 09/23] Update GEDI Combine header in README --- gedi/README.md | 53 +++++++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/gedi/README.md b/gedi/README.md index 79052ed..234c511 100644 --- a/gedi/README.md +++ b/gedi/README.md @@ -3,16 +3,16 @@ This folder is intended for code relating to data from the [GEDI](https://gedi.u The Level 2A data from GEDI are of particular interest for our work here. This dataset allows us to measure canopy heights for large swaths of land at approximately 30m resolution. -# GEDI Retrieval +## GEDI Retrieval The GEDI mission is a relatively new mission to the story of Earth Observation, and is still currently taking data of the Earth's surface while aboard the International Space Station. Thus the GEDI data is still in a rudimentary form. When downloaded, the entire Level 2A dataset available as of 3 February 2021 is nearly 40 TB. This is too big for many individuals to work with, and time needed to download that much data to an individual machine would be too great for the highly adaptive work we are doing. We must therefore develop a methodology that can quickly find, retrieve, download, and gather the appropriate data for our purpose. -Here, we present such a methodology that utilizes the NASA LP DAAC GEDI Data Finder service (https://lpdaac.usgs.gov/news/release-gedi-finder-web-service/) and the NASA EarthData Search application (https://earthdata.nasa.gov) to accomplish this task. +Here, we present such a methodology that utilizes the [NASA LP DAAC GEDI Data Finder service](https://lpdaac.usgs.gov/news/release-gedi-finder-web-service/) and the [NASA EarthData Search application](https://earthdata.nasa.gov/search) to accomplish this task. -## GEDI Finder +### GEDI Finder The GEDI Finder program (gediFinder.py) builds off the NASA service, the LP DAAC GEDI Finder, and allows for a fast and easy way to determine the appropriate GEDI data for a specific bounding box. @@ -23,11 +23,19 @@ A comma-separated list of each GEDI granule that intersects the bounding box of The list of granules is used to select the appropriate granules to download in the NASA EarthData Search application. ***To use this list in future steps, the user must open the file and copy the entire contents.*** -The program can be run, once the user has navigated to the appropriate directory, as follows: +#### To Run +`python gediFinder.py -d DirectoryPath -b ul_lat,ul_lon,lr_lat,lr_lon -l 2a` [source](gediFinder.py) -`python gediFinder.py -d DirectoryPath -b ul_lat,ul_lon,lr_lat,lr_lon -l 2a` +#### Arguments +1. ***-d,--dir*** : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. +2. ***-b, --bbox*** : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon. +3. ***-l,--level*** *(optional)* : The data product level of interest. Acceptable answers at this time are 1B, 2A, or 2B with default of 2A. The default argument if none is given is *2A*. +4. ***-o,--outfile*** *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *granule_list*. -## EarthData Search +#### Output +Text file with a comma-separated list of GEDI granules that intersect user-defined bounding box + +### EarthData Search open safari go to https://search.earthdata.nasa.gov/search @@ -58,22 +66,27 @@ Click Download Data Wait until data has been processed by Nasa. This could take anywhere from a few minutes to multiple days depending on the size of the bounding box and other variables on the server side. -# GEDI Combine -open safari - -navigate to download page in Earthdata Search - -open html file (first file listed in download links) - -Download first zip file +### GEDI Combine +The NASA EarthData Search will collect, clip, and process the GEDI granules, creating a number of zip files with the processed data to download. +The gediCombine program we present here takes the list of urls necessary to download the zip files and extracts the data from the files after downloading them. -open Finder -Open README in unzipped file +This program cycles through the list of urls supplied, conducting the following operations to extract the data. +* Download zip file and unzip contents +* Extracts useful data from files +* Delete HDF5 files +* Write data to a pandas DataFrame -Create new text file to store zip urls +After each zip file has been processed and deleted, the DataFrame is written to a csv, parquet, or GeoJSON file, depending on inputs from the user. -open terminal +#### To Run +`python gediCombine_individual.py -d DirectoryPath -t FilePath -o gedi_output -f csv -b ul_lat,ul_lon,lr_lat,lr_lon` [source](gediCombine.py) -In terminal: +#### Arguments +1. ***-d,--dir*** : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. +2. ***-t,--textfile*** : The txt file containing the urls for the zip files, supplied by EarthData Search. +3. ***-b,--bbox*** : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon +4. ***-o,--outfile*** *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *gedi_output*. +5. ***-f,--filetype*** *(optional)* : The type of file to output. Acceptable formats are: csv, parquet, GeoJSON. The default argument if none is given is *csv*. -python gediCombine_individual.py -d DirectoryPath -t FilePath -o gedi_output -f csv -b ul_lat,ul_lon,lr_lat,lr_lon +#### Output +Generates file of input filetype with Level 2B data retrieved from GEDI servers. From 6bc69ac024804f99989d9a2032e49b2ac7105cf0 Mon Sep 17 00:00:00 2001 From: "Ryan M. Carlson" Date: Wed, 3 Feb 2021 20:15:06 -0600 Subject: [PATCH 10/23] README.md --- gedi/README.md | 63 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/gedi/README.md b/gedi/README.md index 234c511..43a4986 100644 --- a/gedi/README.md +++ b/gedi/README.md @@ -7,11 +7,26 @@ This dataset allows us to measure canopy heights for large swaths of land at app The GEDI mission is a relatively new mission to the story of Earth Observation, and is still currently taking data of the Earth's surface while aboard the International Space Station. Thus the GEDI data is still in a rudimentary form. When downloaded, the entire Level 2A dataset available as of 3 February 2021 is nearly 40 TB. -This is too big for many individuals to work with, and time needed to download that much data to an individual machine would be too great for the highly adaptive work we are doing. +This is too big for many individuals to work with on their personal machines, and time needed to download that much data would be too great for the highly adaptive workflow we are implement. We must therefore develop a methodology that can quickly find, retrieve, download, and gather the appropriate data for our purpose. Here, we present such a methodology that utilizes the [NASA LP DAAC GEDI Data Finder service](https://lpdaac.usgs.gov/news/release-gedi-finder-web-service/) and the [NASA EarthData Search application](https://earthdata.nasa.gov/search) to accomplish this task. +## Dependencies +* `io` +* `os` +* `h5py` +* `numpy` +* `pandas` +* `zipfile` +* `argparse` +* `requests` +* `geopandas ` +* `List from typing` +* `remtree from shutil` +* `Point from shapely.geometry` +* `urlencode from urllib.parse` + ### GEDI Finder The GEDI Finder program (gediFinder.py) builds off the NASA service, the LP DAAC GEDI Finder, and allows for a fast and easy way to determine the appropriate GEDI data for a specific bounding box. @@ -36,35 +51,43 @@ The list of granules is used to select the appropriate granules to download in t Text file with a comma-separated list of GEDI granules that intersect user-defined bounding box ### EarthData Search -open safari -go to https://search.earthdata.nasa.gov/search +Pages 2-4 in the [GEDI Spatial Querying and Subsetting Quick Guide](https://lpdaac.usgs.gov/documents/635/GEDI_Quick_Guide.pdf) provides a great overview of the process we implement in relation to data collection from [NASA EarthData Search application](https://earthdata.nasa.gov/search). +The following is pasted ***directly*** from the [GEDI Spatial Querying and Subsetting Quick Guide](https://lpdaac.usgs.gov/documents/635/GEDI_Quick_Guide.pdf). Some modifications have been made for readability and context and are *emphasized*. + +1. ***Access Earthdata Search*** + +After *obtaining a comma-separated list of GEDI granules with [GEDIFinder](#gedi finder)*, open [NASA Earthdata Search](https://search.earthdata.nasa.gov/). Sign in with Earthdata Login credentials or [register](https://urs.earthdata.nasa.gov/users/new) for a new account. + +Note: Currently there are no spatial searching capabilities for the GEDI Version 1 datasets in Earthdata Search. + +2. ***Search for Dataset*** -Search collections for gedi 2a +Search for a collection by entering the dataset short name *(e.g. GEDI02_A)* into the search box then select the desired product from the list of matching collections. +All available granules for the product will be included in the list of matching granules. -Click on GEDI L2A Elevation and Height Metrics Data Global Footprint Level V001 +3. ***Search for Granules*** +Copy the list of comma-separated granule IDs *obtained with [GEDIFinder](#gedi finder)* and paste it into the Granule Search box in Earthdata Search. Use the Enter key to initiate the search. -Paste granule list in granule search on left side and press enter +4. ***Select spatial and/or layer parameters for GEDI granules*** -Click download all button +Click on the green Download All button to open the download and order menu. Under “Select Data Access Method,” select Customize. -click edit options on left side +To set up the parameters for clipping out a smaller area of the granule, scroll down to the Spatial Subsetting section. +Check the box next to Click to Enable and enter coordinates of the bounding box for the ROI. -Select customize +To select specific science dataset layers, scroll down to the Band Subsetting section. Expand the directories and select the desired layers. Additional information for each of the data layers can be found on the [GEDI01_B](https://doi.org/10.5067/GEDI/GEDI01_B.001), [GEDI02_A](https://doi.org/10.5067/GEDI/GEDI02_A.001), or [GEDI02_B](https://doi.org/10.5067/GEDI/GEDI02_B.001) Digital Object Identifier (DOI) landing page. -Select "Click to enable" in spatial subsetting" +5. ***Place Order*** -Enter in appropriate bounds for the boundign box - North -> ul_lat - West -> ul_lon - East -> lr_lon - South -> lr_lat +After the desired parameters for spatial and/or layer subsetting have been selected, click Done to complete the custom order form then click Download Data to initiate the order. +When the data request is submitted, an order confirmation email is sent to the email address associated with the Earthdata login credentials or specified in the custom order form. -Click Done +6. ***Retrieve Data*** -Click Download Data +A status update email for the data processing request will be delivered when the order has completed. The order completion email contains URLs for accessing the data outputs. +Please note that the URLs have an expiration date and are only valid for one week. -Wait until data has been processed by Nasa. This could take anywhere from a few minutes to multiple days depending on the size of the bounding box and other variables on the server side. ### GEDI Combine The NASA EarthData Search will collect, clip, and process the GEDI granules, creating a number of zip files with the processed data to download. @@ -79,11 +102,11 @@ This program cycles through the list of urls supplied, conducting the following After each zip file has been processed and deleted, the DataFrame is written to a csv, parquet, or GeoJSON file, depending on inputs from the user. #### To Run -`python gediCombine_individual.py -d DirectoryPath -t FilePath -o gedi_output -f csv -b ul_lat,ul_lon,lr_lat,lr_lon` [source](gediCombine.py) +`python gediCombine_individual.py -d DirectoryPath -t FilePath -b ul_lat,ul_lon,lr_lat,lr_lon -o gedi_output -f csv ` [source](gediCombine.py) #### Arguments 1. ***-d,--dir*** : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. -2. ***-t,--textfile*** : The txt file containing the urls for the zip files, supplied by EarthData Search. +2. ***-t,--textfile*** : The file path for the txt file containing the downloaded urls of the zip files, supplied by EarthData Search. 3. ***-b,--bbox*** : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon 4. ***-o,--outfile*** *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *gedi_output*. 5. ***-f,--filetype*** *(optional)* : The type of file to output. Acceptable formats are: csv, parquet, GeoJSON. The default argument if none is given is *csv*. From 13ca22a10880ec268653ecb4bbc03cdca58303ef Mon Sep 17 00:00:00 2001 From: "Ryan M. Carlson" Date: Wed, 3 Feb 2021 20:26:59 -0600 Subject: [PATCH 11/23] README.md --- gedi/README.md | 80 ++++++++++++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 39 deletions(-) diff --git a/gedi/README.md b/gedi/README.md index 43a4986..3e831e9 100644 --- a/gedi/README.md +++ b/gedi/README.md @@ -36,16 +36,16 @@ If the optional arguments for the GEDI data level and the name of the output fil A comma-separated list of each GEDI granule that intersects the bounding box of interest is written to the output file as a .txt file. The list of granules is used to select the appropriate granules to download in the NASA EarthData Search application. -***To use this list in future steps, the user must open the file and copy the entire contents.*** +**To use this list in future steps, the user must open the file and copy the entire contents.** #### To Run `python gediFinder.py -d DirectoryPath -b ul_lat,ul_lon,lr_lat,lr_lon -l 2a` [source](gediFinder.py) #### Arguments -1. ***-d,--dir*** : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. -2. ***-b, --bbox*** : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon. -3. ***-l,--level*** *(optional)* : The data product level of interest. Acceptable answers at this time are 1B, 2A, or 2B with default of 2A. The default argument if none is given is *2A*. -4. ***-o,--outfile*** *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *granule_list*. +1. **-d,--dir** : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. +2. **-b, --bbox** : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon. +3. **-l,--level** *(optional)* : The data product level of interest. Acceptable answers at this time are 1B, 2A, or 2B with default of 2A. The default argument if none is given is *2A*. +4. **-o,--outfile** *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *granule_list*. #### Output Text file with a comma-separated list of GEDI granules that intersect user-defined bounding box @@ -53,40 +53,42 @@ Text file with a comma-separated list of GEDI granules that intersect user-defin ### EarthData Search Pages 2-4 in the [GEDI Spatial Querying and Subsetting Quick Guide](https://lpdaac.usgs.gov/documents/635/GEDI_Quick_Guide.pdf) provides a great overview of the process we implement in relation to data collection from [NASA EarthData Search application](https://earthdata.nasa.gov/search). -The following is pasted ***directly*** from the [GEDI Spatial Querying and Subsetting Quick Guide](https://lpdaac.usgs.gov/documents/635/GEDI_Quick_Guide.pdf). Some modifications have been made for readability and context and are *emphasized*. - -1. ***Access Earthdata Search*** - -After *obtaining a comma-separated list of GEDI granules with [GEDIFinder](#gedi finder)*, open [NASA Earthdata Search](https://search.earthdata.nasa.gov/). Sign in with Earthdata Login credentials or [register](https://urs.earthdata.nasa.gov/users/new) for a new account. - -Note: Currently there are no spatial searching capabilities for the GEDI Version 1 datasets in Earthdata Search. - -2. ***Search for Dataset*** - -Search for a collection by entering the dataset short name *(e.g. GEDI02_A)* into the search box then select the desired product from the list of matching collections. -All available granules for the product will be included in the list of matching granules. - -3. ***Search for Granules*** -Copy the list of comma-separated granule IDs *obtained with [GEDIFinder](#gedi finder)* and paste it into the Granule Search box in Earthdata Search. Use the Enter key to initiate the search. - -4. ***Select spatial and/or layer parameters for GEDI granules*** - -Click on the green Download All button to open the download and order menu. Under “Select Data Access Method,” select Customize. - -To set up the parameters for clipping out a smaller area of the granule, scroll down to the Spatial Subsetting section. -Check the box next to Click to Enable and enter coordinates of the bounding box for the ROI. - -To select specific science dataset layers, scroll down to the Band Subsetting section. Expand the directories and select the desired layers. Additional information for each of the data layers can be found on the [GEDI01_B](https://doi.org/10.5067/GEDI/GEDI01_B.001), [GEDI02_A](https://doi.org/10.5067/GEDI/GEDI02_A.001), or [GEDI02_B](https://doi.org/10.5067/GEDI/GEDI02_B.001) Digital Object Identifier (DOI) landing page. - -5. ***Place Order*** - -After the desired parameters for spatial and/or layer subsetting have been selected, click Done to complete the custom order form then click Download Data to initiate the order. -When the data request is submitted, an order confirmation email is sent to the email address associated with the Earthdata login credentials or specified in the custom order form. - -6. ***Retrieve Data*** - -A status update email for the data processing request will be delivered when the order has completed. The order completion email contains URLs for accessing the data outputs. -Please note that the URLs have an expiration date and are only valid for one week. +The following is pasted **directly** from the [GEDI Spatial Querying and Subsetting Quick Guide](https://lpdaac.usgs.gov/documents/635/GEDI_Quick_Guide.pdf). Some modifications have been made for readability and context and are *emphasized*. + +> 1. **Access Earthdata Search** +> +> After *obtaining a comma-separated list of GEDI granules with [GEDIFinder](#gedi finder)*, open [NASA Earthdata Search](https://search.earthdata.nasa.gov/). Sign in with Earthdata Login credentials or [register](https://urs.earthdata.nasa.gov/users/new) for a new account. +> +> Note: Currently there are no spatial searching capabilities for the GEDI Version 1 datasets in Earthdata Search. +> +> 2. **Search for Dataset** +> +> Search for a collection by entering the dataset short name *(e.g. GEDI02_A)* into the search box then select the desired product from the list of matching collections. +> All available granules for the product will be included in the list of matching granules. +> +> 3. **Search for Granules** +> Copy the list of comma-separated granule IDs *obtained with [GEDIFinder](#gedi finder)* and paste it into the Granule Search box in Earthdata Search. Use the Enter key to initiate the search. +> +> 4. **Select spatial and/or layer parameters for GEDI granules** +> +> Click on the green Download All button to open the download and order menu. Under “Select Data Access Method,” select Customize. +> +> To set up the parameters for clipping out a smaller area of the granule, scroll down to the Spatial Subsetting section. +> Check the box next to Click to Enable and enter coordinates of the bounding box for the ROI. +> +> To select specific science dataset layers, scroll down to the Band Subsetting section. +> Expand the directories and select the desired layers. +> Additional information for each of the data layers can be found on the [GEDI01_B](https://doi.org/10.5067/GEDI/GEDI01_B.001), [GEDI02_A](https://doi.org/10.5067/GEDI/GEDI02_A.001), or [GEDI02_B](https://doi.org/10.5067/GEDI/GEDI02_B.001) Digital Object Identifier (DOI) landing page. +> +> 5. **Place Order** +> +> After the desired parameters for spatial and/or layer subsetting have been selected, click Done to complete the custom order form then click Download Data to initiate the order. +> When the data request is submitted, an order confirmation email is sent to the email address associated with the Earthdata login credentials or specified in the custom order form. +> +> 6. **Retrieve Data** +> +> A status update email for the data processing request will be delivered when the order has completed. The order completion email contains URLs for accessing the data outputs. +> Please note that the URLs have an expiration date and are only valid for one week. ### GEDI Combine From 338974de5eaad0be80ca7fd25512f6442e7bb582 Mon Sep 17 00:00:00 2001 From: "Ryan M. Carlson" Date: Wed, 3 Feb 2021 20:28:18 -0600 Subject: [PATCH 12/23] README.md --- gedi/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gedi/README.md b/gedi/README.md index 3e831e9..3dafff4 100644 --- a/gedi/README.md +++ b/gedi/README.md @@ -57,7 +57,7 @@ The following is pasted **directly** from the [GEDI Spatial Querying and Subsett > 1. **Access Earthdata Search** > -> After *obtaining a comma-separated list of GEDI granules with [GEDIFinder](#gedi finder)*, open [NASA Earthdata Search](https://search.earthdata.nasa.gov/). Sign in with Earthdata Login credentials or [register](https://urs.earthdata.nasa.gov/users/new) for a new account. +> After *obtaining a comma-separated list of GEDI granules with [GEDIFinder](###gedi finder)*, open [NASA Earthdata Search](https://search.earthdata.nasa.gov/). Sign in with Earthdata Login credentials or [register](https://urs.earthdata.nasa.gov/users/new) for a new account. > > Note: Currently there are no spatial searching capabilities for the GEDI Version 1 datasets in Earthdata Search. > @@ -67,7 +67,8 @@ The following is pasted **directly** from the [GEDI Spatial Querying and Subsett > All available granules for the product will be included in the list of matching granules. > > 3. **Search for Granules** -> Copy the list of comma-separated granule IDs *obtained with [GEDIFinder](#gedi finder)* and paste it into the Granule Search box in Earthdata Search. Use the Enter key to initiate the search. +> +> Copy the list of comma-separated granule IDs *obtained with [GEDIFinder](###gedi finder)* and paste it into the Granule Search box in Earthdata Search. Use the Enter key to initiate the search. > > 4. **Select spatial and/or layer parameters for GEDI granules** > From 90ac975b308e627538b7ab01072a16194e799c54 Mon Sep 17 00:00:00 2001 From: "Ryan M. Carlson" Date: Wed, 3 Feb 2021 20:52:44 -0600 Subject: [PATCH 13/23] README.md --- gedi/README.md | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/gedi/README.md b/gedi/README.md index 3dafff4..e00dcfa 100644 --- a/gedi/README.md +++ b/gedi/README.md @@ -56,8 +56,7 @@ Pages 2-4 in the [GEDI Spatial Querying and Subsetting Quick Guide](https://lpda The following is pasted **directly** from the [GEDI Spatial Querying and Subsetting Quick Guide](https://lpdaac.usgs.gov/documents/635/GEDI_Quick_Guide.pdf). Some modifications have been made for readability and context and are *emphasized*. > 1. **Access Earthdata Search** -> -> After *obtaining a comma-separated list of GEDI granules with [GEDIFinder](###gedi finder)*, open [NASA Earthdata Search](https://search.earthdata.nasa.gov/). Sign in with Earthdata Login credentials or [register](https://urs.earthdata.nasa.gov/users/new) for a new account. +> After *obtaining a comma-separated list of GEDI granules with GEDI Finder* , open [NASA Earthdata Search](https://search.earthdata.nasa.gov/). Sign in with Earthdata Login credentials or [register](https://urs.earthdata.nasa.gov/users/new) for a new account. > > Note: Currently there are no spatial searching capabilities for the GEDI Version 1 datasets in Earthdata Search. > @@ -68,7 +67,7 @@ The following is pasted **directly** from the [GEDI Spatial Querying and Subsett > > 3. **Search for Granules** > -> Copy the list of comma-separated granule IDs *obtained with [GEDIFinder](###gedi finder)* and paste it into the Granule Search box in Earthdata Search. Use the Enter key to initiate the search. +> Copy the list of comma-separated granule IDs *obtained with GEDI Finder* and paste it into the Granule Search box in Earthdata Search. Use the Enter key to initiate the search. > > 4. **Select spatial and/or layer parameters for GEDI granules** > @@ -91,6 +90,14 @@ The following is pasted **directly** from the [GEDI Spatial Querying and Subsett > A status update email for the data processing request will be delivered when the order has completed. The order completion email contains URLs for accessing the data outputs. > Please note that the URLs have an expiration date and are only valid for one week. +There are a few more steps we implement to prepare the data for download and extraction with GEDI Combine. + +7. **Get List of Zip File URLs** + +Copy the url ending with `.zip?1` and paste into the web browser of your choosing. +The ZIP file should begin downloading. +Once the file has completed downloading, unzip the file and access the README file in the unzipped folder. +This README file will be used as the `--textfile` argument when running the GEDI Combine script. ### GEDI Combine The NASA EarthData Search will collect, clip, and process the GEDI granules, creating a number of zip files with the processed data to download. @@ -105,14 +112,14 @@ This program cycles through the list of urls supplied, conducting the following After each zip file has been processed and deleted, the DataFrame is written to a csv, parquet, or GeoJSON file, depending on inputs from the user. #### To Run -`python gediCombine_individual.py -d DirectoryPath -t FilePath -b ul_lat,ul_lon,lr_lat,lr_lon -o gedi_output -f csv ` [source](gediCombine.py) +`python gediCombine_individual.py -d DirectoryPath -t FilePath -b ul_lat,ul_lon,lr_lat,lr_lon -o gedi_output -f csv` [source](gediCombine.py) #### Arguments -1. ***-d,--dir*** : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. -2. ***-t,--textfile*** : The file path for the txt file containing the downloaded urls of the zip files, supplied by EarthData Search. -3. ***-b,--bbox*** : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon -4. ***-o,--outfile*** *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *gedi_output*. -5. ***-f,--filetype*** *(optional)* : The type of file to output. Acceptable formats are: csv, parquet, GeoJSON. The default argument if none is given is *csv*. +1. **-d,--dir** : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. +2. **-t,--textfile** : The file path for the txt file containing the downloaded urls of the zip files, supplied by EarthData Search. +3. **-b,--bbox** : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon +4. **-o,--outfile** *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *gedi_output*. +5. **-f,--filetype** *(optional)* : The type of file to output. Acceptable formats are: csv, parquet, GeoJSON. The default argument if none is given is *csv*. #### Output Generates file of input filetype with Level 2B data retrieved from GEDI servers. From bd55b199940b35863723db3e582644b311ccb340 Mon Sep 17 00:00:00 2001 From: Ryan Carlson <60898723+rcarlson89@users.noreply.github.com> Date: Wed, 3 Feb 2021 20:55:22 -0600 Subject: [PATCH 14/23] Update README.md --- gedi/README.md | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/gedi/README.md b/gedi/README.md index e00dcfa..c7df58f 100644 --- a/gedi/README.md +++ b/gedi/README.md @@ -39,13 +39,13 @@ The list of granules is used to select the appropriate granules to download in t **To use this list in future steps, the user must open the file and copy the entire contents.** #### To Run -`python gediFinder.py -d DirectoryPath -b ul_lat,ul_lon,lr_lat,lr_lon -l 2a` [source](gediFinder.py) +`python gediFinder.py -d DirectoryPath -b ul_lat,ul_lon,lr_lat,lr_lon -l 2a` ([source](gediFinder.py)) #### Arguments -1. **-d,--dir** : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. -2. **-b, --bbox** : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon. -3. **-l,--level** *(optional)* : The data product level of interest. Acceptable answers at this time are 1B, 2A, or 2B with default of 2A. The default argument if none is given is *2A*. -4. **-o,--outfile** *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *granule_list*. +1. `**-d,--dir**` : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. +2. `**-b, --bbox**` : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon. +3. `**-l,--level**` *(optional)* : The data product level of interest. Acceptable answers at this time are 1B, 2A, or 2B with default of 2A. The default argument if none is given is *2A*. +4. `**-o,--outfile**` *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *granule_list*. #### Output Text file with a comma-separated list of GEDI granules that intersect user-defined bounding box @@ -56,6 +56,7 @@ Pages 2-4 in the [GEDI Spatial Querying and Subsetting Quick Guide](https://lpda The following is pasted **directly** from the [GEDI Spatial Querying and Subsetting Quick Guide](https://lpdaac.usgs.gov/documents/635/GEDI_Quick_Guide.pdf). Some modifications have been made for readability and context and are *emphasized*. > 1. **Access Earthdata Search** +> > After *obtaining a comma-separated list of GEDI granules with GEDI Finder* , open [NASA Earthdata Search](https://search.earthdata.nasa.gov/). Sign in with Earthdata Login credentials or [register](https://urs.earthdata.nasa.gov/users/new) for a new account. > > Note: Currently there are no spatial searching capabilities for the GEDI Version 1 datasets in Earthdata Search. @@ -112,14 +113,14 @@ This program cycles through the list of urls supplied, conducting the following After each zip file has been processed and deleted, the DataFrame is written to a csv, parquet, or GeoJSON file, depending on inputs from the user. #### To Run -`python gediCombine_individual.py -d DirectoryPath -t FilePath -b ul_lat,ul_lon,lr_lat,lr_lon -o gedi_output -f csv` [source](gediCombine.py) +`python gediCombine_individual.py -d DirectoryPath -t FilePath -b ul_lat,ul_lon,lr_lat,lr_lon -o gedi_output -f csv` ([source](gediCombine.py)) #### Arguments -1. **-d,--dir** : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. -2. **-t,--textfile** : The file path for the txt file containing the downloaded urls of the zip files, supplied by EarthData Search. -3. **-b,--bbox** : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon -4. **-o,--outfile** *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *gedi_output*. -5. **-f,--filetype** *(optional)* : The type of file to output. Acceptable formats are: csv, parquet, GeoJSON. The default argument if none is given is *csv*. +1. `**-d,--dir**` : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. +2. `**-t,--textfile**` : The file path for the txt file containing the downloaded urls of the zip files, supplied by EarthData Search. +3. `**-b,--bbox**` : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon +4. `**-o,--outfile**` *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *gedi_output*. +5. `**-f,--filetype**` *(optional)* : The type of file to output. Acceptable formats are: csv, parquet, GeoJSON. The default argument if none is given is *csv*. #### Output Generates file of input filetype with Level 2B data retrieved from GEDI servers. From a7784a7f99029f485c85b76e284dcb168c9adcbc Mon Sep 17 00:00:00 2001 From: Ryan Carlson <60898723+rcarlson89@users.noreply.github.com> Date: Wed, 3 Feb 2021 20:56:09 -0600 Subject: [PATCH 15/23] Update README.md --- gedi/README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/gedi/README.md b/gedi/README.md index c7df58f..c03320c 100644 --- a/gedi/README.md +++ b/gedi/README.md @@ -42,10 +42,10 @@ The list of granules is used to select the appropriate granules to download in t `python gediFinder.py -d DirectoryPath -b ul_lat,ul_lon,lr_lat,lr_lon -l 2a` ([source](gediFinder.py)) #### Arguments -1. `**-d,--dir**` : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. -2. `**-b, --bbox**` : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon. -3. `**-l,--level**` *(optional)* : The data product level of interest. Acceptable answers at this time are 1B, 2A, or 2B with default of 2A. The default argument if none is given is *2A*. -4. `**-o,--outfile**` *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *granule_list*. +1. `-d,--dir` : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. +2. `-b, --bbox` : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon. +3. `-l,--level` *(optional)* : The data product level of interest. Acceptable answers at this time are 1B, 2A, or 2B with default of 2A. The default argument if none is given is *2A*. +4. `-o,--outfile` *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *granule_list*. #### Output Text file with a comma-separated list of GEDI granules that intersect user-defined bounding box @@ -116,11 +116,11 @@ After each zip file has been processed and deleted, the DataFrame is written to `python gediCombine_individual.py -d DirectoryPath -t FilePath -b ul_lat,ul_lon,lr_lat,lr_lon -o gedi_output -f csv` ([source](gediCombine.py)) #### Arguments -1. `**-d,--dir**` : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. -2. `**-t,--textfile**` : The file path for the txt file containing the downloaded urls of the zip files, supplied by EarthData Search. -3. `**-b,--bbox**` : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon -4. `**-o,--outfile**` *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *gedi_output*. -5. `**-f,--filetype**` *(optional)* : The type of file to output. Acceptable formats are: csv, parquet, GeoJSON. The default argument if none is given is *csv*. +1. `-d,--dir` : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. +2. `-t,--textfile` : The file path for the txt file containing the downloaded urls of the zip files, supplied by EarthData Search. +3. `-b,--bbox` : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon +4. `-o,--outfile` *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *gedi_output*. +5. `-f,--filetype` *(optional)* : The type of file to output. Acceptable formats are: csv, parquet, GeoJSON. The default argument if none is given is *csv*. #### Output Generates file of input filetype with Level 2B data retrieved from GEDI servers. From f2a91ad85b3cd212b8d8abbd7d044b98bab40cc5 Mon Sep 17 00:00:00 2001 From: Ryan Carlson <60898723+rcarlson89@users.noreply.github.com> Date: Wed, 3 Feb 2021 20:56:57 -0600 Subject: [PATCH 16/23] Update README.md --- gedi/README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/gedi/README.md b/gedi/README.md index c03320c..9eed7d9 100644 --- a/gedi/README.md +++ b/gedi/README.md @@ -42,10 +42,10 @@ The list of granules is used to select the appropriate granules to download in t `python gediFinder.py -d DirectoryPath -b ul_lat,ul_lon,lr_lat,lr_lon -l 2a` ([source](gediFinder.py)) #### Arguments -1. `-d,--dir` : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. -2. `-b, --bbox` : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon. -3. `-l,--level` *(optional)* : The data product level of interest. Acceptable answers at this time are 1B, 2A, or 2B with default of 2A. The default argument if none is given is *2A*. -4. `-o,--outfile` *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *granule_list*. +- `-d,--dir` : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. +- `-b, --bbox` : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon. +- `-l,--level` *(optional)* : The data product level of interest. Acceptable answers at this time are 1B, 2A, or 2B with default of 2A. The default argument if none is given is *2A*. +- `-o,--outfile` *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *granule_list*. #### Output Text file with a comma-separated list of GEDI granules that intersect user-defined bounding box @@ -116,11 +116,11 @@ After each zip file has been processed and deleted, the DataFrame is written to `python gediCombine_individual.py -d DirectoryPath -t FilePath -b ul_lat,ul_lon,lr_lat,lr_lon -o gedi_output -f csv` ([source](gediCombine.py)) #### Arguments -1. `-d,--dir` : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. -2. `-t,--textfile` : The file path for the txt file containing the downloaded urls of the zip files, supplied by EarthData Search. -3. `-b,--bbox` : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon -4. `-o,--outfile` *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *gedi_output*. -5. `-f,--filetype` *(optional)* : The type of file to output. Acceptable formats are: csv, parquet, GeoJSON. The default argument if none is given is *csv*. +- `-d,--dir` : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. +- `-t,--textfile` : The file path for the txt file containing the downloaded urls of the zip files, supplied by EarthData Search. +- `-b,--bbox` : The bounding box of the region of interest. In format ul_lat,ul_lon,lr_lat,lr_lon +- `-o,--outfile` *(optional)* : The stem of the name of the output file, without file extension, optional. The default argument if none is given is *gedi_output*. +- `-f,--filetype` *(optional)* : The type of file to output. Acceptable formats are: csv, parquet, GeoJSON. The default argument if none is given is *csv*. #### Output Generates file of input filetype with Level 2B data retrieved from GEDI servers. From 632db4df32c0ef27e36c981230dd1f5da7206a9d Mon Sep 17 00:00:00 2001 From: Ryan Carlson <60898723+rcarlson89@users.noreply.github.com> Date: Wed, 3 Feb 2021 20:57:48 -0600 Subject: [PATCH 17/23] Update README.md --- gedi/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gedi/README.md b/gedi/README.md index 9eed7d9..3c66aff 100644 --- a/gedi/README.md +++ b/gedi/README.md @@ -91,7 +91,7 @@ The following is pasted **directly** from the [GEDI Spatial Querying and Subsett > A status update email for the data processing request will be delivered when the order has completed. The order completion email contains URLs for accessing the data outputs. > Please note that the URLs have an expiration date and are only valid for one week. -There are a few more steps we implement to prepare the data for download and extraction with GEDI Combine. +There is another step we implement to prepare the data for download and extraction with GEDI Combine. 7. **Get List of Zip File URLs** From 053bf9cff81a62e44377df28b4e08139bbb256c4 Mon Sep 17 00:00:00 2001 From: "Ryan M. Carlson" Date: Thu, 4 Feb 2021 09:38:46 -0600 Subject: [PATCH 18/23] gediCombine.py --- gedi/gediCombine.py | 54 ++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/gedi/gediCombine.py b/gedi/gediCombine.py index 1f8a61e..6342c24 100644 --- a/gedi/gediCombine.py +++ b/gedi/gediCombine.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ This program goes through the following process: Download GEDI HDF5 files @@ -41,6 +42,7 @@ Generates file of input filetype with Level 2B data retrieved from GEDI servers. """ +import validators import argparse import requests import zipfile @@ -139,7 +141,7 @@ def gedi_L2A_to_df( for layer in layers: tmp_df[layer] = _f[beam][layer][()][mask].tolist() - append_canopy_metrics(tmp_df, canopy_threshold=2) + _append_canopy_metrics(tmp_df, canopy_threshold=2) del tmp_df["rh"] df = df.append(tmp_df) @@ -178,18 +180,18 @@ def _compute_nan_percentile(a: np.ndarray, q: float) -> np.array: count = (~np.isnan(a)).sum(axis=1) # count number of non-nans in row groups = np.unique(count) # returns sorted unique values groups = groups[groups > 0] # only returns groups with at least 1 non-nan value\n", - + p = np.zeros((a.shape[0])) for group in groups: pos = np.where(count == group) values = a[pos] values = values[:, :group] p[pos] = np.percentile(values, q, axis=1) - + return p -def append_canopy_metrics(df: pd.DataFrame, canopy_threshold: float) -> None: +def _append_canopy_metrics(df: pd.DataFrame, canopy_threshold: float) -> None: """ This function takes a pd.DataFrame object and a numerical value corresponding to the height threshold to consider a return as coming from the forest canopy. @@ -362,32 +364,34 @@ def download_url(url : str, dir : str) -> None: with open(str(args.textfile), 'r') as texturls: for url in texturls: - status = download_url(url, args.dir) - if status: - continue - else: - print("------------------------------------------") - break + if validators.url(url): - # Make list of filepaths - filepaths = [ - os.path.join(root, file) - for root, dir, files in os.walk(args.dir, topdown = False) - for file in files - if file.endswith('.h5') - ] + status = download_url(url, args.dir) + if status: + continue + else: + print("------------------------------------------") + break + + # Make list of filepaths + filepaths = [ + os.path.join(root, file) + for root, dir, files in os.walk(args.dir, topdown = False) + for file in files + if file.endswith('.h5') + ] - tmp_df = gedi_L2A_to_df(filepaths, bbox = bbox) + tmp_df = gedi_L2A_to_df(filepaths, bbox = bbox) - # append data to dataframe - df = df.append(tmp_df) - print(f"Appended {len(tmp_df)} lines to dataframe") + # append data to dataframe + df = df.append(tmp_df) + print(f"Appended {len(tmp_df)} lines to dataframe") - # Delete hdf5 files - for filepath in filepaths: - rmtree(os.path.dirname(filepath)) + # Delete hdf5 files + for filepath in filepaths: + rmtree(os.path.dirname(filepath)) - print("------------------------------------------") + print("------------------------------------------") if df.empty: print("DataFrame is empty. Not writing data to file") From ef02e0bc79012f9a67de030dcd28762fc6ce764c Mon Sep 17 00:00:00 2001 From: "Ryan M. Carlson" Date: Thu, 4 Feb 2021 12:27:30 -0600 Subject: [PATCH 19/23] README.md --- gedi/README.md | 2 +- gedi/gediCombine.py | 44 +++++++++++++++++++++++++++----------------- gedi/gediFinder.py | 8 +++++++- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/gedi/README.md b/gedi/README.md index e00dcfa..04be4dd 100644 --- a/gedi/README.md +++ b/gedi/README.md @@ -112,7 +112,7 @@ This program cycles through the list of urls supplied, conducting the following After each zip file has been processed and deleted, the DataFrame is written to a csv, parquet, or GeoJSON file, depending on inputs from the user. #### To Run -`python gediCombine_individual.py -d DirectoryPath -t FilePath -b ul_lat,ul_lon,lr_lat,lr_lon -o gedi_output -f csv` [source](gediCombine.py) +`python gediCombine.py -d DirectoryPath -t FilePath -b ul_lat,ul_lon,lr_lat,lr_lon -o gedi_output -f csv` [source](gediCombine.py) #### Arguments 1. **-d,--dir** : The directory containing url txt file, formatted with a trailing slash, such that {dir}{fname} is a valid path, for fname a valid file name. diff --git a/gedi/gediCombine.py b/gedi/gediCombine.py index 6342c24..35dd062 100644 --- a/gedi/gediCombine.py +++ b/gedi/gediCombine.py @@ -146,8 +146,6 @@ def gedi_L2A_to_df( df = df.append(tmp_df) - print(f"Finished processing file {os.path.basename(_f_name)}") - except KeyError as e: print(f"Encountered file read error with file {_f_name}") @@ -275,13 +273,17 @@ def df_to_geojson(df : pd.DataFrame, outfile : str) -> None: GeoDF = GeoDF.drop(columns=["lat_lowestmode", "lon_lowestmode"]) GeoDF.to_file(outfile, driver="GeoJSON") -def download_url(url : str, dir : str) -> None: +def download_url(url : str, dir : str, chunk_size : int = 128) -> None: """ - "Description of func" + Download the zip files generated by NASA EarthData Search. Parameters ---------- url : str "description of parameter" + dir : str + Directory used to store + chunk_size : int + Side Effects ------- Downloads @@ -289,16 +291,26 @@ def download_url(url : str, dir : str) -> None: ------- None """ - r = requests.get(url) + filepath = os.path.join(dir, "granuleData.zip") + r = requests.get(url, stream = True) if r.status_code == requests.codes.ok: print(f"Downloading files from {url}") - z = zipfile.ZipFile(io.BytesIO(r.content)) - z.extractall(dir) - print(f"Completed download of files from {url}.") + # Download zip file and save to dir + with open(filepath, 'wb') as fd: + for chunk in r.iter_content(chunk_size = chunk_size): + fd.write(chunk) + + print(f"Unzipping files") + # Unzip contents + with zipfile.ZipFile(filepath, 'r') as zipObj: + # Extract all the contents of zip file in current directory + zipObj.extractall(path = dir) + + # Remove zip file + rmtree(filepath) - del r, z return True else: @@ -366,13 +378,11 @@ def download_url(url : str, dir : str) -> None: for url in texturls: if validators.url(url): - status = download_url(url, args.dir) - if status: - continue - else: + status = download_url(url.rstrip('\n'), args.dir) + if not status: print("------------------------------------------") - break - + continue + print(" ---------------------- ") # Make list of filepaths filepaths = [ os.path.join(root, file) @@ -382,10 +392,10 @@ def download_url(url : str, dir : str) -> None: ] tmp_df = gedi_L2A_to_df(filepaths, bbox = bbox) - + print(" ---------------------- ") # append data to dataframe df = df.append(tmp_df) - print(f"Appended {len(tmp_df)} lines to dataframe") + print(f"Appended {len(tmp_df)} lines to dataframe; Dataframe has {len(df)} lines") # Delete hdf5 files for filepath in filepaths: diff --git a/gedi/gediFinder.py b/gedi/gediFinder.py index 49b3c90..d4ae69d 100644 --- a/gedi/gediFinder.py +++ b/gedi/gediFinder.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ This script is meant to create a GediFinder URL and get the corresponding list of granules within a user-defined bounding box. This list can then be used in the Earthdata Search Tool to pull data from @@ -29,6 +30,7 @@ Writes granule list to a txt file in directory """ +import os import requests import argparse @@ -68,7 +70,7 @@ def gediFinder( r = requests.get(lpdaac, params = payload_str) if r.status_code == requests.codes.ok: - print('Success!') + print('Granules successfully gathered!') granules = [g.split('/')[-1] for g in r.json()['data']] # take filename from url return ','.join(granules) @@ -131,5 +133,9 @@ def gediFinder( f"Recieved unsupported data level {args.level}. Please provide 1B, 2A, or 2B" ) + print("------------------------------------------") + with open(os.path.join(args.dir, args.outfile + ".txt"), 'w+') as file: file.write(gediFinder(level, args.bbox)) + + print("------------------------------------------") From 727dc7fbc2d86050f2eca3805567284117c193e9 Mon Sep 17 00:00:00 2001 From: "Ryan M. Carlson" Date: Sat, 6 Feb 2021 11:05:42 -0600 Subject: [PATCH 20/23] gediCombine.py --- gedi/gediCombine.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gedi/gediCombine.py b/gedi/gediCombine.py index 35dd062..d4c2be6 100644 --- a/gedi/gediCombine.py +++ b/gedi/gediCombine.py @@ -279,14 +279,14 @@ def download_url(url : str, dir : str, chunk_size : int = 128) -> None: Parameters ---------- url : str - "description of parameter" + URL used to download data zip package dir : str - Directory used to store + Directory used to store the zip file and unzipped contents chunk_size : int - + Determines the size of each chunk written to file during data streaming Side Effects ------- - Downloads + Downloads and unpacks the zip file from the provide url Returns ------- None From e154224e65013133506d47edb7a98d31a54d11f7 Mon Sep 17 00:00:00 2001 From: "Ryan M. Carlson" Date: Tue, 9 Feb 2021 10:52:18 -0600 Subject: [PATCH 21/23] process_l2a.py --- gedi/process_l2a.py | 427 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 427 insertions(+) create mode 100644 gedi/process_l2a.py diff --git a/gedi/process_l2a.py b/gedi/process_l2a.py new file mode 100644 index 0000000..d4c2be6 --- /dev/null +++ b/gedi/process_l2a.py @@ -0,0 +1,427 @@ +#!/usr/bin/env python +""" +This program goes through the following process: + Download GEDI HDF5 files + Retrieve approproate data from files + Remove GEDI HDF5 files + Write data to file +The following layers are retrieved from the raw GEDI data: + shot_number, lat_lowestmode, lon_lowestmode, elev_lowestmode, elev_highestreturn, + sensitivity, quality_flag, rh +rh is converted to the following canopy height metrics: + canopy height max, min, std, avg + canopy height percentiles 10, 25, 50, 75, 90 + number of canopy height measurements in range h<5m, 5m pd.DataFrame: + """ + Given the absolute path to a directory of GEDI Level 2A h5 files, the + names of the h5 files, and information about the desired output, ingest + the h5 files and output a pd.DataFrame containing data for all the valid + shots within the bounding box. + The output will have an array-valued column that is designed to be handled + by append_canopy_metrics and then deleted from the DataFrame. + Parameters + ---------- + file_dir : str + Path to directory containing the h5 files, including a trailing slash + such that {file_dir}{fname} is a valid path to a file, given fname a + valid file name. + file_paths : List[str] + The names of the h5 files to process. + bbox : List[float] + The coordinates of the bounding box, formatted as + [ul_lat, ul_lon, lr_lat, lr_lon]. + layers : List[str] + The columns for the output DataFrame. + latlayer : str + The name of the latitude layer. + lonlayer : str + The name of the longitude layer. + Returns + ------- + pd.DataFrame containing the user-specified contents of the h5 files for the + valid shots within the specified bounding box. + """ + df = pd.DataFrame() + for _f_name in file_paths: + try: + _f = h5py.File(f"{_f_name}", "r") + print(f"Processing file {os.path.basename(_f_name)}") + [ul_lat, ul_lon, lr_lat, lr_lon] = bbox + for beam in [ + "BEAM0000", + "BEAM0001", + "BEAM0010", + "BEAM0011", + "BEAM0101", + "BEAM0110", + "BEAM1000", + "BEAM1011", + ]: + tmp_df = pd.DataFrame() + x = _f[beam][latlayer][()] + y = _f[beam][lonlayer][()] + qual = _f[beam]["quality_flag"][()] + + mask = ( + (qual == 1) + & (x <= ul_lat) + & (x >= lr_lat) + & (y <= lr_lon) + & (y >= ul_lon) + ) + + if np.count_nonzero(mask) == 0: + continue + print(f"{beam} in {os.path.basename(_f_name)} does not contain any usable data") + + else: + tmp_df["beam"] = [beam] * np.count_nonzero(mask) + + for layer in layers: + tmp_df[layer] = _f[beam][layer][()][mask].tolist() + + _append_canopy_metrics(tmp_df, canopy_threshold=2) + del tmp_df["rh"] + + df = df.append(tmp_df) + + except KeyError as e: + print(f"Encountered file read error with file {_f_name}") + + + return df.reset_index(drop=True) + + +def _compute_nan_percentile(a: np.ndarray, q: float) -> np.array: + """ + This code is taken directly from StackOverflow at: + https://stackoverflow.com/questions/60015245/numpy-nanpercentile-is-extremely-slow + For 2d masked array of roughly 10^5 x 100, this performs ~3x faster than + np.nanpercentile (~10s vs. 3s) and produces results which are identical. + Would be good to add two tests to verify: (1) the results of this function are + the same as np.nanpercentile, and (2) that this function takes less time on a + reasonably sized array, in case of future performance gains in nanpercentile. + Parameters + ---------- + a : array_like + 2d array for which to compute a row-wise percentile. + q : float + Percentile to compute -- allowed values are [0, 100] + Returns + ------- + np.array of the q'th percentile for each row of a. Dimensions of [len(a), 1]. + """ + if q < 0 or q > 100: + raise ValueError(f"Expected a value between 0 and 100; received {q} instead.") + a = np.sort(a, axis =1) + count = (~np.isnan(a)).sum(axis=1) # count number of non-nans in row + groups = np.unique(count) # returns sorted unique values + groups = groups[groups > 0] # only returns groups with at least 1 non-nan value\n", + + p = np.zeros((a.shape[0])) + for group in groups: + pos = np.where(count == group) + values = a[pos] + values = values[:, :group] + p[pos] = np.percentile(values, q, axis=1) + + return p + + +def _append_canopy_metrics(df: pd.DataFrame, canopy_threshold: float) -> None: + """ + This function takes a pd.DataFrame object and a numerical value corresponding + to the height threshold to consider a return as coming from the forest canopy. + It extracts the "rh" column from the provided DataFrame as an np.ndarray, + applies a mask, and computes the relevant canopy metrics which are then + appended to the provided DataFrame. + Note: the least bad option for computing these canopy metrics seemed to be + using np.ma module to work with masked arrays. Not converting out of Pandas is + untenable due to computation time, and the varying dimension of rows when + subset to canopy-only observations means np converts the result of applying + the mask to the 2d array to a 1d array. + Parameters + ---------- + df : pd.DataFrame + The DataFrame, assumed to be the output of gedi_L2A_to_df, for which + canopy metrics should be calculated. The provided df must contain a column + labeled "rh" which is array-valued. + canopy_threshold : float + The minimum value for a return to be considered a "canopy" return. + Side Effects + ------------ + Modifies the provided pd.DataFrame in-place to add canopy metrics. + Returns + ------- + None + """ + rh = np.stack(df.rh.to_numpy()) + canopy_returns = np.ma.masked_less(rh, canopy_threshold) + + df["canopy_max"] = pd.Series(np.max(canopy_returns, axis=1)) + df["canopy_min"] = pd.Series(np.min(canopy_returns, axis=1)) + df["canopy_std"] = pd.Series(np.std(canopy_returns, axis=1)) + df["canopy_avg"] = pd.Series(np.average(canopy_returns, axis=1)) + df["dns"] = pd.Series( + canopy_returns.count(axis=1) + ) # % all returns >= canopy threshold + + canopy_returns = np.ma.filled(canopy_returns, np.nan) + df["canopy_p10"] = pd.Series(_compute_nan_percentile(canopy_returns, 10)) + df["canopy_p25"] = pd.Series(_compute_nan_percentile(canopy_returns, 25)) + df["canopy_p50"] = pd.Series(_compute_nan_percentile(canopy_returns, 50)) + df["canopy_p75"] = pd.Series(_compute_nan_percentile(canopy_returns, 75)) + df["canopy_p90"] = pd.Series(_compute_nan_percentile(canopy_returns, 90)) + + df["d01"] = pd.Series( + np.ma.masked_outside(rh, canopy_threshold, 5).count(axis=1) + ) # % returns >= canopy, <=5m + df["d02"] = pd.Series( + np.ma.masked_outside(rh, 5, 10).count(axis=1) + ) # % >= 5m, <=10m + df["d03"] = pd.Series( + np.ma.masked_outside(rh, 10, 20).count(axis=1) + ) # % >= 10m, <=20m + df["d04"] = pd.Series( + np.ma.masked_outside(rh, 20, 30).count(axis=1) + ) # % >= 20m, <=30m + + +def df_to_geojson(df : pd.DataFrame, outfile : str) -> None: + """ + Convert pandas dataframe to GeoJSON file and save as given file name and path. + This file type can be useful when working locally in GIS software. + Parameters + ---------- + df : pd.DataFrame + The DataFrame, assumed to be the output of gedi_L2A_to_df, with or without + canopy height metrics appended. + outfile : str + The location and filename of output json file. + Side Effects + ------- + Writes dataframe to geojson filetype + Returns + ------- + None + """ + df["geometry"] = df.apply( + lambda row: Point(row.lon_lowestmode, row.lat_lowestmode), axis=1 + ) + GeoDF = gp.GeoDataFrame(df) + GeoDF = GeoDF.drop(columns=["lat_lowestmode", "lon_lowestmode"]) + GeoDF.to_file(outfile, driver="GeoJSON") + +def download_url(url : str, dir : str, chunk_size : int = 128) -> None: + """ + Download the zip files generated by NASA EarthData Search. + Parameters + ---------- + url : str + URL used to download data zip package + dir : str + Directory used to store the zip file and unzipped contents + chunk_size : int + Determines the size of each chunk written to file during data streaming + Side Effects + ------- + Downloads and unpacks the zip file from the provide url + Returns + ------- + None + """ + filepath = os.path.join(dir, "granuleData.zip") + r = requests.get(url, stream = True) + + if r.status_code == requests.codes.ok: + print(f"Downloading files from {url}") + + # Download zip file and save to dir + with open(filepath, 'wb') as fd: + for chunk in r.iter_content(chunk_size = chunk_size): + fd.write(chunk) + + print(f"Unzipping files") + # Unzip contents + with zipfile.ZipFile(filepath, 'r') as zipObj: + # Extract all the contents of zip file in current directory + zipObj.extractall(path = dir) + + # Remove zip file + rmtree(filepath) + + return True + + else: + print( + f'Error {r.status_code} has occurred.\n' + f'{url} cannot be downloaded at this time.' + ) + + return False + +if __name__ == "__main__": + # things I would like to see: displays 'arg help' if no arguments are given, docstrings with proper usage + parser = argparse.ArgumentParser() + parser.add_argument( + "-d", + "--dir", + help=( + "The directory containing url txt file supplied by the" + "--textfile argument" + ), + ) + parser.add_argument( + "-t", + "--textfile", + help=( + "The txt file containing the urls for the zip files, supplied by" + "EarthData Search." + ), + ) + parser.add_argument( + "-b", + "--bbox", + help=( + "The bounding box of the region of interest. In format" + "ul_lat,ul_lon,lr_lat,lr_lon" + ) + ) + parser.add_argument( + "-o", + "--outfile", + help=( + "The stem of the name of the output file, without file extension," + "optional. Default value is 'gedi_output'." + ), + default="gedi_output", + ) + parser.add_argument( + "-f", + "--filetype", + help=( + "The type of file to output. Acceptable formats are: csv," + "parquet, GeoJSON. Default value is csv." + ), + default="csv", + ) + args = parser.parse_args() + + print("------------------------------------------") + + bbox = [float(b) for b in args.bbox.split(',')] + + df = pd.DataFrame() + + with open(str(args.textfile), 'r') as texturls: + for url in texturls: + if validators.url(url): + + status = download_url(url.rstrip('\n'), args.dir) + if not status: + print("------------------------------------------") + continue + print(" ---------------------- ") + # Make list of filepaths + filepaths = [ + os.path.join(root, file) + for root, dir, files in os.walk(args.dir, topdown = False) + for file in files + if file.endswith('.h5') + ] + + tmp_df = gedi_L2A_to_df(filepaths, bbox = bbox) + print(" ---------------------- ") + # append data to dataframe + df = df.append(tmp_df) + print(f"Appended {len(tmp_df)} lines to dataframe; Dataframe has {len(df)} lines") + + # Delete hdf5 files + for filepath in filepaths: + rmtree(os.path.dirname(filepath)) + + print("------------------------------------------") + + if df.empty: + print("DataFrame is empty. Not writing data to file") + print("------------------------------------------") + else: + if args.filetype.lower() == "csv": + filename = os.path.join(args.dir, args.outfile + ".csv") + print(f'Writing to file {filename}') + df.to_csv(filename, index=False) + elif args.filetype.lower() == "parquet": + filename = os.path.join(args.dir, args.outfile + ".parquet.gzip") + print(f'Writing to file {filename}') + df.to_parquet(filename, compression="gzip") + elif args.filetype.lower() == "geojson": + filename = os.path.join(args.dir, args.outfile + ".geojson") + print(f'Writing to file {filename}') + df_to_geojson(df, filename) + else: + raise ValueError( + f"Received unsupported file type {args.filetype}. Please provide one of: csv, parquet, or GeoJSON." + ) + print("Job Complete!") + print("------------------------------------------") From 5591e68240f3f3bfdcf58d8939f1d1dde937a03c Mon Sep 17 00:00:00 2001 From: Connor Richards Date: Tue, 9 Feb 2021 10:04:27 -0800 Subject: [PATCH 22/23] FIX: remove process_l2a for fastforward --- gedi/process_l2a.py | 427 -------------------------------------------- 1 file changed, 427 deletions(-) delete mode 100644 gedi/process_l2a.py diff --git a/gedi/process_l2a.py b/gedi/process_l2a.py deleted file mode 100644 index d4c2be6..0000000 --- a/gedi/process_l2a.py +++ /dev/null @@ -1,427 +0,0 @@ -#!/usr/bin/env python -""" -This program goes through the following process: - Download GEDI HDF5 files - Retrieve approproate data from files - Remove GEDI HDF5 files - Write data to file -The following layers are retrieved from the raw GEDI data: - shot_number, lat_lowestmode, lon_lowestmode, elev_lowestmode, elev_highestreturn, - sensitivity, quality_flag, rh -rh is converted to the following canopy height metrics: - canopy height max, min, std, avg - canopy height percentiles 10, 25, 50, 75, 90 - number of canopy height measurements in range h<5m, 5m pd.DataFrame: - """ - Given the absolute path to a directory of GEDI Level 2A h5 files, the - names of the h5 files, and information about the desired output, ingest - the h5 files and output a pd.DataFrame containing data for all the valid - shots within the bounding box. - The output will have an array-valued column that is designed to be handled - by append_canopy_metrics and then deleted from the DataFrame. - Parameters - ---------- - file_dir : str - Path to directory containing the h5 files, including a trailing slash - such that {file_dir}{fname} is a valid path to a file, given fname a - valid file name. - file_paths : List[str] - The names of the h5 files to process. - bbox : List[float] - The coordinates of the bounding box, formatted as - [ul_lat, ul_lon, lr_lat, lr_lon]. - layers : List[str] - The columns for the output DataFrame. - latlayer : str - The name of the latitude layer. - lonlayer : str - The name of the longitude layer. - Returns - ------- - pd.DataFrame containing the user-specified contents of the h5 files for the - valid shots within the specified bounding box. - """ - df = pd.DataFrame() - for _f_name in file_paths: - try: - _f = h5py.File(f"{_f_name}", "r") - print(f"Processing file {os.path.basename(_f_name)}") - [ul_lat, ul_lon, lr_lat, lr_lon] = bbox - for beam in [ - "BEAM0000", - "BEAM0001", - "BEAM0010", - "BEAM0011", - "BEAM0101", - "BEAM0110", - "BEAM1000", - "BEAM1011", - ]: - tmp_df = pd.DataFrame() - x = _f[beam][latlayer][()] - y = _f[beam][lonlayer][()] - qual = _f[beam]["quality_flag"][()] - - mask = ( - (qual == 1) - & (x <= ul_lat) - & (x >= lr_lat) - & (y <= lr_lon) - & (y >= ul_lon) - ) - - if np.count_nonzero(mask) == 0: - continue - print(f"{beam} in {os.path.basename(_f_name)} does not contain any usable data") - - else: - tmp_df["beam"] = [beam] * np.count_nonzero(mask) - - for layer in layers: - tmp_df[layer] = _f[beam][layer][()][mask].tolist() - - _append_canopy_metrics(tmp_df, canopy_threshold=2) - del tmp_df["rh"] - - df = df.append(tmp_df) - - except KeyError as e: - print(f"Encountered file read error with file {_f_name}") - - - return df.reset_index(drop=True) - - -def _compute_nan_percentile(a: np.ndarray, q: float) -> np.array: - """ - This code is taken directly from StackOverflow at: - https://stackoverflow.com/questions/60015245/numpy-nanpercentile-is-extremely-slow - For 2d masked array of roughly 10^5 x 100, this performs ~3x faster than - np.nanpercentile (~10s vs. 3s) and produces results which are identical. - Would be good to add two tests to verify: (1) the results of this function are - the same as np.nanpercentile, and (2) that this function takes less time on a - reasonably sized array, in case of future performance gains in nanpercentile. - Parameters - ---------- - a : array_like - 2d array for which to compute a row-wise percentile. - q : float - Percentile to compute -- allowed values are [0, 100] - Returns - ------- - np.array of the q'th percentile for each row of a. Dimensions of [len(a), 1]. - """ - if q < 0 or q > 100: - raise ValueError(f"Expected a value between 0 and 100; received {q} instead.") - a = np.sort(a, axis =1) - count = (~np.isnan(a)).sum(axis=1) # count number of non-nans in row - groups = np.unique(count) # returns sorted unique values - groups = groups[groups > 0] # only returns groups with at least 1 non-nan value\n", - - p = np.zeros((a.shape[0])) - for group in groups: - pos = np.where(count == group) - values = a[pos] - values = values[:, :group] - p[pos] = np.percentile(values, q, axis=1) - - return p - - -def _append_canopy_metrics(df: pd.DataFrame, canopy_threshold: float) -> None: - """ - This function takes a pd.DataFrame object and a numerical value corresponding - to the height threshold to consider a return as coming from the forest canopy. - It extracts the "rh" column from the provided DataFrame as an np.ndarray, - applies a mask, and computes the relevant canopy metrics which are then - appended to the provided DataFrame. - Note: the least bad option for computing these canopy metrics seemed to be - using np.ma module to work with masked arrays. Not converting out of Pandas is - untenable due to computation time, and the varying dimension of rows when - subset to canopy-only observations means np converts the result of applying - the mask to the 2d array to a 1d array. - Parameters - ---------- - df : pd.DataFrame - The DataFrame, assumed to be the output of gedi_L2A_to_df, for which - canopy metrics should be calculated. The provided df must contain a column - labeled "rh" which is array-valued. - canopy_threshold : float - The minimum value for a return to be considered a "canopy" return. - Side Effects - ------------ - Modifies the provided pd.DataFrame in-place to add canopy metrics. - Returns - ------- - None - """ - rh = np.stack(df.rh.to_numpy()) - canopy_returns = np.ma.masked_less(rh, canopy_threshold) - - df["canopy_max"] = pd.Series(np.max(canopy_returns, axis=1)) - df["canopy_min"] = pd.Series(np.min(canopy_returns, axis=1)) - df["canopy_std"] = pd.Series(np.std(canopy_returns, axis=1)) - df["canopy_avg"] = pd.Series(np.average(canopy_returns, axis=1)) - df["dns"] = pd.Series( - canopy_returns.count(axis=1) - ) # % all returns >= canopy threshold - - canopy_returns = np.ma.filled(canopy_returns, np.nan) - df["canopy_p10"] = pd.Series(_compute_nan_percentile(canopy_returns, 10)) - df["canopy_p25"] = pd.Series(_compute_nan_percentile(canopy_returns, 25)) - df["canopy_p50"] = pd.Series(_compute_nan_percentile(canopy_returns, 50)) - df["canopy_p75"] = pd.Series(_compute_nan_percentile(canopy_returns, 75)) - df["canopy_p90"] = pd.Series(_compute_nan_percentile(canopy_returns, 90)) - - df["d01"] = pd.Series( - np.ma.masked_outside(rh, canopy_threshold, 5).count(axis=1) - ) # % returns >= canopy, <=5m - df["d02"] = pd.Series( - np.ma.masked_outside(rh, 5, 10).count(axis=1) - ) # % >= 5m, <=10m - df["d03"] = pd.Series( - np.ma.masked_outside(rh, 10, 20).count(axis=1) - ) # % >= 10m, <=20m - df["d04"] = pd.Series( - np.ma.masked_outside(rh, 20, 30).count(axis=1) - ) # % >= 20m, <=30m - - -def df_to_geojson(df : pd.DataFrame, outfile : str) -> None: - """ - Convert pandas dataframe to GeoJSON file and save as given file name and path. - This file type can be useful when working locally in GIS software. - Parameters - ---------- - df : pd.DataFrame - The DataFrame, assumed to be the output of gedi_L2A_to_df, with or without - canopy height metrics appended. - outfile : str - The location and filename of output json file. - Side Effects - ------- - Writes dataframe to geojson filetype - Returns - ------- - None - """ - df["geometry"] = df.apply( - lambda row: Point(row.lon_lowestmode, row.lat_lowestmode), axis=1 - ) - GeoDF = gp.GeoDataFrame(df) - GeoDF = GeoDF.drop(columns=["lat_lowestmode", "lon_lowestmode"]) - GeoDF.to_file(outfile, driver="GeoJSON") - -def download_url(url : str, dir : str, chunk_size : int = 128) -> None: - """ - Download the zip files generated by NASA EarthData Search. - Parameters - ---------- - url : str - URL used to download data zip package - dir : str - Directory used to store the zip file and unzipped contents - chunk_size : int - Determines the size of each chunk written to file during data streaming - Side Effects - ------- - Downloads and unpacks the zip file from the provide url - Returns - ------- - None - """ - filepath = os.path.join(dir, "granuleData.zip") - r = requests.get(url, stream = True) - - if r.status_code == requests.codes.ok: - print(f"Downloading files from {url}") - - # Download zip file and save to dir - with open(filepath, 'wb') as fd: - for chunk in r.iter_content(chunk_size = chunk_size): - fd.write(chunk) - - print(f"Unzipping files") - # Unzip contents - with zipfile.ZipFile(filepath, 'r') as zipObj: - # Extract all the contents of zip file in current directory - zipObj.extractall(path = dir) - - # Remove zip file - rmtree(filepath) - - return True - - else: - print( - f'Error {r.status_code} has occurred.\n' - f'{url} cannot be downloaded at this time.' - ) - - return False - -if __name__ == "__main__": - # things I would like to see: displays 'arg help' if no arguments are given, docstrings with proper usage - parser = argparse.ArgumentParser() - parser.add_argument( - "-d", - "--dir", - help=( - "The directory containing url txt file supplied by the" - "--textfile argument" - ), - ) - parser.add_argument( - "-t", - "--textfile", - help=( - "The txt file containing the urls for the zip files, supplied by" - "EarthData Search." - ), - ) - parser.add_argument( - "-b", - "--bbox", - help=( - "The bounding box of the region of interest. In format" - "ul_lat,ul_lon,lr_lat,lr_lon" - ) - ) - parser.add_argument( - "-o", - "--outfile", - help=( - "The stem of the name of the output file, without file extension," - "optional. Default value is 'gedi_output'." - ), - default="gedi_output", - ) - parser.add_argument( - "-f", - "--filetype", - help=( - "The type of file to output. Acceptable formats are: csv," - "parquet, GeoJSON. Default value is csv." - ), - default="csv", - ) - args = parser.parse_args() - - print("------------------------------------------") - - bbox = [float(b) for b in args.bbox.split(',')] - - df = pd.DataFrame() - - with open(str(args.textfile), 'r') as texturls: - for url in texturls: - if validators.url(url): - - status = download_url(url.rstrip('\n'), args.dir) - if not status: - print("------------------------------------------") - continue - print(" ---------------------- ") - # Make list of filepaths - filepaths = [ - os.path.join(root, file) - for root, dir, files in os.walk(args.dir, topdown = False) - for file in files - if file.endswith('.h5') - ] - - tmp_df = gedi_L2A_to_df(filepaths, bbox = bbox) - print(" ---------------------- ") - # append data to dataframe - df = df.append(tmp_df) - print(f"Appended {len(tmp_df)} lines to dataframe; Dataframe has {len(df)} lines") - - # Delete hdf5 files - for filepath in filepaths: - rmtree(os.path.dirname(filepath)) - - print("------------------------------------------") - - if df.empty: - print("DataFrame is empty. Not writing data to file") - print("------------------------------------------") - else: - if args.filetype.lower() == "csv": - filename = os.path.join(args.dir, args.outfile + ".csv") - print(f'Writing to file {filename}') - df.to_csv(filename, index=False) - elif args.filetype.lower() == "parquet": - filename = os.path.join(args.dir, args.outfile + ".parquet.gzip") - print(f'Writing to file {filename}') - df.to_parquet(filename, compression="gzip") - elif args.filetype.lower() == "geojson": - filename = os.path.join(args.dir, args.outfile + ".geojson") - print(f'Writing to file {filename}') - df_to_geojson(df, filename) - else: - raise ValueError( - f"Received unsupported file type {args.filetype}. Please provide one of: csv, parquet, or GeoJSON." - ) - print("Job Complete!") - print("------------------------------------------") From 3d78c0e51117e5051d2ac2b3c9a50a9a60070fa4 Mon Sep 17 00:00:00 2001 From: Connor Richards Date: Tue, 9 Feb 2021 10:05:19 -0800 Subject: [PATCH 23/23] FIX: move gediCombine to process_l2a --- gedi/gediCombine.py | 427 -------------------------------------------- gedi/process_l2a.py | 272 +++++++++++++++++++++------- 2 files changed, 208 insertions(+), 491 deletions(-) delete mode 100644 gedi/gediCombine.py diff --git a/gedi/gediCombine.py b/gedi/gediCombine.py deleted file mode 100644 index d4c2be6..0000000 --- a/gedi/gediCombine.py +++ /dev/null @@ -1,427 +0,0 @@ -#!/usr/bin/env python -""" -This program goes through the following process: - Download GEDI HDF5 files - Retrieve approproate data from files - Remove GEDI HDF5 files - Write data to file -The following layers are retrieved from the raw GEDI data: - shot_number, lat_lowestmode, lon_lowestmode, elev_lowestmode, elev_highestreturn, - sensitivity, quality_flag, rh -rh is converted to the following canopy height metrics: - canopy height max, min, std, avg - canopy height percentiles 10, 25, 50, 75, 90 - number of canopy height measurements in range h<5m, 5m pd.DataFrame: - """ - Given the absolute path to a directory of GEDI Level 2A h5 files, the - names of the h5 files, and information about the desired output, ingest - the h5 files and output a pd.DataFrame containing data for all the valid - shots within the bounding box. - The output will have an array-valued column that is designed to be handled - by append_canopy_metrics and then deleted from the DataFrame. - Parameters - ---------- - file_dir : str - Path to directory containing the h5 files, including a trailing slash - such that {file_dir}{fname} is a valid path to a file, given fname a - valid file name. - file_paths : List[str] - The names of the h5 files to process. - bbox : List[float] - The coordinates of the bounding box, formatted as - [ul_lat, ul_lon, lr_lat, lr_lon]. - layers : List[str] - The columns for the output DataFrame. - latlayer : str - The name of the latitude layer. - lonlayer : str - The name of the longitude layer. - Returns - ------- - pd.DataFrame containing the user-specified contents of the h5 files for the - valid shots within the specified bounding box. - """ - df = pd.DataFrame() - for _f_name in file_paths: - try: - _f = h5py.File(f"{_f_name}", "r") - print(f"Processing file {os.path.basename(_f_name)}") - [ul_lat, ul_lon, lr_lat, lr_lon] = bbox - for beam in [ - "BEAM0000", - "BEAM0001", - "BEAM0010", - "BEAM0011", - "BEAM0101", - "BEAM0110", - "BEAM1000", - "BEAM1011", - ]: - tmp_df = pd.DataFrame() - x = _f[beam][latlayer][()] - y = _f[beam][lonlayer][()] - qual = _f[beam]["quality_flag"][()] - - mask = ( - (qual == 1) - & (x <= ul_lat) - & (x >= lr_lat) - & (y <= lr_lon) - & (y >= ul_lon) - ) - - if np.count_nonzero(mask) == 0: - continue - print(f"{beam} in {os.path.basename(_f_name)} does not contain any usable data") - - else: - tmp_df["beam"] = [beam] * np.count_nonzero(mask) - - for layer in layers: - tmp_df[layer] = _f[beam][layer][()][mask].tolist() - - _append_canopy_metrics(tmp_df, canopy_threshold=2) - del tmp_df["rh"] - - df = df.append(tmp_df) - - except KeyError as e: - print(f"Encountered file read error with file {_f_name}") - - - return df.reset_index(drop=True) - - -def _compute_nan_percentile(a: np.ndarray, q: float) -> np.array: - """ - This code is taken directly from StackOverflow at: - https://stackoverflow.com/questions/60015245/numpy-nanpercentile-is-extremely-slow - For 2d masked array of roughly 10^5 x 100, this performs ~3x faster than - np.nanpercentile (~10s vs. 3s) and produces results which are identical. - Would be good to add two tests to verify: (1) the results of this function are - the same as np.nanpercentile, and (2) that this function takes less time on a - reasonably sized array, in case of future performance gains in nanpercentile. - Parameters - ---------- - a : array_like - 2d array for which to compute a row-wise percentile. - q : float - Percentile to compute -- allowed values are [0, 100] - Returns - ------- - np.array of the q'th percentile for each row of a. Dimensions of [len(a), 1]. - """ - if q < 0 or q > 100: - raise ValueError(f"Expected a value between 0 and 100; received {q} instead.") - a = np.sort(a, axis =1) - count = (~np.isnan(a)).sum(axis=1) # count number of non-nans in row - groups = np.unique(count) # returns sorted unique values - groups = groups[groups > 0] # only returns groups with at least 1 non-nan value\n", - - p = np.zeros((a.shape[0])) - for group in groups: - pos = np.where(count == group) - values = a[pos] - values = values[:, :group] - p[pos] = np.percentile(values, q, axis=1) - - return p - - -def _append_canopy_metrics(df: pd.DataFrame, canopy_threshold: float) -> None: - """ - This function takes a pd.DataFrame object and a numerical value corresponding - to the height threshold to consider a return as coming from the forest canopy. - It extracts the "rh" column from the provided DataFrame as an np.ndarray, - applies a mask, and computes the relevant canopy metrics which are then - appended to the provided DataFrame. - Note: the least bad option for computing these canopy metrics seemed to be - using np.ma module to work with masked arrays. Not converting out of Pandas is - untenable due to computation time, and the varying dimension of rows when - subset to canopy-only observations means np converts the result of applying - the mask to the 2d array to a 1d array. - Parameters - ---------- - df : pd.DataFrame - The DataFrame, assumed to be the output of gedi_L2A_to_df, for which - canopy metrics should be calculated. The provided df must contain a column - labeled "rh" which is array-valued. - canopy_threshold : float - The minimum value for a return to be considered a "canopy" return. - Side Effects - ------------ - Modifies the provided pd.DataFrame in-place to add canopy metrics. - Returns - ------- - None - """ - rh = np.stack(df.rh.to_numpy()) - canopy_returns = np.ma.masked_less(rh, canopy_threshold) - - df["canopy_max"] = pd.Series(np.max(canopy_returns, axis=1)) - df["canopy_min"] = pd.Series(np.min(canopy_returns, axis=1)) - df["canopy_std"] = pd.Series(np.std(canopy_returns, axis=1)) - df["canopy_avg"] = pd.Series(np.average(canopy_returns, axis=1)) - df["dns"] = pd.Series( - canopy_returns.count(axis=1) - ) # % all returns >= canopy threshold - - canopy_returns = np.ma.filled(canopy_returns, np.nan) - df["canopy_p10"] = pd.Series(_compute_nan_percentile(canopy_returns, 10)) - df["canopy_p25"] = pd.Series(_compute_nan_percentile(canopy_returns, 25)) - df["canopy_p50"] = pd.Series(_compute_nan_percentile(canopy_returns, 50)) - df["canopy_p75"] = pd.Series(_compute_nan_percentile(canopy_returns, 75)) - df["canopy_p90"] = pd.Series(_compute_nan_percentile(canopy_returns, 90)) - - df["d01"] = pd.Series( - np.ma.masked_outside(rh, canopy_threshold, 5).count(axis=1) - ) # % returns >= canopy, <=5m - df["d02"] = pd.Series( - np.ma.masked_outside(rh, 5, 10).count(axis=1) - ) # % >= 5m, <=10m - df["d03"] = pd.Series( - np.ma.masked_outside(rh, 10, 20).count(axis=1) - ) # % >= 10m, <=20m - df["d04"] = pd.Series( - np.ma.masked_outside(rh, 20, 30).count(axis=1) - ) # % >= 20m, <=30m - - -def df_to_geojson(df : pd.DataFrame, outfile : str) -> None: - """ - Convert pandas dataframe to GeoJSON file and save as given file name and path. - This file type can be useful when working locally in GIS software. - Parameters - ---------- - df : pd.DataFrame - The DataFrame, assumed to be the output of gedi_L2A_to_df, with or without - canopy height metrics appended. - outfile : str - The location and filename of output json file. - Side Effects - ------- - Writes dataframe to geojson filetype - Returns - ------- - None - """ - df["geometry"] = df.apply( - lambda row: Point(row.lon_lowestmode, row.lat_lowestmode), axis=1 - ) - GeoDF = gp.GeoDataFrame(df) - GeoDF = GeoDF.drop(columns=["lat_lowestmode", "lon_lowestmode"]) - GeoDF.to_file(outfile, driver="GeoJSON") - -def download_url(url : str, dir : str, chunk_size : int = 128) -> None: - """ - Download the zip files generated by NASA EarthData Search. - Parameters - ---------- - url : str - URL used to download data zip package - dir : str - Directory used to store the zip file and unzipped contents - chunk_size : int - Determines the size of each chunk written to file during data streaming - Side Effects - ------- - Downloads and unpacks the zip file from the provide url - Returns - ------- - None - """ - filepath = os.path.join(dir, "granuleData.zip") - r = requests.get(url, stream = True) - - if r.status_code == requests.codes.ok: - print(f"Downloading files from {url}") - - # Download zip file and save to dir - with open(filepath, 'wb') as fd: - for chunk in r.iter_content(chunk_size = chunk_size): - fd.write(chunk) - - print(f"Unzipping files") - # Unzip contents - with zipfile.ZipFile(filepath, 'r') as zipObj: - # Extract all the contents of zip file in current directory - zipObj.extractall(path = dir) - - # Remove zip file - rmtree(filepath) - - return True - - else: - print( - f'Error {r.status_code} has occurred.\n' - f'{url} cannot be downloaded at this time.' - ) - - return False - -if __name__ == "__main__": - # things I would like to see: displays 'arg help' if no arguments are given, docstrings with proper usage - parser = argparse.ArgumentParser() - parser.add_argument( - "-d", - "--dir", - help=( - "The directory containing url txt file supplied by the" - "--textfile argument" - ), - ) - parser.add_argument( - "-t", - "--textfile", - help=( - "The txt file containing the urls for the zip files, supplied by" - "EarthData Search." - ), - ) - parser.add_argument( - "-b", - "--bbox", - help=( - "The bounding box of the region of interest. In format" - "ul_lat,ul_lon,lr_lat,lr_lon" - ) - ) - parser.add_argument( - "-o", - "--outfile", - help=( - "The stem of the name of the output file, without file extension," - "optional. Default value is 'gedi_output'." - ), - default="gedi_output", - ) - parser.add_argument( - "-f", - "--filetype", - help=( - "The type of file to output. Acceptable formats are: csv," - "parquet, GeoJSON. Default value is csv." - ), - default="csv", - ) - args = parser.parse_args() - - print("------------------------------------------") - - bbox = [float(b) for b in args.bbox.split(',')] - - df = pd.DataFrame() - - with open(str(args.textfile), 'r') as texturls: - for url in texturls: - if validators.url(url): - - status = download_url(url.rstrip('\n'), args.dir) - if not status: - print("------------------------------------------") - continue - print(" ---------------------- ") - # Make list of filepaths - filepaths = [ - os.path.join(root, file) - for root, dir, files in os.walk(args.dir, topdown = False) - for file in files - if file.endswith('.h5') - ] - - tmp_df = gedi_L2A_to_df(filepaths, bbox = bbox) - print(" ---------------------- ") - # append data to dataframe - df = df.append(tmp_df) - print(f"Appended {len(tmp_df)} lines to dataframe; Dataframe has {len(df)} lines") - - # Delete hdf5 files - for filepath in filepaths: - rmtree(os.path.dirname(filepath)) - - print("------------------------------------------") - - if df.empty: - print("DataFrame is empty. Not writing data to file") - print("------------------------------------------") - else: - if args.filetype.lower() == "csv": - filename = os.path.join(args.dir, args.outfile + ".csv") - print(f'Writing to file {filename}') - df.to_csv(filename, index=False) - elif args.filetype.lower() == "parquet": - filename = os.path.join(args.dir, args.outfile + ".parquet.gzip") - print(f'Writing to file {filename}') - df.to_parquet(filename, compression="gzip") - elif args.filetype.lower() == "geojson": - filename = os.path.join(args.dir, args.outfile + ".geojson") - print(f'Writing to file {filename}') - df_to_geojson(df, filename) - else: - raise ValueError( - f"Received unsupported file type {args.filetype}. Please provide one of: csv, parquet, or GeoJSON." - ) - print("Job Complete!") - print("------------------------------------------") diff --git a/gedi/process_l2a.py b/gedi/process_l2a.py index 54a2ce2..d4c2be6 100644 --- a/gedi/process_l2a.py +++ b/gedi/process_l2a.py @@ -1,16 +1,63 @@ +#!/usr/bin/env python +""" +This program goes through the following process: + Download GEDI HDF5 files + Retrieve approproate data from files + Remove GEDI HDF5 files + Write data to file +The following layers are retrieved from the raw GEDI data: + shot_number, lat_lowestmode, lon_lowestmode, elev_lowestmode, elev_highestreturn, + sensitivity, quality_flag, rh +rh is converted to the following canopy height metrics: + canopy height max, min, std, avg + canopy height percentiles 10, 25, 50, 75, 90 + number of canopy height measurements in range h<5m, 5m= ul_lon) ) - tmp_df["beam"] = [beam] * np.count_nonzero(mask) - for layer in layers: - tmp_df[layer] = _f[beam][layer][()][mask].tolist() + if np.count_nonzero(mask) == 0: + continue + print(f"{beam} in {os.path.basename(_f_name)} does not contain any usable data") - df = df.append(tmp_df) + else: + tmp_df["beam"] = [beam] * np.count_nonzero(mask) + + for layer in layers: + tmp_df[layer] = _f[beam][layer][()][mask].tolist() + + _append_canopy_metrics(tmp_df, canopy_threshold=2) + del tmp_df["rh"] + + df = df.append(tmp_df) except KeyError as e: print(f"Encountered file read error with file {_f_name}") + return df.reset_index(drop=True) def _compute_nan_percentile(a: np.ndarray, q: float) -> np.array: - """This code is taken directly from StackOverflow at: + """ + This code is taken directly from StackOverflow at: https://stackoverflow.com/questions/60015245/numpy-nanpercentile-is-extremely-slow - For 2d masked array of roughly 10^5 x 100, this performs ~3x faster than np.nanpercentile (~10s vs. 3s) and produces results which are identical. - Would be good to add two tests to verify: (1) the results of this function are the same as np.nanpercentile, and (2) that this function takes less time on a reasonably sized array, in case of future performance gains in nanpercentile. - Parameters ---------- a : array_like 2d array for which to compute a row-wise percentile. q : float Percentile to compute -- allowed values are [0, 100] - Returns ------- np.array of the q'th percentile for each row of a. Dimensions of [len(a), 1]. """ if q < 0 or q > 100: raise ValueError(f"Expected a value between 0 and 100; received {q} instead.") - mask = (a >= np.nanmin(a)).astype(int) - - count = mask.sum(axis=1) - groups = np.unique(count) - groups = groups[groups > 0] + a = np.sort(a, axis =1) + count = (~np.isnan(a)).sum(axis=1) # count number of non-nans in row + groups = np.unique(count) # returns sorted unique values + groups = groups[groups > 0] # only returns groups with at least 1 non-nan value\n", p = np.zeros((a.shape[0])) - for g in range(len(groups)): - pos = np.where(count == groups[g]) + for group in groups: + pos = np.where(count == group) values = a[pos] - values = np.nan_to_num(values, nan=(np.nanmin(a) - 1)) - values = np.sort(values, axis=1) - values = values[:, -groups[g] :] + values = values[:, :group] p[pos] = np.percentile(values, q, axis=1) + return p -def append_canopy_metrics(df: pd.DataFrame, canopy_threshold: float) -> None: +def _append_canopy_metrics(df: pd.DataFrame, canopy_threshold: float) -> None: """ This function takes a pd.DataFrame object and a numerical value corresponding to the height threshold to consider a return as coming from the forest canopy. It extracts the "rh" column from the provided DataFrame as an np.ndarray, applies a mask, and computes the relevant canopy metrics which are then appended to the provided DataFrame. - Note: the least bad option for computing these canopy metrics seemed to be using np.ma module to work with masked arrays. Not converting out of Pandas is untenable due to computation time, and the varying dimension of rows when subset to canopy-only observations means np converts the result of applying the mask to the 2d array to a 1d array. - Parameters ---------- df : pd.DataFrame @@ -162,11 +209,9 @@ def append_canopy_metrics(df: pd.DataFrame, canopy_threshold: float) -> None: labeled "rh" which is array-valued. canopy_threshold : float The minimum value for a return to be considered a "canopy" return. - Side Effects ------------ Modifies the provided pd.DataFrame in-place to add canopy metrics. - Returns ------- None @@ -203,11 +248,10 @@ def append_canopy_metrics(df: pd.DataFrame, canopy_threshold: float) -> None: ) # % >= 20m, <=30m -def df_to_geojson(df, outfile): +def df_to_geojson(df : pd.DataFrame, outfile : str) -> None: """ Convert pandas dataframe to GeoJSON file and save as given file name and path. This file type can be useful when working locally in GIS software. - Parameters ---------- df : pd.DataFrame @@ -215,7 +259,9 @@ def df_to_geojson(df, outfile): canopy height metrics appended. outfile : str The location and filename of output json file. - + Side Effects + ------- + Writes dataframe to geojson filetype Returns ------- None @@ -227,22 +273,86 @@ def df_to_geojson(df, outfile): GeoDF = GeoDF.drop(columns=["lat_lowestmode", "lon_lowestmode"]) GeoDF.to_file(outfile, driver="GeoJSON") +def download_url(url : str, dir : str, chunk_size : int = 128) -> None: + """ + Download the zip files generated by NASA EarthData Search. + Parameters + ---------- + url : str + URL used to download data zip package + dir : str + Directory used to store the zip file and unzipped contents + chunk_size : int + Determines the size of each chunk written to file during data streaming + Side Effects + ------- + Downloads and unpacks the zip file from the provide url + Returns + ------- + None + """ + filepath = os.path.join(dir, "granuleData.zip") + r = requests.get(url, stream = True) + + if r.status_code == requests.codes.ok: + print(f"Downloading files from {url}") + + # Download zip file and save to dir + with open(filepath, 'wb') as fd: + for chunk in r.iter_content(chunk_size = chunk_size): + fd.write(chunk) + + print(f"Unzipping files") + # Unzip contents + with zipfile.ZipFile(filepath, 'r') as zipObj: + # Extract all the contents of zip file in current directory + zipObj.extractall(path = dir) + + # Remove zip file + rmtree(filepath) + + return True + + else: + print( + f'Error {r.status_code} has occurred.\n' + f'{url} cannot be downloaded at this time.' + ) + + return False if __name__ == "__main__": + # things I would like to see: displays 'arg help' if no arguments are given, docstrings with proper usage parser = argparse.ArgumentParser() parser.add_argument( "-d", "--dir", help=( - "The directory containing hd5 files, formatted with a trailing slash," - " such that {dir}{fname} is a valid path, for fname a valid file name." + "The directory containing url txt file supplied by the" + "--textfile argument" + ), + ) + parser.add_argument( + "-t", + "--textfile", + help=( + "The txt file containing the urls for the zip files, supplied by" + "EarthData Search." ), ) + parser.add_argument( + "-b", + "--bbox", + help=( + "The bounding box of the region of interest. In format" + "ul_lat,ul_lon,lr_lat,lr_lon" + ) + ) parser.add_argument( "-o", "--outfile", help=( - "The stem of the name of the output file, without file extension, " + "The stem of the name of the output file, without file extension," "optional. Default value is 'gedi_output'." ), default="gedi_output", @@ -250,34 +360,68 @@ def df_to_geojson(df, outfile): parser.add_argument( "-f", "--filetype", - help="The type of file to output. Acceptable formats are: csv, parquet, GeoJSON.", - default="parquet", + help=( + "The type of file to output. Acceptable formats are: csv," + "parquet, GeoJSON. Default value is csv." + ), + default="csv", ) args = parser.parse_args() - ul_lat = 43.38 - ul_lon = -124.1 - lr_lat = 42.88 - lr_lon = -123.6 - bbox = [ul_lat, ul_lon, lr_lat, lr_lon] - - files = [ - _f - for _f in os.listdir(args.dir) - if os.path.isfile(os.path.join(args.dir, _f)) and _f.endswith(".h5") - ] - - df = gedi_L2A_to_df(args.dir, files, bbox) - append_canopy_metrics(df, canopy_threshold=2) - del df["rh"] - if args.filetype.lower() == "csv": - df.to_csv(f"{args.dir}{args.outfile}.csv", index=False) - elif args.filetype.lower() == "parquet": - df.to_parquet(f"{args.dir}{args.outfile}.parquet.gzip", compression="gzip") - elif args.filetype.lower() == "geojson": - df_to_geojson(df, f"{args.dir}{args.outfile}.geojson") + print("------------------------------------------") + + bbox = [float(b) for b in args.bbox.split(',')] + + df = pd.DataFrame() + + with open(str(args.textfile), 'r') as texturls: + for url in texturls: + if validators.url(url): + + status = download_url(url.rstrip('\n'), args.dir) + if not status: + print("------------------------------------------") + continue + print(" ---------------------- ") + # Make list of filepaths + filepaths = [ + os.path.join(root, file) + for root, dir, files in os.walk(args.dir, topdown = False) + for file in files + if file.endswith('.h5') + ] + + tmp_df = gedi_L2A_to_df(filepaths, bbox = bbox) + print(" ---------------------- ") + # append data to dataframe + df = df.append(tmp_df) + print(f"Appended {len(tmp_df)} lines to dataframe; Dataframe has {len(df)} lines") + + # Delete hdf5 files + for filepath in filepaths: + rmtree(os.path.dirname(filepath)) + + print("------------------------------------------") + + if df.empty: + print("DataFrame is empty. Not writing data to file") + print("------------------------------------------") else: - raise ValueError( - f"Received unsupported file type {args.filetype}. Please provide one " - "of: csv, parquet, or GeoJSON." - ) + if args.filetype.lower() == "csv": + filename = os.path.join(args.dir, args.outfile + ".csv") + print(f'Writing to file {filename}') + df.to_csv(filename, index=False) + elif args.filetype.lower() == "parquet": + filename = os.path.join(args.dir, args.outfile + ".parquet.gzip") + print(f'Writing to file {filename}') + df.to_parquet(filename, compression="gzip") + elif args.filetype.lower() == "geojson": + filename = os.path.join(args.dir, args.outfile + ".geojson") + print(f'Writing to file {filename}') + df_to_geojson(df, filename) + else: + raise ValueError( + f"Received unsupported file type {args.filetype}. Please provide one of: csv, parquet, or GeoJSON." + ) + print("Job Complete!") + print("------------------------------------------")