Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
da745cc
continuing SWC development
burlingamet Jan 26, 2026
04c77fa
WIP threshold loader. Needs to handle different sets of thresholds go…
burlingamet Feb 2, 2026
4c434de
example using multiple different sets of contexts.
burlingamet Feb 3, 2026
1bd62c2
test update to threshold module to accept multiple contexts.
burlingamet Feb 3, 2026
b138894
update context filter
burlingamet Feb 3, 2026
d58e46b
req update for docker image
burlingamet Feb 3, 2026
49bd86a
updating requirements to stay with older version (for now) Could upda…
burlingamet Feb 3, 2026
f9a3091
too much filtering in deduplication checks. also was not doing an exa…
burlingamet Feb 4, 2026
775f6ec
removing indexing in deduplication checks
burlingamet Feb 4, 2026
e728cde
req update and testing to see if uuid works for the unique key
burlingamet Feb 9, 2026
efbad02
changing back key
burlingamet Feb 9, 2026
c949a13
testing thresh_select_ts_pad
burlingamet Feb 9, 2026
bf850a8
continued swc pipe development
burlingamet Feb 10, 2026
055c7c5
testing out new flow module to run temperature test on enviroscans
burlingamet Feb 17, 2026
5c44dc2
Merge branch 'master' of github.com:NEONScience/NEON-IS-data-processi…
burlingamet Feb 17, 2026
ed16787
updates to functions to harden pipeline
burlingamet Feb 17, 2026
4690a65
continued hardening
burlingamet Feb 17, 2026
0102e07
handling for limited PRT data and not grabbing sensors that are too f…
burlingamet Feb 17, 2026
f71f758
dynamic distance paramater calculations
burlingamet Feb 17, 2026
15620f0
updates to fix errors discovered in unit testing.
burlingamet Feb 18, 2026
1c7973f
prepping to load to pachyderm
burlingamet Feb 18, 2026
d814c14
clean up of threshold tests
burlingamet Feb 18, 2026
5f59d3a
comments
burlingamet Feb 18, 2026
b7da28c
change image_name
burlingamet Feb 19, 2026
42b7594
minor code changes discovered in unit testing and creation of unit te…
burlingamet Feb 20, 2026
fca90a7
delete extra test data
burlingamet Feb 20, 2026
c0ed10c
remove more test data
burlingamet Feb 20, 2026
7cf97bf
updating unit tests to work with multiple contexts.
burlingamet Feb 20, 2026
eaf4408
Merge branch 'master' of github.com:NEONScience/NEON-IS-data-processi…
burlingamet Feb 23, 2026
183d608
renv lock update
burlingamet Feb 23, 2026
ef49226
typo in flow script.
burlingamet Feb 23, 2026
ae444b5
Merge branch 'master' of github.com:NEONScience/NEON-IS-data-processi…
burlingamet Feb 23, 2026
f1f30ef
testing new temp functions
burlingamet Feb 23, 2026
695ca48
attempted pipeline specs for custom envsc script
burlingamet Feb 23, 2026
b61d89e
still erroring, but at least right number of datums...
burlingamet Feb 24, 2026
2244855
logging messages for debugging.
burlingamet Feb 24, 2026
c054a9d
add filter joiner to envscn module
burlingamet Feb 24, 2026
0736d51
dockerfile update
burlingamet Feb 25, 2026
f1fa951
interime pipeline to test temp pipe
burlingamet Feb 26, 2026
45e3836
delete test pipeline
burlingamet Feb 26, 2026
13332cf
updating flow script to not need a DirTemp, but still handle one if i…
burlingamet Feb 26, 2026
1f92c45
remove debug line
burlingamet Feb 26, 2026
94324c7
combined module with filter joiner so temp data is in with the base q…
burlingamet Feb 26, 2026
6f154b8
add tests and data
burlingamet Feb 26, 2026
830e95d
one more test using a NULL tempdir
burlingamet Feb 26, 2026
68aa895
minor adjustment to flow of data for dirIn and dirTemp, documentaiton…
burlingamet Feb 26, 2026
c97793e
better null handling and test data update
burlingamet Feb 26, 2026
07e10e6
repo clean up
burlingamet Feb 26, 2026
2eb2add
update pipeline to use new schema
burlingamet Feb 26, 2026
c797c08
update unit tests for threshold_loader
burlingamet Mar 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build_push_analyze_pad_qaqc_envscn.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: "Build-push_analyze_pad_qaqc_envscn"
on:
push:
branches:
- 'master'
- 'swc.dev'
paths:
- 'modules_combined/qaqc_plau_and_radiation_custom/**'
- 'modules/padded_timeseries_analyzer/**'
Expand Down
358 changes: 358 additions & 0 deletions flow/flow.envscn.temp.flags/TEST_SUMMARY.txt

Large diffs are not rendered by default.

84 changes: 84 additions & 0 deletions flow/flow.envscn.temp.flags/def.apply.temp.flags.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
##############################################################################################
#' @title Apply temperature flags to high-frequency data

#' @author
#' Teresa Burlingame \email{tburlingame@battelleecology.org}

#' @description
#' Join minute-interval temperature flags to high-frequency (e.g., 10-second) soil moisture data
#' using time-based overlap matching.

#' @param dataSm Data frame containing soil moisture data with readout_time column
#' @param tempData Data frame with temperature flags (startDateTime, endDateTime, temp_flag)
#' @param qfColName Character. Name of the QF column to update in dataSm
#' @param log A logger object. Defaults to NULL.

#' @return Updated dataSm data frame with qfColName column populated with temperature flags

#' @references
#' License: GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007

#' @seealso Currently none

#' @export

# changelog and author contributions / copyrights
# Teresa Burlingame (2025-02-17)
#' original creation
##############################################################################################
def.apply.temp.flags <- function(dataSm,
tempData,
qfColName,
log = NULL) {

# Initialize log if not provided
if (base::is.null(log)) {
log <- NEONprocIS.base::def.log.init()
}

# Convert to data.table for efficient joins
dtSm <- data.table::as.data.table(dataSm)
dtTemp <- data.table::as.data.table(tempData)[, .(startDateTime, endDateTime, temp_flag)]

# Ensure proper datetime format
dtSm[, `:=`(
readout_start = base::as.POSIXct(readout_time, tz = "UTC"),
readout_end = base::as.POSIXct(readout_time, tz = "UTC")
)]

dtTemp[, `:=`(
startDateTime = base::as.POSIXct(startDateTime, tz = "UTC"),
endDateTime = base::as.POSIXct(endDateTime, tz = "UTC"),
temp_flag = base::as.integer(temp_flag)
)]

# Set key for foverlaps (required on the interval table)
data.table::setkey(dtTemp, startDateTime, endDateTime)

# Perform overlap join - find which minute interval each point falls within
joined <- data.table::foverlaps(
x = dtSm[, .(readout_start, readout_end, .rows = .I)],
y = dtTemp[, .(startDateTime, endDateTime, temp_flag)],
by.x = c("readout_start", "readout_end"),
by.y = c("startDateTime", "endDateTime"),
type = "within", # point must fall within interval
nomatch = NA_integer_
)

# Get current QF values
newQf <- dataSm[[qfColName]]

# Update QF values where we found a matching interval
hasMatch <- !base::is.na(joined$temp_flag)
newQf[joined$.rows[hasMatch]] <- joined$temp_flag[hasMatch]

# Assign back to original data frame
dataSm[[qfColName]] <- newQf

numMatched <- base::sum(hasMatch)
numTotal <- base::nrow(dataSm)
log$debug(base::paste0('Applied temperature flags: ', numMatched, ' / ', numTotal,
' rows matched (', base::round(100 * numMatched / numTotal, 1), '%)'))

return(dataSm)
}
159 changes: 159 additions & 0 deletions flow/flow.envscn.temp.flags/def.calc.temp.flags.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
##############################################################################################
#' @title Calculate temperature test flags for soil moisture data

#' @author
#' Teresa Burlingame \email{tburlingame@battelleecology.org}

#' @description
#' Calculate temperature-based quality flags by comparing soil temperature to uncertainty.
#' If primary sensor is flagged, uses average of neighboring sensors.

#' @param sensorInfo List containing 'closest' sensor and 'neighbors' list (from def.find.temp.sensor)
#' @param targetDepth Numeric. The target depth (in meters) for distance validation
#' @param distThreshold Numeric. Maximum allowed distance (in meters) between sensor and target depth
#' @param log A logger object. Defaults to NULL.

#' @return Data frame with columns:
#' \describe{
#' \item{startDateTime}{Start time of measurement interval}
#' \item{endDateTime}{End time of measurement interval}
#' \item{temp_flag}{Flag value: 0=pass, 1=fail, -1=test not run}
#' }

#' @references
#' License: GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007

#' @seealso Currently none

#' @export

# changelog and author contributions / copyrights
# Teresa Burlingame (2025-02-17)
#' original creation
##############################################################################################
def.calc.temp.flags <- function(sensorInfo,
targetDepth = NULL,
distThreshold = NULL,
log = NULL) {

# Initialize log if not provided
if (base::is.null(log)) {
log <- NEONprocIS.base::def.log.init()
}

# Constants
QF_PASS <- 0L
QF_FAIL <- 1L
QF_NA <- -1L
TEMP_DIFF_THRESHOLD <- 1 # degrees

# Read primary sensor data
closestSensor <- sensorInfo$closest
tempDataClose <- NEONprocIS.base::def.read.parq(closestSensor$data_path)

# Initialize output data frame
tempData <- tempDataClose[, c('startDateTime', 'endDateTime')]
tempData$temp_flag <- NA_integer_

# Check if primary sensor has good data (finalQF == 0)
idxGood <- !base::is.na(tempDataClose$finalQF) & tempDataClose$finalQF == QF_PASS

if (base::all(idxGood)) {
# All primary sensor data is good - use it directly
log$debug(base::paste0('Using primary sensor ', closestSensor$sensor_id, ' (all data good)'))
tempData$temp_flag <- base::as.integer(
tempDataClose$soilTempMean < tempDataClose$soilTempExpUncert
)

} else {
# Some primary data is flagged - need to use neighbors for those intervals
log$debug(base::paste0('Primary sensor has flagged data, attempting neighbor average'))

# Use primary sensor where data is good
tempData$temp_flag[idxGood] <- base::as.integer(
tempDataClose$soilTempMean[idxGood] < tempDataClose$soilTempExpUncert[idxGood]
)

# Try to use neighbor average for flagged intervals
nextHigher <- sensorInfo$neighbors$higher
nextLower <- sensorInfo$neighbors$lower

# Check if neighbors exist and are within acceptable distance
useNeighbors <- FALSE
if (!base::is.null(nextHigher) && !base::is.null(nextLower) && !base::is.null(targetDepth) && !base::is.null(distThreshold)) {
distHigher <- base::abs(targetDepth - nextHigher$depth_m)
distLower <- base::abs(targetDepth - nextLower$depth_m)

if (distHigher > distThreshold || distLower > distThreshold) {
log$warn(base::paste0('Neighbor sensors exceed ', distThreshold, 'm distance threshold. ',
'Higher: ', base::round(distHigher, 3), 'm, ',
'Lower: ', base::round(distLower, 3), 'm. ',
'Skipping neighbor averaging.'))
} else {
useNeighbors <- TRUE
}
} else if (!base::is.null(nextHigher) && !base::is.null(nextLower)) {
# targetDepth or distThreshold not provided, proceed without distance check (legacy behavior)
useNeighbors <- TRUE
}

if (useNeighbors) {
# Read neighbor data
tempDataHigher <- NEONprocIS.base::def.read.parq(nextHigher$data_path)
tempDataLower <- NEONprocIS.base::def.read.parq(nextLower$data_path)

# Filter for good data only (finalQF < 1)
tempDataHigher <- tempDataHigher[tempDataHigher$finalQF < 1, ]
tempDataLower <- tempDataLower[tempDataLower$finalQF < 1, ]

# Calculate test statistic for each neighbor
tempDataLower$zeroCheckLow <- tempDataLower$soilTempMean - tempDataLower$soilTempExpUncert
tempDataHigher$zeroCheckHigh <- tempDataHigher$soilTempMean - tempDataHigher$soilTempExpUncert

# Join neighbor data
tempDataJoin <- base::merge(
tempDataLower[, c("startDateTime", "endDateTime", "zeroCheckLow")],
tempDataHigher[, c("startDateTime", "endDateTime", "zeroCheckHigh")],
by = c("startDateTime", "endDateTime"),
all = TRUE
)

# Calculate average of neighbor checks
tempDataJoin$avgZeroCheck <- base::rowMeans(
base::cbind(tempDataJoin$zeroCheckLow, tempDataJoin$zeroCheckHigh),
na.rm = TRUE
)
tempDataJoin$avgZeroCheck[base::is.nan(tempDataJoin$avgZeroCheck)] <- NA_real_

# Test if average is less than threshold
tempDataJoin$zeroCheck <- base::ifelse(
base::is.na(tempDataJoin$avgZeroCheck),
NA_integer_,
base::as.integer(tempDataJoin$avgZeroCheck < TEMP_DIFF_THRESHOLD)
)

# Merge with primary data and fill in gaps
tempData <- base::merge(
tempData,
tempDataJoin[, c("startDateTime", "endDateTime", "zeroCheck")],
by = c("startDateTime", "endDateTime"),
all.x = TRUE
)

# Use neighbor check where primary flag is NA
idxNeedNeighbor <- base::is.na(tempData$temp_flag)
tempData$temp_flag[idxNeedNeighbor] <- tempData$zeroCheck[idxNeedNeighbor]
tempData$zeroCheck <- NULL

log$debug(base::paste0('Filled ', base::sum(idxNeedNeighbor & !base::is.na(tempData$temp_flag)),
' intervals using neighbor average'))
} else {
log$warn('Insufficient neighbor sensors to calculate backup flags')
}
}

# Set remaining NA values to -1 (test could not be run)
tempData$temp_flag[base::is.na(tempData$temp_flag)] <- QF_NA

return(tempData)
}
90 changes: 90 additions & 0 deletions flow/flow.envscn.temp.flags/def.find.temp.sensor.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
##############################################################################################
#' @title Find closest temperature sensor to target depth

#' @author
#' Teresa Burlingame \email{tburlingame@battelleecology.org}

#' @description
#' Identify the temperature sensor with depth closest to the target depth.
#' In case of tie, prefers shallower (less negative/more positive) depth.

#' @param targetDepth Numeric value. Target depth in meters (negative = below surface)
#' @param sensorDepthDf Data frame with columns sensor_id, depth_m, data_path, location_path
#' @param log A logger object. Defaults to NULL.

#' @return A list with two elements:
#' \describe{
#' \item{closest}{Data frame row for the closest sensor}
#' \item{neighbors}{List with elements 'higher' and 'lower' containing neighbor sensor info}
#' }

#' @references
#' License: GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007

#' @seealso Currently none

#' @export

# changelog and author contributions / copyrights
# Teresa Burlingame (2025-02-17)
#' original creation
##############################################################################################
def.find.temp.sensor <- function(targetDepth,
sensorDepthDf,
log = NULL) {

# Initialize log if not provided
if (base::is.null(log)) {
log <- NEONprocIS.base::def.log.init()
}

# Filter out sensors with missing depths
validSensors <- sensorDepthDf[!base::is.na(sensorDepthDf$depth_m), ]

if (base::nrow(validSensors) == 0) {
log$warn('no valid temperature depths found, all data will be flagged -1')
return(NULL)
}

# Calculate absolute difference from target depth
validSensors$abs_diff <- base::abs(validSensors$depth_m - targetDepth)

# Sort by: 1) smallest difference, 2) shallowest depth, 3) sensor_id (for stability)
validSensors <- validSensors[base::order(validSensors$abs_diff,
-validSensors$depth_m,
validSensors$sensor_id), ]

# Select closest sensor
closestSensor <- validSensors[1, c("sensor_id", "depth_m", "data_path", "location_path")]
closestDepth <- closestSensor$depth_m

# Find neighbor sensors (one shallower, one deeper)
# Shallower = greater depth_m (less negative)
higherSensors <- validSensors[validSensors$depth_m > closestDepth, ]
if (base::nrow(higherSensors) > 0) {
higherSensors <- higherSensors[base::order(higherSensors$depth_m, higherSensors$sensor_id), ]
nextHigher <- higherSensors[1, c("sensor_id", "depth_m", "data_path", "location_path")]
} else {
nextHigher <- NULL
}

# Deeper = smaller depth_m (more negative)
lowerSensors <- validSensors[validSensors$depth_m < closestDepth, ]
if (base::nrow(lowerSensors) > 0) {
lowerSensors <- lowerSensors[base::order(-lowerSensors$depth_m, lowerSensors$sensor_id), ]
nextLower <- lowerSensors[1, c("sensor_id", "depth_m", "data_path", "location_path")]
} else {
nextLower <- NULL
}

log$debug(base::paste0('Closest sensor to depth ', targetDepth, 'm: ',
closestSensor$sensor_id, ' (', closestDepth, 'm)'))

return(base::list(
closest = closestSensor,
neighbors = base::list(
higher = nextHigher,
lower = nextLower
)
))
}
Loading