From 96eeb336c49b1378e2bba3bebf7870bee8f12fd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?=
 <tanxpyox@gmail.com>
Date: Thu, 18 Aug 2022 10:13:29 +0800
Subject: [PATCH 01/14] Update NAMESPACE

---
 NAMESPACE | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NAMESPACE b/NAMESPACE
index d9bba6b..9879beb 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -18,6 +18,7 @@ export(getEdges)
 export(hmm)
 export(intensity)
 export(labelUtterances)
+export(llh)
 export(loudness)
 export(lpc)
 export(magPhase)
@@ -29,6 +30,7 @@ export(pipeline)
 export(play)
 export(segment.utterances)
 export(selectParams)
+export(standardizeFeatures)
 export(turnDetector)
 export(vectorPreemphasis)
 export(winFrame)

From f3cd71ad0738017bd236c227cb683187789dbd6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?=
 <tanxpyox@gmail.com>
Date: Thu, 18 Aug 2022 09:52:56 +0800
Subject: [PATCH 02/14] compatibility

make extract.features.R compatible with hmm and llh
add an option to call standardizeFeatures with extractFeatures
---
 R/extract.features.R    | 178 +++++++------
 R/hmm.R                 |  25 +-
 R/hmm_autocorr.R        | 549 ++++++++++++++++++++++++++++++++++++++++
 R/llh.R                 |   6 +-
 R/standardizeFeatures.R | 332 ++++++++++++++++++++++++
 5 files changed, 1000 insertions(+), 90 deletions(-)
 create mode 100644 R/hmm_autocorr.R
 create mode 100644 R/standardizeFeatures.R

diff --git a/R/extract.features.R b/R/extract.features.R
index c276beb..bf3abad 100644
--- a/R/extract.features.R
+++ b/R/extract.features.R
@@ -6,52 +6,82 @@
 #' @param config \code{audio_config}. An object of class 'audio_config' with parameters for extraction.
 #' @param use.full.names \code{boolean}. Whether to use full file path as names of returned list elements
 #' @param use.exts \code{boolean}. Whether to use file extensions in names of returned list elements.
+#' @param raw.data \code{boolean}. Whether to include raw data in the output list.
+#' @param timestamp \code{boolean}. Whether to include raw data in the output list.
+#' @param standardize \code{boolean}. Whether to standardize the extracted features. Only works when both raw.data and timestamp are set to false.
 #'   Only applicable if use.full.names is FALSE
 #' @return audio class
 #' @export
-#' 
-extractFeatures <- function(files,  config = loudness(createConfig()), use.full.names=T, use.exts=T) {
-    features <- purrr::map(files, function(x) {
-        result <- extractFeature(x, config)
-        attr(result, "filename") <- x
-        result
-        })
-
-    list_names <- files
-    if (!use.full.names) list_names <- basename(list_names)
-    if (!use.exts) list_names <- tools::file_path_sans_ext(list_names)
-    
-    purrr::set_names(features, list_names)
+
+extractFeatures <- function(files,  config = loudness(createConfig()), 
+                            use.full.names=T, 
+                            use.exts=T,
+                            raw.data = F,
+                            timestamp = F,
+                            standardize = F
+) {
+  features <- purrr::map(files, function(x) {
+    result <- extractFeature(x, config)
+    attr(result, "filename") <- x
+    result
+  })
+  
+  out = NULL
+  
+  list_names <- files
+  if (!use.full.names) list_names <- basename(list_names)
+  if (!use.exts) list_names <- tools::file_path_sans_ext(list_names)
+  
+  out$data <- purrr::set_names(features, list_names)
+
+  if (standardize) {
+    if (raw.data | timestamp)
+      stop("standardizeFeatures can only be used when both raw.data and timestamp are set to false.")
+    else
+      out$data <- out$data %>% standardizeFeatures
+  }
+
+  out$files$fname <- list_names
+  out$files$duration <- sapply(out$data, nrow) 
+  
+  return(out)
 }
 
 
-extractFeature <- function(filename, config = config) {
-    audio_nfeatures <- create_audio_object(filename, config)
-    audio <- audio_nfeatures$audio
-    n_features <- audio_nfeatures$n_features
+extractFeature <- function(filename, config = config,
+                           raw.data = F,
+                           timestamp = F) {
+  
+  audio_nfeatures <- create_audio_object(filename, config, raw.data, timestamp)
+  audio <- audio_nfeatures$audio %>% as.matrix
+
+  n_features <- audio_nfeatures$n_features
+  
+  if(raw.data){
     raw_data <- add_raw_data(filename, audio$timestamps)
     
     # For some reasons, length of audio$timestamps is not the same as raw_data (returning 
     # more data - I have to check it)
     audio$raw_data <- raw_data$cut_raw_data[1:length(audio$timestamps)]
-    attr(audio, "header") <- raw_data$header
-    
-    # If the no. of columns supplied by the config object is not the same as the no. of features
-    # we got, then most likely the config does not know how many features are created
-    # In this case, we name the feature columns based on the output name of the components
-    # But we can do so reliably only if there is a single component that connects to the sink.
-    column_names <- strsplit(attr(config, "columns"), ":")[[1]]
-    if (length(column_names) < n_features) {
-        edges <- attr(config, 'edges')
-        outputs <- subset(edges, 'explicit_output')$output
-        if (length(outputs) > 1) {
-            warning('Unable to determine column names for features. Please correctly specify the no. of features of each component.')
-        } else {
-            column_names <- paste0(outputs[1], '_', seq(n_features))
-        }
+    attr(audio, "header") <- raw_data$header  
+  }
+  
+  # If the no. of columns supplied by the config object is not the same as the no. of features
+  # we got, then most likely the config does not know how many features are created
+  # In this case, we name the feature columns based on the output name of the components
+  # But we can do so reliably only if there is a single component that connects to the sink.
+  column_names <- strsplit(attr(config, "columns"), ":")[[1]]
+  if (length(column_names) < n_features) {
+    edges <- attr(config, 'edges')
+    outputs <- subset(edges, 'explicit_output')$output
+    if (length(outputs) > 1) {
+      warning('Unable to determine column names for features. Please correctly specify the no. of features of each component.')
+    } else {
+      column_names <- paste0(outputs[1], '_', seq(n_features))
     }
-    colnames(audio) <- c(column_names, "timestamps", "raw_data")
-    audio
+  }
+  colnames(audio) <- c(column_names)
+  audio
 }
 
 
@@ -62,13 +92,13 @@ extractFeature <- function(filename, config = config) {
 #' @return speech object
 #' @export
 labelUtterances <- function(features, labels) {
-    if ( length(features) != length(labels) )
-        stop("Verify the number of labels and different speech units")
-    
-    purrr::map2(features, labels, function(x, y) {
-        attr(x, "label") <- y
-        x
-    })
+  if ( length(features) != length(labels) )
+    stop("Verify the number of labels and different speech units")
+  
+  purrr::map2(features, labels, function(x, y) {
+    attr(x, "label") <- y
+    x
+  })
 }
 
 
@@ -79,63 +109,67 @@ labelUtterances <- function(features, labels) {
 #' @return speech class
 #' @export
 play <- function(x, ...) {
-    UseMethod("play", x)
+  UseMethod("play", x)
 }
 
 
 play.speech <- function(x, start = NULL, end = NULL) {
-    n <- 1:nrow(x)
-    if ( is.null(start) || is.null(end)) {
-        rcpp_playWavFile(purrr::as_vector(x$raw_data[n]),
-                         attr(x, "header"))
-    }
-    else {
-        rcpp_playWavFileSubset(purrr::as_vector(x$raw_data[n]),
-                            attr(x, "header"), start, end)
-    }
+  n <- 1:nrow(x)
+  if ( is.null(start) || is.null(end)) {
+    rcpp_playWavFile(purrr::as_vector(x$raw_data[n]),
+                     attr(x, "header"))
+  }
+  else {
+    rcpp_playWavFileSubset(purrr::as_vector(x$raw_data[n]),
+                           attr(x, "header"), start, end)
+  }
 }
 
 
 
 #' @export
 print.speech <- function(x, ...) {
-    base::print(format(x[1:(ncol(x)-1)]))
-#    invisible(x)
+  base::print(format(x[1:(ncol(x)-1)]))
+  #    invisible(x)
 }
 
 
 head.speech <- function(x, ...) {
-    head(print(x))
+  head(print(x))
 }
 
 
 
 tail.speech <- function(x, ...) {
-    tail(print(x))
+  tail(print(x))
 }
 
-create_audio_object <- function(filename, config) {
-    config_string <- generate_config_string(config)
-    extracted_data <- rcpp_openSmileGetFeatures(filename, config_string_in = config_string)
-    audio <- as.data.frame(extracted_data$audio_features_0)
-    n_features <- ncol(audio)
+create_audio_object <- function(filename, config,
+                                raw.data = F,
+                                timestamp = F) {
+  config_string <- communication:::generate_config_string(config)
+  extracted_data <- communication:::rcpp_openSmileGetFeatures(filename, config_string_in = config_string)
+  audio <- as.data.frame(extracted_data$audio_features_0)
+  n_features <- ncol(audio)
+  
+  if(timestamp) {
     timestamps <- as.vector(extracted_data$audio_timestamps_0)
     audio$timestamps <- timestamps
-    class(audio) <- c("speech", "data.frame")
-    
-    list(audio=audio, n_features=n_features)
+  }
+  
+  list(audio=audio, n_features=n_features)
 }
 
 add_raw_data <- function(filename, timestamps) {
-    raw_data <- rcpp_parseAudioFile(filename)
-    # Timemust must be cut in time intervals of same size
-    # We need to create a test for that
-    timestamp_interval <- timestamps[2]
-    
-    # Split raw_data in intervals of sampleRate * timestamp_interval
-    cut_raw_data <- split(raw_data[[2]], 
-                          ceiling(seq_along(raw_data[[2]])/(raw_data[[1]]$sampleRate*timestamp_interval)))
-    list(cut_raw_data = cut_raw_data, header = raw_data[[1]])
+  raw_data <- rcpp_parseAudioFile(filename)
+  # Timemust must be cut in time intervals of same size
+  # We need to create a test for that
+  timestamp_interval <- timestamps[2]
+  
+  # Split raw_data in intervals of sampleRate * timestamp_interval
+  cut_raw_data <- split(raw_data[[2]], 
+                        ceiling(seq_along(raw_data[[2]])/(raw_data[[1]]$sampleRate*timestamp_interval)))
+  list(cut_raw_data = cut_raw_data, header = raw_data[[1]])
 }
 
 
diff --git a/R/hmm.R b/R/hmm.R
index 8f4e3d4..3a57fe2 100644
--- a/R/hmm.R
+++ b/R/hmm.R
@@ -1,7 +1,3 @@
-# dk when derivatives > 0 delete first leading rows from each Xs before
-# calculating Ts, passing to hmm
-# then add leading rows of NAs back into results
-
 #########################
 # R wrapper for hmm_cpp #
 #########################
@@ -112,23 +108,20 @@
 #'     Defaults to 1.
 #'   }
 #' }
-#' @export
-hmm <- function(Xs,              # data
+#'
+hmm = function(Xs,              # data
                weights=NULL,    # weight on each element of Xs
                nstates,         # number of states
                par=list(),      # initialization
                control=list(),  # EM control parameters
                labels=list()    # labels for supervised training
-)
-  {
-  
-  Xs
+){
 
   if (class(Xs) == 'matrix'){
       Xs = list(Xs)
   }
 
-  if (any(sapply(Xs, function(X) class(X)!='matrix'))){
+  if (any(sapply(Xs, function(X) class(X) != c("matrix", "array")))){
       stop('Xs must be matrix or list of matrices')
   }
 
@@ -136,9 +129,9 @@ hmm <- function(Xs,              # data
       stop('all observation sequences must have same number of observed features')
   }
 
-  ## if (is.null(weights)){
-  ##     weights <- rep(weights, length(Xs))
-  ## }
+  if (is.null(weights)){
+      weights <- rep(weights, length(Xs))
+  }
 
   # indices
 
@@ -265,7 +258,7 @@ hmm <- function(Xs,              # data
 
   if (control$standardize){
 
-    Xs = standardizeFeatures(Xs)
+    Xs = standardizeFeatures(Xs, verbose=control$verbose)
 
     feature_means = attr(Xs[[1]], 'scaled:center')
     feature_sds = attr(Xs[[1]], 'scaled:scale')
@@ -519,7 +512,7 @@ hmm <- function(Xs,              # data
   if (iters >= control$maxiter){
     if (diff(out$llh_seq[-1:0 + length(out$llh_seq)]) >= control$tol){
       converged = FALSE
-      warning('failed to converge by maxiter = ', control$maxiter, ' iterations (tol = ', control$tol, ')')
+      if (control$verbose > 0) warning('failed to converge by maxiter = ', control$maxiter, ' iterations (tol = ', control$tol, ')')
     }
   }
 
diff --git a/R/hmm_autocorr.R b/R/hmm_autocorr.R
new file mode 100644
index 0000000..5a8504c
--- /dev/null
+++ b/R/hmm_autocorr.R
@@ -0,0 +1,549 @@
+#########################
+# R wrapper for hmm_cpp #
+#########################
+
+#' Train a hidden Markov model with multivariate normal state distributions.
+#'
+#' @param Xs List of nsequences matrices; each matrix represents one observation
+#'   sequence and is of dimension nobs x nfeatures. For a single observation
+#'   sequence, a single matrix can be provided
+#' @param weights Optional vector of weights, one for each observation sequence
+#' @param nstates Integer; number of states
+#' @param par List of initialization parameters; see 'Details'
+#' @param control List of control parameters for EM steps
+#' @param labels List of observation labels for supervised training, with each
+#'   element corresponding to an observation sequence. Element i can either be
+#'   an vector of integer state labels in \code{1:nstates} or a matrix of
+#'   dimension \code{nstates} x \code{nrow(Xs[[i]])} with columns summing to 1.
+#'   If labels are supplied, E-step is suppressed.
+#'
+#' @details The \code{par} argument is a list of initialization parameters.
+#'   Can supply any of the following components:
+#' \itemize{
+#'   \item{\code{method}}{
+#'     Name of method used to automatically initialize EM run. Currently only
+#'     \code{'dirichlet'} and \code{'random-spherical'} are implemented. If
+#'     provided, user-specified state distributions are ignored.
+#'     \code{'dirichlet'} randomly generates responsibilities which are in turn
+#'     used to calculate starting distributions. \code{'random-spherical'} randomly draws
+#'     nstates observations and uses their features as state means; all state covariance matrices are set to a
+#'     diagonal matrix with entries \code{method_arg} (default=1).
+#'   }
+#'   \item{\code{method_arg}}{
+#'     Argument to supply to \code{method}. For \code{method='dirichlet'}, this
+#'     is a scalar concentration \code{alpha} (same value used for all states). For \code{method='random-spherical'}, this is a
+#'     scalar for diagonal entries of the spherical covariance matrices of the
+#'     starting distributions (after features are standardized).
+#'     \code{'dirichlet'} is implemented. If provided, all other arguments are ignored.
+#'   }
+#'   \item{\code{resp}}{
+#'     Matrix or list of nsequences matrices with rows summing to 1; each matrix
+#'     represents one observation sequence and is of dimension nobs x nstates,
+#'     with the (t,k)-th entry giving the initial probability that the t-th
+#'     observation belongs to state k. If either \code{resp} or both \code{mus}
+#'     and \code{Sigmas} are not provided, responsibilities are randomly
+#'     initialized using \code{rdirichlet} with all shape parameters set to 10.
+#'   }
+#'   \item{\code{mus}}{
+#'     List of nstates vectors with length nfeatures, each corresponding to the mean of
+#'     a state distribution
+#'   }
+#'   \item{\code{Sigmas}}{
+#'     List of nstates matrices with dimension nfeatures x nfeatures, each
+#'     corresponding to the covariance matrix of a state distribution
+#'   }
+#'   \item{\code{Gamma}}{
+#'     Matrix of transition probabilities with dimension nstates x nstates, with
+#'     row k representing the probabilities of each transition out of k and
+#'     summing to 1. If not supplied, each row is randomly drawn from
+#'     \code{rdirichlet} with all shape parameters set to 10.
+#'   }
+#'   \item{\code{delta}}{
+#'     Vector of initial state probabilities, of length nstates and summing to
+#'     1. If not supplied, \code{delta} is set to the stationary distribution of
+#'     \code{Gamma}, i.e. the normalized first left eigenvector.
+#'   }
+#' }
+#'
+#' The \code{control} argument is a list of EM control parameters that can supply
+#' any of the following components
+#' \itemize{
+#'   \item{\code{lambda}}{
+#'     Ridge-like regularization parameter. \code{lambda} is added to each
+#'     \code{diag(Sigmas[[k]])} to stabilize each state's covariance matrix,
+#'     which might otherwise be numerically singular, before inverting to
+#'     calculate multivariate normal densities. Note that regularization is
+#'     applied after all features are standardized, so \code{diag(Sigmas[[k]])}
+#'     is unlikely to contain elements greater than 1. This parameter should be
+#'     selected through cross-validation.
+#'   }
+#'   \item{\code{tol}}{
+#'     EM terminates when the improvement in the log-likelihood between
+#'     successive steps is \code{< tol}. Defaults to 1e-6.
+#'   }
+#'   \item{\code{maxiter}}{
+#'     EM terminates with a warning if \code{maxiter} iterations are reached
+#'     without convergence as defined by \code{tol}. Defaults to 100.
+#'   }
+#'   \item{\code{uncollapse}}{
+#'     Threshold for detecting and resetting state distribution when they
+#'     collapse on a single point. State distributions are uncollapsed by
+#'     re-drawing \code{mus[[k]]} from a standard multivariate normal and
+#'     setting \code{Sigmas[[k]]} to the nfeatures-dimensional identity matrix.
+#'     Note that this distribution is with respect to the standardized features.
+#'   }
+#'   \item{\code{standardize}}{
+#'     Whether features should be standardized. Defaults to \code{TRUE}. This
+#'     option also adds a small amount of noise , equal to .01 x feature
+#'     standard deviation, to observation-features that have been zeroed out
+#'     (e.g. f0 during unvoiced periods). If set to \code{FALSE}, it is assumed
+#'     that features have been externally standardized and zeroed-out values
+#'     handled. scaling$feature_means and scaling$feature_sds are set to 0s and
+#'     1s, respectively, and no check is done to ensure this is correct. If
+#'     features are in fact not standardized and zeroes handled, bad things will
+#'     happen and nobody will feel sorry for you.
+#'   }
+#'   \item{\code{verbose}}{
+#'     Integer in 0:1. If \code{1}, information on the EM process is reported.
+#'     Defaults to 1.
+#'   }
+#' }
+#'
+hmm_autocorr = function(
+    Xs,              # data
+    weights=NULL,    # weight on each element of Xs
+    nstates,         # number of states
+    par=list(),      # initialization
+    control=list(),  # EM control parameters
+    labels=list()    # labels for supervised training
+    ){
+
+  if (class(Xs) == 'matrix')
+    Xs = list(Xs)
+
+  if (any(sapply(Xs, function(X) class(X)!='matrix')))
+    stop('Xs must be matrix or list of matrices')
+
+  if (any(unique(diff(sapply(Xs, ncol))) != 0))
+    stop('all observation sequences must have same number of observed features')
+
+  if (is.null(weights)){
+      weights <- rep(weights, length(Xs))
+  }
+
+  # indices
+
+  K = floor(nstates)
+  N = length(Xs)
+  M = ncol(Xs[[1]])      # number of features
+  Ts = sapply(Xs, nrow)  # number of obs in each sequence
+
+  K_digits = ceiling(log(K+1, 10))  # for verbose
+  N_digits = ceiling(log(N+1, 10))
+
+  if (K != nstates | nstates < 2){
+      stop('nstates must be integer >= 2')
+  }
+
+  if (is.null(weights)){
+      weights = rep(1, N)
+  }
+
+  # try to read derivatives off attr, otherwise use names _d*
+
+  if (is.null(attr(Xs[[1]], 'derivatives'))){
+    derivatives = 0
+    ind_d1 = grep('_d1$', colnames(Xs[[1]]))                 # 1st deriv
+    ind_d2 = grep('_d2$', colnames(Xs[[1]]))                 # 2nd deriv
+    if (length(ind_d1) > 0)
+      derivatives = 1
+    if (length(ind_d2) > 0)
+      derivatives = 2
+  } else {
+    derivatives = attr(Xs[[1]], 'derivatives')
+  }
+
+  if (derivatives > 0){
+      warning('derivative features not tested with autocorrelation model')
+  }
+
+  if (is.null(attr(Xs[[1]], 'lags'))){
+    lags = 0
+    ind_lag = grep('_lag', colnames(Xs[[1]]))
+    if (length(ind_lag) > 0){
+      lags = max(as.numeric(gsub('.*(_lag)?', '', colnames(Xs[[1]])[ind_lag])))
+      ind_lag0 = (1:M)[-ind_lag]
+    }
+  } else {
+    lags = attr(Xs[[1]], 'lags')
+  }
+
+  # set aside leading obs where derivatives cannot be calculated
+
+  if (derivatives > 0){
+    for (i in 1:N){
+      Xs[[i]] = Xs[[i]][-(1:derivatives),]
+    }
+  }
+
+  Ts = Ts - derivatives
+
+  # defaults
+
+  if (!'lambda' %in% ls(control)){
+    control$lambda = 0
+  } else {
+    if (control$lambda < 0)
+      stop('regularization parameter lambda must be nonnegative')
+  }
+
+  if (!'tol' %in% ls(control))
+    control$tol = 1e-6
+
+  if (!'maxiter' %in% ls(control))
+    control$maxiter = 100
+
+  if (!'uncollapse' %in% ls(control))
+    control$uncollapse = .01^M
+
+  if (!'verbose' %in% ls(control))
+    control$verbose = 1
+
+  # process labels for supervised hmm
+
+  supervised = FALSE
+
+  if (length(labels) > 0){
+    supervised = TRUE
+
+    if (N==1 & is.atomic(labels))
+      labels = list(labels)
+
+    if (all(sapply(labels, is.numeric))){
+
+      if (length(labels) != N)
+        stop('length(labels) != length(Xs)')
+
+      if (all(sapply(labels, is.matrix))){
+
+        if (any(sapply(labels, ncol) != K))
+          stop('ncol of labels[[i]] must equal nstates for all i')
+        if (any(sapply(labels, nrow) != Ts))
+          stop('nrow(labels[[i]]) must equal nrow(Xs[[i]]) for all i')
+        if (any(unlist(sapply(labels, rowSums)) - 1 > 4 * .Machine$double.eps))
+          stop('labels for each observation must sum to 1')
+
+        par$resp = labels
+
+      } else if (all(sapply(labels, is.vector))){
+
+        if (any(sapply(labels, length) != Ts))
+          stop('length of labels[[i]] must equal nrow(Xs[[i]]) for all i')
+        if (any(!unique(unlist(labels)) %in% 1:K))
+          stop('all integer labels must be in 1:nstates')
+
+        par$resp = lapply(labels, function(lab){
+          out = matrix(0, length(lab), nstates)
+          out[cbind(1:length(lab), lab)] = 1
+          return(out)
+        })
+
+      } else {
+        stop('labels for supervised training must be either (a) list of integer vectors with ',
+             'length(labels)==length(Xs), length(labels[[i]])==nrow(Xs[[i]]), and labels[[i]][t] %in% 1:nstates; ',
+             'or (b) list of matrices with length(labels)==length(Xs), nrow(labels[[i]])==nstates, ',
+             'ncol(labels[[i]])==nrow(Xs[[i]]), and all(colSums(labels[[i]]) == 1)')
+      }
+    } else {
+      stop('labels for supervised training must be list of numeric vectors or matrices')
+    }
+
+    if (derivatives > 0){
+      for (i in 1:N){
+        par$resp = par$resp[-(1:derivatives),]
+      }
+    }
+
+  }
+
+  ### standardize covariates - working in-place to conserve ry
+
+  if (!'standardize' %in% ls(control))
+    control$standardize = TRUE
+
+  if (control$standardize){
+
+    Xs = standardizeFeatures(Xs, verbose=control$verbose)
+
+    feature_means = attr(Xs[[1]], 'scaled:center')
+    feature_sds = attr(Xs[[1]], 'scaled:scale')
+
+    nonmissing = attr(Xs, 'nonmissing')
+    missingness_labels = attr(Xs, 'missingness_labels')
+    nonmissing_features = attr(Xs, 'nonmissing_features')
+
+  } else {
+
+    if (is.null(attr(Xs[[1]], 'scaled:center'))){
+      feature_means = rep(0, M)
+    } else {
+      feature_means = attr(Xs[[1]], 'scaled:center')
+    }
+
+    if (is.null(attr(Xs[[1]], 'scaled:scale'))){
+      feature_sds = rep(1, M)
+    } else {
+      feature_sds = attr(Xs[[1]], 'scaled:scale')
+    }
+
+    if (is.null(attr(Xs, 'nonmissing'))){
+      nonmissing = lapply(Ts, function(x) 1:x) # assume nothing is missing
+    } else {
+      nonmissing = attr(Xs, 'nonmissing')
+    }
+
+    if (is.null(attr(Xs, 'missingness_labels'))){
+      missingness_labels = lapply(Ts, function(x) rep(1,x)) # all obs have same pattern of missingness (none)
+    } else {
+      missingness_labels = attr(Xs, 'missingness_labels')
+    }
+
+    if (is.null(attr(Xs, 'nonmissing_features'))){
+      nonmissing_features = list(1:M) # nothing is missing
+    } else {
+      nonmissing_features = attr(Xs, 'nonmissing_features')
+    }
+
+  }
+
+  ### initialize state distributions by either
+  #   (1) providing (near-uniform) obs-level responsibilities (recommended for small datasets < 1e6 obs), or
+  #   (2) providing (dispersed and high-variance) starting state distributions (faster to initialize)
+
+  if ('resp' %in% ls(par)){
+    if (!is.null(par$method))
+      warning('responsibilities supplied; ignoring "par$method"')
+    par$method = 'user-responsibilities'
+  } else if (all(c('mus', 'Sigmas') %in% ls(par))){
+    par$method = 'user-states'
+  } else if (!is.null(par$method)) {
+    if (par$method != 'random-spherical')
+      warning('"par$method" unknown or improper inputs supplied; ignoring')
+  } else {
+    par$method = 'random-dirichlet'
+  }
+
+  if (par$method == 'random-spherical'){
+    Ts_running = cumsum(Ts)
+    seq.draws = table(sample.int(N, K, replace=TRUE, prob=sapply(nonmissing, length)))
+    par$mus = do.call(cbind, lapply(names(seq.draws), function(seq){
+      mus.ind = sample(nonmissing[[as.numeric(seq)]], seq.draws[seq])
+      t(Xs[[as.numeric(seq)]][mus.ind, , drop=FALSE])
+    }))
+
+    if (is.null(par$method_arg))
+      par$method_arg = 1 # entries on diagonal of initial state covariance matrix
+
+    par$Sigmas = replicate(K, par$method_arg*diag(M), simplify=FALSE)
+
+  }
+
+  # if no initialization whatsoever, assign responsibilities
+  if (par$method == 'random-dirichlet'){
+    if (is.null(par$method_arg))
+      par$method_arg = 1 # dirichlet concentration for initial responsibilities
+
+    par$resp = list()
+    for (i in 1:N){
+      par$resp[[i]] = rdirichlet(Ts[[i]], rep(par$method_arg, K))
+    }
+
+  }
+
+  # if responsibilities provided
+  if (par$method %in% c('user-responsibilities', 'random-dirichlet')){
+
+    if (any(sapply(par$resp, ncol) != K)){
+      stop('all responsibility matrices must have nstates columns')
+    }
+
+    if (!all(sapply(par$resp, nrow) == Ts)){
+      stop('each responsibility matrix must have same nrow as corresponding element of Xs')
+    }
+
+    if (any(unlist(sapply(par$resp, rowSums)) - 1 > 4 * .Machine$double.eps)){
+      stop('rows of all responsibility matrices must sum to 1')
+    }
+
+    if (any(c('mus', 'Sigmas') %in% ls(par))){
+      warning('state responsibilities provided; state distribution parameters (mus/Sigmas) ignored')
+    }
+
+    state_sizes = rowSums(sapply(1:N, function(i) colSums(par$resp[[i]][nonmissing[[i]],])))
+
+    # for each state, get mean of obs (weighted by responsibility)
+    par$mus = matrix(0, K, M)
+    for (i in 1:N){
+      par$mus = par$mus + crossprod(par$resp[[i]][nonmissing[[i]],], Xs[[i]][nonmissing[[i]],])
+    }
+    par$mus = t(par$mus)
+    par$mus = sweep(par$mus, 2, state_sizes, '/')
+
+    # for each state, get weighted cov matrix (weighted by responsibility)
+    par$Sigmas = replicate(K, matrix(0, M, M), simplify=F)
+    if (control$verbose == 1)
+      cat('\n  initializing cluster ', rep(' ', K_digits +  N_digits + 9), sep='')
+    for (k in 1:K){
+      if (control$verbose == 1){
+        cat(rep('\b', K_digits + N_digits + 9), sprintf(paste('%', K_digits, 'd', sep=''), k), sep='')
+        cat(' obs seq ', rep(' ', N_digits), sep='')
+      }
+      for (i in 1:N){
+        if (control$verbose == 1)
+          cat(rep('\b', N_digits), sprintf(paste('%', N_digits, 'd', sep=''), i), sep='')
+        par$Sigmas[[k]] = par$Sigmas[[k]] +
+          t(Xs[[i]][nonmissing[[i]],]) %*% diag(par$resp[[i]][nonmissing[[i]], k]) %*% Xs[[i]][nonmissing[[i]],]
+      }
+      par$Sigmas[[k]] = par$Sigmas[[k]] / state_sizes[k]
+    }
+
+    # if starting state distributions provided, shift and scale to account for standardization
+  }
+  if (control$verbose == 1){
+        cat('\n')
+  }
+
+
+  if (par$method == 'user-states') {
+
+    if (all(c('mus', 'Sigmas') %in% ls(par)) & is.null(par$method)){
+
+      par$mus = lapply(par$mus, function(mu){
+        mu = mu - feature_means
+        mu = mu / feature_sds
+      })
+      par$mus = do.call(cbind, par$mus)
+
+      for (k in 1:K){ # standardize and check Sigmas are PD
+        par$Sigmas[[k]] = sweep(par$Sigmas[[k]], 1, feature_sds, '/')
+        par$Sigmas[[k]] = sweep(par$Sigmas[[k]], 2, feature_sds, '/')
+        chol(par$Sigmas[[k]] + control$lambda*diag(M))
+      }
+
+    }
+  }
+
+  # initialize transition matrix
+  if ('Gamma' %in% ls(par)){
+
+    if (class(par$Gamma) != 'matrix' | !all.equal(dim(par$Gamma), c(K, K)))
+      stop('Gamma must be nstates x nstates numeric matrix')
+
+    if (any(rowSums(par$Gamma) - 1 > 4 * .Machine$double.eps)){
+      stop('rows of Gamma must sum to 1')
+    }
+
+  } else {
+
+    par$Gamma = rdirichlet(K, rep(10,K))
+
+  }
+
+  # initialize starting distribution
+  if ('delta' %in% ls(par)){
+
+    if (class(par$delta) != 'numeric' | length(par$delta) != K)
+      stop('delta must be numeric vector of length nstates')
+
+    if (sum(delta) > 4 * .Machine$double.eps)
+      stop('delta must sum to 1')
+
+    par$delta = t(par$delta)
+
+  } else {
+    par$delta = solve(t(diag(K) - par$Gamma + 1), rep(1, K))
+  }
+
+  # c++ indexing
+  nonmissing = lapply(nonmissing, function(x) x - 1)
+  missingness_labels = lapply(missingness_labels, function(x) x - 1)
+  nonmissing_features = lapply(nonmissing_features, function(x) x - 1)
+
+  out = hmm_autocorr_cpp(
+      Xs,
+      weights,
+      par$delta,
+      par$mus,
+      par$Sigmas,
+      par$Gamma,
+      if (supervised) lapply(par$resp, t),
+      labels_t=ind_lag0-1,
+      labels_y=ind_lag-1,
+      control$lambda,
+      control$tol,
+      control$maxiter,
+      control$uncollapse,
+      control$verbose==1,
+      supervised
+      )
+
+  out$weights = weights
+
+  # rescale mus, Sigmas from output
+  for (k in 1:K){
+    out$mus[[k]] = out$mus[[k]] * feature_sds
+    out$mus[[k]] = out$mus[[k]] + feature_means
+    out$Sigmas[[k]] = sweep(out$Sigmas[[k]], 1, feature_sds, '*')
+    out$Sigmas[[k]] = sweep(out$Sigmas[[k]], 2, feature_sds, '*')
+  }
+
+  # stationary distribution
+  out$delta = solve(t(diag(K) - out$Gamma + 1), rep(1, K))
+
+  out$nstates = K
+
+  # append feature means/sds to output so new data can be scaled accordingly
+  out$scaling = list(feature_means = feature_means,
+                     feature_sds = feature_sds)
+
+  # append feature names
+  out$dimnames = colnames(Xs[[1]])
+
+  # append initialization and control parameters
+  out$par = par
+  out$control = control
+
+  # llh of individual obs seq
+  out$llhs = as.numeric(out$llhs)
+
+  # trim leading -Inf (used internally so llh increase always defined)
+  out$llh_seq = out$llh_seq[-1]
+  iters = length(out$llh_seq)
+  converged = TRUE
+  if (iters >= control$maxiter){
+    if (diff(out$llh_seq[-1:0 + length(out$llh_seq)]) >= control$tol){
+      converged = FALSE
+      warning('failed to converge by maxiter = ', control$maxiter, ' iterations (tol = ', control$tol, ')')
+    }
+  }
+
+  # add NAs back in for leading obs for which derivatives cannot be calculated
+  if (derivatives > 0){
+    for (i in 1:N){
+      out$lstateprobs[[i]] = rbind(matrix(NA, derivatives, K),
+                                   out$lstateprobs[[i]])
+      out$zetas[[i]] = cbind(matrix(NA, K, derivatives),
+                             out$zetas[[i]])
+    }
+  }
+
+  out$convergence = list(converged=converged,
+                         iters=iters,
+                         resets=out$resets)
+  out$resets = NULL # move 'resets' from cpp output to convergence diagnostics
+
+  class(out) = 'feelr.hmm'
+
+  return(out)
+
+}
diff --git a/R/llh.R b/R/llh.R
index 9f85312..f203204 100644
--- a/R/llh.R
+++ b/R/llh.R
@@ -8,7 +8,7 @@
 #'   sequence and is of dimension nobs x nfeatures. For a single observation
 #'   sequence, a single matrix can be provided
 #' @param mod Model object of class 'feelr.hmm', as output by \code{hmm}
-#' @param control list
+#'
 #' @return List with two components. \code{llhs} is a numeric vector of
 #'   log-likelihoods of each observation sequence in \code{Xs}. \code{llh_total}
 #'   is the log-likelihood of all observation sequences together, i.e.
@@ -17,7 +17,9 @@
 #'   \code{mod$llhs}. This is because \code{hmm} estimates the starting state of
 #'   each sequence, whereas here it is assumed that the starting state is drawn
 #'   from the stationary distribution \code{mod$delta}.
+#' @export
 #'
+#' @examples
 llh = function(Xs,   # data
                mod,  # fitted feelr.hmm model
                control=list()
@@ -29,7 +31,7 @@ llh = function(Xs,   # data
   if (class(Xs) == 'matrix')
     Xs = list(Xs)
 
-  if (any(sapply(Xs, function(X) class(X)!='matrix')))
+  if (any(sapply(Xs, function(X) class(X)!=c('matrix', 'array'))))
     stop('Xs must be matrix or list of matrices')
 
   if (any(sapply(Xs, ncol) != length(mod$mus[[1]])))
diff --git a/R/standardizeFeatures.R b/R/standardizeFeatures.R
new file mode 100644
index 0000000..176c1c6
--- /dev/null
+++ b/R/standardizeFeatures.R
@@ -0,0 +1,332 @@
+#' Title
+#'
+#' @param Xs Data
+#' @param feature_means Numeric vector corresponding to columns of elements in Xs
+#' @param feature_sds Numeric vector corresponding to columns of elements in Xs. If not supplied, will be computed from Xs.
+#' @param verbose Verbose printing
+#'
+#' @return Standardizes `data` of objects of class
+#'     `preppedAudio`. Maintains structure of original object
+#'     otherwise. Is used to standardize data where the recording
+#'     environment systematically shifts audio features.
+#' 
+#' @details \code{feature_means} and \code{feature_sds} are provided to allow
+#'   alignment of new datasets. For example, after a model is trained, new data
+#'   for prediction must be transformed in the same way as the training data to
+#'   ensure predictions are valid. If either is \code{NULL}, both will be
+#'   computed from \code{Xs} and the output will be internally standardized
+#'   (i.e., columns of \code{do.call(rbind, standardizeFeatures(Xs))} will be
+#'   have a mean of 0 and a standard deviation of 1).
+#'
+#' @examples
+#' data('audio')
+#' audio$data <- standardizeFeatures(
+#'     lapply(audio$data, function(x) na.omit(x))
+#' )
+#' 
+#' @export
+#'
+standardizeFeatures = function(Xs, feature_means=NULL, feature_sds=NULL, verbose=1){
+
+  if (class(Xs) == 'matrix')
+    Xs = list(Xs)
+
+  if (any(unique(diff(sapply(Xs, ncol))) != 0))
+    stop('all observation sequences must have same number of observed features')
+
+  if (length(Xs) > 1){
+    for (i in 1:length(Xs)){
+      if (!all.equal(colnames(Xs[[i]]), colnames(Xs[[1]])))
+        stop('colnames differ across observation sequences')
+    }
+  }
+
+  if (!is.null(feature_means)){
+    if (!is.numeric(feature_means) | !length(feature_means) == ncol(Xs[[1]]))
+      stop('"feature_means" must be numeric vector of length ncol(Xs[[i]]) for all i')
+  }
+
+  if (!is.null(feature_sds)){
+    if (!is.numeric(feature_sds) | !length(feature_sds) == ncol(Xs[[1]]))
+      stop('"feature_sds" must be numeric vector of length ncol(Xs[[i]]) for all i')
+  }
+
+  internal = TRUE
+  if (xor(is.null(feature_means), is.null(feature_sds))){
+    warning('both "feature_means" and "feature_sds" must be supplied for external alignment; ignoring')
+    feature_means = NULL
+    feature_sds = NULL
+  }
+  if (!(is.null(feature_means) | is.null(feature_sds))) {
+    internal = FALSE
+  }
+
+  if (verbose == 1)
+    cat('standardizing features\n')
+
+  # indices
+
+  N = length(Xs)
+  N_digits = ceiling(log(N+1, 10))
+  M = ncol(Xs[[1]])
+  Ts = sapply(Xs, nrow)  # number of obs in each sequence
+
+  ind_d0 = grep('_d\\d$', colnames(Xs[[1]]), invert=TRUE)  # base features
+  ind_d1 = grep('_d1$', colnames(Xs[[1]]))                 # 1st deriv
+  ind_d2 = grep('_d2$', colnames(Xs[[1]]))                 # 2nd deriv
+
+  M_d0 = length(ind_d0)
+  M_d1 = length(ind_d1)
+  M_d2 = length(ind_d2)
+
+  derivatives = 0
+  if (M_d1 > 0)
+    derivatives = 1
+  if (M_d2 > 0)
+    derivatives = 2
+
+  if (derivatives >= 1){
+    if (!all.equal(colnames(Xs[[1]])[ind_d0], gsub('_d1$', '', colnames(Xs[[1]])[ind_d1]))){
+      stop('failed to align base features with first derivatives')
+    }
+  }
+
+  if (derivatives == 2){
+    if (!all.equal(colnames(Xs[[1]])[ind_d0], gsub('_d2$', '', colnames(Xs[[1]])[ind_d2]))){
+      stop('failed to align base features with second derivatives')
+    }
+  }
+
+  # identify patterns of missingness
+
+  for (i in 1:N){
+      Xs[[i]][!is.finite(Xs[[i]])] = NA
+  }
+
+  missingness = lapply(Xs, is.na)
+  missingness_binary = lapply(missingness, function(x){
+    for (m in 1:M){
+      x[,m] = 2^(M-m) * x[,m] # missingness -> binary sequence -> decimal
+    }
+    return(rowSums(x))
+  })
+  nonmissing = lapply(missingness_binary, function(x){
+    x == 0
+  })
+  complete_obs = sum(sapply(nonmissing, sum))
+  if (complete_obs <= M && internal){
+    stop('only ', complete_obs, ' complete observations (no missing features) found; ',
+         'provide at least nfeatures complete observations to estimate model')
+  } else if (complete_obs < 1000 & complete_obs < sum(Ts)){
+    warning('only ', complete_obs, ' complete observations (no missing features) found; ',
+            'partially censored observations are used for likelihood calculations but ',
+            'dropped for feature standardization and estimation of state distributions.')
+  }
+
+  missingness_modes = unique(do.call(rbind, missingness))
+  missingness_modes_binary = unique(do.call(c, missingness_binary))
+
+  nonmissing_features = lapply(1:nrow(missingness_modes), function(i){
+    which(!missingness_modes[i,])
+  })
+  missingness_labels = lapply(missingness_binary, function(x){
+    match(x, missingness_modes_binary)
+  })
+
+  attr(Xs, 'nonmissing') = lapply(nonmissing, which)
+  attr(Xs, 'missingness_labels') = missingness_labels
+  attr(Xs, 'nonmissing_features') = nonmissing_features
+
+  # locate zeroes and contaminated adjacent derivatives
+  zeroes = lapply(Xs, function(X) which(X[,ind_d0]==0, arr.ind=T))
+
+  if (derivatives >= 1){ # indices of contaminated 1st derivatives
+    zeroes_back1 = zeroes_fwd1 = zeroes
+    for (i in 1:N){
+      zeroes_back1[[i]][,'row'] = zeroes_back1[[i]][,'row'] - 1
+      zeroes_back1[[i]] = zeroes_back1[[i]][zeroes_back1[[i]][,'row'] > 0,]
+      zeroes_fwd1[[i]][,'row'] = zeroes_fwd1[[i]][,'row'] + 1
+      zeroes_fwd1[[i]] = zeroes_fwd1[[i]][zeroes_fwd1[[i]][,'row'] <= Ts[i],]
+    }
+  }
+
+  if (derivatives == 2){ # indices of contaminated 2nd derivatives
+    zeroes_back2 = zeroes_fwd2 = zeroes
+    for (i in 1:N){
+      zeroes_back2[[i]][,'row'] = zeroes_back2[[i]][,'row'] - 2
+      zeroes_back2[[i]] = zeroes_back2[[i]][zeroes_back2[[i]][,'row'] > 0,]
+      zeroes_fwd2[[i]][,'row'] = zeroes_fwd2[[i]][,'row'] + 2
+      zeroes_fwd2[[i]] = zeroes_fwd2[[i]][zeroes_fwd2[[i]][,'row'] <= Ts[i],]
+    }
+  }
+
+  # standardize
+
+  if (verbose == 1)
+    cat('  centering obs seq ', rep(' ', N_digits), sep='')
+
+  if (internal){ # calculate internal means of features (as opposed to using means from external source)
+    running_sum = rep(0, M)
+    running_count = rep(0, M)
+    for (i in 1:N){
+      Xs.zerotoNA <- Xs[[i]]
+      Xs.zerotoNA[zeroes[[i]]] <- NA
+      running_sum <- running_sum + colSums(Xs.zerotoNA, na.rm = TRUE)
+      running_count <- running_count + colSums(!is.na(Xs.zerotoNA))
+    }
+    feature_means <- running_sum / running_count
+    ## rowSums(sapply(1:N, function(i){
+    ##         colSums(Xs[[i]][nonmissing[[i]],]), na.rm = TRUE)
+    ##     }) /
+    ##     sum(sapply(nonmissing, sum))
+  }
+
+  for (i in 1:N){
+    if (verbose == 1){
+      cat(rep('\b', N_digits), sprintf(paste('%', N_digits, 'd', sep=''), i), sep='')
+    }
+    Xs[[i]] = sweep(Xs[[i]], 2, feature_means)
+  }
+  if (verbose == 1){
+      cat('\n')
+  }
+  if (verbose == 1){
+    cat('  scaling obs seq ', rep(' ', N_digits), sep='')
+  }
+
+  if (internal){ # calculate internal sds of features (as opposed to using sds from external source)
+    running_sum_sq = rep(0, M)
+    for (i in 1:N){
+      Xs.zerotoNA <- Xs[[i]]
+      Xs.zerotoNA[zeroes[[i]]] <- NA
+      running_sum_sq <- running_sum_sq + colSums(Xs.zerotoNA^2, na.rm = TRUE)
+    }
+    feature_sds = sqrt(running_sum_sq / (running_count - 1))
+    ## feature_sds = sqrt(
+    ##     rowSums(sapply(1:N, function(i){
+    ##         colSums(Xs[[i]][nonmissing[[i]],]^2))
+    ##     }) /
+    ##     (sum(sapply(nonmissing, sum)) - 1)
+    ## )
+  }
+  if (any(feature_sds == 0 | is.na(feature_sds))){
+    stop('zero variance in the following features ',
+         'after dropping incomplete obs (missing features):\n  ',
+         paste(colnames(Xs[[1]])[feature_sds == 0], collapse=', '),
+         '\npartially censored obs are used in likelihood calculations ',
+         'but not feature standardization or estimation of state distributions'
+         )
+  }
+  for (i in 1:N){
+    if (verbose == 1){
+      cat(rep('\b', N_digits), sprintf(paste('%', N_digits, 'd', sep=''), i), sep='')
+    }
+    Xs[[i]] = scale(Xs[[i]], center=F, scale=feature_sds)
+    attr(Xs[[i]], 'scaled:center') = feature_means
+  }
+  if (verbose == 1){
+      cat('\n')
+  }
+
+  #
+
+
+
+
+  # handle zeroed-out feature-observations
+  seqs_with_zeroes = which(sapply(zeroes, length) > 0)
+  if (length(seqs_with_zeroes) > 0){
+
+    if (verbose == 1){
+      cat('  handling zeroed-out obs in seq ', rep(' ', N_digits), sep='')
+    }
+
+    for (i in seqs_with_zeroes){
+
+      if (verbose == 1)
+        cat(rep('\b', N_digits), sprintf(paste('%', N_digits, 'd', sep=''), i), sep='')
+
+      # replace zeroed-out feature-obs with -2 sd + small noise
+      Xs[[i]][zeroes[[i]]] = stats::rnorm(nrow(zeroes[[i]]),
+                                   mean = -3,
+                                   sd = .1)
+
+  }
+    if (verbose == 1){
+        cat('\n')
+    }
+
+    # handle 1st derivative of zeroed-out feature-observations
+    if (derivatives >= 1){
+
+      if (verbose == 1)
+        cat('  handling 1st derivatives at zeroed-out obs in seq ', rep(' ', N_digits), sep='')
+
+      for (i in seqs_with_zeroes){
+
+        if (verbose == 1)
+          cat(rep('\b', N_digits), sprintf(paste('%', N_digits, 'd', sep=''), i), sep='')
+
+        zeroes_back1[[i]][,'col'] = zeroes_back1[[i]][,'col'] + M_d0
+        zeroes[[i]][,'col'] = zeroes[[i]][,'col'] + M_d0
+        zeroes_fwd1[[i]][,'col'] = zeroes_fwd1[[i]][,'col'] + M_d0
+
+        # pull contaminated derivatives back to zero and add small noise
+        Xs[[i]][zeroes_back1[[i]]] =stats::rnorm(nrow(zeroes_back1[[i]]), 0, sd=.01)
+        Xs[[i]][zeroes[[i]]] =stats::rnorm(nrow(zeroes[[i]]), 0, sd=.01)
+        Xs[[i]][zeroes_fwd1[[i]]] =stats::rnorm(nrow(zeroes_fwd1[[i]]), 0, sd=.01)
+
+    }
+      if (verbose == 1){
+          cat('\n')
+      }
+
+    }
+
+    # handle 2nd derivative of zeroed-out feature-observations
+    if (derivatives == 2){
+
+      if (verbose == 1)
+        cat('  handling 2nd derivatives at zeroed-out obs in seq ', rep(' ', N_digits), sep='')
+
+      for (i in seqs_with_zeroes){
+
+        if (verbose == 1)
+          cat(rep('\b', N_digits), sprintf(paste('%', N_digits, 'd', sep=''), i), sep='')
+
+        zeroes_back2[[i]][,'col'] = zeroes_back2[[i]][,'col'] + M_d0 + M_d1
+        zeroes_back1[[i]][,'col'] = zeroes_back1[[i]][,'col'] + M_d0
+        zeroes[[i]][,'col'] = zeroes[[i]][,'col'] + M_d0
+        zeroes_fwd1[[i]][,'col'] = zeroes_fwd1[[i]][,'col'] + M_d0
+        zeroes_fwd2[[i]][,'col'] = zeroes_fwd2[[i]][,'col'] + M_d0 + M_d1
+
+        # pull contaminated derivatives back to zero and add small noise
+        Xs[[i]][zeroes_back2[[i]]] =stats::rnorm(nrow(zeroes_back2[[i]]), 0, sd=.01)
+        Xs[[i]][zeroes_back1[[i]]] =stats::rnorm(nrow(zeroes_back1[[i]]), 0, sd=.01)
+        Xs[[i]][zeroes[[i]]] =stats::rnorm(nrow(zeroes[[i]]), 0, sd=.01)
+        Xs[[i]][zeroes_fwd1[[i]]] =stats::rnorm(nrow(zeroes_fwd1[[i]]), 0, sd=.01)
+        Xs[[i]][zeroes_fwd2[[i]]] =stats::rnorm(nrow(zeroes_fwd2[[i]]), 0, sd=.01)
+
+        attr(Xs[[i]], 'derivatives') = derivatives
+
+    }
+            if (verbose == 1){
+          cat('\n')
+      }
+
+    }
+
+  }
+
+  if (derivatives > 0){
+      for (i in 1:N){
+          ## drop leading frames if they have missing derivatives
+          if (sum(is.na(Xs[[i]][derivatives,])) >= M / (derivatives + 1)){
+              Xs[[i]][1:derivatives,] = NA
+          }
+    }
+  }
+
+  return(Xs)
+
+}

From 791b4c3b3873dffadd2c1f3aeb445b7af99e9f87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?=
 <tanxpyox@gmail.com>
Date: Thu, 18 Aug 2022 13:54:13 +0800
Subject: [PATCH 03/14] Revert "Update NAMESPACE"

This reverts commit 96eeb336c49b1378e2bba3bebf7870bee8f12fd0.
---
 NAMESPACE | 2 --
 1 file changed, 2 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 9879beb..d9bba6b 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -18,7 +18,6 @@ export(getEdges)
 export(hmm)
 export(intensity)
 export(labelUtterances)
-export(llh)
 export(loudness)
 export(lpc)
 export(magPhase)
@@ -30,7 +29,6 @@ export(pipeline)
 export(play)
 export(segment.utterances)
 export(selectParams)
-export(standardizeFeatures)
 export(turnDetector)
 export(vectorPreemphasis)
 export(winFrame)

From 66f06d04a74423ae36bbef56c7f783f999594ed5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?=
 <tanxpyox@gmail.com>
Date: Thu, 18 Aug 2022 13:55:49 +0800
Subject: [PATCH 04/14] Delete hmm_autocorr.R

---
 R/hmm_autocorr.R | 549 -----------------------------------------------
 1 file changed, 549 deletions(-)
 delete mode 100644 R/hmm_autocorr.R

diff --git a/R/hmm_autocorr.R b/R/hmm_autocorr.R
deleted file mode 100644
index 5a8504c..0000000
--- a/R/hmm_autocorr.R
+++ /dev/null
@@ -1,549 +0,0 @@
-#########################
-# R wrapper for hmm_cpp #
-#########################
-
-#' Train a hidden Markov model with multivariate normal state distributions.
-#'
-#' @param Xs List of nsequences matrices; each matrix represents one observation
-#'   sequence and is of dimension nobs x nfeatures. For a single observation
-#'   sequence, a single matrix can be provided
-#' @param weights Optional vector of weights, one for each observation sequence
-#' @param nstates Integer; number of states
-#' @param par List of initialization parameters; see 'Details'
-#' @param control List of control parameters for EM steps
-#' @param labels List of observation labels for supervised training, with each
-#'   element corresponding to an observation sequence. Element i can either be
-#'   an vector of integer state labels in \code{1:nstates} or a matrix of
-#'   dimension \code{nstates} x \code{nrow(Xs[[i]])} with columns summing to 1.
-#'   If labels are supplied, E-step is suppressed.
-#'
-#' @details The \code{par} argument is a list of initialization parameters.
-#'   Can supply any of the following components:
-#' \itemize{
-#'   \item{\code{method}}{
-#'     Name of method used to automatically initialize EM run. Currently only
-#'     \code{'dirichlet'} and \code{'random-spherical'} are implemented. If
-#'     provided, user-specified state distributions are ignored.
-#'     \code{'dirichlet'} randomly generates responsibilities which are in turn
-#'     used to calculate starting distributions. \code{'random-spherical'} randomly draws
-#'     nstates observations and uses their features as state means; all state covariance matrices are set to a
-#'     diagonal matrix with entries \code{method_arg} (default=1).
-#'   }
-#'   \item{\code{method_arg}}{
-#'     Argument to supply to \code{method}. For \code{method='dirichlet'}, this
-#'     is a scalar concentration \code{alpha} (same value used for all states). For \code{method='random-spherical'}, this is a
-#'     scalar for diagonal entries of the spherical covariance matrices of the
-#'     starting distributions (after features are standardized).
-#'     \code{'dirichlet'} is implemented. If provided, all other arguments are ignored.
-#'   }
-#'   \item{\code{resp}}{
-#'     Matrix or list of nsequences matrices with rows summing to 1; each matrix
-#'     represents one observation sequence and is of dimension nobs x nstates,
-#'     with the (t,k)-th entry giving the initial probability that the t-th
-#'     observation belongs to state k. If either \code{resp} or both \code{mus}
-#'     and \code{Sigmas} are not provided, responsibilities are randomly
-#'     initialized using \code{rdirichlet} with all shape parameters set to 10.
-#'   }
-#'   \item{\code{mus}}{
-#'     List of nstates vectors with length nfeatures, each corresponding to the mean of
-#'     a state distribution
-#'   }
-#'   \item{\code{Sigmas}}{
-#'     List of nstates matrices with dimension nfeatures x nfeatures, each
-#'     corresponding to the covariance matrix of a state distribution
-#'   }
-#'   \item{\code{Gamma}}{
-#'     Matrix of transition probabilities with dimension nstates x nstates, with
-#'     row k representing the probabilities of each transition out of k and
-#'     summing to 1. If not supplied, each row is randomly drawn from
-#'     \code{rdirichlet} with all shape parameters set to 10.
-#'   }
-#'   \item{\code{delta}}{
-#'     Vector of initial state probabilities, of length nstates and summing to
-#'     1. If not supplied, \code{delta} is set to the stationary distribution of
-#'     \code{Gamma}, i.e. the normalized first left eigenvector.
-#'   }
-#' }
-#'
-#' The \code{control} argument is a list of EM control parameters that can supply
-#' any of the following components
-#' \itemize{
-#'   \item{\code{lambda}}{
-#'     Ridge-like regularization parameter. \code{lambda} is added to each
-#'     \code{diag(Sigmas[[k]])} to stabilize each state's covariance matrix,
-#'     which might otherwise be numerically singular, before inverting to
-#'     calculate multivariate normal densities. Note that regularization is
-#'     applied after all features are standardized, so \code{diag(Sigmas[[k]])}
-#'     is unlikely to contain elements greater than 1. This parameter should be
-#'     selected through cross-validation.
-#'   }
-#'   \item{\code{tol}}{
-#'     EM terminates when the improvement in the log-likelihood between
-#'     successive steps is \code{< tol}. Defaults to 1e-6.
-#'   }
-#'   \item{\code{maxiter}}{
-#'     EM terminates with a warning if \code{maxiter} iterations are reached
-#'     without convergence as defined by \code{tol}. Defaults to 100.
-#'   }
-#'   \item{\code{uncollapse}}{
-#'     Threshold for detecting and resetting state distribution when they
-#'     collapse on a single point. State distributions are uncollapsed by
-#'     re-drawing \code{mus[[k]]} from a standard multivariate normal and
-#'     setting \code{Sigmas[[k]]} to the nfeatures-dimensional identity matrix.
-#'     Note that this distribution is with respect to the standardized features.
-#'   }
-#'   \item{\code{standardize}}{
-#'     Whether features should be standardized. Defaults to \code{TRUE}. This
-#'     option also adds a small amount of noise , equal to .01 x feature
-#'     standard deviation, to observation-features that have been zeroed out
-#'     (e.g. f0 during unvoiced periods). If set to \code{FALSE}, it is assumed
-#'     that features have been externally standardized and zeroed-out values
-#'     handled. scaling$feature_means and scaling$feature_sds are set to 0s and
-#'     1s, respectively, and no check is done to ensure this is correct. If
-#'     features are in fact not standardized and zeroes handled, bad things will
-#'     happen and nobody will feel sorry for you.
-#'   }
-#'   \item{\code{verbose}}{
-#'     Integer in 0:1. If \code{1}, information on the EM process is reported.
-#'     Defaults to 1.
-#'   }
-#' }
-#'
-hmm_autocorr = function(
-    Xs,              # data
-    weights=NULL,    # weight on each element of Xs
-    nstates,         # number of states
-    par=list(),      # initialization
-    control=list(),  # EM control parameters
-    labels=list()    # labels for supervised training
-    ){
-
-  if (class(Xs) == 'matrix')
-    Xs = list(Xs)
-
-  if (any(sapply(Xs, function(X) class(X)!='matrix')))
-    stop('Xs must be matrix or list of matrices')
-
-  if (any(unique(diff(sapply(Xs, ncol))) != 0))
-    stop('all observation sequences must have same number of observed features')
-
-  if (is.null(weights)){
-      weights <- rep(weights, length(Xs))
-  }
-
-  # indices
-
-  K = floor(nstates)
-  N = length(Xs)
-  M = ncol(Xs[[1]])      # number of features
-  Ts = sapply(Xs, nrow)  # number of obs in each sequence
-
-  K_digits = ceiling(log(K+1, 10))  # for verbose
-  N_digits = ceiling(log(N+1, 10))
-
-  if (K != nstates | nstates < 2){
-      stop('nstates must be integer >= 2')
-  }
-
-  if (is.null(weights)){
-      weights = rep(1, N)
-  }
-
-  # try to read derivatives off attr, otherwise use names _d*
-
-  if (is.null(attr(Xs[[1]], 'derivatives'))){
-    derivatives = 0
-    ind_d1 = grep('_d1$', colnames(Xs[[1]]))                 # 1st deriv
-    ind_d2 = grep('_d2$', colnames(Xs[[1]]))                 # 2nd deriv
-    if (length(ind_d1) > 0)
-      derivatives = 1
-    if (length(ind_d2) > 0)
-      derivatives = 2
-  } else {
-    derivatives = attr(Xs[[1]], 'derivatives')
-  }
-
-  if (derivatives > 0){
-      warning('derivative features not tested with autocorrelation model')
-  }
-
-  if (is.null(attr(Xs[[1]], 'lags'))){
-    lags = 0
-    ind_lag = grep('_lag', colnames(Xs[[1]]))
-    if (length(ind_lag) > 0){
-      lags = max(as.numeric(gsub('.*(_lag)?', '', colnames(Xs[[1]])[ind_lag])))
-      ind_lag0 = (1:M)[-ind_lag]
-    }
-  } else {
-    lags = attr(Xs[[1]], 'lags')
-  }
-
-  # set aside leading obs where derivatives cannot be calculated
-
-  if (derivatives > 0){
-    for (i in 1:N){
-      Xs[[i]] = Xs[[i]][-(1:derivatives),]
-    }
-  }
-
-  Ts = Ts - derivatives
-
-  # defaults
-
-  if (!'lambda' %in% ls(control)){
-    control$lambda = 0
-  } else {
-    if (control$lambda < 0)
-      stop('regularization parameter lambda must be nonnegative')
-  }
-
-  if (!'tol' %in% ls(control))
-    control$tol = 1e-6
-
-  if (!'maxiter' %in% ls(control))
-    control$maxiter = 100
-
-  if (!'uncollapse' %in% ls(control))
-    control$uncollapse = .01^M
-
-  if (!'verbose' %in% ls(control))
-    control$verbose = 1
-
-  # process labels for supervised hmm
-
-  supervised = FALSE
-
-  if (length(labels) > 0){
-    supervised = TRUE
-
-    if (N==1 & is.atomic(labels))
-      labels = list(labels)
-
-    if (all(sapply(labels, is.numeric))){
-
-      if (length(labels) != N)
-        stop('length(labels) != length(Xs)')
-
-      if (all(sapply(labels, is.matrix))){
-
-        if (any(sapply(labels, ncol) != K))
-          stop('ncol of labels[[i]] must equal nstates for all i')
-        if (any(sapply(labels, nrow) != Ts))
-          stop('nrow(labels[[i]]) must equal nrow(Xs[[i]]) for all i')
-        if (any(unlist(sapply(labels, rowSums)) - 1 > 4 * .Machine$double.eps))
-          stop('labels for each observation must sum to 1')
-
-        par$resp = labels
-
-      } else if (all(sapply(labels, is.vector))){
-
-        if (any(sapply(labels, length) != Ts))
-          stop('length of labels[[i]] must equal nrow(Xs[[i]]) for all i')
-        if (any(!unique(unlist(labels)) %in% 1:K))
-          stop('all integer labels must be in 1:nstates')
-
-        par$resp = lapply(labels, function(lab){
-          out = matrix(0, length(lab), nstates)
-          out[cbind(1:length(lab), lab)] = 1
-          return(out)
-        })
-
-      } else {
-        stop('labels for supervised training must be either (a) list of integer vectors with ',
-             'length(labels)==length(Xs), length(labels[[i]])==nrow(Xs[[i]]), and labels[[i]][t] %in% 1:nstates; ',
-             'or (b) list of matrices with length(labels)==length(Xs), nrow(labels[[i]])==nstates, ',
-             'ncol(labels[[i]])==nrow(Xs[[i]]), and all(colSums(labels[[i]]) == 1)')
-      }
-    } else {
-      stop('labels for supervised training must be list of numeric vectors or matrices')
-    }
-
-    if (derivatives > 0){
-      for (i in 1:N){
-        par$resp = par$resp[-(1:derivatives),]
-      }
-    }
-
-  }
-
-  ### standardize covariates - working in-place to conserve ry
-
-  if (!'standardize' %in% ls(control))
-    control$standardize = TRUE
-
-  if (control$standardize){
-
-    Xs = standardizeFeatures(Xs, verbose=control$verbose)
-
-    feature_means = attr(Xs[[1]], 'scaled:center')
-    feature_sds = attr(Xs[[1]], 'scaled:scale')
-
-    nonmissing = attr(Xs, 'nonmissing')
-    missingness_labels = attr(Xs, 'missingness_labels')
-    nonmissing_features = attr(Xs, 'nonmissing_features')
-
-  } else {
-
-    if (is.null(attr(Xs[[1]], 'scaled:center'))){
-      feature_means = rep(0, M)
-    } else {
-      feature_means = attr(Xs[[1]], 'scaled:center')
-    }
-
-    if (is.null(attr(Xs[[1]], 'scaled:scale'))){
-      feature_sds = rep(1, M)
-    } else {
-      feature_sds = attr(Xs[[1]], 'scaled:scale')
-    }
-
-    if (is.null(attr(Xs, 'nonmissing'))){
-      nonmissing = lapply(Ts, function(x) 1:x) # assume nothing is missing
-    } else {
-      nonmissing = attr(Xs, 'nonmissing')
-    }
-
-    if (is.null(attr(Xs, 'missingness_labels'))){
-      missingness_labels = lapply(Ts, function(x) rep(1,x)) # all obs have same pattern of missingness (none)
-    } else {
-      missingness_labels = attr(Xs, 'missingness_labels')
-    }
-
-    if (is.null(attr(Xs, 'nonmissing_features'))){
-      nonmissing_features = list(1:M) # nothing is missing
-    } else {
-      nonmissing_features = attr(Xs, 'nonmissing_features')
-    }
-
-  }
-
-  ### initialize state distributions by either
-  #   (1) providing (near-uniform) obs-level responsibilities (recommended for small datasets < 1e6 obs), or
-  #   (2) providing (dispersed and high-variance) starting state distributions (faster to initialize)
-
-  if ('resp' %in% ls(par)){
-    if (!is.null(par$method))
-      warning('responsibilities supplied; ignoring "par$method"')
-    par$method = 'user-responsibilities'
-  } else if (all(c('mus', 'Sigmas') %in% ls(par))){
-    par$method = 'user-states'
-  } else if (!is.null(par$method)) {
-    if (par$method != 'random-spherical')
-      warning('"par$method" unknown or improper inputs supplied; ignoring')
-  } else {
-    par$method = 'random-dirichlet'
-  }
-
-  if (par$method == 'random-spherical'){
-    Ts_running = cumsum(Ts)
-    seq.draws = table(sample.int(N, K, replace=TRUE, prob=sapply(nonmissing, length)))
-    par$mus = do.call(cbind, lapply(names(seq.draws), function(seq){
-      mus.ind = sample(nonmissing[[as.numeric(seq)]], seq.draws[seq])
-      t(Xs[[as.numeric(seq)]][mus.ind, , drop=FALSE])
-    }))
-
-    if (is.null(par$method_arg))
-      par$method_arg = 1 # entries on diagonal of initial state covariance matrix
-
-    par$Sigmas = replicate(K, par$method_arg*diag(M), simplify=FALSE)
-
-  }
-
-  # if no initialization whatsoever, assign responsibilities
-  if (par$method == 'random-dirichlet'){
-    if (is.null(par$method_arg))
-      par$method_arg = 1 # dirichlet concentration for initial responsibilities
-
-    par$resp = list()
-    for (i in 1:N){
-      par$resp[[i]] = rdirichlet(Ts[[i]], rep(par$method_arg, K))
-    }
-
-  }
-
-  # if responsibilities provided
-  if (par$method %in% c('user-responsibilities', 'random-dirichlet')){
-
-    if (any(sapply(par$resp, ncol) != K)){
-      stop('all responsibility matrices must have nstates columns')
-    }
-
-    if (!all(sapply(par$resp, nrow) == Ts)){
-      stop('each responsibility matrix must have same nrow as corresponding element of Xs')
-    }
-
-    if (any(unlist(sapply(par$resp, rowSums)) - 1 > 4 * .Machine$double.eps)){
-      stop('rows of all responsibility matrices must sum to 1')
-    }
-
-    if (any(c('mus', 'Sigmas') %in% ls(par))){
-      warning('state responsibilities provided; state distribution parameters (mus/Sigmas) ignored')
-    }
-
-    state_sizes = rowSums(sapply(1:N, function(i) colSums(par$resp[[i]][nonmissing[[i]],])))
-
-    # for each state, get mean of obs (weighted by responsibility)
-    par$mus = matrix(0, K, M)
-    for (i in 1:N){
-      par$mus = par$mus + crossprod(par$resp[[i]][nonmissing[[i]],], Xs[[i]][nonmissing[[i]],])
-    }
-    par$mus = t(par$mus)
-    par$mus = sweep(par$mus, 2, state_sizes, '/')
-
-    # for each state, get weighted cov matrix (weighted by responsibility)
-    par$Sigmas = replicate(K, matrix(0, M, M), simplify=F)
-    if (control$verbose == 1)
-      cat('\n  initializing cluster ', rep(' ', K_digits +  N_digits + 9), sep='')
-    for (k in 1:K){
-      if (control$verbose == 1){
-        cat(rep('\b', K_digits + N_digits + 9), sprintf(paste('%', K_digits, 'd', sep=''), k), sep='')
-        cat(' obs seq ', rep(' ', N_digits), sep='')
-      }
-      for (i in 1:N){
-        if (control$verbose == 1)
-          cat(rep('\b', N_digits), sprintf(paste('%', N_digits, 'd', sep=''), i), sep='')
-        par$Sigmas[[k]] = par$Sigmas[[k]] +
-          t(Xs[[i]][nonmissing[[i]],]) %*% diag(par$resp[[i]][nonmissing[[i]], k]) %*% Xs[[i]][nonmissing[[i]],]
-      }
-      par$Sigmas[[k]] = par$Sigmas[[k]] / state_sizes[k]
-    }
-
-    # if starting state distributions provided, shift and scale to account for standardization
-  }
-  if (control$verbose == 1){
-        cat('\n')
-  }
-
-
-  if (par$method == 'user-states') {
-
-    if (all(c('mus', 'Sigmas') %in% ls(par)) & is.null(par$method)){
-
-      par$mus = lapply(par$mus, function(mu){
-        mu = mu - feature_means
-        mu = mu / feature_sds
-      })
-      par$mus = do.call(cbind, par$mus)
-
-      for (k in 1:K){ # standardize and check Sigmas are PD
-        par$Sigmas[[k]] = sweep(par$Sigmas[[k]], 1, feature_sds, '/')
-        par$Sigmas[[k]] = sweep(par$Sigmas[[k]], 2, feature_sds, '/')
-        chol(par$Sigmas[[k]] + control$lambda*diag(M))
-      }
-
-    }
-  }
-
-  # initialize transition matrix
-  if ('Gamma' %in% ls(par)){
-
-    if (class(par$Gamma) != 'matrix' | !all.equal(dim(par$Gamma), c(K, K)))
-      stop('Gamma must be nstates x nstates numeric matrix')
-
-    if (any(rowSums(par$Gamma) - 1 > 4 * .Machine$double.eps)){
-      stop('rows of Gamma must sum to 1')
-    }
-
-  } else {
-
-    par$Gamma = rdirichlet(K, rep(10,K))
-
-  }
-
-  # initialize starting distribution
-  if ('delta' %in% ls(par)){
-
-    if (class(par$delta) != 'numeric' | length(par$delta) != K)
-      stop('delta must be numeric vector of length nstates')
-
-    if (sum(delta) > 4 * .Machine$double.eps)
-      stop('delta must sum to 1')
-
-    par$delta = t(par$delta)
-
-  } else {
-    par$delta = solve(t(diag(K) - par$Gamma + 1), rep(1, K))
-  }
-
-  # c++ indexing
-  nonmissing = lapply(nonmissing, function(x) x - 1)
-  missingness_labels = lapply(missingness_labels, function(x) x - 1)
-  nonmissing_features = lapply(nonmissing_features, function(x) x - 1)
-
-  out = hmm_autocorr_cpp(
-      Xs,
-      weights,
-      par$delta,
-      par$mus,
-      par$Sigmas,
-      par$Gamma,
-      if (supervised) lapply(par$resp, t),
-      labels_t=ind_lag0-1,
-      labels_y=ind_lag-1,
-      control$lambda,
-      control$tol,
-      control$maxiter,
-      control$uncollapse,
-      control$verbose==1,
-      supervised
-      )
-
-  out$weights = weights
-
-  # rescale mus, Sigmas from output
-  for (k in 1:K){
-    out$mus[[k]] = out$mus[[k]] * feature_sds
-    out$mus[[k]] = out$mus[[k]] + feature_means
-    out$Sigmas[[k]] = sweep(out$Sigmas[[k]], 1, feature_sds, '*')
-    out$Sigmas[[k]] = sweep(out$Sigmas[[k]], 2, feature_sds, '*')
-  }
-
-  # stationary distribution
-  out$delta = solve(t(diag(K) - out$Gamma + 1), rep(1, K))
-
-  out$nstates = K
-
-  # append feature means/sds to output so new data can be scaled accordingly
-  out$scaling = list(feature_means = feature_means,
-                     feature_sds = feature_sds)
-
-  # append feature names
-  out$dimnames = colnames(Xs[[1]])
-
-  # append initialization and control parameters
-  out$par = par
-  out$control = control
-
-  # llh of individual obs seq
-  out$llhs = as.numeric(out$llhs)
-
-  # trim leading -Inf (used internally so llh increase always defined)
-  out$llh_seq = out$llh_seq[-1]
-  iters = length(out$llh_seq)
-  converged = TRUE
-  if (iters >= control$maxiter){
-    if (diff(out$llh_seq[-1:0 + length(out$llh_seq)]) >= control$tol){
-      converged = FALSE
-      warning('failed to converge by maxiter = ', control$maxiter, ' iterations (tol = ', control$tol, ')')
-    }
-  }
-
-  # add NAs back in for leading obs for which derivatives cannot be calculated
-  if (derivatives > 0){
-    for (i in 1:N){
-      out$lstateprobs[[i]] = rbind(matrix(NA, derivatives, K),
-                                   out$lstateprobs[[i]])
-      out$zetas[[i]] = cbind(matrix(NA, K, derivatives),
-                             out$zetas[[i]])
-    }
-  }
-
-  out$convergence = list(converged=converged,
-                         iters=iters,
-                         resets=out$resets)
-  out$resets = NULL # move 'resets' from cpp output to convergence diagnostics
-
-  class(out) = 'feelr.hmm'
-
-  return(out)
-
-}

From 886e8e44583d85f3963171d2c2d378e0aba01ceb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?=
 <tanxpyox@gmail.com>
Date: Thu, 18 Aug 2022 14:01:53 +0800
Subject: [PATCH 05/14] Update llh.R

---
 R/llh.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/llh.R b/R/llh.R
index f203204..6bc4c13 100644
--- a/R/llh.R
+++ b/R/llh.R
@@ -8,7 +8,8 @@
 #'   sequence and is of dimension nobs x nfeatures. For a single observation
 #'   sequence, a single matrix can be provided
 #' @param mod Model object of class 'feelr.hmm', as output by \code{hmm}
-#'
+#' @param control list
+#' 
 #' @return List with two components. \code{llhs} is a numeric vector of
 #'   log-likelihoods of each observation sequence in \code{Xs}. \code{llh_total}
 #'   is the log-likelihood of all observation sequences together, i.e.
@@ -18,8 +19,7 @@
 #'   each sequence, whereas here it is assumed that the starting state is drawn
 #'   from the stationary distribution \code{mod$delta}.
 #' @export
-#'
-#' @examples
+
 llh = function(Xs,   # data
                mod,  # fitted feelr.hmm model
                control=list()

From e56c47bff364ad328f845b60d75cca07d541ef21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?=
 <tanxpyox@gmail.com>
Date: Thu, 18 Aug 2022 14:04:20 +0800
Subject: [PATCH 06/14] compatibility update

---
 NAMESPACE | 1 +
 R/hmm.R   | 2 +-
 R/llh.R   | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index d9bba6b..01ca405 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -18,6 +18,7 @@ export(getEdges)
 export(hmm)
 export(intensity)
 export(labelUtterances)
+export(llh)
 export(loudness)
 export(lpc)
 export(magPhase)
diff --git a/R/hmm.R b/R/hmm.R
index 3a57fe2..eaf179a 100644
--- a/R/hmm.R
+++ b/R/hmm.R
@@ -121,7 +121,7 @@ hmm = function(Xs,              # data
       Xs = list(Xs)
   }
 
-  if (any(sapply(Xs, function(X) class(X) != c("matrix", "array")))){
+  if (any(sapply(Xs, function(X) class(X) != class(matrix())))){
       stop('Xs must be matrix or list of matrices')
   }
 
diff --git a/R/llh.R b/R/llh.R
index 6bc4c13..59dea59 100644
--- a/R/llh.R
+++ b/R/llh.R
@@ -31,7 +31,7 @@ llh = function(Xs,   # data
   if (class(Xs) == 'matrix')
     Xs = list(Xs)
 
-  if (any(sapply(Xs, function(X) class(X)!=c('matrix', 'array'))))
+  if (any(sapply(Xs, function(X) class(X) != class(matrix()))))
     stop('Xs must be matrix or list of matrices')
 
   if (any(sapply(Xs, ncol) != length(mod$mus[[1]])))

From 904d4c81329c1ac4c04687b419ad311d917eab39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?=
 <tanxpyox@gmail.com>
Date: Thu, 18 Aug 2022 14:05:34 +0800
Subject: [PATCH 07/14] Update standardizeFeatures.R

---
 R/standardizeFeatures.R | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/R/standardizeFeatures.R b/R/standardizeFeatures.R
index 176c1c6..352d2e6 100644
--- a/R/standardizeFeatures.R
+++ b/R/standardizeFeatures.R
@@ -23,9 +23,7 @@
 #' audio$data <- standardizeFeatures(
 #'     lapply(audio$data, function(x) na.omit(x))
 #' )
-#' 
-#' @export
-#'
+
 standardizeFeatures = function(Xs, feature_means=NULL, feature_sds=NULL, verbose=1){
 
   if (class(Xs) == 'matrix')

From c825a6586faf48197ce84ebf56fd7ea0c6d660c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?=
 <tanxpyox@gmail.com>
Date: Fri, 19 Aug 2022 11:19:04 +0800
Subject: [PATCH 08/14] Use is.matrix() for cross-version compatibility

---
 R/hmm.R | 2 +-
 R/llh.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/hmm.R b/R/hmm.R
index eaf179a..ec6ec85 100644
--- a/R/hmm.R
+++ b/R/hmm.R
@@ -121,7 +121,7 @@ hmm = function(Xs,              # data
       Xs = list(Xs)
   }
 
-  if (any(sapply(Xs, function(X) class(X) != class(matrix())))){
+  if (any(sapply(Xs, function(X) is.matrix(X)))){
       stop('Xs must be matrix or list of matrices')
   }
 
diff --git a/R/llh.R b/R/llh.R
index 59dea59..d6f2fb7 100644
--- a/R/llh.R
+++ b/R/llh.R
@@ -31,7 +31,7 @@ llh = function(Xs,   # data
   if (class(Xs) == 'matrix')
     Xs = list(Xs)
 
-  if (any(sapply(Xs, function(X) class(X) != class(matrix()))))
+  if (any(sapply(Xs, function(X) is.matrix(X))))
     stop('Xs must be matrix or list of matrices')
 
   if (any(sapply(Xs, ncol) != length(mod$mus[[1]])))

From 121db1ad8b440ff709cce47a5b028c9917f29555 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?=
 <tanxpyox@gmail.com>
Date: Fri, 19 Aug 2022 11:20:50 +0800
Subject: [PATCH 09/14] export

---
 R/hmm.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/hmm.R b/R/hmm.R
index ec6ec85..27f5024 100644
--- a/R/hmm.R
+++ b/R/hmm.R
@@ -108,7 +108,7 @@
 #'     Defaults to 1.
 #'   }
 #' }
-#'
+#' @export
 hmm = function(Xs,              # data
                weights=NULL,    # weight on each element of Xs
                nstates,         # number of states

From f98051d990e91499a687e5f150743988b671ae90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?=
 <tanxpyox@gmail.com>
Date: Fri, 19 Aug 2022 11:32:42 +0800
Subject: [PATCH 10/14] Update hmm.R

---
 R/hmm.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/hmm.R b/R/hmm.R
index 27f5024..2f3f56f 100644
--- a/R/hmm.R
+++ b/R/hmm.R
@@ -129,9 +129,9 @@ hmm = function(Xs,              # data
       stop('all observation sequences must have same number of observed features')
   }
 
-  if (is.null(weights)){
-      weights <- rep(weights, length(Xs))
-  }
+  ## if (is.null(weights)){
+  ##     weights <- rep(weights, length(Xs))
+  ## }
 
   # indices
 

From f6534679e282ba73d16a537dfc250d716d708e68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?=
 <tanxpyox@gmail.com>
Date: Fri, 19 Aug 2022 13:55:36 +0800
Subject: [PATCH 11/14] move raw_data and timestamps to root

allows extraction of raw_data and timestamps without affecting the functionality of hmm / llh
---
 R/extract.features.R | 57 +++++++++++++++++++++++---------------------
 1 file changed, 30 insertions(+), 27 deletions(-)

diff --git a/R/extract.features.R b/R/extract.features.R
index bf3abad..a138942 100644
--- a/R/extract.features.R
+++ b/R/extract.features.R
@@ -2,6 +2,7 @@
 #' @description Extract features from a file and return audio class with these data
 #' @importFrom magrittr %>%
 #' @importFrom purrr map
+#' @importFrom purrr transpose
 #' @param files \code{character}. Vector of path and file name. For example, c("folder/1.wav", "folder/2.wav")
 #' @param config \code{audio_config}. An object of class 'audio_config' with parameters for extraction.
 #' @param use.full.names \code{boolean}. Whether to use full file path as names of returned list elements
@@ -14,17 +15,17 @@
 #' @export
 
 extractFeatures <- function(files,  config = loudness(createConfig()), 
-                            use.full.names=T, 
-                            use.exts=T,
+                            use.full.names = T, 
+                            use.exts = T,
                             raw.data = F,
                             timestamp = F,
                             standardize = F
 ) {
   features <- purrr::map(files, function(x) {
-    result <- extractFeature(x, config)
+    result <- extractFeature(x, config, raw.data, timestamp) 
     attr(result, "filename") <- x
     result
-  })
+  }) %>% purrr::transpose()
   
   out = NULL
   
@@ -32,15 +33,12 @@ extractFeatures <- function(files,  config = loudness(createConfig()),
   if (!use.full.names) list_names <- basename(list_names)
   if (!use.exts) list_names <- tools::file_path_sans_ext(list_names)
   
-  out$data <- purrr::set_names(features, list_names)
-
+  out <- purrr::map(features, purrr::set_names, list_names)
+  
   if (standardize) {
-    if (raw.data | timestamp)
-      stop("standardizeFeatures can only be used when both raw.data and timestamp are set to false.")
-    else
-      out$data <- out$data %>% standardizeFeatures
+      out$data <- out$data %>% communication:::standardizeFeatures()
   }
-
+  
   out$files$fname <- list_names
   out$files$duration <- sapply(out$data, nrow) 
   
@@ -52,18 +50,27 @@ extractFeature <- function(filename, config = config,
                            raw.data = F,
                            timestamp = F) {
   
-  audio_nfeatures <- create_audio_object(filename, config, raw.data, timestamp)
+  out = NULL
+  audio_nfeatures <- create_audio_object(filename, config)
   audio <- audio_nfeatures$audio %>% as.matrix
-
   n_features <- audio_nfeatures$n_features
+  timestamps <- audio_nfeatures$timestamps
+  
+  out$data <- audio
   
   if(raw.data){
-    raw_data <- add_raw_data(filename, audio$timestamps)
+    raw_data <- add_raw_data(filename, timestamps)
     
     # For some reasons, length of audio$timestamps is not the same as raw_data (returning 
     # more data - I have to check it)
-    audio$raw_data <- raw_data$cut_raw_data[1:length(audio$timestamps)]
-    attr(audio, "header") <- raw_data$header  
+    raw_data_out <- raw_data$cut_raw_data[1:length(timestamps)]
+    attr(raw_data_out, "header") <- raw_data$header  
+    
+    out$raw_data <- raw_data_out
+  }
+  
+  if(timestamp){
+    out$timestamps <- timestamps
   }
   
   # If the no. of columns supplied by the config object is not the same as the no. of features
@@ -80,8 +87,9 @@ extractFeature <- function(filename, config = config,
       column_names <- paste0(outputs[1], '_', seq(n_features))
     }
   }
-  colnames(audio) <- c(column_names)
-  audio
+  colnames(out$data) <- c(column_names)
+  
+  return(out)
 }
 
 
@@ -144,24 +152,19 @@ tail.speech <- function(x, ...) {
   tail(print(x))
 }
 
-create_audio_object <- function(filename, config,
-                                raw.data = F,
-                                timestamp = F) {
+create_audio_object <- function(filename, config) {
   config_string <- communication:::generate_config_string(config)
   extracted_data <- communication:::rcpp_openSmileGetFeatures(filename, config_string_in = config_string)
   audio <- as.data.frame(extracted_data$audio_features_0)
   n_features <- ncol(audio)
   
-  if(timestamp) {
-    timestamps <- as.vector(extracted_data$audio_timestamps_0)
-    audio$timestamps <- timestamps
-  }
+  timestamps <- as.vector(extracted_data$audio_timestamps_0)
   
-  list(audio=audio, n_features=n_features)
+  list(audio=audio, timestamps = timestamps, n_features=n_features)
 }
 
 add_raw_data <- function(filename, timestamps) {
-  raw_data <- rcpp_parseAudioFile(filename)
+  raw_data <- communication:::rcpp_parseAudioFile(filename)
   # Timemust must be cut in time intervals of same size
   # We need to create a test for that
   timestamp_interval <- timestamps[2]

From 486e5e476d1caad6b71e29a038b147f543b9e19e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?=
 <tanxpyox@gmail.com>
Date: Fri, 19 Aug 2022 18:31:50 +0800
Subject: [PATCH 12/14] typo

---
 R/hmm.R | 2 +-
 R/llh.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/hmm.R b/R/hmm.R
index 2f3f56f..707d57d 100644
--- a/R/hmm.R
+++ b/R/hmm.R
@@ -121,7 +121,7 @@ hmm = function(Xs,              # data
       Xs = list(Xs)
   }
 
-  if (any(sapply(Xs, function(X) is.matrix(X)))){
+  if (any(sapply(Xs, function(X) !is.matrix(X)))){
       stop('Xs must be matrix or list of matrices')
   }
 
diff --git a/R/llh.R b/R/llh.R
index d6f2fb7..40539d6 100644
--- a/R/llh.R
+++ b/R/llh.R
@@ -31,7 +31,7 @@ llh = function(Xs,   # data
   if (class(Xs) == 'matrix')
     Xs = list(Xs)
 
-  if (any(sapply(Xs, function(X) is.matrix(X))))
+  if (any(sapply(Xs, function(X) !is.matrix(X))))
     stop('Xs must be matrix or list of matrices')
 
   if (any(sapply(Xs, ncol) != length(mod$mus[[1]])))

From c21fa55f86269e05abb52e0d2934ee6931dc53ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?=
 <tanxpyox@gmail.com>
Date: Sat, 20 Aug 2022 13:37:52 +0800
Subject: [PATCH 13/14] Update .gitignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index fce20f0..87def04 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,5 @@ src/symbols.rds
 .Rproj.user
 .RData
 *smile.log
-src/conf/config.h
\ No newline at end of file
+src/conf/config.h
+.vscode/*

From 970f95c54772841db5c1844f1772b92234b5098c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?=
 <tanxpyox@gmail.com>
Date: Sat, 20 Aug 2022 17:04:36 +0800
Subject: [PATCH 14/14] documentation

---
 R/extract.features.R | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/R/extract.features.R b/R/extract.features.R
index a138942..8b35650 100644
--- a/R/extract.features.R
+++ b/R/extract.features.R
@@ -8,7 +8,7 @@
 #' @param use.full.names \code{boolean}. Whether to use full file path as names of returned list elements
 #' @param use.exts \code{boolean}. Whether to use file extensions in names of returned list elements.
 #' @param raw.data \code{boolean}. Whether to include raw data in the output list.
-#' @param timestamp \code{boolean}. Whether to include raw data in the output list.
+#' @param timestamp \code{boolean}. Whether to include timestamps in the output list.
 #' @param standardize \code{boolean}. Whether to standardize the extracted features. Only works when both raw.data and timestamp are set to false.
 #'   Only applicable if use.full.names is FALSE
 #' @return audio class
@@ -174,5 +174,3 @@ add_raw_data <- function(filename, timestamps) {
                         ceiling(seq_along(raw_data[[2]])/(raw_data[[1]]$sampleRate*timestamp_interval)))
   list(cut_raw_data = cut_raw_data, header = raw_data[[1]])
 }
-
-