diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..77eeb97 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,28 @@ +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version, and other tools you might need +build: + os: ubuntu-24.04 + tools: + python: "3.13" + apt_packages: + - doxygen + jobs: + pre_build: + - doxygen Doxyfile + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/conf.py + +# Optionally, but recommended, +# declare the Python requirements required to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: docs/requirements.txt + \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index f9e40f0..5381d99 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.10) # Make sure this matches ./NAM/version.h! -project(NAM VERSION 0.3.0) +project(NAM VERSION 0.4.0) set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") diff --git a/Doxyfile b/Doxyfile new file mode 100644 index 0000000..14c0366 --- /dev/null +++ b/Doxyfile @@ -0,0 +1,245 @@ +# Doxyfile configuration for NeuralAmpModelerCore + +PROJECT_NAME = "NeuralAmpModelerCore" +PROJECT_NUMBER = "0.4.0" +PROJECT_BRIEF = "Core C++ DSP library for NAM plugins" +PROJECT_LOGO = +OUTPUT_DIRECTORY = docs/doxygen +CREATE_SUBDIRS = NO +ALLOW_UNICODE_NAMES = NO +OUTPUT_LANGUAGE = English +BRIEF_MEMBER_DESC = YES +REPEAT_BRIEF = YES +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the +ALWAYS_DETAILED_SEC = NO +INLINE_INHERITED_MEMB = NO +FULL_PATH_NAMES = NO +STRIP_FROM_PATH = +STRIP_FROM_INC_PATH = +SHORT_NAMES = NO +JAVADOC_AUTOBRIEF = YES +QT_AUTOBRIEF = NO +MULTILINE_CPP_IS_BRIEF = NO +INHERIT_DOCS = YES +SEPARATE_MEMBER_PAGES = NO +TAB_SIZE = 4 +ALIASES = +OPTIMIZE_OUTPUT_FOR_C = NO +OPTIMIZE_OUTPUT_JAVA = NO +OPTIMIZE_FOR_FORTRAN = NO +OPTIMIZE_OUTPUT_VHDL = NO +EXTENSION_MAPPING = +MARKDOWN_SUPPORT = YES +TOC_INCLUDE_HEADINGS = 0 +AUTOLINK_SUPPORT = YES +BUILTIN_STL_SUPPORT = YES +CPP_CLI_SUPPORT = NO +SIP_SUPPORT = NO +IDL_PROPERTY_SUPPORT = YES +DISTRIBUTE_GROUP_DOC = NO +GROUP_NESTED_COMPOUNDS = NO +SUBGROUPING = YES +INLINE_GROUPED_CLASSES = NO +INLINE_SIMPLE_STRUCTS = NO +TYPEDEF_HIDES_STRUCT = NO +LOOKUP_CACHE_SIZE = 0 + +# Build configuration +EXTRACT_ALL = YES +EXTRACT_PRIVATE = NO +EXTRACT_PACKAGE = NO +EXTRACT_STATIC = YES +EXTRACT_LOCAL_CLASSES = YES +EXTRACT_LOCAL_METHODS = NO +EXTRACT_ANON_NSPACES = NO +RESOLVE_UNNAMED_PARAMS = YES +HIDE_UNDOC_MEMBERS = NO +HIDE_UNDOC_CLASSES = NO +HIDE_FRIEND_COMPOUNDS = NO +HIDE_IN_BODY_DOCS = NO +INTERNAL_DOCS = NO +CASE_SENSE_NAMES = YES +HIDE_SCOPE_NAMES = NO +HIDE_COMPOUND_REFERENCE= NO + +# Input configuration +INPUT = NAM/ +INPUT_ENCODING = UTF-8 +FILE_PATTERNS = *.h \ + *.hpp +RECURSIVE = NO +EXCLUDE = +EXCLUDE_SYMLINKS = NO +EXCLUDE_PATTERNS = +EXCLUDE_SYMBOLS = +EXAMPLE_PATH = +EXAMPLE_PATTERNS = +EXAMPLE_RECURSIVE = NO +IMAGE_PATH = +INPUT_FILTER = +FILTER_PATTERNS = +FILTER_SOURCE_FILES = NO +FILTER_SOURCE_PATTERNS = +USE_MDFILE_AS_MAINPAGE = + +# Source browsing +SOURCE_BROWSER = NO +INLINE_SOURCES = NO +STRIP_CODE_COMMENTS = YES +REFERENCED_BY_RELATION = NO +REFERENCES_RELATION = NO +REFERENCES_LINK_SOURCE = YES +SOURCE_TOOLTIPS = YES +USE_HTAGS = NO +VERBATIM_HEADERS = YES +CLANG_ASSISTED_PARSING = NO +CLANG_ADD_INC_PATHS = YES +CLANG_OPTIONS = +CLANG_DATABASE_PATH = + +# Alphabetical index +ALPHABETICAL_INDEX = YES +COLS_IN_ALPHA_INDEX = 5 +IGNORE_PREFIX = + +# HTML output +GENERATE_HTML = YES +HTML_OUTPUT = html +HTML_FILE_EXTENSION = .html +HTML_HEADER = +HTML_FOOTER = +HTML_STYLESHEET = +HTML_EXTRA_STYLESHEET = +HTML_EXTRA_FILES = +HTML_COLORSTYLE_HUE = 220 +HTML_COLORSTYLE_SAT = 100 +HTML_COLORSTYLE_GAMMA = 80 +HTML_TIMESTAMP = NO +HTML_DYNAMIC_MENUS = YES +HTML_DYNAMIC_SECTIONS = NO +HTML_INDEX_NUM_ENTRIES = 100 +GENERATE_DOCSET = NO +DOCSET_FEEDNAME = "Doxygen generated docs" +DOCSET_FEEDURL = +DOCSET_BUNDLE_ID = org.doxygen.Project +DOCSET_PUBLISHER_ID = org.doxygen.Publisher +DOCSET_PUBLISHER_NAME = Publisher +GENERATE_HTMLHELP = NO +CHM_FILE = +HHC_LOCATION = +GENERATE_CHI = NO +CHM_INDEX_ENCODING = +BINARY_TOC = NO +TOC_EXPAND = NO +GENERATE_QHP = NO +QCH_FILE = +QHP_NAMESPACE = org.doxygen.Project +QHP_VIRTUAL_FOLDER = doc +QHP_CUST_FILTER_NAME = +QHP_CUST_FIL = +QHP_SECT_FILTER_ATTRIBUTES= +QHG_LOCATION = +GENERATE_ECLIPSEHELP = NO +ECLIPSE_DOC_ID = org.doxygen.Project +DISABLE_INDEX = NO +GENERATE_TREEVIEW = NO +ENUM_VALUES_PER_LINE = 4 +TREEVIEW_WIDTH = 250 +EXT_LINKS_IN_WINDOW = NO +FORMULA_FONTSIZE = 10 +FORMULA_TRANSPARENT = YES +USE_MATHJAX = NO +MATHJAX_FORMAT = HTML-CSS +MATHJAX_RELPATH = +MATHJAX_EXTENSIONS = +MATHJAX_CODEFILE = +SEARCHENGINE = YES +SERVER_BASED_SEARCH = NO +EXTERNAL_SEARCH = NO +SEARCHENGINE_URL = +SEARCHDATA_FILE = searchdata.xml +EXTERNAL_SEARCH_ID = +EXTRA_SEARCH_MAPPINGS = + +# LaTeX output +GENERATE_LATEX = NO + +# RTF output +GENERATE_RTF = NO + +# Man page output +GENERATE_MAN = NO + +# XML output (for Breathe) +GENERATE_XML = YES +XML_OUTPUT = xml +XML_PROGRAMLISTING = YES + +# DOCBOOK output +GENERATE_DOCBOOK = NO + +# Configuration options related to the preprocessor +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = NO +EXPAND_ONLY_PREDEF = NO +SEARCH_INCLUDES = YES +INCLUDE_PATH = +INCLUDE_FILE_PATTERNS = +PREDEFINED = NAM_SAMPLE=float +EXPAND_AS_DEFINED = +SKIP_FUNCTION_MACROS = YES + +# Configuration options related to external references +TAGFILES = +GENERATE_TAGFILE = +ALLEXTERNALS = NO +EXTERNAL_GROUPS = YES +EXTERNAL_PAGES = YES +PERL_PATH = /usr/bin/perl + +# Configuration options related to the dot tool +CLASS_DIAGRAMS = YES +MSCGEN_PATH = +DIA_PATH = +HIDE_UNDOC_RELATIONS = YES +HAVE_DOT = NO +DOT_NUM_THREADS = 0 +DOT_FONTNAME = Helvetica +DOT_FONTSIZE = 10 +DOT_FONTPATH = +CLASS_GRAPH = YES +COLLABORATION_GRAPH = YES +GROUP_GRAPHS = YES +UML_LOOK = NO +UML_LIMIT_NUM_FIELDS = 10 +TEMPLATE_RELATIONS = NO +INCLUDE_GRAPH = YES +INCLUDED_BY_GRAPH = YES +CALL_GRAPH = NO +CALLER_GRAPH = NO +GRAPHICAL_HIERARCHY = YES +DIRECTORY_GRAPH = YES +DOT_IMAGE_FORMAT = png +INTERACTIVE_SVG = NO +DOT_PATH = +DOTFILE_DIRS = +MSCFILE_DIRS = +DIAFILE_DIRS = +PLANTUML_JAR_PATH = +PLANTUML_CFG_FILE = +PLANTUML_INCLUDE_PATH = +DOT_GRAPH_MAX_NODES = 50 +MAX_DOT_GRAPH_DEPTH = 0 +DOT_MULTI_TARGETS = NO +GENERATE_LEGEND = YES +DOT_CLEANUP = YES diff --git a/NAM/conv1d.h b/NAM/conv1d.h index 48967b6..8182966 100644 --- a/NAM/conv1d.h +++ b/NAM/conv1d.h @@ -6,48 +6,115 @@ namespace nam { +/// \brief 1D dilated convolution layer +/// +/// Implements a 1D convolution with support for dilation and grouped convolution. +/// Uses a ring buffer to maintain input history for efficient processing of +/// sequential audio frames. class Conv1D { public: + /// \brief Default constructor + /// + /// Initializes with dilation=1 and groups=1. Use set_size_() to configure. Conv1D() { this->_dilation = 1; this->_num_groups = 1; }; + + /// \brief Constructor + /// \param in_channels Number of input channels + /// \param out_channels Number of output channels + /// \param kernel_size Size of the convolution kernel + /// \param bias Whether to use bias (1 for true, 0 for false) + /// \param dilation Dilation factor for the convolution + /// \param groups Number of groups for grouped convolution (default: 1) Conv1D(const int in_channels, const int out_channels, const int kernel_size, const int bias, const int dilation, const int groups = 1) { set_size_(in_channels, out_channels, kernel_size, bias, dilation, groups); }; + + /// \brief Set the parameters (weights) of this module + /// \param weights Iterator to the weights vector. Will be advanced as weights are consumed. void set_weights_(std::vector::iterator& weights); + + /// \brief Set the size parameters of the convolution + /// \param in_channels Number of input channels + /// \param out_channels Number of output channels + /// \param kernel_size Size of the convolution kernel + /// \param do_bias Whether to use bias + /// \param _dilation Dilation factor for the convolution + /// \param groups Number of groups for grouped convolution void set_size_(const int in_channels, const int out_channels, const int kernel_size, const bool do_bias, const int _dilation, const int groups = 1); + + /// \brief Set size and weights in one call + /// \param in_channels Number of input channels + /// \param out_channels Number of output channels + /// \param kernel_size Size of the convolution kernel + /// \param _dilation Dilation factor for the convolution + /// \param do_bias Whether to use bias + /// \param groups Number of groups for grouped convolution + /// \param weights Iterator to the weights vector. Will be advanced as weights are consumed. void set_size_and_weights_(const int in_channels, const int out_channels, const int kernel_size, const int _dilation, const bool do_bias, const int groups, std::vector::iterator& weights); - // Reset the ring buffer and pre-allocate output buffer - // :param sampleRate: Unused, for interface consistency - // :param maxBufferSize: Maximum buffer size for output buffer and to size ring buffer + + /// \brief Reset the ring buffer and pre-allocate output buffer + /// \param maxBufferSize Maximum buffer size for output buffer and to size ring buffer void SetMaxBufferSize(const int maxBufferSize); - // Get the entire internal output buffer. This is intended for internal wiring - // between layers; callers should treat the buffer as pre-allocated storage - // and only consider the first `num_frames` columns valid for a given - // processing call. Slice with .leftCols(num_frames) as needed. + + /// \brief Get the entire internal output buffer + /// + /// This is intended for internal wiring between layers; callers should treat + /// the buffer as pre-allocated storage and only consider the first num_frames columns + /// valid for a given processing call. Slice with .leftCols(num_frames) as needed. + /// \return Reference to the output buffer Eigen::MatrixXf& GetOutput() { return _output; } + + /// \brief Get the entire internal output buffer (const version) + /// \return Const reference to the output buffer const Eigen::MatrixXf& GetOutput() const { return _output; } - // Process input and write to internal output buffer - // :param input: Input matrix (channels x num_frames) - // :param num_frames: Number of frames to process + + /// \brief Process input and write to internal output buffer + /// \param input Input matrix (channels x num_frames) + /// \param num_frames Number of frames to process void Process(const Eigen::MatrixXf& input, const int num_frames); - // Process from input to output (legacy method, kept for compatibility) - // Rightmost indices of input go from i_start for ncols, - // Indices on output for from j_start (to j_start + ncols - i_start) + + /// \brief Process from input to output (legacy method, kept for compatibility) + /// + /// Rightmost indices of input go from i_start for ncols, + /// Indices on output go from j_start (to j_start + ncols - i_start). + /// \param input Input matrix + /// \param output Output matrix + /// \param i_start Starting index in input + /// \param ncols Number of columns to process + /// \param j_start Starting index in output void process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols, const long j_start) const; + /// \brief Get the number of input channels + /// \return Number of input channels long get_in_channels() const { return this->_weight.size() > 0 ? this->_weight[0].cols() : 0; }; + + /// \brief Get the kernel size + /// \return Kernel size long get_kernel_size() const { return this->_weight.size(); }; + + /// \brief Get the total number of weights + /// \return Total number of weight parameters long get_num_weights() const; + + /// \brief Get the number of output channels + /// \return Number of output channels long get_out_channels() const { return this->_weight.size() > 0 ? this->_weight[0].rows() : 0; }; + + /// \brief Get the dilation factor + /// \return Dilation factor int get_dilation() const { return this->_dilation; }; + + /// \brief Check if bias is used + /// \return true if bias is present, false otherwise bool has_bias() const { return this->_bias.size() > 0; }; protected: diff --git a/NAM/convnet.h b/NAM/convnet.h index 1765311..0d963df 100644 --- a/NAM/convnet.h +++ b/NAM/convnet.h @@ -18,17 +18,26 @@ namespace nam { namespace convnet { -// Custom Conv that avoids re-computing on pieces of the input and trusts -// that the corresponding outputs are where they need to be. -// Beware: this is clever! - -// Batch normalization -// In prod mode, so really just an elementwise affine layer. +/// \brief Batch normalization layer +/// +/// In production mode, so really just an elementwise affine layer. +/// Applies: y = (x - mean) / sqrt(variance + eps) * weight + bias +/// which simplifies to: y = scale * x + loc class BatchNorm { public: + /// \brief Default constructor BatchNorm() {}; + + /// \brief Constructor with weights + /// \param dim Dimension of the input + /// \param weights Iterator to the weights vector. Will be advanced as weights are consumed. BatchNorm(const int dim, std::vector::iterator& weights); + + /// \brief Process input in-place + /// \param input Input matrix to process + /// \param i_start Start index + /// \param i_end End index void process_(Eigen::MatrixXf& input, const long i_start, const long i_end) const; private: @@ -41,22 +50,53 @@ class BatchNorm Eigen::VectorXf loc; }; +/// \brief A single block in a ConvNet +/// +/// Consists of a dilated convolution, optional batch normalization, and activation. class ConvNetBlock { public: + /// \brief Default constructor ConvNetBlock() {}; + + /// \brief Set the parameters (weights) of this block + /// \param in_channels Number of input channels + /// \param out_channels Number of output channels + /// \param _dilation Dilation factor for the convolution + /// \param batchnorm Whether to use batch normalization + /// \param activation_config Activation function configuration + /// \param groups Number of groups for grouped convolution + /// \param weights Iterator to the weights vector. Will be advanced as weights are consumed. void set_weights_(const int in_channels, const int out_channels, const int _dilation, const bool batchnorm, const activations::ActivationConfig& activation_config, const int groups, std::vector::iterator& weights); + + /// \brief Resize buffers to handle maxBufferSize frames + /// \param maxBufferSize Maximum number of frames to process in a single call void SetMaxBufferSize(const int maxBufferSize); - // Process input matrix directly (new API, similar to WaveNet) + + /// \brief Process input matrix directly (new API, similar to WaveNet) + /// \param input Input matrix (channels x num_frames) + /// \param num_frames Number of frames to process void Process(const Eigen::MatrixXf& input, const int num_frames); - // Legacy method for compatibility (uses indices) + + /// \brief Process input (legacy method for compatibility, uses indices) + /// \param input Input matrix + /// \param output Output matrix + /// \param i_start Start index in input + /// \param i_end End index in input void process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long i_end); - // Get output from last Process() call + + /// \brief Get output from last Process() call + /// \param num_frames Number of frames to return + /// \return Block reference to the output Eigen::Block GetOutput(const int num_frames); + + /// \brief Get the number of output channels + /// \return Number of output channels long get_out_channels() const; - Conv1D conv; + + Conv1D conv; ///< The dilated convolution layer private: BatchNorm batchnorm; @@ -77,15 +117,38 @@ class _Head Eigen::VectorXf _bias; // (out_channels,) }; +/// \brief Convolutional neural network model +/// +/// A ConvNet consists of multiple ConvNetBlocks with increasing dilation factors, +/// followed by a head layer that produces the final output. class ConvNet : public Buffer { public: + /// \brief Constructor + /// \param in_channels Number of input channels + /// \param out_channels Number of output channels + /// \param channels Number of channels in the hidden layers + /// \param dilations Vector of dilation factors, one per block + /// \param batchnorm Whether to use batch normalization + /// \param activation_config Activation function configuration + /// \param weights Model weights vector + /// \param expected_sample_rate Expected sample rate in Hz (-1.0 if unknown) + /// \param groups Number of groups for grouped convolution ConvNet(const int in_channels, const int out_channels, const int channels, const std::vector& dilations, const bool batchnorm, const activations::ActivationConfig& activation_config, std::vector& weights, const double expected_sample_rate = -1.0, const int groups = 1); + + /// \brief Destructor ~ConvNet() = default; + /// \brief Process audio frames + /// \param input Input audio buffers + /// \param output Output audio buffers + /// \param num_frames Number of frames to process void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) override; + + /// \brief Resize all buffers to handle maxBufferSize frames + /// \param maxBufferSize Maximum number of frames to process in a single call void SetMaxBufferSize(const int maxBufferSize) override; protected: @@ -102,7 +165,11 @@ class ConvNet : public Buffer int PrewarmSamples() override { return mPrewarmSamples; }; }; -// Factory +/// \brief Factory function to instantiate ConvNet from JSON +/// \param config JSON configuration object +/// \param weights Model weights vector +/// \param expectedSampleRate Expected sample rate in Hz (-1.0 if unknown) +/// \return Unique pointer to a DSP object (ConvNet instance) std::unique_ptr Factory(const nlohmann::json& config, std::vector& weights, const double expectedSampleRate); diff --git a/NAM/dsp.h b/NAM/dsp.h index 73319a2..8b984d2 100644 --- a/NAM/dsp.h +++ b/NAM/dsp.h @@ -17,96 +17,141 @@ #else #define NAM_SAMPLE double #endif -// Use a sample rate of -1 if we don't know what the model expects to be run at. -// TODO clean this up and track a bool for whether it knows. +/// \brief Use a sample rate of -1 if we don't know what the model expects to be run at #define NAM_UNKNOWN_EXPECTED_SAMPLE_RATE -1.0 namespace nam { namespace wavenet { -// Forward declaration to allow WaveNet to access protected members of DSP -// Not sure I like this. +/// Forward declaration to allow WaveNet to access protected members of DSP class WaveNet; } // namespace wavenet -enum EArchitectures -{ - kLinear = 0, - kConvNet, - kLSTM, - kCatLSTM, - kWaveNet, - kCatWaveNet, - kNumModels -}; +/// \brief Base class for all DSP models +/// +/// DSP provides the common interface for all neural network-based audio processing models. +/// It handles: +/// - Input/output channel management +/// - Sample rate tracking +/// - Level management (input/output levels and loudness) +/// - Prewarm functionality for settling initial conditions +/// - Buffer size management +/// +/// Subclasses should override process() to implement the actual processing algorithm. class DSP { public: - // Older models won't know, but newer ones will come with a loudness from the training based on their response to a - // standardized input. - // We may choose to have the models figure out for themselves how loud they are in here in the future. + /// \brief Constructor + /// + /// \param in_channels Number of input channels + /// \param out_channels Number of output channels + /// \param expected_sample_rate Expected sample rate in Hz (-1.0 if unknown) DSP(const int in_channels, const int out_channels, const double expected_sample_rate); + + /// \brief Virtual destructor virtual ~DSP() = default; - // prewarm() does any required intial work required to "settle" model initial conditions - // it can be somewhat expensive, so should not be called during realtime audio processing - // Important: don't expect the model to be outputting zeroes after this. Neural networks - // Don't know that there's anything special about "zero", and forcing this gets rid of - // some possibilities that I dont' want to rule out (e.g. models that "are noisy"). + + /// \brief Prewarm the model to settle initial conditions + /// + /// This can be somewhat expensive, so should not be called during real-time audio processing. + /// Important: don't expect the model to be outputting zeroes after this. Neural networks + /// don't know that there's anything special about "zero", and forcing this gets rid of + /// some possibilities (e.g. models that "are noisy"). virtual void prewarm(); - // process() does all of the processing requried to take `input` array and - // fill in the required values on `output`. - // To do this: - // 1. The core DSP algorithm is run (This is what should probably be - // overridden in subclasses). - // 2. The output level is applied and the result stored to `output`. - // `input` and `output` are double pointers where the first pointer indexes channels - // and the second indexes frames: input[channel][frame] + + /// \brief Process audio frames + /// + /// \param input Input audio buffers. Double pointer where the first pointer indexes channels + /// and the second indexes frames: input[channel][frame] + /// \param output Output audio buffers. Same structure as input. + /// \param num_frames Number of frames to process virtual void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames); - // Expected sample rate, in Hz. - // TODO throw if it doesn't know. + /// \brief Get the expected sample rate + /// \return Expected sample rate in Hz (-1.0 if unknown) double GetExpectedSampleRate() const { return mExpectedSampleRate; }; - // Number of input channels + + /// \brief Get the number of input channels + /// \return Number of input channels int NumInputChannels() const { return mInChannels; }; - // Number of output channels + + /// \brief Get the number of output channels + /// \return Number of output channels int NumOutputChannels() const { return mOutChannels; }; - // Input Level, in dBu, corresponding to 0 dBFS for a sine wave - // You should call HasInputLevel() first to be safe. - // Note: input level is assumed global over all inputs. + + /// \brief Get the input level + /// + /// Input level is in dBu RMS, corresponding to 0 dBFS peak for a 1 kHz sine wave. + /// You should call HasInputLevel() first to be safe. + /// Note: input level is assumed global over all inputs. + /// \return Input level in dBu double GetInputLevel(); - // Get how loud this model is, in dB. - // Throws a std::runtime_error if the model doesn't know how loud it is. - // Note: loudness is assumed global over all outputs. + + /// \brief Get how loud this model's output is, in dB, if a "typical" input is processed. + /// This can be used to normalize the output level of the object. + /// + /// Throws a std::runtime_error if the model doesn't know how loud it is. + /// Note: loudness is assumed global over all outputs. + /// \return Loudness in dB + /// \throws std::runtime_error If the model doesn't know its loudness double GetLoudness() const; - // Output Level, in dBu, corresponding to 0 dBFS for a sine wave - // You should call HasOutputLevel() first to be safe. - // Note: output level is assumed global over all outputs. + + /// \brief Get the output level + /// + /// Output level is in dBu RMS, corresponding to 0 dBFS peak for a 1 kHz sine wave. + /// You should call HasOutputLevel() first to be safe. + /// Note: output level is assumed global over all outputs. + /// \return Output level in dBu double GetOutputLevel(); - // Does this model know its input level? - // Note: input level is assumed global over all inputs. + + /// \brief Check if this model knows its input level + /// + /// Note: input level is assumed global over all inputs. + /// \return true if input level is known, false otherwise bool HasInputLevel(); - // Get whether the model knows how loud it is. + + /// \brief Check if the model knows how loud it is + /// \return true if loudness is known, false otherwise bool HasLoudness() const { return mHasLoudness; }; - // Does this model know its output level? - // Note: output level is assumed global over all outputs. + + /// \brief Check if this model knows its output level + /// + /// Note: output level is assumed global over all outputs. + /// \return true if output level is known, false otherwise bool HasOutputLevel(); - // General function for resetting the DSP unit. - // This doesn't call prewarm(). If you want to do that, then you might want to use ResetAndPrewarm(). - // See https://github.com/sdatkinson/NeuralAmpModelerCore/issues/96 for the reasoning. + + /// \brief General function for resetting the DSP unit + /// + /// This doesn't call prewarm(). If you want to do that, then you might want to use ResetAndPrewarm(). + /// See https://github.com/sdatkinson/NeuralAmpModelerCore/issues/96 for the reasoning. + /// \param sampleRate Current sample rate + /// \param maxBufferSize Maximum buffer size to process virtual void Reset(const double sampleRate, const int maxBufferSize); - // Reset(), then prewarm() + + /// \brief Reset the DSP unit, then prewarm + /// \param sampleRate Current sample rate + /// \param maxBufferSize Maximum buffer size to process void ResetAndPrewarm(const double sampleRate, const int maxBufferSize) { Reset(sampleRate, maxBufferSize); prewarm(); } + + /// \brief Set the input level + /// \param inputLevel Input level in dBu void SetInputLevel(const double inputLevel); - // Set the loudness, in dB. - // This is usually defined to be the loudness to a standardized input. The trainer has its own, but you can always - // use this to define it a different way if you like yours better. - // Note: loudness is assumed global over all outputs. + + /// \brief Set the loudness + /// + /// This is usually defined to be the loudness to a standardized input. The trainer has its own, + /// but you can always use this to define it a different way if you like yours better. + /// Note: loudness is assumed global over all outputs. + /// \param loudness Loudness in dB void SetLoudness(const double loudness); + + /// \brief Set the output level + /// \param outputLevel Output level in dBu void SetOutputLevel(const double outputLevel); protected: @@ -123,10 +168,18 @@ class DSP // The largest buffer I expect to be told to process: int mMaxBufferSize = 0; - // How many samples should be processed for me to be considered "warmed up"? + /// \brief Get how many samples should be processed for the model to be considered "warmed up" + /// + /// Override this in subclasses to specify prewarm requirements. + /// \return Number of samples needed for prewarm virtual int PrewarmSamples() { return 0; }; + /// \brief Set the maximum buffer size + /// \param maxBufferSize Maximum number of frames to process in a single call virtual void SetMaxBufferSize(const int maxBufferSize); + + /// \brief Get the maximum buffer size + /// \return Maximum buffer size int GetMaxBufferSize() const { return mMaxBufferSize; }; private: @@ -142,12 +195,20 @@ class DSP Level mOutputLevel; }; -// Class where an input buffer is kept so that long-time effects can be -// captured. (e.g. conv nets or impulse responses, where we need history that's -// longer than the sample buffer that's coming in.) +/// \brief Base class for DSP models that require input buffering +/// This class is deprecated and will be removed in a future version. +/// +/// Class where an input buffer is kept so that long-time effects can be captured. +/// (e.g. conv nets or impulse responses, where we need history that's longer than +/// the sample buffer that's coming in.) class Buffer : public DSP { public: + /// \brief Constructor + /// \param in_channels Number of input channels + /// \param out_channels Number of output channels + /// \param receptive_field Size of the receptive field (buffer size needed) + /// \param expected_sample_rate Expected sample rate in Hz (-1.0 if unknown) Buffer(const int in_channels, const int out_channels, const int receptive_field, const double expected_sample_rate = -1.0); @@ -168,12 +229,26 @@ class Buffer : public DSP virtual void _rewind_buffers_(); }; -// Basic linear model (an IR!) +/// \brief Basic linear model +/// +/// Implements a simple linear convolution, (i.e. an impulse response). class Linear : public Buffer { public: + /// \brief Constructor + /// \param in_channels Number of input channels + /// \param out_channels Number of output channels + /// \param receptive_field Size of the impulse response + /// \param _bias Whether to use bias + /// \param weights Model weights (impulse response coefficients) + /// \param expected_sample_rate Expected sample rate in Hz (-1.0 if unknown) Linear(const int in_channels, const int out_channels, const int receptive_field, const bool _bias, const std::vector& weights, const double expected_sample_rate = -1.0); + + /// \brief Process audio frames + /// \param input Input audio buffers + /// \param output Output audio buffers + /// \param num_frames Number of frames to process void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) override; protected: @@ -183,31 +258,69 @@ class Linear : public Buffer namespace linear { +/// \brief Factory function to instantiate Linear model from JSON +/// \param config JSON configuration object +/// \param weights Model weights vector +/// \param expectedSampleRate Expected sample rate in Hz (-1.0 if unknown) +/// \return Unique pointer to a DSP object (Linear instance) std::unique_ptr Factory(const nlohmann::json& config, std::vector& weights, const double expectedSampleRate); } // namespace linear // NN modules ================================================================= -// Really just a linear layer +/// \brief 1x1 convolution (really just a fully-connected linear layer operating per-sample) +/// +/// Performs a pointwise convolution, which is equivalent to a fully connected layer +/// applied independently to each time step. Supports grouped convolution for efficiency. class Conv1x1 { public: + /// \brief Constructor + /// \param in_channels Number of input channels + /// \param out_channels Number of output channels + /// \param _bias Whether to use bias + /// \param groups Number of groups for grouped convolution (default: 1) Conv1x1(const int in_channels, const int out_channels, const bool _bias, const int groups = 1); - // Get the entire internal output buffer. This is intended for internal wiring - // between layers/arrays; callers should treat the buffer as pre-allocated - // storage and only consider the first `num_frames` columns valid for a given - // processing call. Slice with .leftCols(num_frames) as needed. + + /// \brief Get the entire internal output buffer + /// + /// This is intended for internal wiring between layers/arrays; callers should treat + /// the buffer as pre-allocated storage and only consider the first num_frames columns + /// valid for a given processing call. Slice with .leftCols(num_frames) as needed. + /// \return Reference to the output buffer Eigen::MatrixXf& GetOutput() { return _output; } + + /// \brief Get the entire internal output buffer (const version) + /// \return Const reference to the output buffer const Eigen::MatrixXf& GetOutput() const { return _output; } + + /// \brief Resize the output buffer to handle maxBufferSize frames + /// \param maxBufferSize Maximum number of frames to process in a single call void SetMaxBufferSize(const int maxBufferSize); + + /// \brief Set the parameters (weights) of this module + /// \param weights Iterator to the weights vector. Will be advanced as weights are consumed. void set_weights_(std::vector::iterator& weights); - // :param input: (N,Cin) or (Cin,) - // :return: (N,Cout) or (Cout,), respectively + + /// \brief Process input and return output matrix + /// + /// \param input Input matrix (channels x num_frames) or (channels,) + /// \return Output matrix (channels x num_frames) or (channels,), respectively Eigen::MatrixXf process(const Eigen::MatrixXf& input) const { return process(input, (int)input.cols()); }; + + /// \brief Process input and return output matrix + /// \param input Input matrix (channels x num_frames) + /// \param num_frames Number of frames to process + /// \return Output matrix (channels x num_frames) Eigen::MatrixXf process(const Eigen::MatrixXf& input, const int num_frames) const; - // Store output to pre-allocated _output; access with GetOutput() - // Uses Eigen::Ref to accept matrices and block expressions without creating temporaries (real-time safe) + + /// \brief Process input and store output to pre-allocated buffer + /// + /// Uses Eigen::Ref to accept matrices and block expressions without creating + /// temporaries (real-time safe). Access output via GetOutput(). + /// \param input Input matrix (channels x num_frames) + /// \param num_frames Number of frames to process void process_(const Eigen::Ref& input, const int num_frames); long get_out_channels() const { return this->_weight.rows(); }; @@ -226,41 +339,28 @@ class Conv1x1 // Utilities ================================================================== // Implemented in get_dsp.cpp -// Data for a DSP object -// :param version: Data version. Follows the conventions established in the trainer code. -// :param architecture: Defines the high-level architecture. Supported are (as per `get-dsp()` in get_dsp.cpp): -// * "CatLSTM" -// * "CatWaveNet" -// * "ConvNet" -// * "LSTM" -// * "Linear" -// * "WaveNet" -// :param config: -// :param metadata: -// :param weights: The model weights -// :param expected_sample_rate: Most NAM models implicitly assume that data will be provided to them at some sample -// rate. This captures it for other components interfacing with the model to understand its needs. Use -1.0 for "I -// don't know". +/// \brief Data structure for a DSP object +/// +/// Contains all information needed to instantiate and configure a DSP model. struct dspData { - std::string version; - std::string architecture; - nlohmann::json config; - nlohmann::json metadata; - std::vector weights; - double expected_sample_rate; + std::string version; ///< Data version. Follows conventions established in trainer code. + std::string architecture; ///< High-level architecture. Supported: "ConvNet", "LSTM", "Linear", "WaveNet" + nlohmann::json config; ///< Model configuration JSON + nlohmann::json metadata; ///< Model metadata JSON + std::vector weights; ///< Model weights + double expected_sample_rate; ///< Expected sample rate in Hz. Most NAM models implicitly assume data at some sample + ///< rate. Use -1.0 for "I don't know". }; -// Verify that the config that we are building our model from is supported by -// this plugin version. +/// \brief Verify that the config version is supported by this plugin version +/// \param version Config version string to verify void verify_config_version(const std::string version); -// Takes the model file and uses it to instantiate an instance of DSP. -std::unique_ptr get_dsp(const std::filesystem::path model_file); -// Creates an instance of DSP. Also returns a dspData struct that holds the data of the model. -std::unique_ptr get_dsp(const std::filesystem::path model_file, dspData& returnedConfig); -// Instantiates a DSP object from dsp_config struct. -std::unique_ptr get_dsp(dspData& conf); -// Legacy loader for directory-type DSPs +/// \brief Legacy loader for directory-style DSPs +/// +/// Loads models from a directory structure (older format). +/// \param dirname Path to the directory containing the model +/// \return Unique pointer to a DSP object std::unique_ptr get_dsp_legacy(const std::filesystem::path dirname); }; // namespace nam diff --git a/NAM/film.h b/NAM/film.h index b5376f0..9e1ec25 100644 --- a/NAM/film.h +++ b/NAM/film.h @@ -8,46 +8,69 @@ namespace nam { -// Feature-wise Linear Modulation (FiLM) -// -// Given an `input` (input_dim x num_frames) and a `condition` -// (condition_dim x num_frames), compute: -// scale, shift = Conv1x1(condition) split across channels -// output = input * scale + shift (elementwise) +/// \brief Feature-wise Linear Modulation (FiLM) +/// +/// Given an input (input_dim x num_frames) and a condition (condition_dim x num_frames), compute: +/// scale, shift = Conv1x1(condition) split across channels (top/bottom half, respectively) +/// output = input * scale + shift (elementwise) +/// +/// FiLM applies per-channel scaling and optional shifting based on conditioning input, +/// allowing the model to adapt its behavior based on external signals. class FiLM { public: + /// \brief Constructor + /// \param condition_dim Size of the conditioning input + /// \param input_dim Size of the input to be modulated + /// \param shift Whether to apply both scale and shift (true) or only scale (false) FiLM(const int condition_dim, const int input_dim, const bool shift) : _cond_to_scale_shift(condition_dim, (shift ? 2 : 1) * input_dim, /*bias=*/true) , _do_shift(shift) { } - // Get the entire internal output buffer. This is intended for internal wiring - // between layers; callers should treat the buffer as pre-allocated storage - // and only consider the first `num_frames` columns valid for a given - // processing call. Slice with .leftCols(num_frames) as needed. + /// \brief Get the entire internal output buffer + /// + /// This is intended for internal wiring between layers; callers should treat + /// the buffer as pre-allocated storage and only consider the first num_frames columns + /// valid for a given processing call. Slice with .leftCols(num_frames) as needed. + /// \return Reference to the output buffer Eigen::MatrixXf& GetOutput() { return _output; } + + /// \brief Get the entire internal output buffer (const version) + /// \return Const reference to the output buffer const Eigen::MatrixXf& GetOutput() const { return _output; } + /// \brief Resize buffers to handle maxBufferSize frames + /// \param maxBufferSize Maximum number of frames to process in a single call void SetMaxBufferSize(const int maxBufferSize) { _cond_to_scale_shift.SetMaxBufferSize(maxBufferSize); _output.resize(get_input_dim(), maxBufferSize); } + /// \brief Set the parameters (weights) of this module + /// \param weights Iterator to the weights vector. Will be advanced as weights are consumed. void set_weights_(std::vector::iterator& weights) { _cond_to_scale_shift.set_weights_(weights); } + /// \brief Get the condition dimension + /// \return Size of the conditioning input long get_condition_dim() const { return _cond_to_scale_shift.get_in_channels(); } + + /// \brief Get the input dimension + /// \return Size of the input to be modulated long get_input_dim() const { return _do_shift ? (_cond_to_scale_shift.get_out_channels() / 2) : _cond_to_scale_shift.get_out_channels(); } - // :param input: (input_dim x num_frames) - // :param condition: (condition_dim x num_frames) - // Writes (input_dim x num_frames) into internal output buffer; access via GetOutput(). - // Uses Eigen::Ref to accept matrices and block expressions without creating temporaries (real-time safe) + /// \brief Process input with conditioning + /// + /// Writes (input_dim x num_frames) into internal output buffer; access via GetOutput(). + /// Uses Eigen::Ref to accept matrices and block expressions without creating temporaries (real-time safe). + /// \param input Input matrix (input_dim x num_frames) + /// \param condition Conditioning matrix (condition_dim x num_frames) + /// \param num_frames Number of frames to process void Process(const Eigen::Ref& input, const Eigen::Ref& condition, const int num_frames) { @@ -73,8 +96,13 @@ class FiLM } } - // in-place - // Uses Eigen::Ref to accept matrices and block expressions without creating temporaries (real-time safe) + /// \brief Process input with conditioning (in-place) + /// + /// Uses Eigen::Ref to accept matrices and block expressions without creating temporaries (real-time safe). + /// Modifies the input matrix directly. + /// \param input Input matrix (input_dim x num_frames), will be modified in-place + /// \param condition Conditioning matrix (condition_dim x num_frames) + /// \param num_frames Number of frames to process void Process_(Eigen::Ref input, const Eigen::Ref& condition, const int num_frames) { diff --git a/NAM/get_dsp.h b/NAM/get_dsp.h index 3aef8d6..6353053 100644 --- a/NAM/get_dsp.h +++ b/NAM/get_dsp.h @@ -6,22 +6,37 @@ namespace nam { -// Get NAM from a .nam file at the provided location +/// \brief Get NAM from a .nam file at the provided location +/// \param config_filename Path to the .nam model file +/// \return Unique pointer to a DSP object std::unique_ptr get_dsp(const std::filesystem::path config_filename); -// Get NAM from a provided configuration struct +/// \brief Get NAM from a provided configuration struct +/// \param conf DSP data structure containing model configuration and weights +/// \return Unique pointer to a DSP object std::unique_ptr get_dsp(dspData& conf); -// Get NAM from a provided .nam file path and store its configuration in the provided conf +/// \brief Get NAM from a .nam file and store its configuration +/// +/// Creates an instance of DSP and also returns a dspData struct that holds the data of the model. +/// \param config_filename Path to the .nam model file +/// \param returnedConfig Output parameter that will be filled with the model data +/// \return Unique pointer to a DSP object std::unique_ptr get_dsp(const std::filesystem::path config_filename, dspData& returnedConfig); -// Get NAM from a provided configuration JSON object +/// \brief Get NAM from a provided configuration JSON object +/// \param config JSON configuration object +/// \param returnedConfig Output parameter that will be filled with the model data +/// \return Unique pointer to a DSP object std::unique_ptr get_dsp(const nlohmann::json& config, dspData& returnedConfig); -// Get NAM from a provided configuration JSON object (convenience overload) +/// \brief Get NAM from a provided configuration JSON object (convenience overload) +/// \param config JSON configuration object +/// \return Unique pointer to a DSP object std::unique_ptr get_dsp(const nlohmann::json& config); -// Get sample rate from a .nam file -// Returns -1 if not known (Really old .nam files) +/// \brief Get sample rate from a .nam file +/// \param j JSON object from the .nam file +/// \return Sample rate in Hz, or -1 if not known (really old .nam files) double get_sample_rate_from_nam_file(const nlohmann::json& j); }; // namespace nam diff --git a/NAM/lstm.h b/NAM/lstm.h index 5c03853..d97de20 100644 --- a/NAM/lstm.h +++ b/NAM/lstm.h @@ -13,18 +13,22 @@ namespace nam { namespace lstm { -// A Single LSTM cell -// i input -// f forget -// g cell -// o output -// c cell state -// h hidden state +/// \brief A single LSTM cell class LSTMCell { public: + /// \brief Constructor + /// \param input_size Size of the input vector + /// \param hidden_size Size of the hidden state + /// \param weights Iterator to the weights vector. Will be advanced as weights are consumed. LSTMCell(const int input_size, const int hidden_size, std::vector::iterator& weights); + + /// \brief Get the current hidden state + /// \return Hidden state vector Eigen::VectorXf get_hidden_state() const { return this->_xh(Eigen::placeholders::lastN(this->_get_hidden_size())); }; + + /// \brief Process a single input vector + /// \param x Input vector void process_(const Eigen::VectorXf& x); private: @@ -47,13 +51,31 @@ class LSTMCell long _get_input_size() const { return this->_xh.size() - this->_get_hidden_size(); }; }; -// The multi-layer LSTM model +/// \brief A multi-layer LSTM model +/// +/// A multi-layer LSTM processes audio frame-by-frame, maintaining hidden states +/// across layers. Each layer processes the hidden state from the previous layer as input. class LSTM : public DSP { public: + /// \brief Constructor + /// \param in_channels Number of input channels + /// \param out_channels Number of output channels + /// \param num_layers Number of LSTM layers + /// \param input_size Size of the input to each LSTM cell + /// \param hidden_size Size of the hidden state in each LSTM cell + /// \param weights Model weights vector + /// \param expected_sample_rate Expected sample rate in Hz (-1.0 if unknown) LSTM(const int in_channels, const int out_channels, const int num_layers, const int input_size, const int hidden_size, std::vector& weights, const double expected_sample_rate = -1.0); + + /// \brief Destructor ~LSTM() = default; + + /// \brief Process audio frames + /// \param input Input audio buffers + /// \param output Output audio buffers + /// \param num_frames Number of frames to process void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) override; protected: @@ -73,7 +95,11 @@ class LSTM : public DSP Eigen::VectorXf _output; }; -// Factory to instantiate from nlohmann json +/// \brief Factory function to instantiate LSTM from JSON +/// \param config JSON configuration object +/// \param weights Model weights vector +/// \param expectedSampleRate Expected sample rate in Hz (-1.0 if unknown) +/// \return Unique pointer to a DSP object (LSTM instance) std::unique_ptr Factory(const nlohmann::json& config, std::vector& weights, const double expectedSampleRate); diff --git a/NAM/registry.h b/NAM/registry.h index e3bc4e8..0e90699 100644 --- a/NAM/registry.h +++ b/NAM/registry.h @@ -13,19 +13,28 @@ namespace nam { namespace factory { -// TODO get rid of weights and expectedSampleRate +/// \brief Factory function type for creating DSP objects using FactoryFunction = std::function(const nlohmann::json&, std::vector&, const double)>; -// Register factories for instantiating DSP objects +/// \brief Registry for factories that instantiate DSP objects +/// +/// Singleton registry that maps architecture names to factory functions. +/// Allows dynamic registration of new DSP architectures. class FactoryRegistry { public: + /// \brief Get the singleton instance + /// \return Reference to the factory registry instance static FactoryRegistry& instance() { static FactoryRegistry inst; return inst; } + /// \brief Register a factory function for an architecture + /// \param key Architecture name (e.g., "WaveNet", "LSTM") + /// \param func Factory function that creates DSP instances + /// \throws std::runtime_error If the key is already registered void registerFactory(const std::string& key, FactoryFunction func) { // Assert that the key is not already registered @@ -36,6 +45,13 @@ class FactoryRegistry factories_[key] = func; } + /// \brief Create a DSP object using a registered factory + /// \param name Architecture name + /// \param config JSON configuration object + /// \param weights Model weights vector + /// \param expectedSampleRate Expected sample rate in Hz + /// \return Unique pointer to a DSP object + /// \throws std::runtime_error If no factory is registered for the given name std::unique_ptr create(const std::string& name, const nlohmann::json& config, std::vector& weights, const double expectedSampleRate) const { @@ -51,9 +67,15 @@ class FactoryRegistry std::unordered_map factories_; }; -// Registration helper. Use this to register your factories. +/// \brief Registration helper for factories +/// +/// Use this to register your factories. Create a static instance to automatically +/// register a factory when the program starts. struct Helper { + /// \brief Constructor that registers a factory + /// \param name Architecture name + /// \param factory Factory function Helper(const std::string& name, FactoryFunction factory) { FactoryRegistry::instance().registerFactory(name, std::move(factory)); diff --git a/NAM/ring_buffer.h b/NAM/ring_buffer.h index f2c3dfe..5d0e9b3 100644 --- a/NAM/ring_buffer.h +++ b/NAM/ring_buffer.h @@ -4,38 +4,52 @@ namespace nam { -// Ring buffer for managing Eigen::MatrixXf buffers with write/read pointers +/// \brief Ring buffer for managing Eigen::MatrixXf buffers with write/read pointers +/// +/// Provides efficient circular buffer functionality for maintaining input history +/// in convolutional layers. Automatically handles buffer wrapping when needed. class RingBuffer { public: + /// \brief Default constructor RingBuffer() {}; - // Initialize/resize storage - // :param channels: Number of channels (rows in the storage matrix) - // :param max_buffer_size: Maximum amount that will be written or read at once + + /// \brief Initialize/resize storage + /// \param channels Number of channels (rows in the storage matrix) + /// \param max_buffer_size Maximum amount that will be written or read at once void Reset(const int channels, const int max_buffer_size); - // Write new data at write pointer - // :param input: Input matrix (channels x num_frames) - // :param num_frames: Number of frames to write - // NOTE: This function expects a full, pre-allocated, column-major MatrixXf - // covering the entire valid buffer range. Callers should not pass - // Block expressions (e.g. .leftCols()) across the API boundary; instead, - // pass the full buffer and slice inside the callee. This avoids Eigen - // evaluating Blocks into temporaries (which would allocate) when - // binding to MatrixXf. + + /// \brief Write new data at write pointer + /// + /// NOTE: This function expects a full, pre-allocated, column-major MatrixXf + /// covering the entire valid buffer range. Callers should not pass Block expressions + /// (e.g. .leftCols()) across the API boundary; instead, pass the full buffer and + /// slice inside the callee. This avoids Eigen evaluating Blocks into temporaries + /// (which would allocate) when binding to MatrixXf. + /// \param input Input matrix (channels x num_frames) + /// \param num_frames Number of frames to write void Write(const Eigen::MatrixXf& input, const int num_frames); - // Read data with optional lookback - // :param num_frames: Number of frames to read - // :param lookback: Number of frames to look back from write pointer (default 0) - // :return: Block reference to the storage data + + /// \brief Read data with optional lookback + /// \param num_frames Number of frames to read + /// \param lookback Number of frames to look back from write pointer (default 0) + /// \return Block reference to the storage data Eigen::Block Read(const int num_frames, const long lookback = 0); - // Advance write pointer - // :param num_frames: Number of frames to advance + + /// \brief Advance write pointer + /// \param num_frames Number of frames to advance void Advance(const int num_frames); - // Get max buffer size (the value passed to Reset()) + + /// \brief Get max buffer size (the value passed to Reset()) + /// \return Maximum buffer size int GetMaxBufferSize() const { return _max_buffer_size; } - // Get number of channels (rows) + + /// \brief Get number of channels (rows) + /// \return Number of channels int GetChannels() const { return _storage.rows(); } - // Set the max lookback (maximum history needed when rewinding) + + /// \brief Set the max lookback (maximum history needed when rewinding) + /// \param max_lookback Maximum lookback distance void SetMaxLookback(const long max_lookback) { _max_lookback = max_lookback; } private: diff --git a/NAM/util.h b/NAM/util.h index c0a5bd4..a13cd4a 100644 --- a/NAM/util.h +++ b/NAM/util.h @@ -9,6 +9,9 @@ namespace nam { namespace util { +/// \brief Convert a string to lowercase +/// \param s Input string +/// \return Lowercase version of the input string std::string lowercase(const std::string& s); }; // namespace util }; // namespace nam diff --git a/NAM/version.h b/NAM/version.h index f746905..11b3be5 100644 --- a/NAM/version.h +++ b/NAM/version.h @@ -1,6 +1,15 @@ #pragma once -// Make sure this matches NAM version in ../CMakeLists.txt! +/// \file version.h +/// \brief Version information for NeuralAmpModelerCore +/// +/// Make sure this matches NAM version in ../CMakeLists.txt! + +/// \brief Major version number #define NEURAL_AMP_MODELER_DSP_VERSION_MAJOR 0 -#define NEURAL_AMP_MODELER_DSP_VERSION_MINOR 3 + +/// \brief Minor version number +#define NEURAL_AMP_MODELER_DSP_VERSION_MINOR 4 + +/// \brief Patch version number #define NEURAL_AMP_MODELER_DSP_VERSION_PATCH 0 diff --git a/NAM/wavenet.h b/NAM/wavenet.h index 336d2e4..e290ef4 100644 --- a/NAM/wavenet.h +++ b/NAM/wavenet.h @@ -19,22 +19,34 @@ namespace nam namespace wavenet { -// Gating mode for WaveNet layers +/// \brief Gating mode for WaveNet layers +/// +/// Determines how the layer processes the doubled bottleneck channels when gating is enabled. enum class GatingMode { - NONE, // No gating or blending - GATED, // Traditional gating (element-wise multiplication) - BLENDED // Blending (weighted average) + NONE, ///< No gating or blending - standard activation + GATED, ///< Traditional gating (element-wise multiplication of activated channels) + BLENDED ///< Blending (weighted average between activated and pre-activated values) }; -// Helper function for backward compatibility with boolean gated parameter +/// \brief Helper function for backward compatibility with boolean gated parameter +/// \param gated Boolean indicating whether gating should be enabled +/// \return GatingMode::GATED if gated is true, GatingMode::NONE otherwise inline GatingMode gating_mode_from_bool(bool gated) { return gated ? GatingMode::GATED : GatingMode::NONE; } -// Parameters for head1x1 configuration + +/// \brief Parameters for head1x1 configuration +/// +/// Configures an optional 1x1 convolution that outputs directly to the head (skip connection) +/// instead of using the activation output directly. struct Head1x1Params { + /// \brief Constructor + /// \param active_ Whether the head1x1 convolution is active + /// \param out_channels_ Number of output channels for the head1x1 convolution + /// \param groups_ Number of groups for grouped convolution Head1x1Params(bool active_, int out_channels_, int groups_) : active(active_) , out_channels(out_channels_) @@ -42,26 +54,66 @@ struct Head1x1Params { } - const bool active; - const int out_channels; - const int groups; + const bool active; ///< Whether the head1x1 convolution is active + const int out_channels; ///< Number of output channels + const int groups; ///< Number of groups for grouped convolution }; +/// \brief Parameters for FiLM (Feature-wise Linear Modulation) configuration +/// +/// FiLM applies per-channel scaling and optional shifting based on conditioning input. struct _FiLMParams { + /// \brief Constructor + /// \param active_ Whether FiLM is active at this location + /// \param shift_ Whether to apply both scale and shift (true) or only scale (false) _FiLMParams(bool active_, bool shift_) : active(active_) , shift(shift_) { } - const bool active; - const bool shift; + const bool active; ///< Whether FiLM is active + const bool shift; ///< Whether to apply shift in addition to scale }; +/// \brief A single WaveNet layer block +/// +/// A WaveNet layer performs the following operations: +/// 1. Dilated convolution on the input (with optional pre/post-FiLM) +/// 2. Input mixin (conditioning input processing, with optional pre/post-FiLM) +/// 3. Sum of conv and input mixin outputs +/// 4. Activation (with optional gating/blending and pre/post FiLM) +/// 5. 1x1 convolution for the next layer (with optional post-FiLM) +/// 6. Optional 1x1 convolution for the head output (with optional post-FiLM) +/// 7. Residual connection (input + 1x1 output) and skip connection (to next layer) +/// +/// The layer supports multiple gating modes and FiLM at various points in the computation. +/// See the walkthrough documentation for detailed step-by-step explanation. class _Layer { public: - // Constructor with GatingMode enum and typed ActivationConfig + /// \brief Constructor with GatingMode enum and typed ActivationConfig + /// \param condition_size Size of the conditioning input + /// \param channels Number of input/output channels from layer to layer + /// \param bottleneck Internal channel count + /// \param kernel_size Kernel size for the dilated convolution + /// \param dilation Dilation factor for the convolution + /// \param activation_config Primary activation function configuration + /// \param gating_mode Gating mode (NONE, GATED, or BLENDED) + /// \param groups_input Number of groups for the input convolution + /// \param groups_input_mixin Number of groups for the input mixin convolution + /// \param groups_1x1 Number of groups for the 1x1 convolution + /// \param head1x1_params Configuration of the optional head1x1 convolution + /// \param secondary_activation_config Secondary activation (for gating/blending) + /// \param conv_pre_film_params FiLM parameters before the input convolution + /// \param conv_post_film_params FiLM parameters after the input convolution + /// \param input_mixin_pre_film_params FiLM parameters before the input mixin + /// \param input_mixin_post_film_params FiLM parameters after the input mixin + /// \param activation_pre_film_params FiLM parameters after the input/mixin summed output before activation + /// \param activation_post_film_params FiLM parameters after the activation output before the 1x1 convolution + /// \param _1x1_post_film_params FiLM parameters after the 1x1 convolution + /// \param head1x1_post_film_params FiLM parameters after the head1x1 convolution + /// \throws std::invalid_argument If head1x1_post_film_params is active but head1x1 is not _Layer(const int condition_size, const int channels, const int bottleneck, const int kernel_size, const int dilation, const activations::ActivationConfig& activation_config, const GatingMode gating_mode, const int groups_input, const int groups_input_mixin, const int groups_1x1, const Head1x1Params& head1x1_params, @@ -144,36 +196,72 @@ class _Layer } }; - // Resize all arrays to be able to process `maxBufferSize` frames. + /// \brief Resize all arrays to be able to process maxBufferSize frames + /// \param maxBufferSize Maximum number of frames to process in a single call void SetMaxBufferSize(const int maxBufferSize); - // Set the parameters of this module + + /// \brief Set the parameters (weights) of this module + /// \param weights Iterator to the weights vector. Will be advanced as weights are consumed. void set_weights_(std::vector::iterator& weights); - // Process a block of frames. - // :param `input`: from previous layer - // :param `condition`: conditioning input (input to the WaveNet / "skip-in") - // :param `num_frames`: number of frames to process - // Outputs are stored internally and accessible via GetOutputNextLayer() and GetOutputHead() + + /// \brief Process a block of frames + /// + /// Performs the complete layer computation: + /// 1. Input convolution (with optional pre/post-FiLM) + /// 2. Input mixin processing (with optional pre/post-FiLM) + /// 3. Sum and activation (with optional gating/blending and pre/post-FiLM) + /// 4. 1x1 convolution toward the skip connection for next layer (with optional post-FiLM) + /// 5. Optional 1x1 convolution for the head output (with optional post-FiLM) + /// 6. Store outputs for next layer and the layer array head + /// + /// \param input Input from previous layer (channels x num_frames) + /// \param condition Conditioning input (condition_size x num_frames) + /// \param num_frames Number of frames to process + /// + /// Outputs are stored internally and accessible via GetOutputNextLayer() and GetOutputHead(). + /// Only the first num_frames columns of the output buffers are valid. void Process(const Eigen::MatrixXf& input, const Eigen::MatrixXf& condition, const int num_frames); - // The number of channels expected as input/output from this layer + + /// \brief Get the number of channels expected as input/output from this layer + /// \return Number of channels long get_channels() const { return this->_conv.get_in_channels(); }; - // Dilation of the input convolution layer + + /// \brief Get the dilation of the input convolution layer + /// \return Dilation factor int get_dilation() const { return this->_conv.get_dilation(); }; - // Kernel size of the input convolution layer + + /// \brief Get the kernel size of the input convolution layer + /// \return Kernel size long get_kernel_size() const { return this->_conv.get_kernel_size(); }; - // Get output to next layer (residual connection: input + _1x1 output) - // Returns the full pre-allocated buffer; only the first `num_frames` columns - // are valid for a given processing call. Slice with .leftCols(num_frames) as needed. + /// \brief Get output to next layer (residual connection: input + _1x1 output) + /// + /// Returns the full pre-allocated buffer; only the first num_frames columns + /// are valid for a given processing call. Slice with .leftCols(num_frames) as needed. + /// \return Reference to the output buffer (channels x maxBufferSize) Eigen::MatrixXf& GetOutputNextLayer() { return this->_output_next_layer; } + + /// \brief Get output to next layer (const version) + /// \return Const reference to the output buffer const Eigen::MatrixXf& GetOutputNextLayer() const { return this->_output_next_layer; } - // Get output to head (skip connection: activated conv output) - // Returns the full pre-allocated buffer; only the first `num_frames` columns - // are valid for a given processing call. Slice with .leftCols(num_frames) as needed. + + /// \brief Get output to head (skip connection: activated conv output) + /// + /// Returns the full pre-allocated buffer; only the first num_frames columns + /// are valid for a given processing call. Slice with .leftCols(num_frames) as needed. + /// \return Reference to the head output buffer Eigen::MatrixXf& GetOutputHead() { return this->_output_head; } + + /// \brief Get output to head (const version) + /// \return Const reference to the head output buffer const Eigen::MatrixXf& GetOutputHead() const { return this->_output_head; } - // Access Conv1D for Reset() propagation (needed for _LayerArray) + /// \brief Access Conv1D for Reset() propagation (needed for _LayerArray) + /// \return Reference to the internal Conv1D object Conv1D& get_conv() { return _conv; } + + /// \brief Access Conv1D (const version) + /// \return Const reference to the internal Conv1D object const Conv1D& get_conv() const { return _conv; } private: @@ -211,9 +299,37 @@ class _Layer std::unique_ptr _head1x1_post_film; }; +/// \brief Parameters for constructing a LayerArray +/// +/// Contains all configuration needed to construct a _LayerArray with multiple layers +/// sharing the same channel count, kernel size, and activation configuration. class LayerArrayParams { public: + /// \brief Constructor + /// \param input_size_ Input size (number of channels) to the layer array + /// \param condition_size_ Size of the conditioning input + /// \param head_size_ Size of the head output (after head rechannel) + /// \param channels_ Number of channels in each layer + /// \param bottleneck_ Bottleneck size (internal channel count) + /// \param kernel_size_ Kernel size for dilated convolutions + /// \param dilations_ Vector of dilation factors, one per layer + /// \param activation_ Primary activation configuration + /// \param gating_mode_ Gating mode for all layers + /// \param head_bias_ Whether to use bias in the head rechannel + /// \param groups_input Number of groups for input convolutions + /// \param groups_input_mixin_ Number of groups for input mixin convolutions + /// \param groups_1x1_ Number of groups for 1x1 convolutions + /// \param head1x1_params_ Parameters for optional head1x1 convolutions + /// \param secondary_activation_config_ Secondary activation for gating/blending + /// \param conv_pre_film_params_ FiLM parameters before input convolutions + /// \param conv_post_film_params_ FiLM parameters after input convolutions + /// \param input_mixin_pre_film_params_ FiLM parameters before input mixin + /// \param input_mixin_post_film_params_ FiLM parameters after input mixin + /// \param activation_pre_film_params_ FiLM parameters before activation + /// \param activation_post_film_params_ FiLM parameters after activation + /// \param _1x1_post_film_params_ FiLM parameters after 1x1 convolutions + /// \param head1x1_post_film_params_ FiLM parameters after head1x1 convolutions LayerArrayParams(const int input_size_, const int condition_size_, const int head_size_, const int channels_, const int bottleneck_, const int kernel_size_, const std::vector&& dilations_, const activations::ActivationConfig& activation_, const GatingMode gating_mode_, @@ -250,36 +366,70 @@ class LayerArrayParams { } - const int input_size; - const int condition_size; - const int head_size; - const int channels; - const int bottleneck; - const int kernel_size; - std::vector dilations; - const activations::ActivationConfig activation_config; - const GatingMode gating_mode; - const bool head_bias; - const int groups_input; - const int groups_input_mixin; - const int groups_1x1; - const Head1x1Params head1x1_params; - const activations::ActivationConfig secondary_activation_config; - const _FiLMParams conv_pre_film_params; - const _FiLMParams conv_post_film_params; - const _FiLMParams input_mixin_pre_film_params; - const _FiLMParams input_mixin_post_film_params; - const _FiLMParams activation_pre_film_params; - const _FiLMParams activation_post_film_params; - const _FiLMParams _1x1_post_film_params; - const _FiLMParams head1x1_post_film_params; + const int input_size; ///< Input size (number of channels) + const int condition_size; ///< Size of conditioning input + const int head_size; ///< Size of head output (after rechannel) + const int channels; ///< Number of channels in each layer + const int bottleneck; ///< Bottleneck size (internal channel count) + const int kernel_size; ///< Kernel size for dilated convolutions + std::vector dilations; ///< Dilation factors, one per layer + const activations::ActivationConfig activation_config; ///< Primary activation configuration + const GatingMode gating_mode; ///< Gating mode for all layers + const bool head_bias; ///< Whether to use bias in head rechannel + const int groups_input; ///< Number of groups for input convolutions + const int groups_input_mixin; ///< Number of groups for input mixin + const int groups_1x1; ///< Number of groups for 1x1 convolutions + const Head1x1Params head1x1_params; ///< Parameters for optional head1x1 + const activations::ActivationConfig secondary_activation_config; ///< Secondary activation for gating/blending + const _FiLMParams conv_pre_film_params; ///< FiLM params before input conv + const _FiLMParams conv_post_film_params; ///< FiLM params after input conv + const _FiLMParams input_mixin_pre_film_params; ///< FiLM params before input mixin + const _FiLMParams input_mixin_post_film_params; ///< FiLM params after input mixin + const _FiLMParams activation_pre_film_params; ///< FiLM params before activation + const _FiLMParams activation_post_film_params; ///< FiLM params after activation + const _FiLMParams _1x1_post_film_params; ///< FiLM params after 1x1 conv + const _FiLMParams head1x1_post_film_params; ///< FiLM params after head1x1 conv }; -// An array of layers with the same channels, kernel sizes, activations. +/// \brief An array of layers with the same channels, kernel sizes, and activations +/// +/// A LayerArray chains multiple _Layer objects together, processing them sequentially. +/// Each layer processes the output of the previous layer (residual connection). +/// All layers contribute to a shared head output (skip connection) that is accumulated +/// and then projected to the final head size. +/// +/// The LayerArray handles: +/// - Input projection to match layer channel count +/// - Processing layers in sequence with residual connections +/// - Accumulating head outputs from all layers +/// - Projecting the accumulated head output to the final head size class _LayerArray { public: - // Constructor with GatingMode enum and typed ActivationConfig + /// \brief Constructor with GatingMode enum and typed ActivationConfig + /// \param input_size Input size (number of channels) to the layer array + /// \param condition_size Size of the conditioning input + /// \param head_size Size of the head output (after head rechannel) + /// \param channels Number of channels in each layer + /// \param bottleneck Bottleneck size (internal channel count) + /// \param kernel_size Kernel size for dilated convolutions + /// \param dilations Vector of dilation factors, one per layer + /// \param activation_config Primary activation configuration + /// \param gating_mode Gating mode for all layers + /// \param head_bias Whether to use bias in the head rechannel + /// \param groups_input Number of groups for input convolutions + /// \param groups_input_mixin Number of groups for input mixin + /// \param groups_1x1 Number of groups for 1x1 convolutions + /// \param head1x1_params Parameters for optional head1x1 convolutions + /// \param secondary_activation_config Secondary activation for gating/blending + /// \param conv_pre_film_params FiLM parameters before input convolutions + /// \param conv_post_film_params FiLM parameters after input convolutions + /// \param input_mixin_pre_film_params FiLM parameters before input mixin + /// \param input_mixin_post_film_params FiLM parameters after input mixin + /// \param activation_pre_film_params FiLM parameters before activation + /// \param activation_post_film_params FiLM parameters after activation + /// \param _1x1_post_film_params FiLM parameters after 1x1 convolutions + /// \param head1x1_post_film_params FiLM parameters after head1x1 convolutions _LayerArray(const int input_size, const int condition_size, const int head_size, const int channels, const int bottleneck, const int kernel_size, const std::vector& dilations, const activations::ActivationConfig& activation_config, const GatingMode gating_mode, @@ -290,32 +440,60 @@ class _LayerArray const _FiLMParams& activation_pre_film_params, const _FiLMParams& activation_post_film_params, const _FiLMParams& _1x1_post_film_params, const _FiLMParams& head1x1_post_film_params); + /// \brief Resize all arrays to be able to process maxBufferSize frames + /// \param maxBufferSize Maximum number of frames to process in a single call void SetMaxBufferSize(const int maxBufferSize); - // All arrays are "short". - // Process without head input (first layer array) - zeros head inputs before proceeding - void Process(const Eigen::MatrixXf& layer_inputs, // Short - const Eigen::MatrixXf& condition, // Short - const int num_frames); - // Process with head input (subsequent layer arrays) - copies head input before proceeding - void Process(const Eigen::MatrixXf& layer_inputs, // Short - const Eigen::MatrixXf& condition, // Short - const Eigen::MatrixXf& head_inputs, // Short - from previous layer array - const int num_frames); - // Get output from last layer (for next layer array) - // Returns the full pre-allocated buffer; only the first `num_frames` columns - // are valid for a given processing call. Slice with .leftCols(num_frames) as needed. + /// \brief Process without a given previous head input (first layer array) + /// + /// Zeros head accumulated output before proceeding. Used for the first layer array in a WaveNet. + /// \param layer_inputs Input to the layer array (input_size x num_frames) + /// \param condition Conditioning input (condition_size x num_frames) + /// \param num_frames Number of frames to process + void Process(const Eigen::MatrixXf& layer_inputs, const Eigen::MatrixXf& condition, const int num_frames); + + /// \brief Process with a given previous head input (subsequent layer arrays) + /// + /// Copies head input before proceeding. Used for subsequent layer arrays that accumulate + /// head outputs from previous arrays. + /// \param layer_inputs Input to the layer array (input_size x num_frames) + /// \param condition Conditioning input (condition_size x num_frames) + /// \param head_inputs Head input from previous layer array (head_input_size x num_frames) + /// \param num_frames Number of frames to process + void Process(const Eigen::MatrixXf& layer_inputs, const Eigen::MatrixXf& condition, + const Eigen::MatrixXf& head_inputs, const int num_frames); + + /// \brief Get output from last layer (for next layer array) + /// + /// Returns the full pre-allocated buffer; only the first num_frames columns + /// are valid for a given processing call. Slice with .leftCols(num_frames) as needed. + /// \return Reference to the layer output buffer (channels x maxBufferSize) Eigen::MatrixXf& GetLayerOutputs() { return this->_layer_outputs; } + + /// \brief Get output from last layer (const version) + /// \return Const reference to the layer output buffer const Eigen::MatrixXf& GetLayerOutputs() const { return this->_layer_outputs; } - // Get head outputs (post head-rechannel) - // Returns the full pre-allocated buffer; only the first `num_frames` columns - // are valid for a given processing call. Slice with .leftCols(num_frames) as needed. + + /// \brief Get head outputs (post head-rechannel) + /// + /// Returns the full pre-allocated buffer; only the first num_frames columns + /// are valid for a given processing call. Slice with .leftCols(num_frames) as needed. + /// \return Reference to the head output buffer (head_size x maxBufferSize) Eigen::MatrixXf& GetHeadOutputs(); + + /// \brief Get head outputs (const version) + /// \return Const reference to the head output buffer const Eigen::MatrixXf& GetHeadOutputs() const; + + /// \brief Set the parameters (weights) of this module + /// \param it Iterator to the weights vector. Will be advanced as weights are consumed. void set_weights_(std::vector::iterator& it); - // "Zero-indexed" receptive field. - // E.g. a 1x1 convolution has a z.i.r.f. of zero. + /// \brief Get the "zero-indexed" receptive field + /// + /// The receptive field is the number of input samples that affect the output. + /// A 1x1 convolution is defined to have a zero-indexed receptive field of zero. + /// \return Receptive field size long get_receptive_field() const; private: @@ -341,16 +519,50 @@ class _LayerArray void ProcessInner(const Eigen::MatrixXf& layer_inputs, const Eigen::MatrixXf& condition, const int num_frames); }; -// The main WaveNet model +/// \brief The main WaveNet model +/// +/// WaveNet is a dilated convolutional neural network architecture for audio processing. +/// It consists of multiple LayerArrays, each containing multiple layers with increasing +/// dilation factors. The model processes audio through: +/// +/// 1. Condition DSP (optional) - processes input to generate conditioning signal +/// 2. LayerArrays - sequential processing with residual and skip connections +/// 3. Head scaling - final output scaling +/// +/// The model supports real-time audio processing with pre-allocated buffers. class WaveNet : public DSP { public: + /// \brief Constructor + /// \param in_channels Number of input channels + /// \param layer_array_params Parameters for each layer array + /// \param head_scale Scaling factor applied to the final head output + /// \param with_head Whether to use a custom "head" module that further processes the output (not currently supported) + /// \param weights Model weights (will be consumed during construction) + /// \param condition_dsp Optional DSP module for processing the conditioning input + /// \param expected_sample_rate Expected sample rate in Hz (-1.0 if unknown) WaveNet(const int in_channels, const std::vector& layer_array_params, const float head_scale, const bool with_head, std::vector weights, std::unique_ptr condition_dsp, const double expected_sample_rate = -1.0); + + /// \brief Destructor ~WaveNet() = default; + + /// \brief Process audio frames + /// + /// Implements the DSP::process() interface. Processes input audio through the + /// complete WaveNet pipeline and writes to output. + /// \param input Input audio buffers (in_channels x frames) + /// \param output Output audio buffers (out_channels x frames) + /// \param num_frames Number of frames to process void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) override; + + /// \brief Set model weights from a vector + /// \param weights Vector containing all model weights void set_weights_(std::vector& weights); + + /// \brief Set model weights from an iterator + /// \param weights Iterator to the weights vector. Will be advanced as weights are consumed. void set_weights_(std::vector::iterator& weights); protected: @@ -364,13 +576,28 @@ class WaveNet : public DSP std::vector _condition_dsp_input_ptrs; std::vector _condition_dsp_output_ptrs; + /// \brief Resize all buffers to handle maxBufferSize frames + /// \param maxBufferSize Maximum number of frames to process in a single call void SetMaxBufferSize(const int maxBufferSize) override; - // Compute the conditioning array to be given to the layer arrays + + /// \brief Compute the conditioning array to be given to the layer arrays + /// + /// Processes the condition input through the condition DSP (if present) or + /// passes it through directly. + /// \param num_frames Number of frames to process virtual void _process_condition(const int num_frames); - // Fill in the "condition" array that's fed into the various parts of the net. + + /// \brief Fill in the "condition" array that's fed into the various parts of the net + /// + /// Copies input audio into the condition buffer for processing. + /// \param input Input audio buffers + /// \param num_frames Number of frames to process virtual void _set_condition_array(NAM_SAMPLE** input, const int num_frames); - // How many conditioning inputs are there. - // Just one--the audio. + + /// \brief Get the number of conditioning inputs + /// + /// For standard WaveNet, this is just the audio input (same as input channels). + /// \return Number of conditioning input channels virtual int _get_condition_dim() const { return NumInputChannels(); }; private: @@ -382,7 +609,11 @@ class WaveNet : public DSP int PrewarmSamples() override { return mPrewarmSamples; }; }; -// Factory to instantiate from nlohmann json +/// \brief Factory function to instantiate WaveNet from JSON configuration +/// \param config JSON configuration object +/// \param weights Model weights vector +/// \param expectedSampleRate Expected sample rate in Hz (-1.0 if unknown) +/// \return Unique pointer to a DSP object (WaveNet instance) std::unique_ptr Factory(const nlohmann::json& config, std::vector& weights, const double expectedSampleRate); }; // namespace wavenet diff --git a/docs/api/activations.rst b/docs/api/activations.rst new file mode 100644 index 0000000..e1595a1 --- /dev/null +++ b/docs/api/activations.rst @@ -0,0 +1,17 @@ +Activations API +=============== + +.. doxygennamespace:: nam::activations + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::activations::Activation + :project: NeuralAmpModelerCore + :members: + +.. doxygenenum:: nam::activations::ActivationType + :project: NeuralAmpModelerCore + +.. doxygenstruct:: nam::activations::ActivationConfig + :project: NeuralAmpModelerCore + :members: diff --git a/docs/api/conv1d.rst b/docs/api/conv1d.rst new file mode 100644 index 0000000..f6d87e5 --- /dev/null +++ b/docs/api/conv1d.rst @@ -0,0 +1,6 @@ +Conv1D API +========== + +.. doxygenclass:: nam::Conv1D + :project: NeuralAmpModelerCore + :members: diff --git a/docs/api/convnet.rst b/docs/api/convnet.rst new file mode 100644 index 0000000..f5fc886 --- /dev/null +++ b/docs/api/convnet.rst @@ -0,0 +1,18 @@ +ConvNet API +=========== + +.. doxygennamespace:: nam::convnet + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::convnet::ConvNet + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::convnet::ConvNetBlock + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::convnet::BatchNorm + :project: NeuralAmpModelerCore + :members: diff --git a/docs/api/dsp.rst b/docs/api/dsp.rst new file mode 100644 index 0000000..8fe6ae5 --- /dev/null +++ b/docs/api/dsp.rst @@ -0,0 +1,25 @@ +DSP API +======= + +.. doxygenclass:: nam::DSP + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::Buffer + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::Linear + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::Conv1x1 + :project: NeuralAmpModelerCore + :members: + +.. doxygenstruct:: nam::dspData + :project: NeuralAmpModelerCore + :members: + +.. doxygenenum:: nam::EArchitectures + :project: NeuralAmpModelerCore diff --git a/docs/api/film.rst b/docs/api/film.rst new file mode 100644 index 0000000..88dfef1 --- /dev/null +++ b/docs/api/film.rst @@ -0,0 +1,6 @@ +FiLM API +======== + +.. doxygenclass:: nam::FiLM + :project: NeuralAmpModelerCore + :members: diff --git a/docs/api/gating_activations.rst b/docs/api/gating_activations.rst new file mode 100644 index 0000000..09e8cab --- /dev/null +++ b/docs/api/gating_activations.rst @@ -0,0 +1,14 @@ +Gating Activations API +====================== + +.. doxygennamespace:: nam::gating_activations + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::gating_activations::GatingActivation + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::gating_activations::BlendingActivation + :project: NeuralAmpModelerCore + :members: diff --git a/docs/api/get_dsp.rst b/docs/api/get_dsp.rst new file mode 100644 index 0000000..efab64a --- /dev/null +++ b/docs/api/get_dsp.rst @@ -0,0 +1,6 @@ +Model Loading API +================== + +.. doxygennamespace:: nam + :project: NeuralAmpModelerCore + :members: diff --git a/docs/api/index.rst b/docs/api/index.rst new file mode 100644 index 0000000..fd43328 --- /dev/null +++ b/docs/api/index.rst @@ -0,0 +1,20 @@ +API Reference +============= + +This section contains the complete API reference for NeuralAmpModelerCore, automatically generated from the source code headers. + +.. toctree:: + :maxdepth: 2 + + + dsp + ring_buffer + conv1d + activations + gating_activations + film + convnet + lstm + wavenet + get_dsp + util diff --git a/docs/api/lstm.rst b/docs/api/lstm.rst new file mode 100644 index 0000000..610eb4a --- /dev/null +++ b/docs/api/lstm.rst @@ -0,0 +1,14 @@ +LSTM API +======== + +.. doxygennamespace:: nam::lstm + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::lstm::LSTM + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::lstm::LSTMCell + :project: NeuralAmpModelerCore + :members: diff --git a/docs/api/ring_buffer.rst b/docs/api/ring_buffer.rst new file mode 100644 index 0000000..ba504eb --- /dev/null +++ b/docs/api/ring_buffer.rst @@ -0,0 +1,6 @@ +Ring Buffer API +================ + +.. doxygenclass:: nam::RingBuffer + :project: NeuralAmpModelerCore + :members: diff --git a/docs/api/util.rst b/docs/api/util.rst new file mode 100644 index 0000000..a505f84 --- /dev/null +++ b/docs/api/util.rst @@ -0,0 +1,6 @@ +Utilities API +============= + +.. doxygennamespace:: nam::util + :project: NeuralAmpModelerCore + :members: diff --git a/docs/api/wavenet.rst b/docs/api/wavenet.rst new file mode 100644 index 0000000..571c7e4 --- /dev/null +++ b/docs/api/wavenet.rst @@ -0,0 +1,29 @@ +WaveNet API +=========== + +.. doxygennamespace:: nam::wavenet + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::wavenet::WaveNet + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::wavenet::_LayerArray + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::wavenet::_Layer + :project: NeuralAmpModelerCore + :members: + +.. doxygenclass:: nam::wavenet::LayerArrayParams + :project: NeuralAmpModelerCore + :members: + +.. doxygenstruct:: nam::wavenet::Head1x1Params + :project: NeuralAmpModelerCore + :members: + +.. doxygenenum:: nam::wavenet::GatingMode + :project: NeuralAmpModelerCore diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..305152c --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,64 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +import os +import sys +from pathlib import Path + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. +sys.path.insert(0, os.path.abspath('.')) + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'NeuralAmpModelerCore' +copyright = '2023-present Steven Atkinson' +author = 'Neural Amp Modeler Contributors' +release = '0.4.0' +version = '0.4.0' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.viewcode', + 'sphinx.ext.intersphinx', + 'breathe', + 'sphinxcontrib.mermaid', +] + +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'sphinx_rtd_theme' +# html_static_path = ['_static'] # Commented out until _static directory is created + +# -- Breathe configuration ---------------------------------------------------- +# https://breathe.readthedocs.io/ + +breathe_projects = { + 'NeuralAmpModelerCore': 'doxygen/xml', +} +breathe_default_project = 'NeuralAmpModelerCore' +breathe_default_members = ('members', 'undoc-members') + +# -- Intersphinx configuration ----------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html + +intersphinx_mapping = { + 'cpp': ('https://en.cppreference.com/mwiki/', None), +} + +# -- Extension configuration -------------------------------------------------- + +# Autodoc settings +autodoc_mock_imports = ['Eigen', 'nlohmann'] diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..7f9bdda --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,64 @@ +NeuralAmpModelerCore Documentation +=================================== + +Welcome to the NeuralAmpModelerCore documentation. This library provides a core C++ DSP implementation for Neural Amp Modeler plugins. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + wavenet_walkthrough + api/index + +Overview +-------- + +NeuralAmpModelerCore is a high-performance C++ library for running neural network-based audio processing models. It supports multiple architectures including: + +* **WaveNet**: Dilated convolutional neural networks with gating and conditioning +* **ConvNet**: Convolutional neural networks with batch normalization +* **LSTM**: Long Short-Term Memory networks +* **Linear**: Simple linear models (impulse responses) + +The library is designed for real-time audio processing with a focus on: + +* **Real-time safety**: Pre-allocated buffers and no dynamic allocations during processing +* **Performance**: Optimized implementations using Eigen for linear algebra +* **Flexibility**: Support for various activation functions, gating modes, and conditioning mechanisms + +Getting Started +--------------- + +For an example of how to use this library, see the `NeuralAmpModelerPlugin `_ repository. + +Architecture +------------ + +The library is organized into several namespaces: + +* :ref:`nam::wavenet `: WaveNet architecture implementation +* :ref:`nam::convnet `: ConvNet architecture implementation +* :ref:`nam::lstm `: LSTM architecture implementation +* :ref:`nam::activations `: Activation function implementations +* :ref:`nam::gating_activations `: Gating and blending activation functions + +Key Components +-------------- + +* :ref:`DSP `: Base class for all DSP models +* :ref:`WaveNet `: Main WaveNet model class +* :ref:`Conv1D `: Dilated 1D convolution implementation +* :ref:`FiLM `: Feature-wise Linear Modulation + +Documentation +------------- + +* :doc:`wavenet_walkthrough`: Step-by-step explanation of WaveNet architecture, LayerArray, and Layer computations +* :doc:`api/index`: Complete API reference + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..f5ac4b8 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,4 @@ +sphinx>=5.0.0 +breathe>=4.35.0 +sphinx-rtd-theme>=1.2.0 +sphinxcontrib-mermaid>=0.8.0 diff --git a/docs/wavenet_walkthrough.rst b/docs/wavenet_walkthrough.rst new file mode 100644 index 0000000..9dd90e6 --- /dev/null +++ b/docs/wavenet_walkthrough.rst @@ -0,0 +1,443 @@ +WaveNet Computation Walkthrough +================================== + +This document provides a detailed step-by-step explanation of how the NAM WaveNet architecture performs its computations, including the LayerArray and Layer objects that make up a the model. + +"It's not *really* a Wavenet" +----------------------------- + +The name "WaveNet" is a bit of a misnomer. +There are similarities to the architecture from +`van den Oord et al. (2016) `_--this is a +convolutional neural network that repeats a "Layer" motif withskip connections that +give good accuracy typical of convnets along with good training stability, but there are +a lot of differences. + +Here's a rundown of what's not exactly the same at an informal level: + +* The model in NAM is feedforward and used in a "regression" setting; + the model from the original paper is autoregressive and used for generative tasks. +* The class in NAM actually composes several "Layer array" objects. + Each one of these individually is actually far closer to a "WaveNet" in architecture. + In other words, this is more like a "stacked WaveNet". +* There are additional skip connections (e.g. input mixin) that aren't really part of + the original WaveNet architecture. +* And finally, the actual recipe within the layer has a lot of modifications. + The original layer has, roughly, a "convolution-activation-convolution" sequence with a + gated activation. + Here, the gated activation is optional (and is frequently not used, like in the popular + A1 standard/lite/feather/nano configurations). +* In v0.4.0, even more modifications have been added in--FiLMs, a bottlneck, and an + arbitrary "conditioning DSP" module that can be used to embed the input signal in a more + effective way to modulate the layers in the main model. + It doesn't need to be a WaveNet, but if it were then this feels more like a "cascading + (stacked) WaveNet". + +WaveNet Overview +---------------- + +WaveNet is a dilated convolutional neural network architecture designed for audio processing. The model consists of: + +* **Multiple LayerArrays**: Each LayerArray contains multiple layers with the same channel configuration +* **Conditioning**: Optional DSP processing of the input to generate conditioning signals and "skip in" this signal to the layers. +* **Residual and Skip Connections**: Information flows through both residual (layer-to-layer) and skip (to head) paths + +Computation graphs of the layer, layer array, and full model are below on this page. + +Layer Computation +----------------- + +A single Layer performs the core computation of a WaveNet block. +The computation proceeds through several stages: + +Step 1: Input Convolution +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The input first goes through a dilated 1D convolution: + +1. **Optional Pre-FiLM**: If `conv_pre_film` is active, the input is modulated by the condition signal before convolution +2. **Dilated Convolution**: The input is convolved with a dilated kernel +3. **Optional Post-FiLM**: If `conv_post_film` is active, the convolution output is modulated by the condition signal + +.. note:: + Having two FiLM layers bookending the convolution layer is mathematically equivalent + to a sort of "rank 1 adaptive LoRA" on the convolution weights. + +.. code-block:: cpp + :caption: Input convolution processing + + if (this->_conv_pre_film) { + this->_conv_pre_film->Process(input, condition, num_frames); + this->_conv.Process(this->_conv_pre_film->GetOutput(), num_frames); + } else { + this->_conv.Process(input, num_frames); + } + if (this->_conv_post_film) { + Eigen::MatrixXf& conv_output = this->_conv.GetOutput(); + this->_conv_post_film->Process_(conv_output, condition, num_frames); + } + +Step 2: Input Mixin +~~~~~~~~~~~~~~~~~~~ + +The conditioning input is processed separately and added to the convolution output: + +1. **Optional Pre-FiLM**: If `input_mixin_pre_film` is active, the condition is modulated before the mixin convolution +2. **Input Mixin Convolution**: A 1x1 convolution processes the condition signal +3. **Optional Post-FiLM**: If `input_mixin_post_film` is active, the mixin output is modulated + +.. code-block:: cpp + :caption: Input mixin processing + + if (this->_input_mixin_pre_film) { + this->_input_mixin_pre_film->Process(condition, condition, num_frames); + this->_input_mixin.process_(this->_input_mixin_pre_film->GetOutput(), num_frames); + } else { + this->_input_mixin.process_(condition, num_frames); + } + if (this->_input_mixin_post_film) { + Eigen::MatrixXf& input_mixin_output = this->_input_mixin.GetOutput(); + this->_input_mixin_post_film->Process_(input_mixin_output, condition, num_frames); + } + +Step 3: Sum and Pre-Activation FiLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The convolution output and input mixin output are summed, and optionally modulated: + +.. code-block:: cpp + :caption: Sum and pre-activation FiLM + + this->_z.leftCols(num_frames).noalias() = + _conv.GetOutput().leftCols(num_frames) + _input_mixin.GetOutput().leftCols(num_frames); + if (this->_activation_pre_film) { + this->_activation_pre_film->Process_(this->_z, condition, num_frames); + } + +Step 4: Activation +~~~~~~~~~~~~~~~~~~ + +The activation stage depends on the gating mode: + +**No Gating (GatingMode::NONE)** + Simple activation function applied to the summed output. + +**Gated (GatingMode::GATED)** + The output channels are doubled (2 * bottleneck). The top half goes through the primary activation, + the bottom half through a secondary activation (typically sigmoid). The results are multiplied element-wise. + +**Blended (GatingMode::BLENDED)** + Similar to gated, but instead of multiplication, a weighted blend is performed: + output = alpha * activated_input + (1 - alpha) * pre_activation_input + where alpha comes from the secondary activation. + +After activation, an optional post-activation FiLM may be applied. + +.. note:: + Even though the secondary activation is calssically chosen to be a sigmoid, it doesn't + need to be. It doesn't even need to output a value between 0 and 1. + The operation is still well-defined. + +.. code-block:: cpp + :caption: Activation processing (gated mode example) + + if (this->_gating_mode == GatingMode::GATED) { + auto input_block = this->_z.leftCols(num_frames); + auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames); + this->_gating_activation->apply(input_block, output_block); + if (this->_activation_post_film) { + this->_activation_post_film->Process(this->_z.topRows(bottleneck), condition, num_frames); + this->_z.topRows(bottleneck).leftCols(num_frames).noalias() = + this->_activation_post_film->GetOutput().leftCols(num_frames); + } + } + +Step 5: 1x1 Convolution +~~~~~~~~~~~~~~~~~~~~~~~~ + +A 1x1 convolution reduces the bottleneck channels back to the layer channel count: + +.. code-block:: cpp + :caption: 1x1 convolution + + _1x1.process_(this->_z.topRows(bottleneck), num_frames); + if (this->_1x1_post_film) { + Eigen::MatrixXf& _1x1_output = this->_1x1.GetOutput(); + this->_1x1_post_film->Process_(_1x1_output, condition, num_frames); + } + +Step 6: Head 1x1 (Optional) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If a head1x1 convolution is configured, it processes the activated output for the skip connection: + +.. code-block:: cpp + :caption: Head 1x1 processing + + if (this->_head1x1) { + this->_head1x1->process_(this->_z.topRows(bottleneck).leftCols(num_frames), num_frames); + if (this->_head1x1_post_film) { + Eigen::MatrixXf& head1x1_output = this->_head1x1->GetOutput(); + this->_head1x1_post_film->Process_(head1x1_output, condition, num_frames); + } + this->_output_head.leftCols(num_frames).noalias() = + this->_head1x1->GetOutput().leftCols(num_frames); + } + +.. note:: + If there is no head 1x1, then the output dimension is the same as the activation + output dimension (the "bottleneck" dimension). + If there is, then the head can project to an arbitrary dimension. + +Step 7: Residual and Skip Connections +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Finally, the outputs are computed: + +* **Residual Connection**: `output_next_layer = input + 1x1_output` +* **Skip Connection**: `output_head = activated_output` (or head1x1 output if present) + +.. code-block:: cpp + :caption: Residual and skip connections + + // Store output to next layer (residual connection) + this->_output_next_layer.leftCols(num_frames).noalias() = + input.leftCols(num_frames) + _1x1.GetOutput().leftCols(num_frames); + + // Store output to head (skip connection) + if (this->_head1x1) { + this->_output_head.leftCols(num_frames).noalias() = + this->_head1x1->GetOutput().leftCols(num_frames); + } else { + this->_output_head.leftCols(num_frames).noalias() = + this->_z.topRows(bottleneck).leftCols(num_frames); + } + +Data Flow Diagram +~~~~~~~~~~~~~~~~~ + +Data arrays are marked with their dimensions as (channels, frames). +Notes: + +* ``g=2`` if a gating or blending activation is used, and ``1`` otherwise. + +* The head output dimension ``dh`` is the bottleneck dimension ``b`` when no head 1x1 is + used; otherwise, it is determined by the head 1x1's number of output channels. + + +.. mermaid:: + :caption: Layer Computation Flow + + graph TD + Input["Input (dx,n)"] --> PreFiLM1{Pre-FiLM?} + PreFiLM1 -->|Yes| ConvPre[Conv Pre-FiLM] + PreFiLM1 -->|No| Conv["Dilated Conv (g*b,n)"] + ConvPre --> Conv + Conv --> PostFiLM1{Post-FiLM?} + PostFiLM1 -->|Yes| ConvPost[Conv Post-FiLM] + PostFiLM1 -->|No| Sum["Sum (g*b,n)"] + ConvPost --> Sum + + Condition["Condition (dc,n)"] --> PreFiLM2{Pre-FiLM?} + PreFiLM2 -->|Yes| MixinPre[Input Mixin Pre-FiLM] + PreFiLM2 -->|No| Mixin["Input Mixin (g*b,n)"] + MixinPre --> Mixin + Mixin --> PostFiLM2{Post-FiLM?} + PostFiLM2 -->|Yes| MixinPost[Input Mixin Post-FiLM] + PostFiLM2 -->|No| Sum + MixinPost --> Sum + + Sum --> PreActFiLM{Pre-Act FiLM?} + PreActFiLM -->|Yes| PreAct[Pre-Activation FiLM] + PreActFiLM -->|No| Act["Activation (b,n)"] + PreAct --> Act + + Act --> PostActFiLM{Post-Act FiLM?} + PostActFiLM -->|Yes| PostActFilm[Post-Activation FiLM] + PostActFiLM -->|No| PostAct["Post-Activation Output (b,n)"] + PostActFilm --> PostAct + + PostAct --> Conv1x1["1x1 Conv (dx,n)"] + Conv1x1 --> Post1x1FiLM{Post-1x1 FiLM?} + Post1x1FiLM -->|Yes| Post1x1[Post-1x1 FiLM] + Post1x1FiLM -->|No| Residual["Residual (dx,n)"] + Post1x1 --> Residual + + Input --> ResidualSum["Residual Sum (dx,n)"] + Residual --> ResidualSum + ResidualSum --> LayerOutput["Layer Output (dx,n)"] + + PostAct --> Head1x1{Head 1x1?} + Head1x1 -->|Yes| HeadConv["Head 1x1 Conv (dh,n)"] + Head1x1 -->|No| HeadOutput["Head Output (dh,n)"] + HeadConv --> HeadFiLM{Head FiLM?} + HeadFiLM -->|Yes| HeadPost[Head Post-FiLM] + HeadFiLM -->|No| HeadOutput + HeadPost --> HeadOutput + +LayerArray Computation +---------------------- + +A LayerArray chains multiple Layer objects together, processing them sequentially while +accumulating their "head outputs" via skip-out connections. + +Step 1: Rechanneling +~~~~~~~~~~~~~~~~~~~~~ + +The input is first proejcted (rechanneled) to match the layer channel count: + +.. code-block:: cpp + :caption: Input rechanneling + + this->_rechannel.process_(layer_inputs, num_frames); + Eigen::MatrixXf& rechannel_output = _rechannel.GetOutput(); + +Step 2: Layer Processing +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Each layer processes the output of the previous layer: + +1. **First Layer**: Processes the rechanneled input +2. **Subsequent Layers**: Process the residual output from the previous layer +3. **Head Accumulation**: Each "head output" is accumulated into the head buffer + +.. code-block:: cpp + :caption: Layer processing loop + + for (size_t i = 0; i < this->_layers.size(); i++) { + if (i == 0) { + // First layer consumes the rechannel output buffer + this->_layers[i].Process(rechannel_output, condition, num_frames); + } else { + // Subsequent layers consume the previous layer's output + Eigen::MatrixXf& prev_output = this->_layers[i - 1].GetOutputNextLayer(); + this->_layers[i].Process(prev_output, condition, num_frames); + } + + // Accumulate head output from this layer + this->_head_inputs.leftCols(num_frames).noalias() += + this->_layers[i].GetOutputHead().leftCols(num_frames); + } + +Step 3: Head Rechanneling +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The accumulated head outputs are proejcted (rechanneled) to the final output dimension +for the layer array: + +.. code-block:: cpp + :caption: Head rechanneling + + _head_rechannel.process_(this->_head_inputs, num_frames); + +LayerArray Structure +~~~~~~~~~~~~~~~~~~~~ + +.. mermaid:: + :caption: LayerArray Structure + + graph TD + Input[Layer Input] --> Rechannel[Rechannel] + Rechannel --> Layer1[Layer 1] + Layer1 --> Layer2[Layer 2] + Layer2 --> Layer3[Layer 3] + Layer3 --> LayerN[Layer N] + Layer1 -->|Skip| HeadAccum[Head Accumulator] + Layer2 -->|Skip| HeadAccum + Layer3 -->|Skip| HeadAccum + LayerN -->|Skip| HeadAccum + HeadAccum --> HeadRechannel[Head Rechannel] + HeadRechannel --> HeadOut[Head Output] + LayerN --> LayerOut[Layer Output] + +WaveNet Processing +------------------ + +The complete WaveNet processing pipeline: + +Step 1: Condition Processing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If a condition DSP is provided, the input is processed through it to generate the +conditioning signal: + +.. code-block:: cpp + :caption: Condition processing + + void WaveNet::_process_condition(const int num_frames) { + if (this->_condition_dsp != nullptr) { + // Process input through condition DSP + this->_condition_dsp->process(/* input */, /* output */, num_frames); + // Copy output to condition buffer + } else { + // Use input directly as condition + this->_condition_output = this->_condition_input; + } + } + +The condition module can be a WaveNet, but it can also be something else--a convolution, +an RNN, etc. + +Step 2: LayerArray Processing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Each LayerArray processes the output of the previous array: + +1. **First LayerArray**: Processes the input with zeroed head inputs +2. **Subsequent LayerArrays**: Process the previous array's output and accumulate head inputs + +.. code-block:: cpp + :caption: LayerArray processing + + // First layer array + this->_layer_arrays[0].Process(input, condition, num_frames); + + // Subsequent layer arrays + for (size_t i = 1; i < this->_layer_arrays.size(); i++) { + Eigen::MatrixXf& prev_output = this->_layer_arrays[i-1].GetLayerOutputs(); + Eigen::MatrixXf& prev_head = this->_layer_arrays[i-1].GetHeadOutputs(); + this->_layer_arrays[i].Process(prev_output, condition, prev_head, num_frames); + } + +Step 3: Head Scaling and Output +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The final head output from the last LayerArray is scaled and written to output: + +.. code-block:: cpp + :caption: Head scaling and output + + Eigen::MatrixXf& final_head = this->_layer_arrays.back().GetHeadOutputs(); + // Apply head scale and write to output buffers + // (implementation details in wavenet.cpp) + +Complete WaveNet Flow +~~~~~~~~~~~~~~~~~~~~~~ + +.. mermaid:: + :caption: Complete WaveNet Processing Flow + + graph TD + AudioIn[Audio Input] --> ConditionProc{Condition DSP?} + ConditionProc -->|Yes| CondDSP[Condition DSP] + ConditionProc -->|No| Condition[Condition Signal] + CondDSP --> Condition + AudioIn --> LayerArray1[LayerArray 1] + Condition --> LayerArray1 + LayerArray1 -->|LayerN Output| LayerArray2[LayerArray 2] + LayerArray1 -->|Head Output| LayerArray2 + Condition --> LayerArray2 + LayerArray2 -->|LayerN Output| LayerArrayN[LayerArray N] + LayerArray2 -->|Head Output| LayerArrayN + Condition --> LayerArrayN + LayerArrayN -->|LayerN Output| Unused("(Unused)") + LayerArrayN -->|Head Output| HeadAccum[Head Accumulator] + HeadAccum --> HeadScale[Head Scale] + HeadScale --> AudioOut[Audio Output] + +See Also +-------- + +* :doc:`api/wavenet` - Complete API reference for WaveNet classes +* :doc:`api/dsp` - Base DSP interface documentation +* :doc:`api/conv1d` - Convolution implementation details diff --git a/tools/benchmodel.cpp b/tools/benchmodel.cpp index d8a1690..39c14b0 100644 --- a/tools/benchmodel.cpp +++ b/tools/benchmodel.cpp @@ -1,7 +1,9 @@ #include #include +#include #include "NAM/dsp.h" +#include "NAM/get_dsp.h" using std::chrono::duration; using std::chrono::duration_cast; @@ -27,7 +29,7 @@ int main(int argc, char* argv[]) std::unique_ptr model; model.reset(); - model = nam::get_dsp(modelPath); + model = nam::get_dsp(std::filesystem::path(modelPath)); if (model == nullptr) { diff --git a/tools/loadmodel.cpp b/tools/loadmodel.cpp index 8a1b889..265139a 100644 --- a/tools/loadmodel.cpp +++ b/tools/loadmodel.cpp @@ -1,5 +1,7 @@ #include +#include #include "NAM/dsp.h" +#include "NAM/get_dsp.h" int main(int argc, char* argv[]) { @@ -9,7 +11,7 @@ int main(int argc, char* argv[]) fprintf(stderr, "Loading model [%s]\n", modelPath); - auto model = nam::get_dsp(modelPath); + auto model = nam::get_dsp(std::filesystem::path(modelPath)); if (model != nullptr) {