sdatkinson · sdatkinson · Jan 15, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -3,9 +3,6 @@ name: Build
 on:
   [workflow_dispatch, pull_request]
 
-env:
-  BUILD_TYPE: Release
-
 jobs:
   build-ubuntu:
     name: Build Ubuntu
@@ -35,7 +32,7 @@ jobs:
       env:
         CXX: clang++
       run: |
-        cmake ..
+        cmake .. -DCMAKE_BUILD_TYPE=Debug
         cmake --build . -j4
 
     - name: Run tests

diff --git a/NAM/activations.h b/NAM/activations.h
@@ -323,12 +323,12 @@ class FastLUTActivation : public Activation
     return table_[i] + (table_[i + 1] - table_[i]) * frac;
   }
 
-  // Vector application (Batch processing)
-  void apply(std::vector<float>& data) const
+  // Override base class virtual method to apply LUT lookup to array of floats
+  void apply(float* data, long size) override
   {
-    for (float& val : data)
+    for (long i = 0; i < size; i++)
     {
-      val = lookup(val);
+      data[i] = lookup(data[i]);
     }
   }
 

diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp
@@ -1,4 +1,5 @@
 #include "conv1d.h"
+#include <stdexcept>
 
 namespace nam
 {
@@ -10,19 +11,48 @@ void Conv1D::set_weights_(std::vector<float>::iterator& weights)
   {
     const long out_channels = this->_weight[0].rows();
     const long in_channels = this->_weight[0].cols();
+    const int numGroups = this->_num_groups;
+    const long out_per_group = out_channels / numGroups;
+    const long in_per_group = in_channels / numGroups;
+
+    // For grouped convolutions, weights are organized per group
+    // Weight layout: for each kernel position k, weights are [group0, group1, ..., groupN-1]
+    // Each group's weight matrix is (out_channels/numGroups, in_channels/numGroups)
     // Crazy ordering because that's how it gets flattened.
-    for (auto i = 0; i < out_channels; i++)
-      for (auto j = 0; j < in_channels; j++)
-        for (size_t k = 0; k < this->_weight.size(); k++)
-          this->_weight[k](i, j) = *(weights++);
+    for (int g = 0; g < numGroups; g++)
+    {
+      for (auto i = 0; i < out_per_group; i++)
+      {
+        for (auto j = 0; j < in_per_group; j++)
+        {
+          for (size_t k = 0; k < this->_weight.size(); k++)
+          {
+            this->_weight[k](g * out_per_group + i, g * in_per_group + j) = *(weights++);
+          }
+        }
+      }
+    }
   }
   for (long i = 0; i < this->_bias.size(); i++)
     this->_bias(i) = *(weights++);
 }
 
 void Conv1D::set_size_(const int in_channels, const int out_channels, const int kernel_size, const bool do_bias,
-                       const int _dilation)
+                       const int _dilation, const int groups)
 {
+  // Validate that channels divide evenly by groups
+  if (in_channels % groups != 0)
+  {
+    throw std::runtime_error("in_channels (" + std::to_string(in_channels) + ") must be divisible by numGroups ("
+                             + std::to_string(groups) + ")");
+  }
+  if (out_channels % groups != 0)
+  {
+    throw std::runtime_error("out_channels (" + std::to_string(out_channels) + ") must be divisible by numGroups ("
+                             + std::to_string(groups) + ")");
+  }
+
+  this->_num_groups = groups;
   this->_weight.resize(kernel_size);
   for (size_t i = 0; i < this->_weight.size(); i++)
     this->_weight[i].resize(out_channels,
@@ -35,9 +65,10 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int
 }
 
 void Conv1D::set_size_and_weights_(const int in_channels, const int out_channels, const int kernel_size,
-                                   const int _dilation, const bool do_bias, std::vector<float>::iterator& weights)
+                                   const int _dilation, const bool do_bias, const int groups,
+                                   std::vector<float>::iterator& weights)
 {
-  this->set_size_(in_channels, out_channels, kernel_size, do_bias, _dilation);
+  this->set_size_(in_channels, out_channels, kernel_size, do_bias, _dilation, groups);
   this->set_weights_(weights);
 }
 
@@ -73,25 +104,54 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
   // Zero output before processing
   _output.leftCols(num_frames).setZero();
 
+  const int numGroups = this->_num_groups;
+  const long in_channels = get_in_channels();
+  const long out_channels = get_out_channels();
+  const long in_per_group = in_channels / numGroups;
+  const long out_per_group = out_channels / numGroups;
+
   // Process from ring buffer with dilation lookback
   // After Write(), data is at positions [_write_pos, _write_pos+num_frames-1]
   // For kernel tap k with offset, we need to read from _write_pos + offset
   // The offset is negative (looking back), so _write_pos + offset reads from earlier positions
   // The original process_() reads: input.middleCols(i_start + offset, ncols)
   // where i_start is the current position and offset is negative for lookback
-  for (size_t k = 0; k < this->_weight.size(); k++)
-  {
-    const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
-    // Offset is negative (looking back)
-    // Read from position: _write_pos + offset
-    // Since offset is negative, we compute lookback = -offset to read from _write_pos - lookback
-    const long lookback = -offset;
-
-    // Read num_frames starting from write_pos + offset (which is write_pos - lookback)
-    auto input_block = _input_buffer.Read(num_frames, lookback);
 
-    // Perform convolution: output += weight[k] * input_block
-    _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
+  if (numGroups == 1)
+  {
+    // Standard convolution (no grouping)
+    for (size_t k = 0; k < this->_weight.size(); k++)
+    {
+      const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
+      const long lookback = -offset;
+      auto input_block = _input_buffer.Read(num_frames, lookback);
+      _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
+    }
+  }
+  else
+  {
+    // Grouped convolution: process each group separately
+    for (int g = 0; g < numGroups; g++)
+    {
+      for (size_t k = 0; k < this->_weight.size(); k++)
+      {
+        const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
+        const long lookback = -offset;
+        auto input_block = _input_buffer.Read(num_frames, lookback);
+
+        // Extract input slice for this group
+        auto input_group = input_block.middleRows(g * in_per_group, in_per_group);
+
+        // Extract weight slice for this group
+        auto weight_group = this->_weight[k].block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);
+
+        // Extract output slice for this group
+        auto output_group = _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group);
+
+        // Perform grouped convolution: output_group += weight_group * input_group
+        output_group.noalias() += weight_group * input_group;
+      }
+    }
   }
 
   // Add bias if present
@@ -107,14 +167,49 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
 void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols,
                       const long j_start) const
 {
-  // This is the clever part ;)
-  for (size_t k = 0; k < this->_weight.size(); k++)
+  const int numGroups = this->_num_groups;
+  const long in_channels = get_in_channels();
+  const long out_channels = get_out_channels();
+  const long in_per_group = in_channels / numGroups;
+  const long out_per_group = out_channels / numGroups;
+
+  if (numGroups == 1)
+  {
+    // Standard convolution (no grouping)
+    for (size_t k = 0; k < this->_weight.size(); k++)
+    {
+      const long offset = this->_dilation * (k + 1 - this->_weight.size());
+      if (k == 0)
+        output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
+      else
+        output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
+    }
+  }
+  else
   {
-    const long offset = this->_dilation * (k + 1 - this->_weight.size());
-    if (k == 0)
-      output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
-    else
-      output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
+    // Grouped convolution: process each group separately
+    for (int g = 0; g < numGroups; g++)
+    {
+      for (size_t k = 0; k < this->_weight.size(); k++)
+      {
+        const long offset = this->_dilation * (k + 1 - this->_weight.size());
+
+        // Extract input slice for this group
+        auto input_group = input.middleCols(i_start + offset, ncols).middleRows(g * in_per_group, in_per_group);
+
+        // Extract weight slice for this group
+        auto weight_group = this->_weight[k].block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);
+
+        // Extract output slice for this group
+        auto output_group = output.middleCols(j_start, ncols).middleRows(g * out_per_group, out_per_group);
+
+        // Perform grouped convolution
+        if (k == 0)
+          output_group.noalias() = weight_group * input_group;
+        else
+          output_group.noalias() += weight_group * input_group;
+      }
+    }
   }
   if (this->_bias.size() > 0)
   {
@@ -125,8 +220,13 @@ void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, con
 long Conv1D::get_num_weights() const
 {
   long num_weights = this->_bias.size();
-  for (size_t i = 0; i < this->_weight.size(); i++)
-    num_weights += this->_weight[i].size();
+  if (this->_weight.size() > 0)
+  {
+    const long out_channels = this->_weight[0].rows();
+    const long in_channels = this->_weight[0].cols();
+    // For grouped convolutions, the number of weights is reduced by numGroups
+    num_weights += (out_channels * in_channels * this->_weight.size()) / this->_num_groups;
+  }
   return num_weights;
 }
 } // namespace nam
diff --git a/NAM/conv1d.h b/NAM/conv1d.h
@@ -9,16 +9,21 @@ namespace nam
 class Conv1D
 {
 public:
-  Conv1D() { this->_dilation = 1; };
-  Conv1D(const int in_channels, const int out_channels, const int kernel_size, const int bias, const int dilation)
+  Conv1D()
   {
-    set_size_(in_channels, out_channels, kernel_size, bias, dilation);
+    this->_dilation = 1;
+    this->_num_groups = 1;
+  };
+  Conv1D(const int in_channels, const int out_channels, const int kernel_size, const int bias, const int dilation,
+         const int groups = 1)
+  {
+    set_size_(in_channels, out_channels, kernel_size, bias, dilation, groups);
   };
   void set_weights_(std::vector<float>::iterator& weights);
   void set_size_(const int in_channels, const int out_channels, const int kernel_size, const bool do_bias,
-                 const int _dilation);
+                 const int _dilation, const int groups = 1);
   void set_size_and_weights_(const int in_channels, const int out_channels, const int kernel_size, const int _dilation,
-                             const bool do_bias, std::vector<float>::iterator& weights);
+                             const bool do_bias, const int groups, std::vector<float>::iterator& weights);
   // Reset the ring buffer and pre-allocate output buffer
   // :param sampleRate: Unused, for interface consistency
   // :param maxBufferSize: Maximum buffer size for output buffer and to size ring buffer
@@ -50,6 +55,7 @@ class Conv1D
   std::vector<Eigen::MatrixXf> _weight;
   Eigen::VectorXf _bias;
   int _dilation;
+  int _num_groups;
 
 private:
   RingBuffer _input_buffer; // Ring buffer for input (channels x buffer_size)

diff --git a/NAM/convnet.cpp b/NAM/convnet.cpp
@@ -48,12 +48,12 @@ void nam::convnet::BatchNorm::process_(Eigen::MatrixXf& x, const long i_start, c
 }
 
 void nam::convnet::ConvNetBlock::set_weights_(const int in_channels, const int out_channels, const int _dilation,
-                                              const bool batchnorm, const std::string activation,
+                                              const bool batchnorm, const std::string activation, const int groups,
                                               std::vector<float>::iterator& weights)
 {
   this->_batchnorm = batchnorm;
   // HACK 2 kernel
-  this->conv.set_size_and_weights_(in_channels, out_channels, 2, _dilation, !batchnorm, weights);
+  this->conv.set_size_and_weights_(in_channels, out_channels, 2, _dilation, !batchnorm, groups, weights);
   if (this->_batchnorm)
     this->batchnorm = BatchNorm(out_channels, weights);
   this->activation = activations::Activation::get_activation(activation);
@@ -148,14 +148,14 @@ void nam::convnet::_Head::process_(const Eigen::MatrixXf& input, Eigen::VectorXf
 
 nam::convnet::ConvNet::ConvNet(const int channels, const std::vector<int>& dilations, const bool batchnorm,
                                const std::string activation, std::vector<float>& weights,
-                               const double expected_sample_rate)
+                               const double expected_sample_rate, const int groups)
 : Buffer(*std::max_element(dilations.begin(), dilations.end()), expected_sample_rate)
 {
   this->_verify_weights(channels, dilations, batchnorm, weights.size());
   this->_blocks.resize(dilations.size());
   std::vector<float>::iterator it = weights.begin();
   for (size_t i = 0; i < dilations.size(); i++)
-    this->_blocks[i].set_weights_(i == 0 ? 1 : channels, channels, dilations[i], batchnorm, activation, it);
+    this->_blocks[i].set_weights_(i == 0 ? 1 : channels, channels, dilations[i], batchnorm, activation, groups, it);
   // Only need _block_vals for the head (one entry)
   // Conv1D layers manage their own buffers now
   this->_block_vals.resize(1);
@@ -280,8 +280,9 @@ std::unique_ptr<nam::DSP> nam::convnet::Factory(const nlohmann::json& config, st
   const std::vector<int> dilations = config["dilations"];
   const bool batchnorm = config["batchnorm"];
   const std::string activation = config["activation"];
+  const int groups = config.value("groups", 1); // defaults to 1
   return std::make_unique<nam::convnet::ConvNet>(
-    channels, dilations, batchnorm, activation, weights, expectedSampleRate);
+    channels, dilations, batchnorm, activation, weights, expectedSampleRate, groups);
 }
 
 namespace

diff --git a/NAM/convnet.h b/NAM/convnet.h
@@ -44,7 +44,7 @@ class ConvNetBlock
 public:
   ConvNetBlock() {};
   void set_weights_(const int in_channels, const int out_channels, const int _dilation, const bool batchnorm,
-                    const std::string activation, std::vector<float>::iterator& weights);
+                    const std::string activation, const int groups, std::vector<float>::iterator& weights);
   void SetMaxBufferSize(const int maxBufferSize);
   // Process input matrix directly (new API, similar to WaveNet)
   void Process(const Eigen::MatrixXf& input, const int num_frames);
@@ -78,7 +78,7 @@ class ConvNet : public Buffer
 {
 public:
   ConvNet(const int channels, const std::vector<int>& dilations, const bool batchnorm, const std::string activation,
-          std::vector<float>& weights, const double expected_sample_rate = -1.0);
+          std::vector<float>& weights, const double expected_sample_rate = -1.0, const int groups = 1);
   ~ConvNet() = default;
 
   void process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames) override;

diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp
@@ -73,12 +73,14 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
 
 nam::wavenet::_LayerArray::_LayerArray(const int input_size, const int condition_size, const int head_size,
                                        const int channels, const int kernel_size, const std::vector<int>& dilations,
-                                       const std::string activation, const bool gated, const bool head_bias)
+                                       const std::string activation, const bool gated, const bool head_bias,
+                                       const int groups_input)
 : _rechannel(input_size, channels, false)
 , _head_rechannel(channels, head_size, head_bias)
 {
   for (size_t i = 0; i < dilations.size(); i++)
-    this->_layers.push_back(_Layer(condition_size, channels, kernel_size, dilations[i], activation, gated));
+    this->_layers.push_back(
+      _Layer(condition_size, channels, kernel_size, dilations[i], activation, gated, groups_input));
 }
 
 void nam::wavenet::_LayerArray::SetMaxBufferSize(const int maxBufferSize)
@@ -198,7 +200,8 @@ nam::wavenet::WaveNet::WaveNet(const std::vector<nam::wavenet::LayerArrayParams>
     this->_layer_arrays.push_back(nam::wavenet::_LayerArray(
       layer_array_params[i].input_size, layer_array_params[i].condition_size, layer_array_params[i].head_size,
       layer_array_params[i].channels, layer_array_params[i].kernel_size, layer_array_params[i].dilations,
-      layer_array_params[i].activation, layer_array_params[i].gated, layer_array_params[i].head_bias));
+      layer_array_params[i].activation, layer_array_params[i].gated, layer_array_params[i].head_bias,
+      layer_array_params[i].groups_input));
     if (i > 0)
       if (layer_array_params[i].channels != layer_array_params[i - 1].head_size)
       {
@@ -295,10 +298,11 @@ std::unique_ptr<nam::DSP> nam::wavenet::Factory(const nlohmann::json& config, st
   for (size_t i = 0; i < config["layers"].size(); i++)
   {
     nlohmann::json layer_config = config["layers"][i];
+    const int groups = layer_config.value("groups", 1); // defaults to 1
     layer_array_params.push_back(nam::wavenet::LayerArrayParams(
       layer_config["input_size"], layer_config["condition_size"], layer_config["head_size"], layer_config["channels"],
       layer_config["kernel_size"], layer_config["dilations"], layer_config["activation"], layer_config["gated"],
-      layer_config["head_bias"]));
+      layer_config["head_bias"], groups));
   }
   const bool with_head = !config["head"].is_null();
   const float head_scale = config["head_scale"];