diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp index 2a1aba7..285ea69 100644 --- a/NAM/wavenet.cpp +++ b/NAM/wavenet.cpp @@ -13,12 +13,14 @@ void nam::wavenet::_Layer::SetMaxBufferSize(const int maxBufferSize) { _conv.SetMaxBufferSize(maxBufferSize); _input_mixin.SetMaxBufferSize(maxBufferSize); - _z.resize(this->_conv.get_out_channels(), maxBufferSize); + const long z_channels = this->_conv.get_out_channels(); // This is 2*bottleneck when gated, bottleneck when not + _z.resize(z_channels, maxBufferSize); _1x1.SetMaxBufferSize(maxBufferSize); // Pre-allocate output buffers const long channels = this->get_channels(); this->_output_next_layer.resize(channels, maxBufferSize); - this->_output_head.resize(channels, maxBufferSize); + // _output_head stores the activated portion: bottleneck rows (the actual bottleneck value, not doubled) + this->_output_head.resize(this->_bottleneck, maxBufferSize); } void nam::wavenet::_Layer::set_weights_(std::vector::iterator& weights) @@ -30,7 +32,7 @@ void nam::wavenet::_Layer::set_weights_(std::vector::iterator& weights) void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::MatrixXf& condition, const int num_frames) { - const long channels = this->get_channels(); + const long bottleneck = this->_bottleneck; // Use the actual bottleneck value, not the doubled output channels // Step 1: input convolutions this->_conv.Process(input, num_frames); @@ -50,19 +52,20 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma // do this column-wise: for (int i = 0; i < num_frames; i++) { - this->_activation->apply(this->_z.block(0, i, channels, 1)); + this->_activation->apply(this->_z.block(0, i, bottleneck, 1)); // TODO Need to support other activation functions here instead of hardcoded sigmoid - activations::Activation::get_activation("Sigmoid")->apply(this->_z.block(channels, i, channels, 1)); + activations::Activation::get_activation("Sigmoid")->apply(this->_z.block(bottleneck, i, bottleneck, 1)); } - this->_z.block(0, 0, channels, num_frames).array() *= this->_z.block(channels, 0, channels, num_frames).array(); - _1x1.process_(_z.topRows(channels), num_frames); // Might not be RT safe + this->_z.block(0, 0, bottleneck, num_frames).array() *= + this->_z.block(bottleneck, 0, bottleneck, num_frames).array(); + _1x1.process_(_z.topRows(bottleneck), num_frames); // Might not be RT safe } // Store output to head (skip connection: activated conv output) if (!this->_gated) this->_output_head.leftCols(num_frames).noalias() = this->_z.leftCols(num_frames); else - this->_output_head.leftCols(num_frames).noalias() = this->_z.topRows(channels).leftCols(num_frames); + this->_output_head.leftCols(num_frames).noalias() = this->_z.topRows(bottleneck).leftCols(num_frames); // Store output to next layer (residual connection: input + _1x1 output) this->_output_next_layer.leftCols(num_frames).noalias() = input.leftCols(num_frames) + _1x1.GetOutput().leftCols(num_frames); @@ -72,15 +75,17 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma // LayerArray ================================================================= nam::wavenet::_LayerArray::_LayerArray(const int input_size, const int condition_size, const int head_size, - const int channels, const int kernel_size, const std::vector& dilations, - const std::string activation, const bool gated, const bool head_bias, - const int groups_input, const int groups_1x1) + const int channels, const int bottleneck, const int kernel_size, + const std::vector& dilations, const std::string activation, + const bool gated, const bool head_bias, const int groups_input, + const int groups_1x1) : _rechannel(input_size, channels, false) -, _head_rechannel(channels, head_size, head_bias) +, _head_rechannel(bottleneck, head_size, head_bias) +, _bottleneck(bottleneck) { for (size_t i = 0; i < dilations.size(); i++) - this->_layers.push_back( - _Layer(condition_size, channels, kernel_size, dilations[i], activation, gated, groups_input, groups_1x1)); + this->_layers.push_back(_Layer( + condition_size, channels, bottleneck, kernel_size, dilations[i], activation, gated, groups_input, groups_1x1)); } void nam::wavenet::_LayerArray::SetMaxBufferSize(const int maxBufferSize) @@ -94,7 +99,7 @@ void nam::wavenet::_LayerArray::SetMaxBufferSize(const int maxBufferSize) // Pre-allocate output buffers const long channels = this->_get_channels(); this->_layer_outputs.resize(channels, maxBufferSize); - this->_head_inputs.resize(channels, maxBufferSize); + this->_head_inputs.resize(this->_bottleneck, maxBufferSize); } @@ -199,9 +204,9 @@ nam::wavenet::WaveNet::WaveNet(const std::vector { this->_layer_arrays.push_back(nam::wavenet::_LayerArray( layer_array_params[i].input_size, layer_array_params[i].condition_size, layer_array_params[i].head_size, - layer_array_params[i].channels, layer_array_params[i].kernel_size, layer_array_params[i].dilations, - layer_array_params[i].activation, layer_array_params[i].gated, layer_array_params[i].head_bias, - layer_array_params[i].groups_input, layer_array_params[i].groups_1x1)); + layer_array_params[i].channels, layer_array_params[i].bottleneck, layer_array_params[i].kernel_size, + layer_array_params[i].dilations, layer_array_params[i].activation, layer_array_params[i].gated, + layer_array_params[i].head_bias, layer_array_params[i].groups_input, layer_array_params[i].groups_1x1)); if (i > 0) if (layer_array_params[i].channels != layer_array_params[i - 1].head_size) { @@ -300,8 +305,10 @@ std::unique_ptr nam::wavenet::Factory(const nlohmann::json& config, st nlohmann::json layer_config = config["layers"][i]; const int groups = layer_config.value("groups", 1); // defaults to 1 const int groups_1x1 = layer_config.value("groups_1x1", 1); // defaults to 1 + const int channels = layer_config["channels"]; + const int bottleneck = layer_config.value("bottleneck", channels); // defaults to channels if not present layer_array_params.push_back(nam::wavenet::LayerArrayParams( - layer_config["input_size"], layer_config["condition_size"], layer_config["head_size"], layer_config["channels"], + layer_config["input_size"], layer_config["condition_size"], layer_config["head_size"], channels, bottleneck, layer_config["kernel_size"], layer_config["dilations"], layer_config["activation"], layer_config["gated"], layer_config["head_bias"], groups, groups_1x1)); } diff --git a/NAM/wavenet.h b/NAM/wavenet.h index 71d2eff..832673b 100644 --- a/NAM/wavenet.h +++ b/NAM/wavenet.h @@ -16,13 +16,14 @@ namespace wavenet class _Layer { public: - _Layer(const int condition_size, const int channels, const int kernel_size, const int dilation, + _Layer(const int condition_size, const int channels, const int bottleneck, const int kernel_size, const int dilation, const std::string activation, const bool gated, const int groups_input, const int groups_1x1) - : _conv(channels, gated ? 2 * channels : channels, kernel_size, true, dilation, groups_input) - , _input_mixin(condition_size, gated ? 2 * channels : channels, false) - , _1x1(channels, channels, true, groups_1x1) + : _conv(channels, gated ? 2 * bottleneck : bottleneck, kernel_size, true, dilation, groups_input) + , _input_mixin(condition_size, gated ? 2 * bottleneck : bottleneck, false) + , _1x1(bottleneck, channels, true, groups_1x1) , _activation(activations::Activation::get_activation(activation)) // needs to support activations with parameters - , _gated(gated) {}; + , _gated(gated) + , _bottleneck(bottleneck) {}; // Resize all arrays to be able to process `maxBufferSize` frames. void SetMaxBufferSize(const int maxBufferSize); // Set the parameters of this module @@ -71,18 +72,21 @@ class _Layer activations::Activation* _activation; const bool _gated; + const int _bottleneck; // Internal channel count (not doubled when gated) }; class LayerArrayParams { public: LayerArrayParams(const int input_size_, const int condition_size_, const int head_size_, const int channels_, - const int kernel_size_, const std::vector&& dilations_, const std::string activation_, - const bool gated_, const bool head_bias_, const int groups_input, const int groups_1x1_) + const int bottleneck_, const int kernel_size_, const std::vector&& dilations_, + const std::string activation_, const bool gated_, const bool head_bias_, const int groups_input, + const int groups_1x1_) : input_size(input_size_) , condition_size(condition_size_) , head_size(head_size_) , channels(channels_) + , bottleneck(bottleneck_) , kernel_size(kernel_size_) , dilations(std::move(dilations_)) , activation(activation_) @@ -97,6 +101,7 @@ class LayerArrayParams const int condition_size; const int head_size; const int channels; + const int bottleneck; const int kernel_size; std::vector dilations; const std::string activation; @@ -111,8 +116,9 @@ class _LayerArray { public: _LayerArray(const int input_size, const int condition_size, const int head_size, const int channels, - const int kernel_size, const std::vector& dilations, const std::string activation, const bool gated, - const bool head_bias, const int groups_input, const int groups_1x1); + const int bottleneck, const int kernel_size, const std::vector& dilations, + const std::string activation, const bool gated, const bool head_bias, const int groups_input, + const int groups_1x1); void SetMaxBufferSize(const int maxBufferSize); @@ -150,12 +156,15 @@ class _LayerArray std::vector<_Layer> _layers; // Output from last layer (for next layer array) Eigen::MatrixXf _layer_outputs; - // Accumulated head inputs from all layers + // Accumulated head inputs from all layers (bottleneck channels) Eigen::MatrixXf _head_inputs; - // Rechannel for the head + // Rechannel for the head (bottleneck -> head_size) Conv1x1 _head_rechannel; + // Bottleneck size (internal channel count) + const int _bottleneck; + long _get_channels() const; // Common processing logic after head inputs are set void ProcessInner(const Eigen::MatrixXf& layer_inputs, const Eigen::MatrixXf& condition, const int num_frames); diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 01cf211..1fd5802 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -20,7 +20,7 @@ set_target_properties(run_tests PROPERTIES COMPILE_OPTIONS "-O0") # Release/RelWithDebInfo/MinSizeRel build types automatically define NDEBUG # We use a compile option to undefine it, which works on GCC, Clang, and MSVC target_compile_options(run_tests PRIVATE - $<$,$,$>:-U_NDEBUG> + $<$,$,$>:-UNDEBUG> ) source_group(NAM ${CMAKE_CURRENT_SOURCE_DIR} FILES ${NAM_SOURCES}) diff --git a/tools/run_tests.cpp b/tools/run_tests.cpp index aa28629..33c4d45 100644 --- a/tools/run_tests.cpp +++ b/tools/run_tests.cpp @@ -104,6 +104,8 @@ int main() test_wavenet::test_layer::test_non_gated_layer(); test_wavenet::test_layer::test_layer_activations(); test_wavenet::test_layer::test_layer_multichannel(); + test_wavenet::test_layer::test_layer_bottleneck(); + test_wavenet::test_layer::test_layer_bottleneck_gated(); test_wavenet::test_layer_array::test_layer_array_basic(); test_wavenet::test_layer_array::test_layer_array_receptive_field(); test_wavenet::test_layer_array::test_layer_array_with_head_input(); @@ -118,6 +120,7 @@ int main() test_wavenet::test_conv1d_grouped_process_realtime_safe(); test_wavenet::test_conv1d_grouped_dilated_process_realtime_safe(); test_wavenet::test_layer_process_realtime_safe(); + test_wavenet::test_layer_bottleneck_process_realtime_safe(); test_wavenet::test_layer_grouped_process_realtime_safe(); test_wavenet::test_layer_array_process_realtime_safe(); test_wavenet::test_process_realtime_safe(); diff --git a/tools/test/test_wavenet/test_full.cpp b/tools/test/test_wavenet/test_full.cpp index 3d20679..d75ae1c 100644 --- a/tools/test/test_wavenet/test_full.cpp +++ b/tools/test/test_wavenet/test_full.cpp @@ -19,6 +19,7 @@ void test_wavenet_model() const int condition_size = 1; const int head_size = 1; const int channels = 1; + const int bottleneck = channels; const int kernel_size = 1; std::vector dilations{1}; const std::string activation = "ReLU"; @@ -29,7 +30,7 @@ void test_wavenet_model() const int groups = 1; const int groups_1x1 = 1; - nam::wavenet::LayerArrayParams params(input_size, condition_size, head_size, channels, kernel_size, + nam::wavenet::LayerArrayParams params(input_size, condition_size, head_size, channels, bottleneck, kernel_size, std::move(dilations), activation, gated, head_bias, groups, groups_1x1); std::vector layer_array_params; layer_array_params.push_back(std::move(params)); @@ -85,15 +86,16 @@ void test_wavenet_multiple_arrays() std::vector layer_array_params; // First array std::vector dilations1{1}; + const int bottleneck = channels; const int groups_1x1 = 1; layer_array_params.push_back(nam::wavenet::LayerArrayParams(input_size, condition_size, head_size, channels, - kernel_size, std::move(dilations1), activation, gated, - head_bias, groups, groups_1x1)); + bottleneck, kernel_size, std::move(dilations1), activation, + gated, head_bias, groups, groups_1x1)); // Second array (head_size of first must match channels of second) std::vector dilations2{1}; layer_array_params.push_back(nam::wavenet::LayerArrayParams(head_size, condition_size, head_size, channels, - kernel_size, std::move(dilations2), activation, gated, - head_bias, groups, groups_1x1)); + bottleneck, kernel_size, std::move(dilations2), activation, + gated, head_bias, groups, groups_1x1)); std::vector weights; // Array 0: rechannel, layer, head_rechannel @@ -127,6 +129,7 @@ void test_wavenet_zero_input() const int condition_size = 1; const int head_size = 1; const int channels = 1; + const int bottleneck = channels; const int kernel_size = 1; std::vector dilations{1}; const std::string activation = "ReLU"; @@ -137,7 +140,7 @@ void test_wavenet_zero_input() const int groups = 1; const int groups_1x1 = 1; - nam::wavenet::LayerArrayParams params(input_size, condition_size, head_size, channels, kernel_size, + nam::wavenet::LayerArrayParams params(input_size, condition_size, head_size, channels, bottleneck, kernel_size, std::move(dilations), activation, gated, head_bias, groups, groups_1x1); std::vector layer_array_params; layer_array_params.push_back(std::move(params)); @@ -168,6 +171,7 @@ void test_wavenet_different_buffer_sizes() const int condition_size = 1; const int head_size = 1; const int channels = 1; + const int bottleneck = channels; const int kernel_size = 1; std::vector dilations{1}; const std::string activation = "ReLU"; @@ -178,7 +182,7 @@ void test_wavenet_different_buffer_sizes() const int groups = 1; const int groups_1x1 = 1; - nam::wavenet::LayerArrayParams params(input_size, condition_size, head_size, channels, kernel_size, + nam::wavenet::LayerArrayParams params(input_size, condition_size, head_size, channels, bottleneck, kernel_size, std::move(dilations), activation, gated, head_bias, groups, groups_1x1); std::vector layer_array_params; layer_array_params.push_back(std::move(params)); @@ -210,6 +214,7 @@ void test_wavenet_prewarm() const int condition_size = 1; const int head_size = 1; const int channels = 1; + const int bottleneck = channels; const int kernel_size = 3; std::vector dilations{1, 2, 4}; const std::string activation = "ReLU"; @@ -220,7 +225,7 @@ void test_wavenet_prewarm() const int groups = 1; const int groups_1x1 = 1; - nam::wavenet::LayerArrayParams params(input_size, condition_size, head_size, channels, kernel_size, + nam::wavenet::LayerArrayParams params(input_size, condition_size, head_size, channels, bottleneck, kernel_size, std::move(dilations), activation, gated, head_bias, groups, groups_1x1); std::vector layer_array_params; layer_array_params.push_back(std::move(params)); diff --git a/tools/test/test_wavenet/test_layer.cpp b/tools/test/test_wavenet/test_layer.cpp index 10eccf4..5d53be4 100644 --- a/tools/test/test_wavenet/test_layer.cpp +++ b/tools/test/test_wavenet/test_layer.cpp @@ -18,14 +18,15 @@ void test_gated() // Issue 101 const int conditionSize = 1; const int channels = 1; + const int bottleneck = channels; const int kernelSize = 1; const int dilation = 1; const std::string activation = "ReLU"; const bool gated = true; const int groups_input = 1; const int groups_1x1 = 1; - auto layer = - nam::wavenet::_Layer(conditionSize, channels, kernelSize, dilation, activation, gated, groups_input, groups_1x1); + auto layer = nam::wavenet::_Layer( + conditionSize, channels, bottleneck, kernelSize, dilation, activation, gated, groups_input, groups_1x1); // Conv, input mixin, 1x1 std::vector weights{ @@ -48,7 +49,7 @@ void test_gated() Eigen::MatrixXf input, condition, headInput, output; input.resize(channels, numFrames); condition.resize(conditionSize, numFrames); - headInput.resize(channels, numFrames); + headInput.resize(bottleneck, numFrames); output.resize(channels, numFrames); const float signalValue = 0.25f; @@ -92,6 +93,7 @@ void test_layer_getters() { const int conditionSize = 2; const int channels = 4; + const int bottleneck = channels; const int kernelSize = 3; const int dilation = 2; const std::string activation = "Tanh"; @@ -99,8 +101,8 @@ void test_layer_getters() const int groups_input = 1; const int groups_1x1 = 1; - auto layer = - nam::wavenet::_Layer(conditionSize, channels, kernelSize, dilation, activation, gated, groups_input, groups_1x1); + auto layer = nam::wavenet::_Layer( + conditionSize, channels, bottleneck, kernelSize, dilation, activation, gated, groups_input, groups_1x1); assert(layer.get_channels() == channels); assert(layer.get_kernel_size() == kernelSize); @@ -112,6 +114,7 @@ void test_non_gated_layer() { const int conditionSize = 1; const int channels = 1; + const int bottleneck = channels; const int kernelSize = 1; const int dilation = 1; const std::string activation = "ReLU"; @@ -119,8 +122,8 @@ void test_non_gated_layer() const int groups_input = 1; const int groups_1x1 = 1; - auto layer = - nam::wavenet::_Layer(conditionSize, channels, kernelSize, dilation, activation, gated, groups_input, groups_1x1); + auto layer = nam::wavenet::_Layer( + conditionSize, channels, bottleneck, kernelSize, dilation, activation, gated, groups_input, groups_1x1); // For non-gated: conv outputs 1 channel, input_mixin outputs 1 channel, 1x1 outputs 1 channel // Conv: (1,1,1) weight + (1,) bias @@ -152,7 +155,7 @@ void test_non_gated_layer() assert(layer_output.rows() == channels); assert(layer_output.cols() == numFrames); - assert(head_output.rows() == channels); + assert(head_output.rows() == bottleneck); assert(head_output.cols() == numFrames); // With identity-like weights: input=1, condition=1 @@ -183,10 +186,11 @@ void test_layer_activations() // Test Tanh activation { + const int bottleneck = channels; const int groups_input = 1; const int groups_1x1 = 1; - auto layer = - nam::wavenet::_Layer(conditionSize, channels, kernelSize, dilation, "Tanh", gated, groups_input, groups_1x1); + auto layer = nam::wavenet::_Layer( + conditionSize, channels, bottleneck, kernelSize, dilation, "Tanh", gated, groups_input, groups_1x1); std::vector weights{1.0f, 0.0f, 1.0f, 1.0f, 0.0f}; auto it = weights.begin(); layer.set_weights_(it); @@ -213,6 +217,7 @@ void test_layer_multichannel() { const int conditionSize = 2; const int channels = 3; + const int bottleneck = channels; const int kernelSize = 1; const int dilation = 1; const std::string activation = "ReLU"; @@ -220,8 +225,8 @@ void test_layer_multichannel() const int groups_input = 1; const int groups_1x1 = 1; - auto layer = - nam::wavenet::_Layer(conditionSize, channels, kernelSize, dilation, activation, gated, groups_input, groups_1x1); + auto layer = nam::wavenet::_Layer( + conditionSize, channels, bottleneck, kernelSize, dilation, activation, gated, groups_input, groups_1x1); assert(layer.get_channels() == channels); @@ -272,7 +277,158 @@ void test_layer_multichannel() assert(layer_output.rows() == channels); assert(layer_output.cols() == numFrames); - assert(head_output.rows() == channels); + assert(head_output.rows() == bottleneck); + assert(head_output.cols() == numFrames); +} + +// Test layer with bottleneck different from channels +void test_layer_bottleneck() +{ + const int conditionSize = 1; + const int channels = 4; + const int bottleneck = 2; // bottleneck < channels + const int kernelSize = 1; + const int dilation = 1; + const std::string activation = "ReLU"; + const bool gated = false; + const int groups_input = 1; + const int groups_1x1 = 1; + + auto layer = nam::wavenet::_Layer( + conditionSize, channels, bottleneck, kernelSize, dilation, activation, gated, groups_input, groups_1x1); + + // With bottleneck < channels, the internal conv and input_mixin should have bottleneck channels, + // but the 1x1 should map from bottleneck back to channels + // Conv: (channels, bottleneck, kernelSize=1) + bias -> outputs bottleneck channels + // Input mixin: (conditionSize, bottleneck) -> outputs bottleneck channels + // 1x1: (bottleneck, channels) + bias -> outputs channels channels + + // Set weights + std::vector weights; + // Conv weights: out_channels x in_channels x kernelSize = bottleneck x channels x kernelSize = 2 x 4 x 1 = 8 weights + // Weight layout for Conv1D: for each out_channel, for each in_channel, for each kernel position + // Use identity-like pattern: out_channel i connects to in_channel i (for i < bottleneck) + for (int out_ch = 0; out_ch < bottleneck; out_ch++) + { + for (int in_ch = 0; in_ch < channels; in_ch++) + { + weights.push_back((out_ch == in_ch) ? 1.0f : 0.0f); + } + } + // Conv bias: bottleneck values + weights.insert(weights.end(), {0.0f, 0.0f}); + // Input mixin: conditionSize x bottleneck = 1 x 2 = 2 weights + weights.insert(weights.end(), {1.0f, 1.0f}); + // 1x1 weights: out_channels x in_channels = channels x bottleneck = 4 x 2 = 8 weights + // Weight layout for Conv1x1: for each out_channel, for each in_channel + // Identity-like pattern: out_channel i connects to in_channel i (for i < bottleneck) + for (int out_ch = 0; out_ch < channels; out_ch++) + { + for (int in_ch = 0; in_ch < bottleneck; in_ch++) + { + weights.push_back((out_ch == in_ch) ? 1.0f : 0.0f); + } + } + // 1x1 bias: channels values + weights.insert(weights.end(), {0.0f, 0.0f, 0.0f, 0.0f}); + + auto it = weights.begin(); + layer.set_weights_(it); + assert(it == weights.end()); + + const int numFrames = 2; + layer.SetMaxBufferSize(numFrames); + + Eigen::MatrixXf input(channels, numFrames); + Eigen::MatrixXf condition(conditionSize, numFrames); + input.fill(1.0f); + condition.fill(1.0f); + + layer.Process(input, condition, numFrames); + + auto layer_output = layer.GetOutputNextLayer().leftCols(numFrames); + auto head_output = layer.GetOutputHead().leftCols(numFrames); + + // Layer output should have channels rows (for next layer) + assert(layer_output.rows() == channels); + assert(layer_output.cols() == numFrames); + // Head output should have bottleneck rows (internal channel count) + assert(head_output.rows() == bottleneck); + assert(head_output.cols() == numFrames); +} + +// Test layer with bottleneck and gated activation +void test_layer_bottleneck_gated() +{ + const int conditionSize = 1; + const int channels = 4; + const int bottleneck = 2; // bottleneck < channels + const int kernelSize = 1; + const int dilation = 1; + const std::string activation = "ReLU"; + const bool gated = true; // gated doubles the internal bottleneck channels + const int groups_input = 1; + const int groups_1x1 = 1; + + auto layer = nam::wavenet::_Layer( + conditionSize, channels, bottleneck, kernelSize, dilation, activation, gated, groups_input, groups_1x1); + + // With gated=true and bottleneck=2, internal channels should be 2*bottleneck=4 + // Conv: (channels, 2*bottleneck, kernelSize=1) = (4, 4, 1) + bias + // Input mixin: (conditionSize, 2*bottleneck) = (1, 4) + // 1x1: (bottleneck, channels) = (2, 4) + bias + + // Set weights + std::vector weights; + // Conv weights: out_channels x in_channels x kernelSize = (2*bottleneck) x channels x kernelSize = 4 x 4 x 1 = 16 + // weights Weight layout for Conv1D: for each out_channel, for each in_channel, for each kernel position Identity + // pattern: out_channel i connects to in_channel i (for i < min(2*bottleneck, channels)) + for (int out_ch = 0; out_ch < 2 * bottleneck; out_ch++) + { + for (int in_ch = 0; in_ch < channels; in_ch++) + { + weights.push_back((out_ch == in_ch) ? 1.0f : 0.0f); + } + } + // Conv bias: 2*bottleneck = 4 values + weights.insert(weights.end(), {0.0f, 0.0f, 0.0f, 0.0f}); + // Input mixin: conditionSize x (2*bottleneck) = 1 x 4 = 4 weights + weights.insert(weights.end(), {1.0f, 1.0f, 1.0f, 1.0f}); + // 1x1 weights: out_channels x in_channels = channels x bottleneck = 4 x 2 = 8 weights + // Weight layout for Conv1x1: for each out_channel, for each in_channel + // Identity pattern: out_channel i connects to in_channel i (for i < bottleneck) + for (int out_ch = 0; out_ch < channels; out_ch++) + { + for (int in_ch = 0; in_ch < bottleneck; in_ch++) + { + weights.push_back((out_ch == in_ch) ? 1.0f : 0.0f); + } + } + // 1x1 bias: channels = 4 values + weights.insert(weights.end(), {0.0f, 0.0f, 0.0f, 0.0f}); + + auto it = weights.begin(); + layer.set_weights_(it); + assert(it == weights.end()); + + const int numFrames = 2; + layer.SetMaxBufferSize(numFrames); + + Eigen::MatrixXf input(channels, numFrames); + Eigen::MatrixXf condition(conditionSize, numFrames); + input.fill(1.0f); + condition.fill(1.0f); + + layer.Process(input, condition, numFrames); + + auto layer_output = layer.GetOutputNextLayer().leftCols(numFrames); + auto head_output = layer.GetOutputHead().leftCols(numFrames); + + // Layer output should have channels rows (for next layer) + assert(layer_output.rows() == channels); + assert(layer_output.cols() == numFrames); + // Head output should have bottleneck rows (the activated portion, not the full 2*bottleneck) + assert(head_output.rows() == bottleneck); assert(head_output.cols() == numFrames); } }; // namespace test_layer diff --git a/tools/test/test_wavenet/test_layer_array.cpp b/tools/test/test_wavenet/test_layer_array.cpp index 41c435a..a4581c2 100644 --- a/tools/test/test_wavenet/test_layer_array.cpp +++ b/tools/test/test_wavenet/test_layer_array.cpp @@ -19,6 +19,7 @@ void test_layer_array_basic() const int condition_size = 1; const int head_size = 1; const int channels = 1; + const int bottleneck = channels; const int kernel_size = 1; std::vector dilations{1, 2}; const std::string activation = "ReLU"; @@ -27,8 +28,8 @@ void test_layer_array_basic() const int groups = 1; const int groups_1x1 = 1; - auto layer_array = nam::wavenet::_LayerArray(input_size, condition_size, head_size, channels, kernel_size, dilations, - activation, gated, head_bias, groups, groups_1x1); + auto layer_array = nam::wavenet::_LayerArray(input_size, condition_size, head_size, channels, bottleneck, kernel_size, + dilations, activation, gated, head_bias, groups, groups_1x1); const int numFrames = 4; layer_array.SetMaxBufferSize(numFrames); @@ -75,6 +76,7 @@ void test_layer_array_receptive_field() const int condition_size = 1; const int head_size = 1; const int channels = 1; + const int bottleneck = channels; const int kernel_size = 3; std::vector dilations{1, 2, 4}; const std::string activation = "ReLU"; @@ -83,8 +85,8 @@ void test_layer_array_receptive_field() const int groups = 1; const int groups_1x1 = 1; - auto layer_array = nam::wavenet::_LayerArray(input_size, condition_size, head_size, channels, kernel_size, dilations, - activation, gated, head_bias, groups, groups_1x1); + auto layer_array = nam::wavenet::_LayerArray(input_size, condition_size, head_size, channels, bottleneck, kernel_size, + dilations, activation, gated, head_bias, groups, groups_1x1); long rf = layer_array.get_receptive_field(); // Expected: sum of dilation * (kernel_size - 1) for each layer @@ -103,6 +105,7 @@ void test_layer_array_with_head_input() const int condition_size = 1; const int head_size = 1; const int channels = 1; + const int bottleneck = channels; const int kernel_size = 1; std::vector dilations{1}; const std::string activation = "ReLU"; @@ -111,8 +114,8 @@ void test_layer_array_with_head_input() const int groups = 1; const int groups_1x1 = 1; - auto layer_array = nam::wavenet::_LayerArray(input_size, condition_size, head_size, channels, kernel_size, dilations, - activation, gated, head_bias, groups, groups_1x1); + auto layer_array = nam::wavenet::_LayerArray(input_size, condition_size, head_size, channels, bottleneck, kernel_size, + dilations, activation, gated, head_bias, groups, groups_1x1); const int numFrames = 2; layer_array.SetMaxBufferSize(numFrames); diff --git a/tools/test/test_wavenet/test_real_time_safe.cpp b/tools/test/test_wavenet/test_real_time_safe.cpp index a7a5e8f..91d8628 100644 --- a/tools/test/test_wavenet/test_real_time_safe.cpp +++ b/tools/test/test_wavenet/test_real_time_safe.cpp @@ -429,6 +429,7 @@ void test_layer_process_realtime_safe() // Setup: Create a Layer const int condition_size = 1; const int channels = 1; + const int bottleneck = channels; const int kernel_size = 1; const int dilation = 1; const std::string activation = "ReLU"; @@ -436,8 +437,8 @@ void test_layer_process_realtime_safe() const int groups_input = 1; const int groups_1x1 = 1; - auto layer = - nam::wavenet::_Layer(condition_size, channels, kernel_size, dilation, activation, gated, groups_input, groups_1x1); + auto layer = nam::wavenet::_Layer(condition_size, channels, bottleneck, kernel_size, dilation, activation, gated, + groups_input, groups_1x1); // Set weights std::vector weights{1.0f, 0.0f, // Conv (weight, bias) @@ -477,12 +478,98 @@ void test_layer_process_realtime_safe() } } +// Test that Layer::Process() method with bottleneck != channels does not allocate or free memory +void test_layer_bottleneck_process_realtime_safe() +{ + // Setup: Create a Layer with bottleneck different from channels + const int condition_size = 1; + const int channels = 4; + const int bottleneck = 2; // bottleneck < channels + const int kernel_size = 1; + const int dilation = 1; + const std::string activation = "ReLU"; + const bool gated = false; + const int groups_input = 1; + const int groups_1x1 = 1; + + auto layer = nam::wavenet::_Layer(condition_size, channels, bottleneck, kernel_size, dilation, activation, gated, + groups_input, groups_1x1); + + // Set weights for bottleneck != channels + // Conv: (channels, bottleneck, kernelSize=1) = (4, 2, 1) + bias + // Input mixin: (conditionSize, bottleneck) = (1, 2) + // 1x1: (bottleneck, channels) = (2, 4) + bias + std::vector weights; + // Conv weights: out_channels x in_channels x kernelSize = bottleneck x channels x kernelSize = 2 x 4 x 1 = 8 weights + // Weight layout for Conv1D: for each out_channel, for each in_channel, for each kernel position + // Identity-like pattern: out_channel i connects to in_channel i (for i < bottleneck) + for (int out_ch = 0; out_ch < bottleneck; out_ch++) + { + for (int in_ch = 0; in_ch < channels; in_ch++) + { + weights.push_back((out_ch == in_ch) ? 1.0f : 0.0f); + } + } + // Conv bias: bottleneck values + weights.insert(weights.end(), {0.0f, 0.0f}); + // Input mixin: conditionSize x bottleneck = 1 x 2 = 2 weights + weights.insert(weights.end(), {1.0f, 1.0f}); + // 1x1 weights: out_channels x in_channels = channels x bottleneck = 4 x 2 = 8 weights + // Weight layout for Conv1x1: for each out_channel, for each in_channel + // Identity-like pattern: out_channel i connects to in_channel i (for i < bottleneck) + for (int out_ch = 0; out_ch < channels; out_ch++) + { + for (int in_ch = 0; in_ch < bottleneck; in_ch++) + { + weights.push_back((out_ch == in_ch) ? 1.0f : 0.0f); + } + } + // 1x1 bias: channels values + weights.insert(weights.end(), {0.0f, 0.0f, 0.0f, 0.0f}); + + auto it = weights.begin(); + layer.set_weights_(it); + + const int maxBufferSize = 256; + layer.SetMaxBufferSize(maxBufferSize); + + // Test with several different buffer sizes + std::vector buffer_sizes{1, 8, 16, 32, 64, 128, 256}; + + for (int buffer_size : buffer_sizes) + { + // Prepare input/condition matrices (allocate before tracking) + Eigen::MatrixXf input(channels, buffer_size); + Eigen::MatrixXf condition(condition_size, buffer_size); + input.setConstant(0.5f); + condition.setConstant(0.5f); + + std::string test_name = "Layer Process (bottleneck=" + std::to_string(bottleneck) + ", channels=" + + std::to_string(channels) + ") - Buffer size " + std::to_string(buffer_size); + run_allocation_test_no_allocations( + nullptr, // No setup needed + [&]() { + // Call Process() - this should not allocate or free + layer.Process(input, condition, buffer_size); + }, + nullptr, // No teardown needed + test_name.c_str()); + + // Verify output is valid + auto output = layer.GetOutputNextLayer().leftCols(buffer_size); + assert(output.rows() == channels && output.cols() == buffer_size); + assert(std::isfinite(output(0, 0))); + assert(std::isfinite(output(channels - 1, buffer_size - 1))); + } +} + // Test that Layer::Process() method with grouped convolution (groups_input > 1) does not allocate or free memory void test_layer_grouped_process_realtime_safe() { // Setup: Create a Layer with grouped convolution const int condition_size = 1; const int channels = 4; // Must be divisible by groups_input + const int bottleneck = channels; const int kernel_size = 2; const int dilation = 1; const std::string activation = "ReLU"; @@ -490,8 +577,8 @@ void test_layer_grouped_process_realtime_safe() const int groups_input = 2; // groups_input > 1 const int groups_1x1 = 2; // 1x1 is also grouped - auto layer = - nam::wavenet::_Layer(condition_size, channels, kernel_size, dilation, activation, gated, groups_input, groups_1x1); + auto layer = nam::wavenet::_Layer(condition_size, channels, bottleneck, kernel_size, dilation, activation, gated, + groups_input, groups_1x1); // Set weights for grouped convolution // With groups_input=2, channels=4: each group has 2 in_channels and 2 out_channels @@ -592,6 +679,7 @@ void test_layer_array_process_realtime_safe() const int condition_size = 1; const int head_size = 1; const int channels = 1; + const int bottleneck = channels; const int kernel_size = 1; std::vector dilations{1}; const std::string activation = "ReLU"; @@ -600,8 +688,8 @@ void test_layer_array_process_realtime_safe() const int groups = 1; const int groups_1x1 = 1; - auto layer_array = nam::wavenet::_LayerArray(input_size, condition_size, head_size, channels, kernel_size, dilations, - activation, gated, head_bias, groups, groups_1x1); + auto layer_array = nam::wavenet::_LayerArray(input_size, condition_size, head_size, channels, bottleneck, kernel_size, + dilations, activation, gated, head_bias, groups, groups_1x1); // Set weights: rechannel(1), layer(conv:1+1, input_mixin:1, 1x1:1+1), head_rechannel(1) std::vector weights{1.0f, // Rechannel @@ -666,15 +754,16 @@ void test_process_realtime_safe() std::vector layer_array_params; // First layer array std::vector dilations1{1}; + const int bottleneck = channels; const int groups_1x1 = 1; layer_array_params.push_back(nam::wavenet::LayerArrayParams(input_size, condition_size, head_size, channels, - kernel_size, std::move(dilations1), activation, gated, - head_bias, groups, groups_1x1)); + bottleneck, kernel_size, std::move(dilations1), activation, + gated, head_bias, groups, groups_1x1)); // Second layer array (head_size of first must match channels of second) std::vector dilations2{1}; layer_array_params.push_back(nam::wavenet::LayerArrayParams(head_size, condition_size, head_size, channels, - kernel_size, std::move(dilations2), activation, gated, - head_bias, groups, groups_1x1)); + bottleneck, kernel_size, std::move(dilations2), activation, + gated, head_bias, groups, groups_1x1)); // Weights: Array 0: rechannel(1), layer(conv:1+1, input_mixin:1, 1x1:1+1), head_rechannel(1) // Array 1: same structure