handle VQ returning topk minimum distance codes

lucidrains · lucidrains · commit 28e4e560c9d1 · 2025-11-23T08:15:14.000-08:00
diff --git a/tests/test_manual_ema.py b/tests/test_manual_ema.py
@@ -1,7 +1,7 @@
 import torch
 from vector_quantize_pytorch import VectorQuantize
 
-def test_manual_ema_update():
+def test_topk_and_manual_ema_update():
 
     vq1 = VectorQuantize(
         dim = 256,
@@ -19,17 +19,25 @@ def test_manual_ema_update():
     mask = torch.randint(0, 2, (1, 1024)).bool()
 
     vq1.train()
-    quantize1, indices1, _ = vq1(x, mask = mask)
+    quantize1, indices1, commit_loss1 = vq1(x, mask = mask)
 
     vq2.train()
-    quantize2, indices2, _ = vq2(x, mask = mask, ema_update = False)
+    quantize2, indices2, commit_losses = vq2(x, mask = mask, topk = 1, ema_update = False)
 
-    assert torch.allclose(quantize1, quantize2)
-    assert torch.equal(indices1, indices2)
+    assert quantize2.shape == (1, 1024, 1, 256)
+    assert indices2.shape == (1, 1024, 1)
+    assert commit_losses.shape == (1, 1024, 1)
+
+    top_quantize2 = quantize2[..., 0, :]
+    top_indices2 = indices2[..., 0]
+
+    assert torch.allclose(commit_loss1, commit_losses.sum() / mask.sum())
+    assert torch.equal(indices1, top_indices2)
+    assert torch.allclose(quantize1, top_quantize2)
 
     assert not torch.allclose(vq1._codebook.embed_avg, vq2._codebook.embed_avg)
 
-    vq2.update_ema_indices(x, indices2, mask = mask)
+    vq2.update_ema_indices(x, top_indices2, mask = mask)
 
     assert torch.allclose(vq1._codebook.cluster_size, vq2._codebook.cluster_size)
     assert torch.allclose(vq1._codebook.embed_avg, vq2._codebook.embed_avg)
diff --git a/vector_quantize_pytorch/vector_quantize_pytorch.py b/vector_quantize_pytorch/vector_quantize_pytorch.py
@@ -119,7 +119,8 @@ def gumbel_sample(
     stochastic = False,
     straight_through = False,
     dim = -1,
-    training = True
+    training = True,
+    topk = None
 ):
     dtype, size = logits.dtype, logits.shape[dim]
 
@@ -128,7 +129,11 @@ def gumbel_sample(
     else:
         sampling_logits = logits
 
-    ind = sampling_logits.argmax(dim = dim)
+    if exists(topk):
+        ind = sampling_logits.topk(topk, dim = dim).indices
+    else:
+        ind = sampling_logits.argmax(dim = dim)
+
     one_hot = F.one_hot(ind, size).type(dtype)
 
     if not straight_through or temperature <= 0. or not training:
@@ -629,7 +634,8 @@ def forward(
         codebook_transform_fn: Callable | None = None,
         ema_update_weight: Tensor | Callable | None = None,
         accum_ema_update = False,
-        ema_update = None
+        ema_update = None,
+        topk = None
     ):
         ema_update = default(ema_update, self.ema_update)
 
@@ -686,20 +692,26 @@ def forward(
 
         # sample or argmax depending on temperature
 
-        embed_ind, embed_onehot = self.gumbel_sample(dist, dim = -1, temperature = sample_codebook_temp, training = self.training)
+        embed_ind, embed_onehot = self.gumbel_sample(dist, dim = -1, topk = topk, temperature = sample_codebook_temp, training = self.training)
 
-        embed_ind = unpack_one(embed_ind, 'h *')
+        if exists(topk):
+            embed_ind = unpack_one(embed_ind, 'h * k')
+        else:
+            embed_ind = unpack_one(embed_ind, 'h *')
 
         if exists(codebook_transform_fn):
             transformed_embed = unpack_one(transformed_embed, 'h * c d')
 
         if self.training:
-            unpacked_onehot = unpack_one(embed_onehot, 'h * c')
+            if exists(topk):
+                unpacked_onehot = unpack_one(embed_onehot, 'h * k c')
+            else:
+                unpacked_onehot = unpack_one(embed_onehot, 'h * c')
 
             if exists(codebook_transform_fn):
-                quantize = einsum('h b n c, h b n c d -> h b n d', unpacked_onehot, transformed_embed)
+                quantize = einsum('h b n ... c, h b n c d -> h b n ... d', unpacked_onehot, transformed_embed)
             else:
-                quantize = einsum('h b n c, h c d -> h b n d', unpacked_onehot, embed)
+                quantize = einsum('h b n ... c, h c d -> h b n ... d', unpacked_onehot, embed)
 
         else:
             if exists(codebook_transform_fn):
@@ -1007,6 +1019,7 @@ def forward(
         indices = None,
         mask = None,
         lens = None,
+        topk = None,
         sample_codebook_temp = None,
         freeze_codebook = None,
         return_loss_breakdown = False,
@@ -1072,7 +1085,8 @@ def forward(
             codebook_transform_fn = codebook_transform_fn,
             ema_update_weight = ema_update_weight,
             accum_ema_update = accum_ema_update,
-            ema_update = ema_update
+            ema_update = ema_update and not exists(topk),
+            topk = topk
         )
 
         # quantize
@@ -1196,17 +1210,27 @@ def calculate_ce_loss(codes):
 
                     commit_loss = calculate_ce_loss(embed_ind)
                 else:
-                    if exists(mask):
+                    if exists(topk):
+                        # handle special case when returning topk
+
+                        repeated_input = repeat(orig_input, '... d -> ... k d', k = topk)
+                        commit_loss = F.mse_loss(commit_quantize, repeated_input, reduction = 'none')
+                        commit_loss = reduce(commit_loss, '... k d -> ... k', 'mean')
+
+                        if exists(mask):
+                            commit_loss = einx.where('..., ... k, -> ... k', mask, commit_loss, 0.)
+
+                    elif exists(mask):
                         # with variable lengthed sequences
-                        commit_loss = F.mse_loss(commit_quantize, x, reduction = 'none')
+                        commit_loss = F.mse_loss(commit_quantize, orig_input, reduction = 'none')
 
                         loss_mask = mask
                         if is_multiheaded:
                             loss_mask = repeat(loss_mask, 'b n -> c (b h) n', c = commit_loss.shape[0], h = commit_loss.shape[1] // mask.shape[0])
 
                         commit_loss = commit_loss[loss_mask].mean()
                     else:
-                        commit_loss = F.mse_loss(commit_quantize, x)
+                        commit_loss = F.mse_loss(commit_quantize, orig_input)
 
                 loss = loss + commit_loss * self.commitment_weight
 
@@ -1261,7 +1285,7 @@ def calculate_ce_loss(codes):
                 masked_out_value = torch.zeros_like(orig_input)
 
             quantize = einx.where(
-                'b n, b n d, b n d -> b n d',
+                'b n, b n ... d, b n d -> b n ... d',
                 mask,
                 quantize,
                 masked_out_value