microsoft · blyxyas · Jan 29, 2026
diff --git a/gpu/model.py b/gpu/model.py
@@ -25,7 +25,7 @@ def bitnet_int8xint2_linear(input0, input1, s, ws):
     stream = torch.cuda.current_stream()
 
     M = input0.shape[0]
-    if len(out_shape) == 3: 
+    if len(out_shape) == 3:
         M *= input0.shape[1]
     N = input1.shape[0]
     K = input1.shape[1] * 4
@@ -319,7 +319,7 @@ def make_cache(
             cache entries (defaults to the default dtype).
 
     Returns:
-        The cache object to pass to ``Tranformer.forward``.
+        The cache object to pass to ``Transformer.forward``.
     """
 
     head_dim = args.dim // args.n_heads
@@ -348,7 +348,7 @@ def cache_prefix(cache: list[LayerCache], length: int) -> list[LayerCache]:
     Take a prefix view of a larger cache.
 
     The original cache object remains of identical size and valid
-    after the shrinked alias has been used. This function is useful
+    after the shrunk alias has been used. This function is useful
     when a cache was allocated for a larger batch size than what is
     necessary.
 
@@ -363,4 +363,4 @@ def cache_prefix(cache: list[LayerCache], length: int) -> list[LayerCache]:
     if len(cache) > 0:
         assert cache[0][0].shape[1] >= length
 
-    return [(ck[:, :length], cv[:, :length]) for ck, cv in cache]
+    return [(ck[:, :length], cv[:, :length]) for ck, cv in cache]
diff --git a/gpu/tokenizer.py b/gpu/tokenizer.py
@@ -117,7 +117,7 @@ def encode(
         By default, setting disallowed_special=() encodes a string by ignoring
         special tokens. Specifically:
         - Setting `disallowed_special` to () will cause all text corresponding
-          to special tokens to be encoded as natural text (insteading of raising
+          to special tokens to be encoded as natural text (instead of raising
           an error).
         - Setting `allowed_special` to "all" will treat all text corresponding
           to special tokens to be encoded as special tokens.
@@ -198,7 +198,7 @@ class ChatFormat:
     def __init__(self, tokenizer: Tokenizer):
         self.tokenizer = tokenizer
         self.eot_id = tokenizer.special_tokens["<|eot_id|>"]
-    
+
     def decode(self, tokens: List[int]) -> str:
         # Decode the tokens to a string.
         decoded_str = self.tokenizer.decode(tokens)
@@ -250,8 +250,8 @@ def encode_dialog_prompt(self, dialog: Dialog, completion=False, return_target=F
         # Add the start of an assistant message for the model to complete.
         if completion:
             tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
-        
+
         if return_target:
             return tokens, targets
 
-        return tokens
+        return tokens
diff --git a/utils/codegen_tl1.py b/utils/codegen_tl1.py
@@ -206,7 +206,7 @@ def gen_body_core_code(bm, by):
             vec_c[{7}] += vec_v_left_{0}.val[1];\n\
             vec_c[{7}] += vec_v_right_{0}.val[1];\n\
         ".format(i, 2 * by // 2, (4 * i) % (2 * by // 2), (4 * i + 1) % (2 * by // 2), (4 * i + 2) % (2 * by // 2), (4 * i + 3) % (2 * by // 2), (i * 2) // (by // 2) * 2 + 0, (i * 2) // (by // 2) * 2 + 1)
-        
+
         all_code = "".join([all_code, core_code])
 
     all_code = "".join([all_code, "\n       }\n\n"])
@@ -235,7 +235,7 @@ def gen_tbl_impl(pre, BM, BK, bm, k):
     const int8x16_t vec_zero = vdupq_n_s16(0x0000);\n\
     int8x16_t vec_lut[2 * KK];\n\
 ".format(pre, BM, BK)
-    
+
     kernel_code = "".join([kernel_code, "    int16x8_t vec_c[{}];".format(bm // 8)])
 
     kernel_code = "".join([kernel_code, "\n\
@@ -378,11 +378,11 @@ def gen_transform_code(kernel_shape):
         "Llama3-8B-1.58-100B-tokens"        : [[14336, 4096],
                                                [4096, 14336],
                                                [1024, 4096],
-                                               [4096, 4096]] 
+                                               [4096, 4096]]
     }
-    
+
     parser = argparse.ArgumentParser(description='gen impl')
-    parser.add_argument('--model',default="input", type=str, dest="model", 
+    parser.add_argument('--model',default="input", type=str, dest="model",
                         help="choose from bitnet_b1_58-large/bitnet_b1_58-3B/Llama3-8B-1.58-100B-tokens.")
     parser.add_argument('--BM',default="input", type=str,
                         help="block length when cutting one weight (M, K) into M / BM weights (BM, K).")
@@ -398,8 +398,8 @@ def gen_transform_code(kernel_shape):
     BK_list = [int(item) for item in args.BK.split(',')]
     bm_list = [int(item) for item in args.bm.split(',')]
 
-    assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm shoud be {}".format(len(kernel_shapes))
-    
+    assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm should be {}".format(len(kernel_shapes))
+
     for i in range(len(kernel_shapes)):
         assert kernel_shapes[i][0] % BM_list[i] == 0, "M %% BM should be 0"
         assert kernel_shapes[i][1] % BK_list[i] == 0, "K %% BK should be 0"
@@ -439,4 +439,4 @@ def gen_transform_code(kernel_shape):
         config.set('Kernels_{}'.format(i), 'bmm'.format(i), str(bm_list[i]))
 
     with open(''.join([output_dir, "/kernel_config.ini"]), 'w') as configfile:
-        config.write(configfile)
+        config.write(configfile)
diff --git a/utils/codegen_tl2.py b/utils/codegen_tl2.py
@@ -690,11 +690,11 @@ def get_three_k_two_k(K, bk):
         "Llama3-8B-1.58-100B-tokens"        : [[14336, 4096],
                                                [4096, 14336],
                                                [1024, 4096],
-                                               [4096, 4096]] 
+                                               [4096, 4096]]
     }
 
     parser = argparse.ArgumentParser(description='gen impl')
-    parser.add_argument('--model',default="input", type=str, dest="model", 
+    parser.add_argument('--model',default="input", type=str, dest="model",
                         help="choose from bitnet_b1_58-large/bitnet_b1_58-3B/Llama3-8B-1.58-100B-tokens.")
     parser.add_argument('--BM',default="input", type=str,
                         help="block length when cutting one weight (M, K) into M / BM weights (BM, K).")
@@ -721,8 +721,8 @@ def get_three_k_two_k(K, bk):
             gen_tbl_impl("{}_{}".format(kernel_shapes[i][0], kernel_shapes[i][1]), BM_list[i], BK_list[i], bm_list[i], k_list[i])
         )
 
-    assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm shoud be {}".format(len(kernel_shapes))
-    
+    assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm should be {}".format(len(kernel_shapes))
+
     for i in range(len(kernel_shapes)):
         assert kernel_shapes[i][0] % BM_list[i] == 0, "M %% BM should be 0"
         assert (kernel_shapes[i][1] % BK_list[i]) % 32 == 0, "K %% BK %% 32 should be 0"
@@ -754,4 +754,4 @@ def get_three_k_two_k(K, bk):
         config.set('Kernels_{}'.format(i), 'bmm'.format(i), str(bm_list[i]))
 
     with open(''.join([output_dir, "/kernel_config.ini"]), 'w') as configfile:
-        config.write(configfile)
+        config.write(configfile)
diff --git a/utils/convert-hf-to-gguf-bitnet.py b/utils/convert-hf-to-gguf-bitnet.py
@@ -53,13 +53,13 @@ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian:
         self.ftype = ftype
         self.fname_out = fname_out
         self.is_big_endian = is_big_endian
-        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
+        self.endianness = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
         self.use_temp_file = use_temp_file
         self.is_safetensors = self._is_model_safetensors()
         self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
         self.part_names = self._get_part_names()
         self.hparams = Model.load_hparams(self.dir_model)
-        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
+        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianness=self.endianness, use_temp_file=self.use_temp_file)
         self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
         self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
 
@@ -542,7 +542,7 @@ def preprocess_two_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final_w
     weight = weight.reshape((M * K // bm // by, bm // 8, 8))
     weight[:, [0, 1, 2, 3], :] = weight[:, [0, 2, 1, 3], :]
     weight = weight.reshape(M * K // bm // by, bm)
-    
+
     for i in range(weight.shape[0]):
         final_weight.append(weight[i, :])
 
@@ -590,7 +590,7 @@ def preprocess_three_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final
         combine_weight += temp_weight
     combine_weight = combine_weight.view(np.uint8)
     combine_weight = combine_weight.reshape((M * K // bm // (by * 4)), bm)
-    
+
     for i in range(combine_weight.shape[0]):
         final_weight.append(combine_weight[i, :])
 
@@ -958,7 +958,7 @@ class BitnetModel(Model):
 
     def set_vocab(self):
         self._set_vocab_sentencepiece()
-        
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
 
@@ -976,7 +976,7 @@ def weight_quant(self, weight):
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         # quant weight to i2 (in fp16)
-        if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", 
+        if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
                           "down_proj.weight", "up_proj.weight", "gate_proj.weight",
                           "o_proj.weight")):
             data_torch = self.weight_quant(data_torch)

diff --git a/utils/convert-ms-to-gguf-bitnet.py b/utils/convert-ms-to-gguf-bitnet.py
@@ -739,7 +739,7 @@ def preprocess_weights(
             kfactor = int(cf.get(sec, 'kfactor'))
             simd_n_in = int(cf.get(sec, 'simd_n_in'))
             simd_n_out = int(cf.get(sec, 'simd_n_out'))
-            break    
+            break
 
     M = M * bits
     ngroups_per_elem = 8 // g
@@ -1241,8 +1241,8 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)
 
 
 class OutputFile:
-    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
-        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
+    def __init__(self, fname_out: Path, endianness:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianness=endianness)
 
     def add_meta_arch(self, params: Params) -> None:
         name = "bitnet"
@@ -1364,7 +1364,7 @@ def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency:
             logger.info(
                 f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
             )
-            
+
             if i2_scale is not None:
                 i2_scale = np.tile(i2_scale, 8)
                 ndarray = preprocess_weights(ndarray)
@@ -1379,11 +1379,11 @@ def close(self) -> None:
     @staticmethod
     def write_vocab_only(
         fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
+        endianness: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
     ) -> None:
         check_vocab_size(params, vocab, pad_vocab=pad_vocab)
 
-        of = OutputFile(fname_out, endianess=endianess)
+        of = OutputFile(fname_out, endianness=endianness)
 
         # meta data
         of.add_meta_arch(params)
@@ -1410,12 +1410,12 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
     @staticmethod
     def write_all(
         fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
-        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        concurrency: int = DEFAULT_CONCURRENCY, endianness: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
         pad_vocab: bool = False,
     ) -> None:
         check_vocab_size(params, vocab, pad_vocab=pad_vocab)
 
-        of = OutputFile(fname_out, endianess=endianess)
+        of = OutputFile(fname_out, endianness=endianness)
 
         if 'bitnet' in of.gguf.arch:
             svocab.chat_template = "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nBITNETAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"
@@ -1493,7 +1493,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
     # 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
     rope_ndarray = (1.0 / (torch.tensor(500000.0) ** (torch.arange(0, 128, 2).float() / 128))).numpy().astype(np.float32)
     # print(rope_ndarray)
-    
+
 
     def load() -> UnquantizedTensor:
         return UnquantizedTensor(rope_ndarray)
@@ -1508,7 +1508,7 @@ def load() -> UnquantizedTensor:
             # print(lazy_tensor.load().ndarray)
     # asfasf
 
-    # HF models permut or pack some of the tensors, so we need to undo that
+    # HF models permute or pack some of the tensors, so we need to undo that
 
     # if ARCH == gguf.MODEL_ARCH.LLAMA or ARCH == gguf.MODEL_ARCH.BITNET:
     #     print(tmp.keys())
@@ -1560,7 +1560,7 @@ def load() -> UnquantizedTensor:
     #         break
 
     # for name, lazy_tensor in model.items():
-    #     if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", 
+    #     if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
     #                       "w1.weight", "w2.weight", "w3.weight",
     #                       "wo.weight")):
     #         tmp[name] = part_lazy_weight_quant(tmp[name], name)
@@ -1791,9 +1791,9 @@ def main(args_in: list[str] | None = None) -> None:
         do_dump_model(model_plus)
         return
 
-    endianess = gguf.GGUFEndian.LITTLE
+    endianness = gguf.GGUFEndian.LITTLE
     if args.big_endian:
-        endianess = gguf.GGUFEndian.BIG
+        endianness = gguf.GGUFEndian.BIG
 
     params = Params.load(model_plus)
     if params.n_ctx == -1:
@@ -1828,7 +1828,7 @@ def main(args_in: list[str] | None = None) -> None:
             raise ValueError("need --outfile if using --vocab-only")
         outfile = args.outfile
         OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
-                                    endianess=endianess, pad_vocab=args.pad_vocab)
+                                    endianness=endianness, pad_vocab=args.pad_vocab)
         logger.info(f"Wrote {outfile}")
         return
 
@@ -1847,7 +1847,7 @@ def main(args_in: list[str] | None = None) -> None:
     logger.info(f"Writing {outfile}, format {ftype}")
 
     OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
+                         concurrency=args.concurrency, endianness=endianness, pad_vocab=args.pad_vocab)
     logger.info(f"Wrote {outfile}")