diff --git a/gpu/model.py b/gpu/model.py
index cd5abec01..a606cfe27 100755
--- a/gpu/model.py
+++ b/gpu/model.py
@@ -25,7 +25,7 @@ def bitnet_int8xint2_linear(input0, input1, s, ws):
     stream = torch.cuda.current_stream()
 
     M = input0.shape[0]
-    if len(out_shape) == 3: 
+    if len(out_shape) == 3:
         M *= input0.shape[1]
     N = input1.shape[0]
     K = input1.shape[1] * 4
@@ -319,7 +319,7 @@ def make_cache(
             cache entries (defaults to the default dtype).
 
     Returns:
-        The cache object to pass to ``Tranformer.forward``.
+        The cache object to pass to ``Transformer.forward``.
     """
 
     head_dim = args.dim // args.n_heads
@@ -348,7 +348,7 @@ def cache_prefix(cache: list[LayerCache], length: int) -> list[LayerCache]:
     Take a prefix view of a larger cache.
 
     The original cache object remains of identical size and valid
-    after the shrinked alias has been used. This function is useful
+    after the shrunk alias has been used. This function is useful
     when a cache was allocated for a larger batch size than what is
     necessary.
 
@@ -363,4 +363,4 @@ def cache_prefix(cache: list[LayerCache], length: int) -> list[LayerCache]:
     if len(cache) > 0:
         assert cache[0][0].shape[1] >= length
 
-    return [(ck[:, :length], cv[:, :length]) for ck, cv in cache]
\ No newline at end of file
+    return [(ck[:, :length], cv[:, :length]) for ck, cv in cache]
diff --git a/gpu/tokenizer.py b/gpu/tokenizer.py
index 38e0fd2d1..1d7a7c90c 100755
--- a/gpu/tokenizer.py
+++ b/gpu/tokenizer.py
@@ -117,7 +117,7 @@ def encode(
         By default, setting disallowed_special=() encodes a string by ignoring
         special tokens. Specifically:
         - Setting `disallowed_special` to () will cause all text corresponding
-          to special tokens to be encoded as natural text (insteading of raising
+          to special tokens to be encoded as natural text (instead of raising
           an error).
         - Setting `allowed_special` to "all" will treat all text corresponding
           to special tokens to be encoded as special tokens.
@@ -198,7 +198,7 @@ class ChatFormat:
     def __init__(self, tokenizer: Tokenizer):
         self.tokenizer = tokenizer
         self.eot_id = tokenizer.special_tokens["<|eot_id|>"]
-    
+
     def decode(self, tokens: List[int]) -> str:
         # Decode the tokens to a string.
         decoded_str = self.tokenizer.decode(tokens)
@@ -250,8 +250,8 @@ def encode_dialog_prompt(self, dialog: Dialog, completion=False, return_target=F
         # Add the start of an assistant message for the model to complete.
         if completion:
             tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
-        
+
         if return_target:
             return tokens, targets
 
-        return tokens
\ No newline at end of file
+        return tokens
diff --git a/utils/codegen_tl1.py b/utils/codegen_tl1.py
index 4c2e7dd3f..35b3a537c 100644
--- a/utils/codegen_tl1.py
+++ b/utils/codegen_tl1.py
@@ -206,7 +206,7 @@ def gen_body_core_code(bm, by):
             vec_c[{7}] += vec_v_left_{0}.val[1];\n\
             vec_c[{7}] += vec_v_right_{0}.val[1];\n\
         ".format(i, 2 * by // 2, (4 * i) % (2 * by // 2), (4 * i + 1) % (2 * by // 2), (4 * i + 2) % (2 * by // 2), (4 * i + 3) % (2 * by // 2), (i * 2) // (by // 2) * 2 + 0, (i * 2) // (by // 2) * 2 + 1)
-        
+
         all_code = "".join([all_code, core_code])
 
     all_code = "".join([all_code, "\n       }\n\n"])
@@ -235,7 +235,7 @@ def gen_tbl_impl(pre, BM, BK, bm, k):
     const int8x16_t vec_zero = vdupq_n_s16(0x0000);\n\
     int8x16_t vec_lut[2 * KK];\n\
 ".format(pre, BM, BK)
-    
+
     kernel_code = "".join([kernel_code, "    int16x8_t vec_c[{}];".format(bm // 8)])
 
     kernel_code = "".join([kernel_code, "\n\
@@ -378,11 +378,11 @@ def gen_transform_code(kernel_shape):
         "Llama3-8B-1.58-100B-tokens"        : [[14336, 4096],
                                                [4096, 14336],
                                                [1024, 4096],
-                                               [4096, 4096]] 
+                                               [4096, 4096]]
     }
-    
+
     parser = argparse.ArgumentParser(description='gen impl')
-    parser.add_argument('--model',default="input", type=str, dest="model", 
+    parser.add_argument('--model',default="input", type=str, dest="model",
                         help="choose from bitnet_b1_58-large/bitnet_b1_58-3B/Llama3-8B-1.58-100B-tokens.")
     parser.add_argument('--BM',default="input", type=str,
                         help="block length when cutting one weight (M, K) into M / BM weights (BM, K).")
@@ -398,8 +398,8 @@ def gen_transform_code(kernel_shape):
     BK_list = [int(item) for item in args.BK.split(',')]
     bm_list = [int(item) for item in args.bm.split(',')]
 
-    assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm shoud be {}".format(len(kernel_shapes))
-    
+    assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm should be {}".format(len(kernel_shapes))
+
     for i in range(len(kernel_shapes)):
         assert kernel_shapes[i][0] % BM_list[i] == 0, "M %% BM should be 0"
         assert kernel_shapes[i][1] % BK_list[i] == 0, "K %% BK should be 0"
@@ -439,4 +439,4 @@ def gen_transform_code(kernel_shape):
         config.set('Kernels_{}'.format(i), 'bmm'.format(i), str(bm_list[i]))
 
     with open(''.join([output_dir, "/kernel_config.ini"]), 'w') as configfile:
-        config.write(configfile)
\ No newline at end of file
+        config.write(configfile)
diff --git a/utils/codegen_tl2.py b/utils/codegen_tl2.py
index 4d9408123..348691447 100644
--- a/utils/codegen_tl2.py
+++ b/utils/codegen_tl2.py
@@ -690,11 +690,11 @@ def get_three_k_two_k(K, bk):
         "Llama3-8B-1.58-100B-tokens"        : [[14336, 4096],
                                                [4096, 14336],
                                                [1024, 4096],
-                                               [4096, 4096]] 
+                                               [4096, 4096]]
     }
 
     parser = argparse.ArgumentParser(description='gen impl')
-    parser.add_argument('--model',default="input", type=str, dest="model", 
+    parser.add_argument('--model',default="input", type=str, dest="model",
                         help="choose from bitnet_b1_58-large/bitnet_b1_58-3B/Llama3-8B-1.58-100B-tokens.")
     parser.add_argument('--BM',default="input", type=str,
                         help="block length when cutting one weight (M, K) into M / BM weights (BM, K).")
@@ -721,8 +721,8 @@ def get_three_k_two_k(K, bk):
             gen_tbl_impl("{}_{}".format(kernel_shapes[i][0], kernel_shapes[i][1]), BM_list[i], BK_list[i], bm_list[i], k_list[i])
         )
 
-    assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm shoud be {}".format(len(kernel_shapes))
-    
+    assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm should be {}".format(len(kernel_shapes))
+
     for i in range(len(kernel_shapes)):
         assert kernel_shapes[i][0] % BM_list[i] == 0, "M %% BM should be 0"
         assert (kernel_shapes[i][1] % BK_list[i]) % 32 == 0, "K %% BK %% 32 should be 0"
@@ -754,4 +754,4 @@ def get_three_k_two_k(K, bk):
         config.set('Kernels_{}'.format(i), 'bmm'.format(i), str(bm_list[i]))
 
     with open(''.join([output_dir, "/kernel_config.ini"]), 'w') as configfile:
-        config.write(configfile)
\ No newline at end of file
+        config.write(configfile)
diff --git a/utils/convert-hf-to-gguf-bitnet.py b/utils/convert-hf-to-gguf-bitnet.py
index 23e84384c..b63d52111 100644
--- a/utils/convert-hf-to-gguf-bitnet.py
+++ b/utils/convert-hf-to-gguf-bitnet.py
@@ -53,13 +53,13 @@ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian:
         self.ftype = ftype
         self.fname_out = fname_out
         self.is_big_endian = is_big_endian
-        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
+        self.endianness = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
         self.use_temp_file = use_temp_file
         self.is_safetensors = self._is_model_safetensors()
         self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
         self.part_names = self._get_part_names()
         self.hparams = Model.load_hparams(self.dir_model)
-        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
+        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianness=self.endianness, use_temp_file=self.use_temp_file)
         self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
         self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
 
@@ -542,7 +542,7 @@ def preprocess_two_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final_w
     weight = weight.reshape((M * K // bm // by, bm // 8, 8))
     weight[:, [0, 1, 2, 3], :] = weight[:, [0, 2, 1, 3], :]
     weight = weight.reshape(M * K // bm // by, bm)
-    
+
     for i in range(weight.shape[0]):
         final_weight.append(weight[i, :])
 
@@ -590,7 +590,7 @@ def preprocess_three_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final
         combine_weight += temp_weight
     combine_weight = combine_weight.view(np.uint8)
     combine_weight = combine_weight.reshape((M * K // bm // (by * 4)), bm)
-    
+
     for i in range(combine_weight.shape[0]):
         final_weight.append(combine_weight[i, :])
 
@@ -958,7 +958,7 @@ class BitnetModel(Model):
 
     def set_vocab(self):
         self._set_vocab_sentencepiece()
-        
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
 
@@ -976,7 +976,7 @@ def weight_quant(self, weight):
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         # quant weight to i2 (in fp16)
-        if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", 
+        if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
                           "down_proj.weight", "up_proj.weight", "gate_proj.weight",
                           "o_proj.weight")):
             data_torch = self.weight_quant(data_torch)
diff --git a/utils/convert-ms-to-gguf-bitnet.py b/utils/convert-ms-to-gguf-bitnet.py
index e9e91622e..775ff60ed 100644
--- a/utils/convert-ms-to-gguf-bitnet.py
+++ b/utils/convert-ms-to-gguf-bitnet.py
@@ -739,7 +739,7 @@ def preprocess_weights(
             kfactor = int(cf.get(sec, 'kfactor'))
             simd_n_in = int(cf.get(sec, 'simd_n_in'))
             simd_n_out = int(cf.get(sec, 'simd_n_out'))
-            break    
+            break
 
     M = M * bits
     ngroups_per_elem = 8 // g
@@ -1241,8 +1241,8 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)
 
 
 class OutputFile:
-    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
-        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
+    def __init__(self, fname_out: Path, endianness:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianness=endianness)
 
     def add_meta_arch(self, params: Params) -> None:
         name = "bitnet"
@@ -1364,7 +1364,7 @@ def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency:
             logger.info(
                 f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
             )
-            
+
             if i2_scale is not None:
                 i2_scale = np.tile(i2_scale, 8)
                 ndarray = preprocess_weights(ndarray)
@@ -1379,11 +1379,11 @@ def close(self) -> None:
     @staticmethod
     def write_vocab_only(
         fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
+        endianness: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
     ) -> None:
         check_vocab_size(params, vocab, pad_vocab=pad_vocab)
 
-        of = OutputFile(fname_out, endianess=endianess)
+        of = OutputFile(fname_out, endianness=endianness)
 
         # meta data
         of.add_meta_arch(params)
@@ -1410,12 +1410,12 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
     @staticmethod
     def write_all(
         fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
-        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        concurrency: int = DEFAULT_CONCURRENCY, endianness: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
         pad_vocab: bool = False,
     ) -> None:
         check_vocab_size(params, vocab, pad_vocab=pad_vocab)
 
-        of = OutputFile(fname_out, endianess=endianess)
+        of = OutputFile(fname_out, endianness=endianness)
 
         if 'bitnet' in of.gguf.arch:
             svocab.chat_template = "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nBITNETAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"
@@ -1493,7 +1493,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
     # 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
     rope_ndarray = (1.0 / (torch.tensor(500000.0) ** (torch.arange(0, 128, 2).float() / 128))).numpy().astype(np.float32)
     # print(rope_ndarray)
-    
+
 
     def load() -> UnquantizedTensor:
         return UnquantizedTensor(rope_ndarray)
@@ -1508,7 +1508,7 @@ def load() -> UnquantizedTensor:
             # print(lazy_tensor.load().ndarray)
     # asfasf
 
-    # HF models permut or pack some of the tensors, so we need to undo that
+    # HF models permute or pack some of the tensors, so we need to undo that
 
     # if ARCH == gguf.MODEL_ARCH.LLAMA or ARCH == gguf.MODEL_ARCH.BITNET:
     #     print(tmp.keys())
@@ -1560,7 +1560,7 @@ def load() -> UnquantizedTensor:
     #         break
 
     # for name, lazy_tensor in model.items():
-    #     if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", 
+    #     if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
     #                       "w1.weight", "w2.weight", "w3.weight",
     #                       "wo.weight")):
     #         tmp[name] = part_lazy_weight_quant(tmp[name], name)
@@ -1791,9 +1791,9 @@ def main(args_in: list[str] | None = None) -> None:
         do_dump_model(model_plus)
         return
 
-    endianess = gguf.GGUFEndian.LITTLE
+    endianness = gguf.GGUFEndian.LITTLE
     if args.big_endian:
-        endianess = gguf.GGUFEndian.BIG
+        endianness = gguf.GGUFEndian.BIG
 
     params = Params.load(model_plus)
     if params.n_ctx == -1:
@@ -1828,7 +1828,7 @@ def main(args_in: list[str] | None = None) -> None:
             raise ValueError("need --outfile if using --vocab-only")
         outfile = args.outfile
         OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
-                                    endianess=endianess, pad_vocab=args.pad_vocab)
+                                    endianness=endianness, pad_vocab=args.pad_vocab)
         logger.info(f"Wrote {outfile}")
         return
 
@@ -1847,7 +1847,7 @@ def main(args_in: list[str] | None = None) -> None:
     logger.info(f"Writing {outfile}, format {ftype}")
 
     OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
+                         concurrency=args.concurrency, endianness=endianness, pad_vocab=args.pad_vocab)
     logger.info(f"Wrote {outfile}")
 
 
diff --git a/utils/convert.py b/utils/convert.py
index 5938c42f2..e4298ff58 100644
--- a/utils/convert.py
+++ b/utils/convert.py
@@ -739,7 +739,7 @@ def preprocess_weights(
             kfactor = int(cf.get(sec, 'kfactor'))
             simd_n_in = int(cf.get(sec, 'simd_n_in'))
             simd_n_out = int(cf.get(sec, 'simd_n_out'))
-            break    
+            break
 
     M = M * bits
     ngroups_per_elem = 8 // g
@@ -1167,8 +1167,8 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)
 
 
 class OutputFile:
-    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
-        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
+    def __init__(self, fname_out: Path, endianness:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianness=endianness)
 
     def add_meta_arch(self, params: Params) -> None:
         name = "LLaMA"
@@ -1295,7 +1295,7 @@ def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency:
             logger.info(
                 f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
             )
-            
+
             if i2_scale is not None:
                 i2_scale = np.tile(i2_scale, 8)
                 ndarray = preprocess_weights(ndarray)
@@ -1310,11 +1310,11 @@ def close(self) -> None:
     @staticmethod
     def write_vocab_only(
         fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
+        endianness: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
     ) -> None:
         check_vocab_size(params, vocab, pad_vocab=pad_vocab)
 
-        of = OutputFile(fname_out, endianess=endianess)
+        of = OutputFile(fname_out, endianness=endianness)
 
         # meta data
         of.add_meta_arch(params)
@@ -1341,12 +1341,12 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
     @staticmethod
     def write_all(
         fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
-        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        concurrency: int = DEFAULT_CONCURRENCY, endianness: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
         pad_vocab: bool = False,
     ) -> None:
         check_vocab_size(params, vocab, pad_vocab=pad_vocab)
 
-        of = OutputFile(fname_out, endianess=endianess)
+        of = OutputFile(fname_out, endianness=endianness)
 
         # meta data
         of.add_meta_arch(params)
@@ -1418,7 +1418,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
                         raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
                 tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
 
-    # HF models permut or pack some of the tensors, so we need to undo that
+    # HF models permute or pack some of the tensors, so we need to undo that
     for i in itertools.count():
         if f"model.layers.{i}.self_attn.q_proj.weight" in model:
             logger.debug(f"Permuting layer {i}")
@@ -1433,7 +1433,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
             del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
         else:
             break
-    
+
     # check if is bitnet
     if ARCH == 33:
         del tmp['output.weight']
@@ -1647,9 +1647,9 @@ def main(args_in: list[str] | None = None) -> None:
         do_dump_model(model_plus)
         return
 
-    endianess = gguf.GGUFEndian.LITTLE
+    endianness = gguf.GGUFEndian.LITTLE
     if args.big_endian:
-        endianess = gguf.GGUFEndian.BIG
+        endianness = gguf.GGUFEndian.BIG
 
     params = Params.load(model_plus)
     if params.n_ctx == -1:
@@ -1684,7 +1684,7 @@ def main(args_in: list[str] | None = None) -> None:
             raise ValueError("need --outfile if using --vocab-only")
         outfile = args.outfile
         OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
-                                    endianess=endianess, pad_vocab=args.pad_vocab)
+                                    endianness=endianness, pad_vocab=args.pad_vocab)
         logger.info(f"Wrote {outfile}")
         return
 
@@ -1703,7 +1703,7 @@ def main(args_in: list[str] | None = None) -> None:
     logger.info(f"Writing {outfile}, format {ftype}")
 
     OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
+                         concurrency=args.concurrency, endianness=endianness, pad_vocab=args.pad_vocab)
     logger.info(f"Wrote {outfile}")
 
 
diff --git a/utils/generate-dummy-bitnet-model.py b/utils/generate-dummy-bitnet-model.py
index be3f6cdaa..e5dae298a 100644
--- a/utils/generate-dummy-bitnet-model.py
+++ b/utils/generate-dummy-bitnet-model.py
@@ -125,13 +125,13 @@ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian:
         self.ftype = ftype
         self.fname_out = fname_out
         self.is_big_endian = is_big_endian
-        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
+        self.endianness = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
         self.use_temp_file = use_temp_file
         self.is_safetensors = self._is_model_safetensors()
         self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
         self.part_names = self._get_part_names()
         self.hparams = Model.load_hparams(self.dir_model)
-        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
+        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianness=self.endianness, use_temp_file=self.use_temp_file)
         self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
         self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
 
@@ -601,14 +601,14 @@ def preprocess_two_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final_w
                     left_weight = func_weights[0]
                     left_sub_weights = np.split(left_weight, 4, axis=0)
                     new_left_weight = np.reshape(
-                                        np.concatenate([left_sub_weights[0], left_sub_weights[2], 
+                                        np.concatenate([left_sub_weights[0], left_sub_weights[2],
                                         left_sub_weights[1], left_sub_weights[3]], axis=0, dtype=np.uint8),
                                         (bm))
 
                     right_weight = func_weights[1]
                     right_sub_weights = np.split(right_weight, 4, axis=0)
                     new_right_weight = np.reshape(
-                                        np.concatenate([right_sub_weights[0], right_sub_weights[2], 
+                                        np.concatenate([right_sub_weights[0], right_sub_weights[2],
                                         right_sub_weights[1], right_sub_weights[3]], axis=0, dtype=np.uint8),
                                         (bm))
                     hi_weight = new_left_weight.astype(np.uint8) << 4
@@ -651,7 +651,7 @@ def preprocess_three_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final
                     left_weight = func_weights[0]
                     left_sub_weights = np.split(left_weight, 4, axis=0)
                     new_left_weight = np.reshape(
-                                        np.concatenate([left_sub_weights[0], left_sub_weights[2], 
+                                        np.concatenate([left_sub_weights[0], left_sub_weights[2],
                                         left_sub_weights[1], left_sub_weights[3]], axis=0, dtype=np.uint8),
                                         (bm))
 
@@ -659,7 +659,7 @@ def preprocess_three_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final
                     right_sub_weights = np.split(right_weight, 4, axis=0)
 
                     new_right_weight = np.reshape(
-                                        np.concatenate([right_sub_weights[0], right_sub_weights[2], 
+                                        np.concatenate([right_sub_weights[0], right_sub_weights[2],
                                         right_sub_weights[1], right_sub_weights[3]], axis=0, dtype=np.uint8),
                                         (bm))
                     hi_weight = new_left_weight.astype(np.uint8) << 4
@@ -771,13 +771,13 @@ def preprocess_weights_tl2(
     weight = np.array(final_weight, dtype=np.uint8)
 
     return weight
-    
+
 
 @Model.register("BitnetForCausalLM")
 class BitnetModel(Model):
     model_arch = gguf.MODEL_ARCH.BITNET
     params: str = ""
-    
+
     def set_params(self, params: str):
         self.params = params
         hp_config = model_config[self.params]
@@ -788,11 +788,11 @@ def set_params(self, params: str):
         self.hparams["num_key_value_heads"] = hp_config["num_attention_heads"]
         self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
         self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
-        
+
 
     def set_vocab(self):
         self._set_vocab_sentencepiece()
-        
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
 
@@ -819,7 +819,7 @@ def transform_to_tl2(self, x: np.ndarray):
         # res = np.round(x / scale + 2).astype(np.uint8)
         res = preprocess_weights_tl2(x)
         return res, scale
-    
+
     # generate dummy model
     def generate_tensors(self) -> Iterator[tuple[str, np.ndarray]]:
         hp_config = model_config[self.params]
@@ -851,7 +851,7 @@ def generate_tensors(self) -> Iterator[tuple[str, np.ndarray]]:
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         # quant weight to i2 (in fp16)
-        if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", 
+        if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
                           "down_proj.weight", "up_proj.weight", "gate_proj.weight",
                           "o_proj.weight")):
             data_torch = self.weight_quant(data_torch)
@@ -1015,7 +1015,7 @@ def read_gguf_file(gguf_file_path):
         size_str = str(tensor.n_elements)
         quantization_str = tensor.tensor_type.name
         print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100
-        
+
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="Generate a dummy bitnet model with GGUF format")
@@ -1045,4 +1045,4 @@ def parse_args() -> argparse.Namespace:
 
 if __name__ == '__main__':
     args = parse_args()
-    main()
\ No newline at end of file
+    main()