diff --git a/gpu/model.py b/gpu/model.py index cd5abec01..a606cfe27 100755 --- a/gpu/model.py +++ b/gpu/model.py @@ -25,7 +25,7 @@ def bitnet_int8xint2_linear(input0, input1, s, ws): stream = torch.cuda.current_stream() M = input0.shape[0] - if len(out_shape) == 3: + if len(out_shape) == 3: M *= input0.shape[1] N = input1.shape[0] K = input1.shape[1] * 4 @@ -319,7 +319,7 @@ def make_cache( cache entries (defaults to the default dtype). Returns: - The cache object to pass to ``Tranformer.forward``. + The cache object to pass to ``Transformer.forward``. """ head_dim = args.dim // args.n_heads @@ -348,7 +348,7 @@ def cache_prefix(cache: list[LayerCache], length: int) -> list[LayerCache]: Take a prefix view of a larger cache. The original cache object remains of identical size and valid - after the shrinked alias has been used. This function is useful + after the shrunk alias has been used. This function is useful when a cache was allocated for a larger batch size than what is necessary. @@ -363,4 +363,4 @@ def cache_prefix(cache: list[LayerCache], length: int) -> list[LayerCache]: if len(cache) > 0: assert cache[0][0].shape[1] >= length - return [(ck[:, :length], cv[:, :length]) for ck, cv in cache] \ No newline at end of file + return [(ck[:, :length], cv[:, :length]) for ck, cv in cache] diff --git a/gpu/tokenizer.py b/gpu/tokenizer.py index 38e0fd2d1..1d7a7c90c 100755 --- a/gpu/tokenizer.py +++ b/gpu/tokenizer.py @@ -117,7 +117,7 @@ def encode( By default, setting disallowed_special=() encodes a string by ignoring special tokens. Specifically: - Setting `disallowed_special` to () will cause all text corresponding - to special tokens to be encoded as natural text (insteading of raising + to special tokens to be encoded as natural text (instead of raising an error). - Setting `allowed_special` to "all" will treat all text corresponding to special tokens to be encoded as special tokens. @@ -198,7 +198,7 @@ class ChatFormat: def __init__(self, tokenizer: Tokenizer): self.tokenizer = tokenizer self.eot_id = tokenizer.special_tokens["<|eot_id|>"] - + def decode(self, tokens: List[int]) -> str: # Decode the tokens to a string. decoded_str = self.tokenizer.decode(tokens) @@ -250,8 +250,8 @@ def encode_dialog_prompt(self, dialog: Dialog, completion=False, return_target=F # Add the start of an assistant message for the model to complete. if completion: tokens.extend(self.encode_header({"role": "assistant", "content": ""})) - + if return_target: return tokens, targets - return tokens \ No newline at end of file + return tokens diff --git a/utils/codegen_tl1.py b/utils/codegen_tl1.py index 4c2e7dd3f..35b3a537c 100644 --- a/utils/codegen_tl1.py +++ b/utils/codegen_tl1.py @@ -206,7 +206,7 @@ def gen_body_core_code(bm, by): vec_c[{7}] += vec_v_left_{0}.val[1];\n\ vec_c[{7}] += vec_v_right_{0}.val[1];\n\ ".format(i, 2 * by // 2, (4 * i) % (2 * by // 2), (4 * i + 1) % (2 * by // 2), (4 * i + 2) % (2 * by // 2), (4 * i + 3) % (2 * by // 2), (i * 2) // (by // 2) * 2 + 0, (i * 2) // (by // 2) * 2 + 1) - + all_code = "".join([all_code, core_code]) all_code = "".join([all_code, "\n }\n\n"]) @@ -235,7 +235,7 @@ def gen_tbl_impl(pre, BM, BK, bm, k): const int8x16_t vec_zero = vdupq_n_s16(0x0000);\n\ int8x16_t vec_lut[2 * KK];\n\ ".format(pre, BM, BK) - + kernel_code = "".join([kernel_code, " int16x8_t vec_c[{}];".format(bm // 8)]) kernel_code = "".join([kernel_code, "\n\ @@ -378,11 +378,11 @@ def gen_transform_code(kernel_shape): "Llama3-8B-1.58-100B-tokens" : [[14336, 4096], [4096, 14336], [1024, 4096], - [4096, 4096]] + [4096, 4096]] } - + parser = argparse.ArgumentParser(description='gen impl') - parser.add_argument('--model',default="input", type=str, dest="model", + parser.add_argument('--model',default="input", type=str, dest="model", help="choose from bitnet_b1_58-large/bitnet_b1_58-3B/Llama3-8B-1.58-100B-tokens.") parser.add_argument('--BM',default="input", type=str, help="block length when cutting one weight (M, K) into M / BM weights (BM, K).") @@ -398,8 +398,8 @@ def gen_transform_code(kernel_shape): BK_list = [int(item) for item in args.BK.split(',')] bm_list = [int(item) for item in args.bm.split(',')] - assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm shoud be {}".format(len(kernel_shapes)) - + assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm should be {}".format(len(kernel_shapes)) + for i in range(len(kernel_shapes)): assert kernel_shapes[i][0] % BM_list[i] == 0, "M %% BM should be 0" assert kernel_shapes[i][1] % BK_list[i] == 0, "K %% BK should be 0" @@ -439,4 +439,4 @@ def gen_transform_code(kernel_shape): config.set('Kernels_{}'.format(i), 'bmm'.format(i), str(bm_list[i])) with open(''.join([output_dir, "/kernel_config.ini"]), 'w') as configfile: - config.write(configfile) \ No newline at end of file + config.write(configfile) diff --git a/utils/codegen_tl2.py b/utils/codegen_tl2.py index 4d9408123..348691447 100644 --- a/utils/codegen_tl2.py +++ b/utils/codegen_tl2.py @@ -690,11 +690,11 @@ def get_three_k_two_k(K, bk): "Llama3-8B-1.58-100B-tokens" : [[14336, 4096], [4096, 14336], [1024, 4096], - [4096, 4096]] + [4096, 4096]] } parser = argparse.ArgumentParser(description='gen impl') - parser.add_argument('--model',default="input", type=str, dest="model", + parser.add_argument('--model',default="input", type=str, dest="model", help="choose from bitnet_b1_58-large/bitnet_b1_58-3B/Llama3-8B-1.58-100B-tokens.") parser.add_argument('--BM',default="input", type=str, help="block length when cutting one weight (M, K) into M / BM weights (BM, K).") @@ -721,8 +721,8 @@ def get_three_k_two_k(K, bk): gen_tbl_impl("{}_{}".format(kernel_shapes[i][0], kernel_shapes[i][1]), BM_list[i], BK_list[i], bm_list[i], k_list[i]) ) - assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm shoud be {}".format(len(kernel_shapes)) - + assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm should be {}".format(len(kernel_shapes)) + for i in range(len(kernel_shapes)): assert kernel_shapes[i][0] % BM_list[i] == 0, "M %% BM should be 0" assert (kernel_shapes[i][1] % BK_list[i]) % 32 == 0, "K %% BK %% 32 should be 0" @@ -754,4 +754,4 @@ def get_three_k_two_k(K, bk): config.set('Kernels_{}'.format(i), 'bmm'.format(i), str(bm_list[i])) with open(''.join([output_dir, "/kernel_config.ini"]), 'w') as configfile: - config.write(configfile) \ No newline at end of file + config.write(configfile) diff --git a/utils/convert-hf-to-gguf-bitnet.py b/utils/convert-hf-to-gguf-bitnet.py index 23e84384c..b63d52111 100644 --- a/utils/convert-hf-to-gguf-bitnet.py +++ b/utils/convert-hf-to-gguf-bitnet.py @@ -53,13 +53,13 @@ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: self.ftype = ftype self.fname_out = fname_out self.is_big_endian = is_big_endian - self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.endianness = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.use_temp_file = use_temp_file self.is_safetensors = self._is_model_safetensors() self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) - self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file) + self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianness=self.endianness, use_temp_file=self.use_temp_file) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) @@ -542,7 +542,7 @@ def preprocess_two_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final_w weight = weight.reshape((M * K // bm // by, bm // 8, 8)) weight[:, [0, 1, 2, 3], :] = weight[:, [0, 2, 1, 3], :] weight = weight.reshape(M * K // bm // by, bm) - + for i in range(weight.shape[0]): final_weight.append(weight[i, :]) @@ -590,7 +590,7 @@ def preprocess_three_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final combine_weight += temp_weight combine_weight = combine_weight.view(np.uint8) combine_weight = combine_weight.reshape((M * K // bm // (by * 4)), bm) - + for i in range(combine_weight.shape[0]): final_weight.append(combine_weight[i, :]) @@ -958,7 +958,7 @@ class BitnetModel(Model): def set_vocab(self): self._set_vocab_sentencepiece() - + def set_gguf_parameters(self): super().set_gguf_parameters() @@ -976,7 +976,7 @@ def weight_quant(self, weight): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # quant weight to i2 (in fp16) - if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", + if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", "down_proj.weight", "up_proj.weight", "gate_proj.weight", "o_proj.weight")): data_torch = self.weight_quant(data_torch) diff --git a/utils/convert-ms-to-gguf-bitnet.py b/utils/convert-ms-to-gguf-bitnet.py index e9e91622e..775ff60ed 100644 --- a/utils/convert-ms-to-gguf-bitnet.py +++ b/utils/convert-ms-to-gguf-bitnet.py @@ -739,7 +739,7 @@ def preprocess_weights( kfactor = int(cf.get(sec, 'kfactor')) simd_n_in = int(cf.get(sec, 'simd_n_in')) simd_n_out = int(cf.get(sec, 'simd_n_out')) - break + break M = M * bits ngroups_per_elem = 8 // g @@ -1241,8 +1241,8 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) class OutputFile: - def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): - self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) + def __init__(self, fname_out: Path, endianness:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): + self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianness=endianness) def add_meta_arch(self, params: Params) -> None: name = "bitnet" @@ -1364,7 +1364,7 @@ def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: logger.info( f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" ) - + if i2_scale is not None: i2_scale = np.tile(i2_scale, 8) ndarray = preprocess_weights(ndarray) @@ -1379,11 +1379,11 @@ def close(self) -> None: @staticmethod def write_vocab_only( fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, + endianness: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) - of = OutputFile(fname_out, endianess=endianess) + of = OutputFile(fname_out, endianness=endianness) # meta data of.add_meta_arch(params) @@ -1410,12 +1410,12 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: @staticmethod def write_all( fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, - concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + concurrency: int = DEFAULT_CONCURRENCY, endianness: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) - of = OutputFile(fname_out, endianess=endianess) + of = OutputFile(fname_out, endianness=endianness) if 'bitnet' in of.gguf.arch: svocab.chat_template = "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nBITNETAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}" @@ -1493,7 +1493,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> # 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) rope_ndarray = (1.0 / (torch.tensor(500000.0) ** (torch.arange(0, 128, 2).float() / 128))).numpy().astype(np.float32) # print(rope_ndarray) - + def load() -> UnquantizedTensor: return UnquantizedTensor(rope_ndarray) @@ -1508,7 +1508,7 @@ def load() -> UnquantizedTensor: # print(lazy_tensor.load().ndarray) # asfasf - # HF models permut or pack some of the tensors, so we need to undo that + # HF models permute or pack some of the tensors, so we need to undo that # if ARCH == gguf.MODEL_ARCH.LLAMA or ARCH == gguf.MODEL_ARCH.BITNET: # print(tmp.keys()) @@ -1560,7 +1560,7 @@ def load() -> UnquantizedTensor: # break # for name, lazy_tensor in model.items(): - # if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", + # if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", # "w1.weight", "w2.weight", "w3.weight", # "wo.weight")): # tmp[name] = part_lazy_weight_quant(tmp[name], name) @@ -1791,9 +1791,9 @@ def main(args_in: list[str] | None = None) -> None: do_dump_model(model_plus) return - endianess = gguf.GGUFEndian.LITTLE + endianness = gguf.GGUFEndian.LITTLE if args.big_endian: - endianess = gguf.GGUFEndian.BIG + endianness = gguf.GGUFEndian.BIG params = Params.load(model_plus) if params.n_ctx == -1: @@ -1828,7 +1828,7 @@ def main(args_in: list[str] | None = None) -> None: raise ValueError("need --outfile if using --vocab-only") outfile = args.outfile OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, - endianess=endianess, pad_vocab=args.pad_vocab) + endianness=endianness, pad_vocab=args.pad_vocab) logger.info(f"Wrote {outfile}") return @@ -1847,7 +1847,7 @@ def main(args_in: list[str] | None = None) -> None: logger.info(f"Writing {outfile}, format {ftype}") OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, - concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) + concurrency=args.concurrency, endianness=endianness, pad_vocab=args.pad_vocab) logger.info(f"Wrote {outfile}") diff --git a/utils/convert.py b/utils/convert.py index 5938c42f2..e4298ff58 100644 --- a/utils/convert.py +++ b/utils/convert.py @@ -739,7 +739,7 @@ def preprocess_weights( kfactor = int(cf.get(sec, 'kfactor')) simd_n_in = int(cf.get(sec, 'simd_n_in')) simd_n_out = int(cf.get(sec, 'simd_n_out')) - break + break M = M * bits ngroups_per_elem = 8 // g @@ -1167,8 +1167,8 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) class OutputFile: - def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): - self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) + def __init__(self, fname_out: Path, endianness:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): + self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianness=endianness) def add_meta_arch(self, params: Params) -> None: name = "LLaMA" @@ -1295,7 +1295,7 @@ def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: logger.info( f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" ) - + if i2_scale is not None: i2_scale = np.tile(i2_scale, 8) ndarray = preprocess_weights(ndarray) @@ -1310,11 +1310,11 @@ def close(self) -> None: @staticmethod def write_vocab_only( fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, + endianness: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) - of = OutputFile(fname_out, endianess=endianess) + of = OutputFile(fname_out, endianness=endianness) # meta data of.add_meta_arch(params) @@ -1341,12 +1341,12 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: @staticmethod def write_all( fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, - concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + concurrency: int = DEFAULT_CONCURRENCY, endianness: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) - of = OutputFile(fname_out, endianess=endianess) + of = OutputFile(fname_out, endianness=endianness) # meta data of.add_meta_arch(params) @@ -1418,7 +1418,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight") tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts) - # HF models permut or pack some of the tensors, so we need to undo that + # HF models permute or pack some of the tensors, so we need to undo that for i in itertools.count(): if f"model.layers.{i}.self_attn.q_proj.weight" in model: logger.debug(f"Permuting layer {i}") @@ -1433,7 +1433,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] else: break - + # check if is bitnet if ARCH == 33: del tmp['output.weight'] @@ -1647,9 +1647,9 @@ def main(args_in: list[str] | None = None) -> None: do_dump_model(model_plus) return - endianess = gguf.GGUFEndian.LITTLE + endianness = gguf.GGUFEndian.LITTLE if args.big_endian: - endianess = gguf.GGUFEndian.BIG + endianness = gguf.GGUFEndian.BIG params = Params.load(model_plus) if params.n_ctx == -1: @@ -1684,7 +1684,7 @@ def main(args_in: list[str] | None = None) -> None: raise ValueError("need --outfile if using --vocab-only") outfile = args.outfile OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, - endianess=endianess, pad_vocab=args.pad_vocab) + endianness=endianness, pad_vocab=args.pad_vocab) logger.info(f"Wrote {outfile}") return @@ -1703,7 +1703,7 @@ def main(args_in: list[str] | None = None) -> None: logger.info(f"Writing {outfile}, format {ftype}") OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, - concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) + concurrency=args.concurrency, endianness=endianness, pad_vocab=args.pad_vocab) logger.info(f"Wrote {outfile}") diff --git a/utils/generate-dummy-bitnet-model.py b/utils/generate-dummy-bitnet-model.py index be3f6cdaa..e5dae298a 100644 --- a/utils/generate-dummy-bitnet-model.py +++ b/utils/generate-dummy-bitnet-model.py @@ -125,13 +125,13 @@ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: self.ftype = ftype self.fname_out = fname_out self.is_big_endian = is_big_endian - self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.endianness = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.use_temp_file = use_temp_file self.is_safetensors = self._is_model_safetensors() self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) - self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file) + self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianness=self.endianness, use_temp_file=self.use_temp_file) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) @@ -601,14 +601,14 @@ def preprocess_two_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final_w left_weight = func_weights[0] left_sub_weights = np.split(left_weight, 4, axis=0) new_left_weight = np.reshape( - np.concatenate([left_sub_weights[0], left_sub_weights[2], + np.concatenate([left_sub_weights[0], left_sub_weights[2], left_sub_weights[1], left_sub_weights[3]], axis=0, dtype=np.uint8), (bm)) right_weight = func_weights[1] right_sub_weights = np.split(right_weight, 4, axis=0) new_right_weight = np.reshape( - np.concatenate([right_sub_weights[0], right_sub_weights[2], + np.concatenate([right_sub_weights[0], right_sub_weights[2], right_sub_weights[1], right_sub_weights[3]], axis=0, dtype=np.uint8), (bm)) hi_weight = new_left_weight.astype(np.uint8) << 4 @@ -651,7 +651,7 @@ def preprocess_three_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final left_weight = func_weights[0] left_sub_weights = np.split(left_weight, 4, axis=0) new_left_weight = np.reshape( - np.concatenate([left_sub_weights[0], left_sub_weights[2], + np.concatenate([left_sub_weights[0], left_sub_weights[2], left_sub_weights[1], left_sub_weights[3]], axis=0, dtype=np.uint8), (bm)) @@ -659,7 +659,7 @@ def preprocess_three_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final right_sub_weights = np.split(right_weight, 4, axis=0) new_right_weight = np.reshape( - np.concatenate([right_sub_weights[0], right_sub_weights[2], + np.concatenate([right_sub_weights[0], right_sub_weights[2], right_sub_weights[1], right_sub_weights[3]], axis=0, dtype=np.uint8), (bm)) hi_weight = new_left_weight.astype(np.uint8) << 4 @@ -771,13 +771,13 @@ def preprocess_weights_tl2( weight = np.array(final_weight, dtype=np.uint8) return weight - + @Model.register("BitnetForCausalLM") class BitnetModel(Model): model_arch = gguf.MODEL_ARCH.BITNET params: str = "" - + def set_params(self, params: str): self.params = params hp_config = model_config[self.params] @@ -788,11 +788,11 @@ def set_params(self, params: str): self.hparams["num_key_value_heads"] = hp_config["num_attention_heads"] self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - + def set_vocab(self): self._set_vocab_sentencepiece() - + def set_gguf_parameters(self): super().set_gguf_parameters() @@ -819,7 +819,7 @@ def transform_to_tl2(self, x: np.ndarray): # res = np.round(x / scale + 2).astype(np.uint8) res = preprocess_weights_tl2(x) return res, scale - + # generate dummy model def generate_tensors(self) -> Iterator[tuple[str, np.ndarray]]: hp_config = model_config[self.params] @@ -851,7 +851,7 @@ def generate_tensors(self) -> Iterator[tuple[str, np.ndarray]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # quant weight to i2 (in fp16) - if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", + if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", "down_proj.weight", "up_proj.weight", "gate_proj.weight", "o_proj.weight")): data_torch = self.weight_quant(data_torch) @@ -1015,7 +1015,7 @@ def read_gguf_file(gguf_file_path): size_str = str(tensor.n_elements) quantization_str = tensor.tensor_type.name print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100 - + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Generate a dummy bitnet model with GGUF format") @@ -1045,4 +1045,4 @@ def parse_args() -> argparse.Namespace: if __name__ == '__main__': args = parse_args() - main() \ No newline at end of file + main()