Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions gpu/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def bitnet_int8xint2_linear(input0, input1, s, ws):
stream = torch.cuda.current_stream()

M = input0.shape[0]
if len(out_shape) == 3:
if len(out_shape) == 3:
M *= input0.shape[1]
N = input1.shape[0]
K = input1.shape[1] * 4
Expand Down Expand Up @@ -319,7 +319,7 @@ def make_cache(
cache entries (defaults to the default dtype).

Returns:
The cache object to pass to ``Tranformer.forward``.
The cache object to pass to ``Transformer.forward``.
"""

head_dim = args.dim // args.n_heads
Expand Down Expand Up @@ -348,7 +348,7 @@ def cache_prefix(cache: list[LayerCache], length: int) -> list[LayerCache]:
Take a prefix view of a larger cache.

The original cache object remains of identical size and valid
after the shrinked alias has been used. This function is useful
after the shrunk alias has been used. This function is useful
when a cache was allocated for a larger batch size than what is
necessary.

Expand All @@ -363,4 +363,4 @@ def cache_prefix(cache: list[LayerCache], length: int) -> list[LayerCache]:
if len(cache) > 0:
assert cache[0][0].shape[1] >= length

return [(ck[:, :length], cv[:, :length]) for ck, cv in cache]
return [(ck[:, :length], cv[:, :length]) for ck, cv in cache]
8 changes: 4 additions & 4 deletions gpu/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def encode(
By default, setting disallowed_special=() encodes a string by ignoring
special tokens. Specifically:
- Setting `disallowed_special` to () will cause all text corresponding
to special tokens to be encoded as natural text (insteading of raising
to special tokens to be encoded as natural text (instead of raising
an error).
- Setting `allowed_special` to "all" will treat all text corresponding
to special tokens to be encoded as special tokens.
Expand Down Expand Up @@ -198,7 +198,7 @@ class ChatFormat:
def __init__(self, tokenizer: Tokenizer):
self.tokenizer = tokenizer
self.eot_id = tokenizer.special_tokens["<|eot_id|>"]

def decode(self, tokens: List[int]) -> str:
# Decode the tokens to a string.
decoded_str = self.tokenizer.decode(tokens)
Expand Down Expand Up @@ -250,8 +250,8 @@ def encode_dialog_prompt(self, dialog: Dialog, completion=False, return_target=F
# Add the start of an assistant message for the model to complete.
if completion:
tokens.extend(self.encode_header({"role": "assistant", "content": ""}))

if return_target:
return tokens, targets

return tokens
return tokens
16 changes: 8 additions & 8 deletions utils/codegen_tl1.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def gen_body_core_code(bm, by):
vec_c[{7}] += vec_v_left_{0}.val[1];\n\
vec_c[{7}] += vec_v_right_{0}.val[1];\n\
".format(i, 2 * by // 2, (4 * i) % (2 * by // 2), (4 * i + 1) % (2 * by // 2), (4 * i + 2) % (2 * by // 2), (4 * i + 3) % (2 * by // 2), (i * 2) // (by // 2) * 2 + 0, (i * 2) // (by // 2) * 2 + 1)

all_code = "".join([all_code, core_code])

all_code = "".join([all_code, "\n }\n\n"])
Expand Down Expand Up @@ -235,7 +235,7 @@ def gen_tbl_impl(pre, BM, BK, bm, k):
const int8x16_t vec_zero = vdupq_n_s16(0x0000);\n\
int8x16_t vec_lut[2 * KK];\n\
".format(pre, BM, BK)

kernel_code = "".join([kernel_code, " int16x8_t vec_c[{}];".format(bm // 8)])

kernel_code = "".join([kernel_code, "\n\
Expand Down Expand Up @@ -378,11 +378,11 @@ def gen_transform_code(kernel_shape):
"Llama3-8B-1.58-100B-tokens" : [[14336, 4096],
[4096, 14336],
[1024, 4096],
[4096, 4096]]
[4096, 4096]]
}

parser = argparse.ArgumentParser(description='gen impl')
parser.add_argument('--model',default="input", type=str, dest="model",
parser.add_argument('--model',default="input", type=str, dest="model",
help="choose from bitnet_b1_58-large/bitnet_b1_58-3B/Llama3-8B-1.58-100B-tokens.")
parser.add_argument('--BM',default="input", type=str,
help="block length when cutting one weight (M, K) into M / BM weights (BM, K).")
Expand All @@ -398,8 +398,8 @@ def gen_transform_code(kernel_shape):
BK_list = [int(item) for item in args.BK.split(',')]
bm_list = [int(item) for item in args.bm.split(',')]

assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm shoud be {}".format(len(kernel_shapes))
assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm should be {}".format(len(kernel_shapes))

for i in range(len(kernel_shapes)):
assert kernel_shapes[i][0] % BM_list[i] == 0, "M %% BM should be 0"
assert kernel_shapes[i][1] % BK_list[i] == 0, "K %% BK should be 0"
Expand Down Expand Up @@ -439,4 +439,4 @@ def gen_transform_code(kernel_shape):
config.set('Kernels_{}'.format(i), 'bmm'.format(i), str(bm_list[i]))

with open(''.join([output_dir, "/kernel_config.ini"]), 'w') as configfile:
config.write(configfile)
config.write(configfile)
10 changes: 5 additions & 5 deletions utils/codegen_tl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,11 +690,11 @@ def get_three_k_two_k(K, bk):
"Llama3-8B-1.58-100B-tokens" : [[14336, 4096],
[4096, 14336],
[1024, 4096],
[4096, 4096]]
[4096, 4096]]
}

parser = argparse.ArgumentParser(description='gen impl')
parser.add_argument('--model',default="input", type=str, dest="model",
parser.add_argument('--model',default="input", type=str, dest="model",
help="choose from bitnet_b1_58-large/bitnet_b1_58-3B/Llama3-8B-1.58-100B-tokens.")
parser.add_argument('--BM',default="input", type=str,
help="block length when cutting one weight (M, K) into M / BM weights (BM, K).")
Expand All @@ -721,8 +721,8 @@ def get_three_k_two_k(K, bk):
gen_tbl_impl("{}_{}".format(kernel_shapes[i][0], kernel_shapes[i][1]), BM_list[i], BK_list[i], bm_list[i], k_list[i])
)

assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm shoud be {}".format(len(kernel_shapes))
assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm should be {}".format(len(kernel_shapes))

for i in range(len(kernel_shapes)):
assert kernel_shapes[i][0] % BM_list[i] == 0, "M %% BM should be 0"
assert (kernel_shapes[i][1] % BK_list[i]) % 32 == 0, "K %% BK %% 32 should be 0"
Expand Down Expand Up @@ -754,4 +754,4 @@ def get_three_k_two_k(K, bk):
config.set('Kernels_{}'.format(i), 'bmm'.format(i), str(bm_list[i]))

with open(''.join([output_dir, "/kernel_config.ini"]), 'w') as configfile:
config.write(configfile)
config.write(configfile)
12 changes: 6 additions & 6 deletions utils/convert-hf-to-gguf-bitnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,13 @@ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian:
self.ftype = ftype
self.fname_out = fname_out
self.is_big_endian = is_big_endian
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
self.endianness = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
self.use_temp_file = use_temp_file
self.is_safetensors = self._is_model_safetensors()
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
self.part_names = self._get_part_names()
self.hparams = Model.load_hparams(self.dir_model)
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianness=self.endianness, use_temp_file=self.use_temp_file)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)

Expand Down Expand Up @@ -542,7 +542,7 @@ def preprocess_two_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final_w
weight = weight.reshape((M * K // bm // by, bm // 8, 8))
weight[:, [0, 1, 2, 3], :] = weight[:, [0, 2, 1, 3], :]
weight = weight.reshape(M * K // bm // by, bm)

for i in range(weight.shape[0]):
final_weight.append(weight[i, :])

Expand Down Expand Up @@ -590,7 +590,7 @@ def preprocess_three_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final
combine_weight += temp_weight
combine_weight = combine_weight.view(np.uint8)
combine_weight = combine_weight.reshape((M * K // bm // (by * 4)), bm)

for i in range(combine_weight.shape[0]):
final_weight.append(combine_weight[i, :])

Expand Down Expand Up @@ -958,7 +958,7 @@ class BitnetModel(Model):

def set_vocab(self):
self._set_vocab_sentencepiece()

def set_gguf_parameters(self):
super().set_gguf_parameters()

Expand All @@ -976,7 +976,7 @@ def weight_quant(self, weight):

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# quant weight to i2 (in fp16)
if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
"down_proj.weight", "up_proj.weight", "gate_proj.weight",
"o_proj.weight")):
data_torch = self.weight_quant(data_torch)
Expand Down
30 changes: 15 additions & 15 deletions utils/convert-ms-to-gguf-bitnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ def preprocess_weights(
kfactor = int(cf.get(sec, 'kfactor'))
simd_n_in = int(cf.get(sec, 'simd_n_in'))
simd_n_out = int(cf.get(sec, 'simd_n_out'))
break
break

M = M * bits
ngroups_per_elem = 8 // g
Expand Down Expand Up @@ -1241,8 +1241,8 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)


class OutputFile:
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def __init__(self, fname_out: Path, endianness:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianness=endianness)

def add_meta_arch(self, params: Params) -> None:
name = "bitnet"
Expand Down Expand Up @@ -1364,7 +1364,7 @@ def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency:
logger.info(
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
)

if i2_scale is not None:
i2_scale = np.tile(i2_scale, 8)
ndarray = preprocess_weights(ndarray)
Expand All @@ -1379,11 +1379,11 @@ def close(self) -> None:
@staticmethod
def write_vocab_only(
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
endianness: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab)

of = OutputFile(fname_out, endianess=endianess)
of = OutputFile(fname_out, endianness=endianness)

# meta data
of.add_meta_arch(params)
Expand All @@ -1410,12 +1410,12 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
@staticmethod
def write_all(
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
concurrency: int = DEFAULT_CONCURRENCY, endianness: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab)

of = OutputFile(fname_out, endianess=endianess)
of = OutputFile(fname_out, endianness=endianness)

if 'bitnet' in of.gguf.arch:
svocab.chat_template = "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nBITNETAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"
Expand Down Expand Up @@ -1493,7 +1493,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
# 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
rope_ndarray = (1.0 / (torch.tensor(500000.0) ** (torch.arange(0, 128, 2).float() / 128))).numpy().astype(np.float32)
# print(rope_ndarray)


def load() -> UnquantizedTensor:
return UnquantizedTensor(rope_ndarray)
Expand All @@ -1508,7 +1508,7 @@ def load() -> UnquantizedTensor:
# print(lazy_tensor.load().ndarray)
# asfasf

# HF models permut or pack some of the tensors, so we need to undo that
# HF models permute or pack some of the tensors, so we need to undo that

# if ARCH == gguf.MODEL_ARCH.LLAMA or ARCH == gguf.MODEL_ARCH.BITNET:
# print(tmp.keys())
Expand Down Expand Up @@ -1560,7 +1560,7 @@ def load() -> UnquantizedTensor:
# break

# for name, lazy_tensor in model.items():
# if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
# if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
# "w1.weight", "w2.weight", "w3.weight",
# "wo.weight")):
# tmp[name] = part_lazy_weight_quant(tmp[name], name)
Expand Down Expand Up @@ -1791,9 +1791,9 @@ def main(args_in: list[str] | None = None) -> None:
do_dump_model(model_plus)
return

endianess = gguf.GGUFEndian.LITTLE
endianness = gguf.GGUFEndian.LITTLE
if args.big_endian:
endianess = gguf.GGUFEndian.BIG
endianness = gguf.GGUFEndian.BIG

params = Params.load(model_plus)
if params.n_ctx == -1:
Expand Down Expand Up @@ -1828,7 +1828,7 @@ def main(args_in: list[str] | None = None) -> None:
raise ValueError("need --outfile if using --vocab-only")
outfile = args.outfile
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
endianess=endianess, pad_vocab=args.pad_vocab)
endianness=endianness, pad_vocab=args.pad_vocab)
logger.info(f"Wrote {outfile}")
return

Expand All @@ -1847,7 +1847,7 @@ def main(args_in: list[str] | None = None) -> None:
logger.info(f"Writing {outfile}, format {ftype}")

OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
concurrency=args.concurrency, endianness=endianness, pad_vocab=args.pad_vocab)
logger.info(f"Wrote {outfile}")


Expand Down
Loading