From d2c35fc1b69e39ed91ab7357d18457c702fa913a Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 7 May 2022 18:08:22 +0700 Subject: [PATCH 001/148] added train script but with prefix manually declared --- pretrain_mp3_gpt.py | 258 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 258 insertions(+) create mode 100644 pretrain_mp3_gpt.py diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py new file mode 100644 index 000000000..cd1000631 --- /dev/null +++ b/pretrain_mp3_gpt.py @@ -0,0 +1,258 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pretrain GPT""" + +import torch +from functools import partial +from megatron import get_args +from megatron import print_rank_0 +from megatron import get_timers +from megatron import get_tokenizer +from megatron import mpu +from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ +from megatron.utils import average_losses_across_data_parallel_group + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +import subprocess + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building GPT model ...') + see_memory_usage(f"Before Building Model", force=True) + + args = get_args() + + with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed: + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + prefix_lm=True + ) + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + + else: + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + prefix_lm=True + ) + see_memory_usage(f"After Building Model", force=True) + return model + +_KEYS = ['text', 'prompt', 'answer'] + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = _KEYS + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + prefix_indices = [len(seq) for seq in data_b['prompt'].long()] + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + # weight loss_mask + if args.reweight_loss_based_on_position_frequency: + reweight_loss_mask_(loss_mask, tokens) + + return tokens, labels, loss_mask, attention_mask, position_ids + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = _KEYS + datatype = torch.int64 + + # Broadcast data. + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + prefix_indices = [len(seq) for seq in data_b['prompt'].long()] + + # Get the masks and position ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + # weight loss_mask + if args.reweight_loss_based_on_position_frequency: + reweight_loss_mask_(loss_mask, tokens) + + return (tokens, position_ids, attention_mask), (labels, loss_mask), prefix_indices + +def loss_func(loss_mask, output_tensor): + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator').start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + train_ds, valid_ds, test_ds = None, None, None + + print_rank_0('> building train, validation, and test datasets for GPT ...') + # Option 1 of data loading using --data-path + + if args.data_path: + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup)) + + import sys + sys.exit() + # Option 2 of data loading using --(train|valid|test)-weighted-split-paths + elif args.train_weighted_split_paths: + assigned_train_valid_test = [] + if args.train_weighted_split_paths is not None: + train_ds = [] + assigned_train_valid_test.append("train") + if args.valid_weighted_split_paths is not None: + valid_ds = [] + assigned_train_valid_test.append("valid") + if args.test_weighted_split_paths is not None: + test_ds = [] + assigned_train_valid_test.append("test") + + for s in assigned_train_valid_test: + data_groups = zip(eval(f"args.{s}_weighted_split_paths"), + eval(f"args.{s}_weighted_split_weights"), + eval(f"args.{s}_weighted_split_splits"), + eval(f"args.{s}_weighted_split_names")) + for paths, weights, splits, name in data_groups: + d = build_dataset_group(name, paths, weights, splits, + args.data_impl, + train_val_test_num_samples, + args.seq_length, args.seed, + (not args.mmap_warmup), + train_valid_test=s) + eval(f"{s}_ds").append(d) + else: + raise NotImplementedError("No dataloading argument passed") + + print_rank_0("> finished creating GPT datasets ...") + return train_ds, valid_ds, test_ds + +def command_exists(cmd): + result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) + return result.wait() == 0 + +def git_ds_info(): + from deepspeed.env_report import main as ds_report + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists('git'): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode('utf-8').strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode('utf-8').strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') + + +if __name__ == "__main__": + git_ds_info() + pretrain(train_valid_test_datasets_provider, model_provider, forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) From f977b857f83a28d34b4a5e11b503554292e21594 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 11:06:47 +0700 Subject: [PATCH 002/148] made new dataset --- megatron/data/non_causal_mtf_dataset.py | 527 ++++++++++++++++++++++++ pretrain_mp3_gpt.py | 12 +- train_mp3_gpt.sh | 115 ++++++ 3 files changed, 649 insertions(+), 5 deletions(-) create mode 100644 megatron/data/non_causal_mtf_dataset.py create mode 100644 train_mp3_gpt.sh diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py new file mode 100644 index 000000000..4c3bc8e50 --- /dev/null +++ b/megatron/data/non_causal_mtf_dataset.py @@ -0,0 +1,527 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""GPT style dataset.""" + +import os +import time + +import numpy as np +import torch + +from megatron import mpu, print_rank_0 +from megatron.data.blendable_dataset import BlendableDataset +from megatron.data.dataset_utils import get_datasets_weights_and_num_samples +from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ +from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset + + +def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup): + """Build train, valid, and test datasets.""" + + # Single dataset. + if len(data_prefix) == 1: + all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets(data_prefix[0], + data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup) + # Blending dataset. + else: + + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + prefixes[i], data_impl, splits_string, + datasets_train_valid_test_num_samples[i], + seq_length, seed, skip_warmup) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + all_train_datasets = BlendableDataset(train_datasets, weights) \ + if train_datasets else None + all_valid_datasets = BlendableDataset(valid_datasets, weights) \ + if valid_datasets else None + all_test_datasets = BlendableDataset(test_datasets, weights) \ + if test_datasets else None + + return all_train_datasets, all_valid_datasets, all_test_datasets + + +def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, train_valid_test): + ''' + Build a single dataset group corresponding to Option 2 of data loading see arguments.py + a dataset group is passed on the following form + GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2 + or alternatively + GIVEN_NAME PATH1 # for a single dataset to be used fully + ''' + + assert train_valid_test in ["train","valid","test"] + + # Single dataset. + if len(paths) == 1: + dataset = _build_single_datasets(paths[0], + splits[0], + data_impl, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, + dataset_group_name, train_valid_test) + return dataset + # Blending dataset. + else: + + data_prefix = [] + # data_prefix is on the shape: + # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"] + for w,p in zip(weights, paths): + data_prefix += [w,p] + + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + datasets = [] + for i in range(len(prefixes)): + ds = _build_single_datasets(prefixes[i], + splits[i], + data_impl, + datasets_train_valid_test_num_samples[i], + seq_length, + seed, skip_warmup, + dataset_group_name, train_valid_test) + + datasets.append(ds) + all_datasets = BlendableDataset(datasets, weights) + + return all_datasets + +def _build_single_datasets(data_prefix, range_string, data_impl, train_valid_test_num_samples, + seq_length, seed, skip_warmup, dataset_group_name, train_valid_test): + """Build a single dataset""" + + assert train_valid_test in ["train","valid","test"] + index = ["train","valid","test"].index(train_valid_test) + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + data_impl, + skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + # this corresponds to option2 for data loading on the form + # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3 + # splits here is an array of size 2 [start_index, end_index] + splits = get_split_by_range_(range_string=range_string, size=total_num_of_documents) + + # Print stats about the splits. + print_rank_0(' > dataset split:') + + print_rank_0(' {}:'.format(dataset_group_name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[0], splits[1], + splits[1] - splits[0])) + + def build_dataset(name): + dataset = None + if splits[1] > splits[0]: + documents = np.arange(start=splits[0], stop=splits[1], + step=1, dtype=np.int32) + dataset = NonCausalMTFDataset(name, data_prefix, + documents, indexed_dataset, + train_valid_test_num_samples[index], + seq_length, seed) + return dataset + + dataset = build_dataset(dataset_group_name) + + return dataset + + +def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup): + """Build train, valid, and test datasets.""" + + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + data_impl, + skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + # splits here is an array of size 4 [train_start_index, valid_start_index, test_start_index, test_end_index] + splits = get_train_valid_test_split_(splits_string, total_num_of_documents) + # Print stats about the splits. + print_rank_0(' > dataset split:') + + def print_split_stats(name, index): + print_rank_0(' {}:'.format(name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[index], splits[index + 1], + splits[index + 1] - splits[index])) + print_split_stats('train', 0) + print_split_stats('validation', 1) + print_split_stats('test', 2) + + def build_dataset(index, name): + dataset = None + if splits[index + 1] > splits[index]: + documents = np.arange(start=splits[index], stop=splits[index + 1], + step=1, dtype=np.int32) + dataset = NonCausalMTFDataset(name, data_prefix, + documents, indexed_dataset, + train_valid_test_num_samples[index], + seq_length, seed) + return dataset + + train_dataset = build_dataset(0, 'train') + valid_dataset = build_dataset(1, 'valid') + test_dataset = build_dataset(2, 'test') + + return (train_dataset, valid_dataset, test_dataset) + + +def get_indexed_dataset_(path, data_impl, skip_warmup): + """Build indexed dataset.""" + print_rank_0(' > building dataset index ...') + start_time = time.time() + indexed_dataset = make_indexed_dataset(path, + data_impl, + skip_warmup) + print_rank_0(' > finished creating indexed dataset in {:4f} ' + 'seconds'.format(time.time() - start_time)) + print_rank_0(' number of documents: {}'.format( + indexed_dataset.sizes.shape[0])) + + return indexed_dataset + + +class NonCausalMTFDataset(torch.utils.data.Dataset): + + def __init__( + self, + name, + data_prefix, + documents, + indexed_dataset, + num_samples, + seq_length, + seed + ): + + self.name = name + self.indexed_dataset = indexed_dataset + + # Checks + assert np.min(documents) >= 0 + assert np.max(documents) < indexed_dataset.sizes.shape[0] + + # Build index mappings. + self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( + self.name, data_prefix, documents, self.indexed_dataset.sizes, + num_samples, seq_length, seed) + + def __len__(self): + # -1 is due to data structure used to retieve the index: + # sample i --> [sample_idx[i], sample_idx[i+1]) + return self.sample_idx.shape[0] - 1 + + def __getitem__(self, idx): + # Get the shuffled index. + idx = self.shuffle_idx[idx] + # Start and end documents and offsets. + doc_index_f = self.sample_idx[idx][0] + doc_index_l = self.sample_idx[idx + 1][0] + offset_f = self.sample_idx[idx][1] + offset_l = self.sample_idx[idx + 1][1] + # If we are within the same document, just extract the chunk. + if doc_index_f == doc_index_l: + sample = self.indexed_dataset.get(self.doc_idx[doc_index_f], + offset=offset_f, + length=offset_l - offset_f + 1) + else: + # Otherwise, get the rest of the initial document. + sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], + offset=offset_f)] + # Loop over all in between documents and add the entire document. + for i in range(doc_index_f + 1, doc_index_l): + sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) + # And finally add the relevant portion of last document. + sample_list.append(self.indexed_dataset.get( + self.doc_idx[doc_index_l], + length=offset_l + 1)) + sample = np.concatenate(sample_list) + + return { + 'text': np.array(sample, dtype=np.int64), + 'prefix_len': 0 + } + + +def _build_index_mappings(name, data_prefix, documents, sizes, + num_samples, seq_length, seed, cutoff_last_epoch=0.95): + """Build doc-idx, sample-idx, and shuffle-idx. + doc-idx: is an array (ordered) of documents to be used in training. + sample-idx: is the start document index and document offset for each + training sample. + shuffle-idx: maps the sample index into a random index into sample-idx. + """ + # Number of tokens in each epoch and number of required epochs. + tokens_per_epoch = _num_tokens(documents, sizes) + num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) + # rng state + np_rng = np.random.RandomState(seed=seed) + + # Filename of the index mappings. + _filename = data_prefix + _filename += '_{}_indexmap'.format(name) + _filename += '_{}ns'.format(num_samples) + _filename += '_{}sl'.format(seq_length) + _filename += '_{}s'.format(seed) + doc_idx_filename = _filename + '_doc_idx.npy' + sample_idx_filename = _filename + '_sample_idx.npy' + shuffle_idx_filename = _filename + '_shuffle_idx.npy' + + # Build the indexed mapping if not exist. + if torch.distributed.get_rank() == 0: + if (not os.path.isfile(doc_idx_filename)) or \ + (not os.path.isfile(sample_idx_filename)) or \ + (not os.path.isfile(shuffle_idx_filename)): + + print_rank_0(' > WARNING: could not find index map files, building ' + 'the indices on rank 0 ...') + + # For the last epoch, decide whether include the entire epoch + # in the global shuffle or not. + + # If we need only one epoch, then separating last epoch does + # not mean anything. + if num_epochs == 1: + separate_last_epoch = False + print(' > only one epoch required, setting ' + 'separate_last_epoch to False', flush=True) + + else: + # Get the number of samples for the last epoch + num_samples_from_epochs_minus_one = ( + (num_epochs - 1) * tokens_per_epoch - 1) // seq_length + last_epoch_num_samples = num_samples - \ + num_samples_from_epochs_minus_one + assert last_epoch_num_samples >= 0, \ + f'last epoch number of samples {last_epoch_num_samples} should be non-negative.' + num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length + assert last_epoch_num_samples <= num_samples_per_epoch, \ + f'last epoch number of samples {last_epoch_num_samples} exceeded max value {num_samples_per_epoch}.' + # If we have less than cutoff_last_epoch * samples_per_epoch of the samples for the last epoch, + # seperate out the epoch and treat it differently. + separate_last_epoch = (last_epoch_num_samples < + int(cutoff_last_epoch * num_samples_per_epoch)) + if separate_last_epoch: + string = ' > last epoch number of samples ({}) is smaller '\ + 'than {}% of number of samples per epoch ({}), '\ + 'setting separate_last_epoch to True' + else: + string = ' > last epoch number of samples ({}) is larger '\ + 'than {}% of number of samples per epoch ({}), '\ + 'setting separate_last_epoch to False' + print(string.format(last_epoch_num_samples, cutoff_last_epoch * 100, + num_samples_per_epoch), flush=True) + + # doc-idx. + start_time = time.time() + doc_idx = _build_doc_idx(documents, num_epochs, np_rng, + separate_last_epoch) + np.save(doc_idx_filename, doc_idx, allow_pickle=True) + print_rank_0(' > elasped time to build and save doc-idx mapping ' + '(seconds): {:4f}'.format(time.time() - start_time)) + # sample-idx. + start_time = time.time() + # Use C++ implementation for speed. + # First compile and then import. + from megatron.data import helpers + assert doc_idx.dtype == np.int32 + assert sizes.dtype == np.int32 + sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, + num_epochs, tokens_per_epoch) + # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length, + # num_epochs, tokens_per_epoch) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + print_rank_0(' > elasped time to build and save sample-idx mapping ' + '(seconds): {:4f}'.format(time.time() - start_time)) + # shuffle-idx. + start_time = time.time() + # -1 is due to data structure used to retieve the index: + # sample i --> [sample_idx[i], sample_idx[i+1]) + if separate_last_epoch: + num_samples_ = num_samples_from_epochs_minus_one + else: + num_samples_ = sample_idx.shape[0] - 1 + shuffle_idx = _build_shuffle_idx(num_samples_, + sample_idx.shape[0] - 1, np_rng) + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) + print_rank_0(' > elasped time to build and save shuffle-idx mapping' + ' (seconds): {:4f}'.format(time.time() - start_time)) + + # This should be a barrier but nccl barrier assumes + # device_index=rank which is not the case for model + # parallel case + counts = torch.cuda.LongTensor([1]) + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) + assert counts[0].item() == ( + torch.distributed.get_world_size() // + torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) + + # Load mappings. + start_time = time.time() + print_rank_0(' > loading doc-idx mapping from {}'.format( + doc_idx_filename)) + doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' > loading sample-idx mapping from {}'.format( + sample_idx_filename)) + sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' > loading shuffle-idx mapping from {}'.format( + shuffle_idx_filename)) + shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( + time.time() - start_time)) + print_rank_0(' total number of samples: {}'.format( + sample_idx.shape[0])) + print_rank_0(' total number of epochs: {}'.format(num_epochs)) + + return doc_idx, sample_idx, shuffle_idx + + +def _num_tokens(documents, sizes): + """Total number of tokens in the dataset.""" + return np.sum(sizes[documents]) + + +def _num_epochs(tokens_per_epoch, seq_length, num_samples): + """Based on number of samples and sequence lenght, calculate how many + epochs will be needed.""" + num_epochs = 0 + total_tokens = 0 + while True: + num_epochs += 1 + total_tokens += tokens_per_epoch + # -1 is because we need to retrieve seq_length + 1 token each time + # but the last token will overlap with the first token of the next + # sample except for the last sample. + if ((total_tokens - 1) // seq_length) >= num_samples: + return num_epochs + + +def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch): + """Build an array with length = number-of-epochs * number-of-dcuments. + Each index is mapped to a corresponding document.""" + if not separate_last_epoch or num_epochs == 1: + doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1] + doc_idx[:] = documents + doc_idx = doc_idx.reshape(-1) + doc_idx = doc_idx.astype(np.int32) + np_rng.shuffle(doc_idx) + return doc_idx + + doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False) + doc_idx_last = _build_doc_idx(documents, 1, np_rng, False) + return np.concatenate((doc_idx_first, doc_idx_last)) + + +def _build_sample_idx(sizes, doc_idx, seq_length, + num_epochs, tokens_per_epoch): + """Sample index mapping is a 2D array with sizes + [number-of-samples + 1, 2] where [..., 0] contains + the index into `doc_idx` and [..., 1] is the + starting offset in that document.""" + + # Total number of samples. For -1 see comments in `_num_epochs`. + num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length + sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32) + + # Index into sample_idx. + sample_index = 0 + # Index into doc_idx. + doc_idx_index = 0 + # Begining offset for each document. + doc_offset = 0 + # Start with first document and no offset. + sample_idx[sample_index][0] = doc_idx_index + sample_idx[sample_index][1] = doc_offset + sample_index += 1 + while sample_index <= num_samples: + # Start with a fresh sequence. + remaining_seq_length = seq_length + 1 + while remaining_seq_length != 0: + # Get the document length. + doc_id = doc_idx[doc_idx_index] + doc_length = sizes[doc_id] - doc_offset + # And add it to the current sequence. + remaining_seq_length -= doc_length + # If we have more than a full sequence, adjust offset and set + # remaining length to zero so we return from the while loop. + # Note that -1 here is for the same reason we have -1 in + # `_num_epochs` calculations. + if remaining_seq_length <= 0: + doc_offset += (remaining_seq_length + doc_length - 1) + remaining_seq_length = 0 + else: + # Otherwise, start from the begining of the next document. + doc_idx_index += 1 + doc_offset = 0 + # Record the sequence. + sample_idx[sample_index][0] = doc_idx_index + sample_idx[sample_index][1] = doc_offset + sample_index += 1 + + return sample_idx + + +def _build_shuffle_idx(num_samples, total_size, np_rng): + """Build the range [0, size) and shuffle.""" + print(' > building shuffle index with split [0, {}) and [{}, {}) ' + '...'.format(num_samples, num_samples, total_size), flush=True) + + dtype_ = np.uint32 + if total_size >= (np.iinfo(np.uint32).max - 1): + dtype_ = np.int64 + + shuffle_idx_first = np.arange(start=0, stop=num_samples, + step=1, dtype=dtype_) + np_rng.shuffle(shuffle_idx_first) + if num_samples == total_size: + return shuffle_idx_first + + shuffle_idx_last = np.arange(start=num_samples, stop=total_size, + step=1, dtype=dtype_) + np_rng.shuffle(shuffle_idx_last) + + return np.concatenate((shuffle_idx_first, shuffle_idx_last)) diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py index cd1000631..acf143573 100644 --- a/pretrain_mp3_gpt.py +++ b/pretrain_mp3_gpt.py @@ -22,7 +22,8 @@ from megatron import get_timers from megatron import get_tokenizer from megatron import mpu -from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group +# from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ @@ -66,7 +67,7 @@ def model_provider(pre_process=True, post_process=True): see_memory_usage(f"After Building Model", force=True) return model -_KEYS = ['text', 'prompt', 'answer'] +_KEYS = ['text', 'prefix_len'] def get_batch(data_iterator): """Generate a batch""" @@ -90,7 +91,10 @@ def get_batch(data_iterator): tokens = tokens_[:, :-1].contiguous() # Prefix - prefix_indices = [len(seq) for seq in data_b['prompt'].long()] + prefix_indices = data_b['prefix_len'].long() + print(prefix_indices) + import sys + sys.exit() # Get the masks and postition ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( @@ -192,8 +196,6 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): seed=args.seed, skip_warmup=(not args.mmap_warmup)) - import sys - sys.exit() # Option 2 of data loading using --(train|valid|test)-weighted-split-paths elif args.train_weighted_split_paths: assigned_train_valid_test = [] diff --git a/train_mp3_gpt.sh b/train_mp3_gpt.sh new file mode 100644 index 000000000..56a1c8767 --- /dev/null +++ b/train_mp3_gpt.sh @@ -0,0 +1,115 @@ +CHECKPOINT_PATH=checkpoints/gpt2 +VOCAB_FILE=data/gpt2-vocab.json +MERGE_FILE=data/gpt2-merges.txt +DATA_PATH=data/meg-gpt2_oscar-combined_text_document +TENSORBOARD_PATH=output_dir/tensorboard +CODECARBON_PATH=output_dir/codecarbon + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=16 +TP_SIZE=1 +PP_SIZE=1 + +N_GPUS=2 +SAVE_INTERVAL=100 + +# --train-samples 10_000 \ +# --exit-interval $EXIT_INTERVAL \ + +# --exit-interval 100 \ +GPT_ARGS=" \ + --num-layers 2 \ + --hidden-size 64 \ + --num-attention-heads 2 \ + --seq-length 1024 \ + --max-position-embeddings 1024 \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --rampup-batch-size 2 2 1_000 \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples 100 \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 1e-4 \ + --lr-warmup-samples 5 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --fp16 \ + " +# --train-iters 500 \ + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 100 \ + --eval-iters 10 \ + --checkpoint-activations \ + " + +# --codecarbon-dir $CODECARBON_PATH \ +DATA_ARGS=" \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + + +ZERO_STAGE=1 + +config_json="./ds_config.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + +ALL_ARGS="$GPT_ARGS $OUTPUT_ARGS $DATA_ARGS $DEEPSPEED_ARGS" + +# if you can't stand pt-1.9 launcher noise +export LOGLEVEL=WARNING + +LAUNCHER="deepspeed --num_gpus $N_GPUS" +export CMD=" \ + $LAUNCHER pretrain_mp3_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + --distributed-backend nccl \ + $ALL_ARGS \ + " + +echo $CMD + +$CMD From fcfbf176da5c05c498484b1e628ad77e35d20e29 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 11:12:26 +0700 Subject: [PATCH 003/148] minor adjustments --- train_mp3_gpt.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/train_mp3_gpt.sh b/train_mp3_gpt.sh index 56a1c8767..0a9407a90 100644 --- a/train_mp3_gpt.sh +++ b/train_mp3_gpt.sh @@ -1,9 +1,9 @@ -CHECKPOINT_PATH=checkpoints/gpt2 +CHECKPOINT_PATH=data/checkpoints/gpt2 VOCAB_FILE=data/gpt2-vocab.json MERGE_FILE=data/gpt2-merges.txt -DATA_PATH=data/meg-gpt2_oscar-combined_text_document -TENSORBOARD_PATH=output_dir/tensorboard -CODECARBON_PATH=output_dir/codecarbon +DATA_PATH=data/t0-test +TENSORBOARD_PATH=data/checkpoints/tensorboard +CODECARBON_PATH=data/checkpoints/codecarbon MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=16 From 870dfd889bc8d96d21f41377eeeeee1039d07b04 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Mon, 9 May 2022 08:00:24 +0000 Subject: [PATCH 004/148] added capabilities for padding and prefix lm index --- megatron/data/non_causal_mtf_dataset.py | 59 +++++++++++++++---------- megatron/tokenizer/tokenizer.py | 4 ++ pretrain_mp3_gpt.py | 7 +-- 3 files changed, 42 insertions(+), 28 deletions(-) diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index 4c3bc8e50..da8556c50 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -21,7 +21,7 @@ import numpy as np import torch -from megatron import mpu, print_rank_0 +from megatron import mpu, print_rank_0, get_tokenizer from megatron.data.blendable_dataset import BlendableDataset from megatron.data.dataset_utils import get_datasets_weights_and_num_samples from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ @@ -239,6 +239,10 @@ def __init__( self.name = name self.indexed_dataset = indexed_dataset + self.seq_length = seq_length + + # vocab + self.tokenizer = get_tokenizer() # Checks assert np.min(documents) >= 0 @@ -257,32 +261,27 @@ def __len__(self): def __getitem__(self, idx): # Get the shuffled index. idx = self.shuffle_idx[idx] - # Start and end documents and offsets. - doc_index_f = self.sample_idx[idx][0] - doc_index_l = self.sample_idx[idx + 1][0] - offset_f = self.sample_idx[idx][1] - offset_l = self.sample_idx[idx + 1][1] - # If we are within the same document, just extract the chunk. - if doc_index_f == doc_index_l: - sample = self.indexed_dataset.get(self.doc_idx[doc_index_f], - offset=offset_f, - length=offset_l - offset_f + 1) + doc_idx = self.sample_idx[idx][0] + + sample = self.indexed_dataset.get( + self.doc_idx[doc_idx] + ) + + eod_idx = np.where(sample == self.tokenizer.eod)[0] + if len(eod_idx) > 0: + prefix_len = eod_idx[0] else: - # Otherwise, get the rest of the initial document. - sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], - offset=offset_f)] - # Loop over all in between documents and add the entire document. - for i in range(doc_index_f + 1, doc_index_l): - sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) - # And finally add the relevant portion of last document. - sample_list.append(self.indexed_dataset.get( - self.doc_idx[doc_index_l], - length=offset_l + 1)) - sample = np.concatenate(sample_list) + prefix_len = 0 + + sample = pad_and_convert_to_numpy( + sample, + self.tokenizer.pad, + self.seq_length + ) return { 'text': np.array(sample, dtype=np.int64), - 'prefix_len': 0 + 'prefix_len': prefix_len } @@ -525,3 +524,17 @@ def _build_shuffle_idx(num_samples, total_size, np_rng): np_rng.shuffle(shuffle_idx_last) return np.concatenate((shuffle_idx_first, shuffle_idx_last)) + +def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): + """Pad sequences and convert them to numpy.""" + + # Some checks. + num_tokens = len(tokens) + padding_length = max_seq_length - num_tokens + assert padding_length >= 0 + + # Tokens and token types. + filler = np.array([pad_id] * padding_length) + tokens_np = np.concatenate((tokens, filler), dtype=np.int64) + + return tokens_np diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 66f6522f2..9fa9ca216 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -348,3 +348,7 @@ def detokenize(self, token_ids): @property def eod(self): return self.tokenizer.eos_token_id + + @property + def pad(self): + return self.tokenizer.pad_token_id diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py index acf143573..8dccce361 100644 --- a/pretrain_mp3_gpt.py +++ b/pretrain_mp3_gpt.py @@ -22,7 +22,6 @@ from megatron import get_timers from megatron import get_tokenizer from megatron import mpu -# from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain @@ -83,6 +82,7 @@ def get_batch(data_iterator): data = next(data_iterator) else: data = None + print(data) data_b = mpu.broadcast_data(keys, data, datatype) # Unpack. @@ -92,9 +92,6 @@ def get_batch(data_iterator): # Prefix prefix_indices = data_b['prefix_len'].long() - print(prefix_indices) - import sys - sys.exit() # Get the masks and postition ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( @@ -131,7 +128,7 @@ def get_batch_pipe(data): tokens = tokens_[:, :-1].contiguous() # Prefix - prefix_indices = [len(seq) for seq in data_b['prompt'].long()] + prefix_indices = data_b['prefix_len'].long() # Get the masks and position ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( From 791bbd0270667101b39b5b6ad29c34c0c3dc9fc6 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Mon, 9 May 2022 08:09:24 +0000 Subject: [PATCH 005/148] added finetune script --- examples/finetune_mp3.sh | 42 ++++++++++++++++++++++++++++++++++++++++ pretrain_mp3_gpt.py | 3 +-- 2 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 examples/finetune_mp3.sh diff --git a/examples/finetune_mp3.sh b/examples/finetune_mp3.sh new file mode 100644 index 000000000..1e9919a86 --- /dev/null +++ b/examples/finetune_mp3.sh @@ -0,0 +1,42 @@ +#! /bin/bash + +# Runs the "345M" parameter model + +RANK=0 +WORLD_SIZE=1 + +DATA_PATH=data/t0-test_text_document +CHECKPOINT_PATH=data + + +deepspeed --num_gpus 2 pretrain_mp3_gpt.py \ + --num-layers 2 \ + --hidden-size 128 \ + --num-attention-heads 4 \ + --micro-batch-size 4 \ + --global-batch-size 16 \ + --seq-length 512 \ + --max-position-embeddings 512 \ + --train-iters 10000 \ + --lr-decay-iters 5000 \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 0.00015 \ + --min-lr 1.0e-5 \ + --lr-decay-style cosine \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --lr-warmup-fraction .01 \ + --checkpoint-activations \ + --log-interval 100 \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 \ + --fp16 \ + --tensorboard-dir GPT2 diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py index 8dccce361..4dd8a4160 100644 --- a/pretrain_mp3_gpt.py +++ b/pretrain_mp3_gpt.py @@ -82,7 +82,6 @@ def get_batch(data_iterator): data = next(data_iterator) else: data = None - print(data) data_b = mpu.broadcast_data(keys, data, datatype) # Unpack. @@ -91,7 +90,7 @@ def get_batch(data_iterator): tokens = tokens_[:, :-1].contiguous() # Prefix - prefix_indices = data_b['prefix_len'].long() + prefix_indices = data_b['prefix_len'].cpu().tolist() #.long() # Get the masks and postition ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( From 0f44b92da37f4444517fba1829381b153ce62384 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Mon, 9 May 2022 08:19:38 +0000 Subject: [PATCH 006/148] removed script --- train_mp3_gpt.sh | 115 ----------------------------------------------- 1 file changed, 115 deletions(-) delete mode 100644 train_mp3_gpt.sh diff --git a/train_mp3_gpt.sh b/train_mp3_gpt.sh deleted file mode 100644 index 0a9407a90..000000000 --- a/train_mp3_gpt.sh +++ /dev/null @@ -1,115 +0,0 @@ -CHECKPOINT_PATH=data/checkpoints/gpt2 -VOCAB_FILE=data/gpt2-vocab.json -MERGE_FILE=data/gpt2-merges.txt -DATA_PATH=data/t0-test -TENSORBOARD_PATH=data/checkpoints/tensorboard -CODECARBON_PATH=data/checkpoints/codecarbon - -MICRO_BATCH_SIZE=1 -GLOBAL_BATCH_SIZE=16 -TP_SIZE=1 -PP_SIZE=1 - -N_GPUS=2 -SAVE_INTERVAL=100 - -# --train-samples 10_000 \ -# --exit-interval $EXIT_INTERVAL \ - -# --exit-interval 100 \ -GPT_ARGS=" \ - --num-layers 2 \ - --hidden-size 64 \ - --num-attention-heads 2 \ - --seq-length 1024 \ - --max-position-embeddings 1024 \ - --micro-batch-size $MICRO_BATCH_SIZE \ - --rampup-batch-size 2 2 1_000 \ - --global-batch-size $GLOBAL_BATCH_SIZE \ - --train-samples 100 \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --adam-eps 1e-8 \ - --lr 1e-4 \ - --lr-warmup-samples 5 \ - --clip-grad 1.0 \ - --weight-decay 1e-1 \ - --vocab-file $VOCAB_FILE \ - --merge-file $MERGE_FILE \ - --fp16 \ - " -# --train-iters 500 \ - -OUTPUT_ARGS=" \ - --log-interval 10 \ - --save-interval $SAVE_INTERVAL \ - --eval-interval 100 \ - --eval-iters 10 \ - --checkpoint-activations \ - " - -# --codecarbon-dir $CODECARBON_PATH \ -DATA_ARGS=" \ - --save $CHECKPOINT_PATH \ - --load $CHECKPOINT_PATH \ - --data-path $DATA_PATH \ - --tensorboard-dir $TENSORBOARD_PATH \ - --tensorboard-queue-size 5 \ - --log-timers-to-tensorboard \ - --log-batch-size-to-tensorboard \ - --log-validation-ppl-to-tensorboard \ - " - - -ZERO_STAGE=1 - -config_json="./ds_config.json" - -# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() -cat < $config_json -{ - "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, - "train_batch_size": $GLOBAL_BATCH_SIZE, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": $ZERO_STAGE - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "steps_per_print": 2000, - "wall_clock_breakdown": false -} -EOT - - -DEEPSPEED_ARGS=" \ - --deepspeed \ - --deepspeed_config ${config_json} \ - --zero-stage ${ZERO_STAGE} \ - --deepspeed-activation-checkpointing \ - " - -ALL_ARGS="$GPT_ARGS $OUTPUT_ARGS $DATA_ARGS $DEEPSPEED_ARGS" - -# if you can't stand pt-1.9 launcher noise -export LOGLEVEL=WARNING - -LAUNCHER="deepspeed --num_gpus $N_GPUS" -export CMD=" \ - $LAUNCHER pretrain_mp3_gpt.py \ - --tensor-model-parallel-size $TP_SIZE \ - --pipeline-model-parallel-size $PP_SIZE \ - --distributed-backend nccl \ - $ALL_ARGS \ - " - -echo $CMD - -$CMD From 2ff081506836eed447a0791248886994d42e8743 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 22:59:26 +0700 Subject: [PATCH 007/148] added adjustments and new dataset --- megatron/data/non_causal_mlm_dataset.py | 165 ++++++++++++++++++++++++ megatron/data/non_causal_mtf_dataset.py | 52 +------- megatron/tokenizer/tokenizer.py | 19 +++ pretrain_mp3_gpt.py | 4 +- 4 files changed, 187 insertions(+), 53 deletions(-) create mode 100644 megatron/data/non_causal_mlm_dataset.py diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py new file mode 100644 index 000000000..d5f435d37 --- /dev/null +++ b/megatron/data/non_causal_mlm_dataset.py @@ -0,0 +1,165 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""T5 Style dataset.""" + +import collections + +import numpy as np +import torch + +from megatron import get_tokenizer +from megatron.data.dataset_utils import ( + create_masked_lm_predictions, + get_samples_mapping +) + +class NonCausalMLMDataset(torch.utils.data.Dataset): + + def __init__(self, name, indexed_dataset, data_prefix, + num_epochs, max_num_samples, masked_lm_prob, + max_seq_length, + short_seq_prob, seed): + + # Params to store. + self.name = name + self.seed = seed + self.masked_lm_prob = masked_lm_prob + self.max_seq_length = max_seq_length + + # Dataset. + self.indexed_dataset = indexed_dataset + + # Build the samples mapping. + self.samples_mapping = get_samples_mapping(self.indexed_dataset, + data_prefix, + num_epochs, + max_num_samples, + self.max_seq_length - 2, # account for added tokens + short_seq_prob, + self.seed, + self.name, + False) + + # Vocab stuff. + tokenizer = get_tokenizer() + self.vocab_id_list = list(tokenizer.inv_vocab.keys()) + self.vocab_id_to_token_dict = tokenizer.inv_vocab + self.cls_id = tokenizer.cls + self.sep_id = tokenizer.sep + self.mask_id = tokenizer.mask + self.pad_id = tokenizer.pad + self.bos_id = tokenizer.bos_token_id + self.eos_id = tokenizer.eos_token_id + self.sentinel_tokens = tokenizer.additional_special_tokens_ids + assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" + + def __len__(self): + return self.samples_mapping.shape[0] + + def __getitem__(self, idx): + + start_index, end_index, seq_length = self.samples_mapping[idx] + sample = [] + for index in range(start_index, end_index): + sample.append(self.indexed_dataset[index]) + # Note that this rng state should be numpy and not python since + # python randint is inclusive whereas the numpy one is exclusive. + np_rng = np.random.RandomState(seed=(self.seed + idx)) + return build_training_sample(sample, + self.max_seq_length, # needed for padding + self.vocab_id_list, + self.vocab_id_to_token_dict, + self.cls_id, self.sep_id, + self.mask_id, self.pad_id, + self.masked_lm_prob, np_rng, + self.bos_id, self.eos_id, + self.sentinel_tokens) + + +def build_training_sample(sample, + max_seq_length, + vocab_id_list, vocab_id_to_token_dict, + cls_id, sep_id, mask_id, pad_id, + masked_lm_prob, np_rng, bos_id=None, + eos_id=None, sentinel_tokens=None): + """Build training sample. + + Arguments: + sample: A list of sentences in which each sentence is a list token ids. + max_seq_length: Maximum length of the sequence. All values are padded to + this length. + vocab_id_list: List of vocabulary ids. Used to pick a random id. + vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. + cls_id: Start of example id. + sep_id: Separator id. + mask_id: Mask token id. + pad_id: Padding token id. + masked_lm_prob: Probability to mask tokens. + np_rng: Random number genenrator. Note that this rng state should be + numpy and not python since python randint is inclusive for + the opper bound whereas the numpy one is exclusive. + bos_id: start of decoder example id + eos_id: end of generation id + sentinel_tokens: unique value to be substituted for every replaced span + """ + + # flatten sentences into one list + tokens = [token for sentence in sample for token in sentence] + + # Truncate to `target_sequence_length`. + max_num_tokens = max_seq_length + truncated = len(tokens) > max_num_tokens + tokens = tokens[:max_num_tokens] + + # Masking. + max_predictions_per_seq = masked_lm_prob * max_num_tokens + (tokens, masks, labels, _, masked_spans) = create_masked_lm_predictions( + tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, + cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng + ) + + # Padding. + padded_tokens = pad_and_convert_to_numpy(tokens, max_seq_length) + padded_labels = pad_and_convert_to_numpy(labels, max_seq_length) + padded_masks = pad_and_convert_to_numpy(masks, max_seq_length) + + print(padded_tokens) + print(padded_labels) + import sys + sys.exit() + + train_sample = { + 'text': padded_tokens, + 'labels': padded_labels, + 'mask': padded_masks, + 'prefix_len': 0 + } + return train_sample + + +def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): + """Pad sequences and convert them to numpy.""" + + # Some checks. + num_tokens = len(tokens) + padding_length = max_seq_length - num_tokens + assert padding_length >= 0 + + # Tokens and token types. + filler = np.array([pad_id] * padding_length) + tokens_np = np.concatenate((tokens, filler), dtype=np.int64) + + return tokens_np \ No newline at end of file diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index da8556c50..95a005833 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -370,8 +370,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes, assert sizes.dtype == np.int32 sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch) - # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length, - # num_epochs, tokens_per_epoch) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) print_rank_0(' > elasped time to build and save sample-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) @@ -455,55 +454,6 @@ def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch): return np.concatenate((doc_idx_first, doc_idx_last)) -def _build_sample_idx(sizes, doc_idx, seq_length, - num_epochs, tokens_per_epoch): - """Sample index mapping is a 2D array with sizes - [number-of-samples + 1, 2] where [..., 0] contains - the index into `doc_idx` and [..., 1] is the - starting offset in that document.""" - - # Total number of samples. For -1 see comments in `_num_epochs`. - num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length - sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32) - - # Index into sample_idx. - sample_index = 0 - # Index into doc_idx. - doc_idx_index = 0 - # Begining offset for each document. - doc_offset = 0 - # Start with first document and no offset. - sample_idx[sample_index][0] = doc_idx_index - sample_idx[sample_index][1] = doc_offset - sample_index += 1 - while sample_index <= num_samples: - # Start with a fresh sequence. - remaining_seq_length = seq_length + 1 - while remaining_seq_length != 0: - # Get the document length. - doc_id = doc_idx[doc_idx_index] - doc_length = sizes[doc_id] - doc_offset - # And add it to the current sequence. - remaining_seq_length -= doc_length - # If we have more than a full sequence, adjust offset and set - # remaining length to zero so we return from the while loop. - # Note that -1 here is for the same reason we have -1 in - # `_num_epochs` calculations. - if remaining_seq_length <= 0: - doc_offset += (remaining_seq_length + doc_length - 1) - remaining_seq_length = 0 - else: - # Otherwise, start from the begining of the next document. - doc_idx_index += 1 - doc_offset = 0 - # Record the sequence. - sample_idx[sample_index][0] = doc_idx_index - sample_idx[sample_index][1] = doc_offset - sample_index += 1 - - return sample_idx - - def _build_shuffle_idx(num_samples, total_size, np_rng): """Build the range [0, size) and shuffle.""" print(' > building shuffle index with split [0, {}) and [{}, {}) ' diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 9fa9ca216..2f5bb657c 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -327,6 +327,9 @@ def __init__(self, tokenizer_name_or_path): self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} + if 'mask_token' not in self.tokenizer.special_tokens_map: + tokenizer.mask_token = "" + @property def vocab_size(self): return self.tokenizer.vocab_size @@ -345,6 +348,22 @@ def tokenize(self, text): def detokenize(self, token_ids): return self.tokenizer.decode(token_ids) + @property + def cls(self): + return self.cls_id + + @property + def sep(self): + return self.sep_id + + @property + def pad(self): + return self.pad_id + + @property + def mask(self): + return self.mask_id + @property def eod(self): return self.tokenizer.eos_token_id diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py index 4dd8a4160..4e61c184e 100644 --- a/pretrain_mp3_gpt.py +++ b/pretrain_mp3_gpt.py @@ -90,7 +90,7 @@ def get_batch(data_iterator): tokens = tokens_[:, :-1].contiguous() # Prefix - prefix_indices = data_b['prefix_len'].cpu().tolist() #.long() + prefix_indices = data_b['prefix_len'].cpu().tolist() # Get the masks and postition ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( @@ -127,7 +127,7 @@ def get_batch_pipe(data): tokens = tokens_[:, :-1].contiguous() # Prefix - prefix_indices = data_b['prefix_len'].long() + prefix_indices = data_b['prefix_len'].cpu().tolist() # Get the masks and position ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( From f0a79f69206414e128bb214d4eac5fe60cde47e1 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:08:19 +0700 Subject: [PATCH 008/148] try mlm dataset --- megatron/data/non_causal_mlm_dataset.py | 210 +++++++++++++++++++++++- pretrain_mp3_gpt.py | 3 +- 2 files changed, 206 insertions(+), 7 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index d5f435d37..bb75d7367 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -13,18 +13,216 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""T5 Style dataset.""" +"""GPT Non-Causal Mask Language Model Finetune Style dataset.""" import collections import numpy as np import torch -from megatron import get_tokenizer -from megatron.data.dataset_utils import ( - create_masked_lm_predictions, - get_samples_mapping -) +from megatron import mpu, print_rank_0, get_tokenizer +from megatron.data.blendable_dataset import BlendableDataset +from megatron.data.dataset_utils import get_datasets_weights_and_num_samples +from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ +from megatron.data.dataset_utils import create_masked_lm_predictions, get_samples_mapping +from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset + + +def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup): + """Build train, valid, and test datasets.""" + + # Single dataset. + if len(data_prefix) == 1: + all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets(data_prefix[0], + data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup) + # Blending dataset. + else: + + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + prefixes[i], data_impl, splits_string, + datasets_train_valid_test_num_samples[i], + seq_length, seed, skip_warmup) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + all_train_datasets = BlendableDataset(train_datasets, weights) \ + if train_datasets else None + all_valid_datasets = BlendableDataset(valid_datasets, weights) \ + if valid_datasets else None + all_test_datasets = BlendableDataset(test_datasets, weights) \ + if test_datasets else None + + return all_train_datasets, all_valid_datasets, all_test_datasets + + +def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, train_valid_test): + ''' + Build a single dataset group corresponding to Option 2 of data loading see arguments.py + a dataset group is passed on the following form + GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2 + or alternatively + GIVEN_NAME PATH1 # for a single dataset to be used fully + ''' + + assert train_valid_test in ["train","valid","test"] + + # Single dataset. + if len(paths) == 1: + dataset = _build_single_datasets(paths[0], + splits[0], + data_impl, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, + dataset_group_name, train_valid_test) + return dataset + # Blending dataset. + else: + + data_prefix = [] + # data_prefix is on the shape: + # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"] + for w,p in zip(weights, paths): + data_prefix += [w,p] + + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + datasets = [] + for i in range(len(prefixes)): + ds = _build_single_datasets(prefixes[i], + splits[i], + data_impl, + datasets_train_valid_test_num_samples[i], + seq_length, + seed, skip_warmup, + dataset_group_name, train_valid_test) + + datasets.append(ds) + all_datasets = BlendableDataset(datasets, weights) + + return all_datasets + +def _build_single_datasets(data_prefix, range_string, data_impl, train_valid_test_num_samples, + seq_length, seed, skip_warmup, dataset_group_name, train_valid_test): + """Build a single dataset""" + + assert train_valid_test in ["train","valid","test"] + index = ["train","valid","test"].index(train_valid_test) + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + data_impl, + skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + # this corresponds to option2 for data loading on the form + # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3 + # splits here is an array of size 2 [start_index, end_index] + splits = get_split_by_range_(range_string=range_string, size=total_num_of_documents) + + # Print stats about the splits. + print_rank_0(' > dataset split:') + + print_rank_0(' {}:'.format(dataset_group_name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[0], splits[1], + splits[1] - splits[0])) + + def build_dataset(name): + dataset = None + if splits[1] > splits[0]: + documents = np.arange(start=splits[0], stop=splits[1], + step=1, dtype=np.int32) + dataset = NonCausalMTFDataset(name, data_prefix, + documents, indexed_dataset, + train_valid_test_num_samples[index], + seq_length, seed) + return dataset + + dataset = build_dataset(dataset_group_name) + + return dataset + + +def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup): + """Build train, valid, and test datasets.""" + + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + data_impl, + skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + # splits here is an array of size 4 [train_start_index, valid_start_index, test_start_index, test_end_index] + splits = get_train_valid_test_split_(splits_string, total_num_of_documents) + # Print stats about the splits. + print_rank_0(' > dataset split:') + + def print_split_stats(name, index): + print_rank_0(' {}:'.format(name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[index], splits[index + 1], + splits[index + 1] - splits[index])) + print_split_stats('train', 0) + print_split_stats('validation', 1) + print_split_stats('test', 2) + + def build_dataset(index, name): + dataset = None + if splits[index + 1] > splits[index]: + documents = np.arange(start=splits[index], stop=splits[index + 1], + step=1, dtype=np.int32) + dataset = NonCausalMTFDataset(name, data_prefix, + documents, indexed_dataset, + train_valid_test_num_samples[index], + seq_length, seed) + return dataset + + train_dataset = build_dataset(0, 'train') + valid_dataset = build_dataset(1, 'valid') + test_dataset = build_dataset(2, 'test') + + return (train_dataset, valid_dataset, test_dataset) + + +def get_indexed_dataset_(path, data_impl, skip_warmup): + """Build indexed dataset.""" + print_rank_0(' > building dataset index ...') + start_time = time.time() + indexed_dataset = make_indexed_dataset(path, + data_impl, + skip_warmup) + print_rank_0(' > finished creating indexed dataset in {:4f} ' + 'seconds'.format(time.time() - start_time)) + print_rank_0(' number of documents: {}'.format( + indexed_dataset.sizes.shape[0])) + + return indexed_dataset + class NonCausalMLMDataset(torch.utils.data.Dataset): diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py index 4e61c184e..b7af289a5 100644 --- a/pretrain_mp3_gpt.py +++ b/pretrain_mp3_gpt.py @@ -22,7 +22,8 @@ from megatron import get_timers from megatron import get_tokenizer from megatron import mpu -from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group +# from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ From eb416c7620d26ae5fccdde3c23cec076d24a8612 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:09:42 +0700 Subject: [PATCH 009/148] minor changes --- megatron/tokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 2f5bb657c..a13cc031a 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -328,7 +328,7 @@ def __init__(self, tokenizer_name_or_path): self.decoder = {v: k for k, v in self.encoder.items()} if 'mask_token' not in self.tokenizer.special_tokens_map: - tokenizer.mask_token = "" + self.tokenizer.mask_token = "" @property def vocab_size(self): From c0bc21b6b567e2ddfaee4d216f2e9951c2b78cd3 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:10:51 +0700 Subject: [PATCH 010/148] minor addition of import packages --- megatron/data/non_causal_mlm_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index bb75d7367..82c9eb66b 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -15,6 +15,8 @@ """GPT Non-Causal Mask Language Model Finetune Style dataset.""" +import os +import time import collections import numpy as np From 82e824c0048c25dc6a1e07186f80977dab61c02c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:12:04 +0700 Subject: [PATCH 011/148] minor error fix --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 82c9eb66b..c7a251f78 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -156,7 +156,7 @@ def build_dataset(name): if splits[1] > splits[0]: documents = np.arange(start=splits[0], stop=splits[1], step=1, dtype=np.int32) - dataset = NonCausalMTFDataset(name, data_prefix, + dataset = NonCausalMLMDataset(name, data_prefix, documents, indexed_dataset, train_valid_test_num_samples[index], seq_length, seed) From 7bb17ec4ea20194e49d887bb3f70778769d88b0c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:14:07 +0700 Subject: [PATCH 012/148] minor error fix --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index c7a251f78..ceeceb559 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -198,7 +198,7 @@ def build_dataset(index, name): if splits[index + 1] > splits[index]: documents = np.arange(start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32) - dataset = NonCausalMTFDataset(name, data_prefix, + dataset = NonCausalMLMDataset(name, data_prefix, documents, indexed_dataset, train_valid_test_num_samples[index], seq_length, seed) From 99297666eed750c590fdc709f424de6a8cd98415 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:24:14 +0700 Subject: [PATCH 013/148] samples follow how gpt dataset is loaded --- megatron/data/non_causal_mlm_dataset.py | 68 +++++++++++++++++-------- 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index ceeceb559..b4959e469 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -228,31 +228,26 @@ def get_indexed_dataset_(path, data_impl, skip_warmup): class NonCausalMLMDataset(torch.utils.data.Dataset): - def __init__(self, name, indexed_dataset, data_prefix, - num_epochs, max_num_samples, masked_lm_prob, - max_seq_length, - short_seq_prob, seed): + def __init__( + self, + name, + data_prefix, + documents, + indexed_dataset, + num_samples, + seq_length, + seed, + ): # Params to store. self.name = name + self.seq_length = seq_length self.seed = seed self.masked_lm_prob = masked_lm_prob - self.max_seq_length = max_seq_length # Dataset. self.indexed_dataset = indexed_dataset - # Build the samples mapping. - self.samples_mapping = get_samples_mapping(self.indexed_dataset, - data_prefix, - num_epochs, - max_num_samples, - self.max_seq_length - 2, # account for added tokens - short_seq_prob, - self.seed, - self.name, - False) - # Vocab stuff. tokenizer = get_tokenizer() self.vocab_id_list = list(tokenizer.inv_vocab.keys()) @@ -266,15 +261,46 @@ def __init__(self, name, indexed_dataset, data_prefix, self.sentinel_tokens = tokenizer.additional_special_tokens_ids assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" + # Checks + assert np.min(documents) >= 0 + assert np.max(documents) < indexed_dataset.sizes.shape[0] + + # Build index mappings. + self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( + self.name, data_prefix, documents, self.indexed_dataset.sizes, + num_samples, seq_length, seed) + def __len__(self): - return self.samples_mapping.shape[0] + # -1 is due to data structure used to retieve the index: + # sample i --> [sample_idx[i], sample_idx[i+1]) + return self.sample_idx.shape[0] - 1 def __getitem__(self, idx): - start_index, end_index, seq_length = self.samples_mapping[idx] - sample = [] - for index in range(start_index, end_index): - sample.append(self.indexed_dataset[index]) + idx = self.shuffle_idx[idx] + # Start and end documents and offsets. + doc_index_f = self.sample_idx[idx][0] + doc_index_l = self.sample_idx[idx + 1][0] + offset_f = self.sample_idx[idx][1] + offset_l = self.sample_idx[idx + 1][1] + # If we are within the same document, just extract the chunk. + if doc_index_f == doc_index_l: + sample = self.indexed_dataset.get(self.doc_idx[doc_index_f], + offset=offset_f, + length=offset_l - offset_f + 1) + else: + # Otherwise, get the rest of the initial document. + sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], + offset=offset_f)] + # Loop over all in between documents and add the entire document. + for i in range(doc_index_f + 1, doc_index_l): + sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) + # And finally add the relevant portion of last document. + sample_list.append(self.indexed_dataset.get( + self.doc_idx[doc_index_l], + length=offset_l + 1)) + sample = np.concatenate(sample_list) + # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. np_rng = np.random.RandomState(seed=(self.seed + idx)) From 861c41f3f6ef0cdfa5373ba1b43869a91836f216 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:26:03 +0700 Subject: [PATCH 014/148] added masked_lm_prob --- megatron/data/non_causal_mlm_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index b4959e469..e3599be22 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -237,6 +237,7 @@ def __init__( num_samples, seq_length, seed, + masked_lm_prob=0.15, ): # Params to store. From fe9511562ab7416436f61d99dfdab5a624615874 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:35:25 +0700 Subject: [PATCH 015/148] fixed tokenizer abstractions for HF tokenizer --- megatron/tokenizer/tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index a13cc031a..b6bb15df7 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -336,11 +336,11 @@ def vocab_size(self): @property def vocab(self): - return self.tokenizer.encoder + return self.tokenizer.vocab @property def inv_vocab(self): - return self.tokenizer.decoder + return {v: k for k, v in self.tokenizer.vocab.items()} def tokenize(self, text): return self.tokenizer.encode(text) From 8ea594302a2726d1d3b8a9c41d1ec3f96e5a5e56 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:51:48 +0700 Subject: [PATCH 016/148] added mask id --- megatron/tokenizer/tokenizer.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index b6bb15df7..d0df8b917 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -329,6 +329,15 @@ def __init__(self, tokenizer_name_or_path): if 'mask_token' not in self.tokenizer.special_tokens_map: self.tokenizer.mask_token = "" + self.tokenizer.mask_id = self.tokenizer.vocab_size-1 + + if 'cls_token' not in self.tokenizer.special_tokens_map: + self.tokenizer.cls_token = "" + self.tokenizer.cls_id = self.tokenizer.vocab_size-2 + + if 'sep_token' not in self.tokenizer.special_tokens_map: + self.tokenizer.sep_token = "" + self.tokenizer.sep_id = self.tokenizer.vocab_size-3 @property def vocab_size(self): From aa0d146a33f3ca98008336d0b2869d10bd1a68f6 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:53:37 +0700 Subject: [PATCH 017/148] added mask id --- megatron/tokenizer/tokenizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index d0df8b917..b38b4bbc4 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -359,19 +359,19 @@ def detokenize(self, token_ids): @property def cls(self): - return self.cls_id + return self.cls_token_id @property def sep(self): - return self.sep_id + return self.sep_token_id @property def pad(self): - return self.pad_id + return self.pad_token_id @property def mask(self): - return self.mask_id + return self.mask_token_id @property def eod(self): From 215e8ccbc8fedf59c5f6382a9ef52f28541cd5b3 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:54:54 +0700 Subject: [PATCH 018/148] added mask id --- megatron/tokenizer/tokenizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index b38b4bbc4..8ea4d6af4 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -329,15 +329,15 @@ def __init__(self, tokenizer_name_or_path): if 'mask_token' not in self.tokenizer.special_tokens_map: self.tokenizer.mask_token = "" - self.tokenizer.mask_id = self.tokenizer.vocab_size-1 + self.tokenizer.mask_token_id = self.tokenizer.vocab_size-1 if 'cls_token' not in self.tokenizer.special_tokens_map: self.tokenizer.cls_token = "" - self.tokenizer.cls_id = self.tokenizer.vocab_size-2 + self.tokenizer.cls_token_id = self.tokenizer.vocab_size-2 if 'sep_token' not in self.tokenizer.special_tokens_map: self.tokenizer.sep_token = "" - self.tokenizer.sep_id = self.tokenizer.vocab_size-3 + self.tokenizer.sep_token_id = self.tokenizer.vocab_size-3 @property def vocab_size(self): From b6eef436c2d5c25987526ba8689463153eba91e5 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:19:03 +0700 Subject: [PATCH 019/148] added mask id --- megatron/tokenizer/tokenizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 8ea4d6af4..a4a53533a 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -329,15 +329,15 @@ def __init__(self, tokenizer_name_or_path): if 'mask_token' not in self.tokenizer.special_tokens_map: self.tokenizer.mask_token = "" - self.tokenizer.mask_token_id = self.tokenizer.vocab_size-1 + # self.tokenizer.mask_token_id = self.tokenizer.vocab_size-1 if 'cls_token' not in self.tokenizer.special_tokens_map: self.tokenizer.cls_token = "" - self.tokenizer.cls_token_id = self.tokenizer.vocab_size-2 + # self.tokenizer.cls_token_id = self.tokenizer.vocab_size-2 if 'sep_token' not in self.tokenizer.special_tokens_map: self.tokenizer.sep_token = "" - self.tokenizer.sep_token_id = self.tokenizer.vocab_size-3 + # self.tokenizer.sep_token_id = self.tokenizer.vocab_size-3 @property def vocab_size(self): From bfc73a5ca1e6886d9af761e03d8b18264db46fa8 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:20:20 +0700 Subject: [PATCH 020/148] added fix --- megatron/tokenizer/tokenizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index a4a53533a..e8cc5ebeb 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -359,19 +359,19 @@ def detokenize(self, token_ids): @property def cls(self): - return self.cls_token_id + return self.tokenizer.cls_token_id @property def sep(self): - return self.sep_token_id + return self.tokenizer.sep_token_id @property def pad(self): - return self.pad_token_id + return self.tokenizer.pad_token_id @property def mask(self): - return self.mask_token_id + return self.tokenizer.mask_token_id @property def eod(self): From 1890f87fae66ea848490d6a4455997325bd65d9f Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:23:02 +0700 Subject: [PATCH 021/148] added bos and eos token id --- megatron/tokenizer/tokenizer.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index e8cc5ebeb..31d6dcb05 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -373,6 +373,16 @@ def pad(self): def mask(self): return self.tokenizer.mask_token_id + @property + def bos_token_id(self): + """ Id of the beginning of sentence token in the vocabulary.""" + return self.tokenizer.bos_token_id + + @property + def eos_token_id(self): + """ Id of the end of sentence token in the vocabulary.""" + return self.tokenizer.eos_token_id + @property def eod(self): return self.tokenizer.eos_token_id From 01392a90d08a331563c5f81b58395334befd46a3 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:25:13 +0700 Subject: [PATCH 022/148] no need for sentinal token --- megatron/data/non_causal_mlm_dataset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e3599be22..4929f0dac 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -259,8 +259,6 @@ def __init__( self.pad_id = tokenizer.pad self.bos_id = tokenizer.bos_token_id self.eos_id = tokenizer.eos_token_id - self.sentinel_tokens = tokenizer.additional_special_tokens_ids - assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" # Checks assert np.min(documents) >= 0 @@ -313,7 +311,7 @@ def __getitem__(self, idx): self.mask_id, self.pad_id, self.masked_lm_prob, np_rng, self.bos_id, self.eos_id, - self.sentinel_tokens) + ) def build_training_sample(sample, From 923decbec7c7c92e7ce00a96e00c78ebb1b49873 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:27:02 +0700 Subject: [PATCH 023/148] add aux functions --- megatron/data/non_causal_mlm_dataset.py | 191 ++++++++++++++++++++++++ 1 file changed, 191 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 4929f0dac..a5239f402 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -375,6 +375,197 @@ def build_training_sample(sample, return train_sample +def _build_index_mappings(name, data_prefix, documents, sizes, + num_samples, seq_length, seed, cutoff_last_epoch=0.95): + """Build doc-idx, sample-idx, and shuffle-idx. + doc-idx: is an array (ordered) of documents to be used in training. + sample-idx: is the start document index and document offset for each + training sample. + shuffle-idx: maps the sample index into a random index into sample-idx. + """ + # Number of tokens in each epoch and number of required epochs. + tokens_per_epoch = _num_tokens(documents, sizes) + num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) + # rng state + np_rng = np.random.RandomState(seed=seed) + + # Filename of the index mappings. + _filename = data_prefix + _filename += '_{}_indexmap'.format(name) + _filename += '_{}ns'.format(num_samples) + _filename += '_{}sl'.format(seq_length) + _filename += '_{}s'.format(seed) + doc_idx_filename = _filename + '_doc_idx.npy' + sample_idx_filename = _filename + '_sample_idx.npy' + shuffle_idx_filename = _filename + '_shuffle_idx.npy' + + # Build the indexed mapping if not exist. + if torch.distributed.get_rank() == 0: + if (not os.path.isfile(doc_idx_filename)) or \ + (not os.path.isfile(sample_idx_filename)) or \ + (not os.path.isfile(shuffle_idx_filename)): + + print_rank_0(' > WARNING: could not find index map files, building ' + 'the indices on rank 0 ...') + + # For the last epoch, decide whether include the entire epoch + # in the global shuffle or not. + + # If we need only one epoch, then separating last epoch does + # not mean anything. + if num_epochs == 1: + separate_last_epoch = False + print(' > only one epoch required, setting ' + 'separate_last_epoch to False', flush=True) + + else: + # Get the number of samples for the last epoch + num_samples_from_epochs_minus_one = ( + (num_epochs - 1) * tokens_per_epoch - 1) // seq_length + last_epoch_num_samples = num_samples - \ + num_samples_from_epochs_minus_one + assert last_epoch_num_samples >= 0, \ + f'last epoch number of samples {last_epoch_num_samples} should be non-negative.' + num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length + assert last_epoch_num_samples <= num_samples_per_epoch, \ + f'last epoch number of samples {last_epoch_num_samples} exceeded max value {num_samples_per_epoch}.' + # If we have less than cutoff_last_epoch * samples_per_epoch of the samples for the last epoch, + # seperate out the epoch and treat it differently. + separate_last_epoch = (last_epoch_num_samples < + int(cutoff_last_epoch * num_samples_per_epoch)) + if separate_last_epoch: + string = ' > last epoch number of samples ({}) is smaller '\ + 'than {}% of number of samples per epoch ({}), '\ + 'setting separate_last_epoch to True' + else: + string = ' > last epoch number of samples ({}) is larger '\ + 'than {}% of number of samples per epoch ({}), '\ + 'setting separate_last_epoch to False' + print(string.format(last_epoch_num_samples, cutoff_last_epoch * 100, + num_samples_per_epoch), flush=True) + + # doc-idx. + start_time = time.time() + doc_idx = _build_doc_idx(documents, num_epochs, np_rng, + separate_last_epoch) + np.save(doc_idx_filename, doc_idx, allow_pickle=True) + print_rank_0(' > elasped time to build and save doc-idx mapping ' + '(seconds): {:4f}'.format(time.time() - start_time)) + # sample-idx. + start_time = time.time() + # Use C++ implementation for speed. + # First compile and then import. + from megatron.data import helpers + assert doc_idx.dtype == np.int32 + assert sizes.dtype == np.int32 + sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, + num_epochs, tokens_per_epoch) + + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + print_rank_0(' > elasped time to build and save sample-idx mapping ' + '(seconds): {:4f}'.format(time.time() - start_time)) + # shuffle-idx. + start_time = time.time() + # -1 is due to data structure used to retieve the index: + # sample i --> [sample_idx[i], sample_idx[i+1]) + if separate_last_epoch: + num_samples_ = num_samples_from_epochs_minus_one + else: + num_samples_ = sample_idx.shape[0] - 1 + shuffle_idx = _build_shuffle_idx(num_samples_, + sample_idx.shape[0] - 1, np_rng) + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) + print_rank_0(' > elasped time to build and save shuffle-idx mapping' + ' (seconds): {:4f}'.format(time.time() - start_time)) + + # This should be a barrier but nccl barrier assumes + # device_index=rank which is not the case for model + # parallel case + counts = torch.cuda.LongTensor([1]) + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) + assert counts[0].item() == ( + torch.distributed.get_world_size() // + torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) + + # Load mappings. + start_time = time.time() + print_rank_0(' > loading doc-idx mapping from {}'.format( + doc_idx_filename)) + doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' > loading sample-idx mapping from {}'.format( + sample_idx_filename)) + sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' > loading shuffle-idx mapping from {}'.format( + shuffle_idx_filename)) + shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( + time.time() - start_time)) + print_rank_0(' total number of samples: {}'.format( + sample_idx.shape[0])) + print_rank_0(' total number of epochs: {}'.format(num_epochs)) + + return doc_idx, sample_idx, shuffle_idx + + +def _num_tokens(documents, sizes): + """Total number of tokens in the dataset.""" + return np.sum(sizes[documents]) + + +def _num_epochs(tokens_per_epoch, seq_length, num_samples): + """Based on number of samples and sequence lenght, calculate how many + epochs will be needed.""" + num_epochs = 0 + total_tokens = 0 + while True: + num_epochs += 1 + total_tokens += tokens_per_epoch + # -1 is because we need to retrieve seq_length + 1 token each time + # but the last token will overlap with the first token of the next + # sample except for the last sample. + if ((total_tokens - 1) // seq_length) >= num_samples: + return num_epochs + + +def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch): + """Build an array with length = number-of-epochs * number-of-dcuments. + Each index is mapped to a corresponding document.""" + if not separate_last_epoch or num_epochs == 1: + doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1] + doc_idx[:] = documents + doc_idx = doc_idx.reshape(-1) + doc_idx = doc_idx.astype(np.int32) + np_rng.shuffle(doc_idx) + return doc_idx + + doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False) + doc_idx_last = _build_doc_idx(documents, 1, np_rng, False) + return np.concatenate((doc_idx_first, doc_idx_last)) + + +def _build_shuffle_idx(num_samples, total_size, np_rng): + """Build the range [0, size) and shuffle.""" + print(' > building shuffle index with split [0, {}) and [{}, {}) ' + '...'.format(num_samples, num_samples, total_size), flush=True) + + dtype_ = np.uint32 + if total_size >= (np.iinfo(np.uint32).max - 1): + dtype_ = np.int64 + + shuffle_idx_first = np.arange(start=0, stop=num_samples, + step=1, dtype=dtype_) + np_rng.shuffle(shuffle_idx_first) + if num_samples == total_size: + return shuffle_idx_first + + shuffle_idx_last = np.arange(start=num_samples, stop=total_size, + step=1, dtype=dtype_) + np_rng.shuffle(shuffle_idx_last) + + return np.concatenate((shuffle_idx_first, shuffle_idx_last)) + + def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): """Pad sequences and convert them to numpy.""" From 4611d67964b732cd9b973c10b78c0514cd92813e Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:28:06 +0700 Subject: [PATCH 024/148] add aux functions --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index a5239f402..d9da3b6a6 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -304,7 +304,7 @@ def __getitem__(self, idx): # python randint is inclusive whereas the numpy one is exclusive. np_rng = np.random.RandomState(seed=(self.seed + idx)) return build_training_sample(sample, - self.max_seq_length, # needed for padding + self.seq_length, # needed for padding self.vocab_id_list, self.vocab_id_to_token_dict, self.cls_id, self.sep_id, From 4356de3a1cc4b7321e029ebf7a2418a4654e84e3 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:29:39 +0700 Subject: [PATCH 025/148] add aux functions --- megatron/data/non_causal_mlm_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index d9da3b6a6..0ab54df8a 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -342,7 +342,8 @@ def build_training_sample(sample, """ # flatten sentences into one list - tokens = [token for sentence in sample for token in sentence] + # tokens = [token for sentence in sample for token in sentence] + tokens = sample # Truncate to `target_sequence_length`. max_num_tokens = max_seq_length From f31c686d255e8206d64dd56d96f352717f554f62 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:31:41 +0700 Subject: [PATCH 026/148] add pad_id --- megatron/data/non_causal_mlm_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 0ab54df8a..aec33a23f 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -358,9 +358,9 @@ def build_training_sample(sample, ) # Padding. - padded_tokens = pad_and_convert_to_numpy(tokens, max_seq_length) - padded_labels = pad_and_convert_to_numpy(labels, max_seq_length) - padded_masks = pad_and_convert_to_numpy(masks, max_seq_length) + padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) + padded_labels = pad_and_convert_to_numpy(labels, pad_id, max_seq_length) + padded_masks = pad_and_convert_to_numpy(masks, pad_id, max_seq_length) print(padded_tokens) print(padded_labels) From a3951e8b8ca1db1380b1b9184999b1ae9f9bdf2d Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 18 May 2022 23:47:31 +0700 Subject: [PATCH 027/148] changed lm predictions to t5 --- megatron/data/non_causal_mlm_dataset.py | 19 +++++++++++-------- megatron/data/non_causal_mtf_dataset.py | 7 +++++-- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index aec33a23f..c27cfecf8 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -354,16 +354,19 @@ def build_training_sample(sample, max_predictions_per_seq = masked_lm_prob * max_num_tokens (tokens, masks, labels, _, masked_spans) = create_masked_lm_predictions( tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, - cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng - ) + cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, + max_ngrams=10, geometric_dist=True, masking_style="t5") - # Padding. - padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) - padded_labels = pad_and_convert_to_numpy(labels, pad_id, max_seq_length) - padded_masks = pad_and_convert_to_numpy(masks, pad_id, max_seq_length) - print(padded_tokens) - print(padded_labels) + # Padding. + # padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) + # padded_labels = pad_and_convert_to_numpy(labels, pad_id, max_seq_length) + # padded_masks = pad_and_convert_to_numpy(masks, pad_id, max_seq_length) + # print(padded_tokens) + # print(padded_labels) + + print(tokens) + print(labels) import sys sys.exit() diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index 95a005833..6bce2c4ef 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""GPT style dataset.""" +"""GPT Non-Causal Multitask Finetune style dataset.""" import os import time @@ -237,10 +237,13 @@ def __init__( seed ): + # Params to store. self.name = name - self.indexed_dataset = indexed_dataset self.seq_length = seq_length + # Dataset. + self.indexed_dataset = indexed_dataset + # vocab self.tokenizer = get_tokenizer() From 97b9a92d3eb951881580dfccccde1c15f36277dc Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 18 May 2022 23:51:06 +0700 Subject: [PATCH 028/148] changed lm predictions to t5 --- megatron/data/non_causal_mlm_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index c27cfecf8..36456e819 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -365,7 +365,9 @@ def build_training_sample(sample, # print(padded_tokens) # print(padded_labels) + print("tokens") print(tokens) + print("labels") print(labels) import sys sys.exit() From fe73a73d4c3bb147cd606581531dffef8678291b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 18 May 2022 23:57:28 +0700 Subject: [PATCH 029/148] changed lm predictions to t5 --- megatron/data/non_causal_mlm_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 36456e819..de47bd9ad 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -369,6 +369,8 @@ def build_training_sample(sample, print(tokens) print("labels") print(labels) + print("masked_spans") + print(masked_spans) import sys sys.exit() From 6a9cb758f5ba738ed5f433957699c8ab95bf7347 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 19 May 2022 00:00:11 +0700 Subject: [PATCH 030/148] changed lm predictions to t5 --- megatron/data/non_causal_mlm_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index de47bd9ad..434576ef3 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -367,6 +367,8 @@ def build_training_sample(sample, print("tokens") print(tokens) + print("masks") + print(masks) print("labels") print(labels) print("masked_spans") From 469848f38aa0f7765150f341e9ca13cb27960700 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 19 May 2022 00:11:48 +0700 Subject: [PATCH 031/148] changed lm predictions to t5 --- megatron/data/non_causal_mlm_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 434576ef3..b21720618 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -365,6 +365,8 @@ def build_training_sample(sample, # print(padded_tokens) # print(padded_labels) + print("sample") + print(sample) print("tokens") print(tokens) print("masks") From e68283ff72b31bd22413c6d302ab5f1f5790f011 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 19 May 2022 00:59:41 +0700 Subject: [PATCH 032/148] tokenizer add mask, cls, sep tokens --- megatron/tokenizer/tokenizer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 31d6dcb05..482d666be 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -328,16 +328,16 @@ def __init__(self, tokenizer_name_or_path): self.decoder = {v: k for k, v in self.encoder.items()} if 'mask_token' not in self.tokenizer.special_tokens_map: + self.tokenizer.add_tokens("") self.tokenizer.mask_token = "" - # self.tokenizer.mask_token_id = self.tokenizer.vocab_size-1 if 'cls_token' not in self.tokenizer.special_tokens_map: - self.tokenizer.cls_token = "" - # self.tokenizer.cls_token_id = self.tokenizer.vocab_size-2 + self.tokenizer.add_tokens("") + self.tokenizer.mask_token = "" if 'sep_token' not in self.tokenizer.special_tokens_map: - self.tokenizer.sep_token = "" - # self.tokenizer.sep_token_id = self.tokenizer.vocab_size-3 + self.tokenizer.add_tokens("") + self.tokenizer.mask_token = "" @property def vocab_size(self): From 476ae94d5e6b676d9f753c90146607ae6b2f40e1 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:08:10 +0700 Subject: [PATCH 033/148] commit latest changes --- megatron/data/non_causal_mlm_dataset.py | 35 +++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index b21720618..970bd7c55 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -259,7 +259,7 @@ def __init__( self.pad_id = tokenizer.pad self.bos_id = tokenizer.bos_token_id self.eos_id = tokenizer.eos_token_id - + self.sentinel_tokens = tokenizer.additional_special_tokens_ids # Checks assert np.min(documents) >= 0 assert np.max(documents) < indexed_dataset.sizes.shape[0] @@ -311,6 +311,7 @@ def __getitem__(self, idx): self.mask_id, self.pad_id, self.masked_lm_prob, np_rng, self.bos_id, self.eos_id, + self.sentinel_tokens ) @@ -357,7 +358,6 @@ def build_training_sample(sample, cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, max_ngrams=10, geometric_dist=True, masking_style="t5") - # Padding. # padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) # padded_labels = pad_and_convert_to_numpy(labels, pad_id, max_seq_length) @@ -365,6 +365,33 @@ def build_training_sample(sample, # print(padded_tokens) # print(padded_labels) + sentinel_tokens = collections.deque(sentinel_tokens) + t5_input = [] + (t5_decoder_in, t5_decoder_out) = ([bos_id], []) + (start_index, end_index) = (0, None) + for span in masked_spans: + flag = sentinel_tokens.popleft() + + # Append the same tokens in decoder input and output + t5_decoder_in.append(flag) + t5_decoder_in.extend(span.label) + t5_decoder_out.append(flag) + t5_decoder_out.extend(span.label) + + end_index = span.index[0] + t5_input.extend(tokens[start_index: end_index]) + t5_input.append(flag) + + # the next start index is the token after the last span token + start_index = span.index[-1] + 1 + + # Add token to the t5_decoder_out + t5_decoder_out.append(eos_id) + + # Add the remaining tokens to the t5 input + t5_input.extend(tokens[start_index:]) + + print("sample") print(sample) print("tokens") @@ -375,6 +402,10 @@ def build_training_sample(sample, print(labels) print("masked_spans") print(masked_spans) + for idx, spans in enumerate(masked_spans): + spans.index + sentinel_tokens + labels = spans.labels import sys sys.exit() From 72ff5752e13b8df2bd5e1f964e435e7d88303a58 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:09:07 +0700 Subject: [PATCH 034/148] commit latest changes --- megatron/data/non_causal_mlm_dataset.py | 50 ++++++++++++------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 970bd7c55..dc5d36db5 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -365,31 +365,31 @@ def build_training_sample(sample, # print(padded_tokens) # print(padded_labels) - sentinel_tokens = collections.deque(sentinel_tokens) - t5_input = [] - (t5_decoder_in, t5_decoder_out) = ([bos_id], []) - (start_index, end_index) = (0, None) - for span in masked_spans: - flag = sentinel_tokens.popleft() - - # Append the same tokens in decoder input and output - t5_decoder_in.append(flag) - t5_decoder_in.extend(span.label) - t5_decoder_out.append(flag) - t5_decoder_out.extend(span.label) - - end_index = span.index[0] - t5_input.extend(tokens[start_index: end_index]) - t5_input.append(flag) - - # the next start index is the token after the last span token - start_index = span.index[-1] + 1 - - # Add token to the t5_decoder_out - t5_decoder_out.append(eos_id) - - # Add the remaining tokens to the t5 input - t5_input.extend(tokens[start_index:]) + # sentinel_tokens = collections.deque(sentinel_tokens) + # t5_input = [] + # (t5_decoder_in, t5_decoder_out) = ([bos_id], []) + # (start_index, end_index) = (0, None) + # for span in masked_spans: + # flag = sentinel_tokens.popleft() + + # # Append the same tokens in decoder input and output + # t5_decoder_in.append(flag) + # t5_decoder_in.extend(span.label) + # t5_decoder_out.append(flag) + # t5_decoder_out.extend(span.label) + + # end_index = span.index[0] + # t5_input.extend(tokens[start_index: end_index]) + # t5_input.append(flag) + + # # the next start index is the token after the last span token + # start_index = span.index[-1] + 1 + + # # Add token to the t5_decoder_out + # t5_decoder_out.append(eos_id) + + # # Add the remaining tokens to the t5 input + # t5_input.extend(tokens[start_index:]) print("sample") From 36472913920f07e0ec341137e043ed75f640b4aa Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:19:29 +0700 Subject: [PATCH 035/148] added sentinal tokens --- megatron/data/non_causal_mlm_dataset.py | 66 +++++++++++++++++-------- 1 file changed, 46 insertions(+), 20 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index dc5d36db5..8c5029fe6 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -251,6 +251,32 @@ def __init__( # Vocab stuff. tokenizer = get_tokenizer() + tokenizer.add_special_tokens({ + 'additional_special_tokens': [ + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + ] + }) + self.vocab_id_list = list(tokenizer.inv_vocab.keys()) self.vocab_id_to_token_dict = tokenizer.inv_vocab self.cls_id = tokenizer.cls @@ -365,31 +391,31 @@ def build_training_sample(sample, # print(padded_tokens) # print(padded_labels) - # sentinel_tokens = collections.deque(sentinel_tokens) - # t5_input = [] - # (t5_decoder_in, t5_decoder_out) = ([bos_id], []) - # (start_index, end_index) = (0, None) - # for span in masked_spans: - # flag = sentinel_tokens.popleft() + sentinel_tokens = collections.deque(sentinel_tokens) + t5_input = [] + (t5_decoder_in, t5_decoder_out) = ([bos_id], []) + (start_index, end_index) = (0, None) + for span in masked_spans: + flag = sentinel_tokens.popleft() - # # Append the same tokens in decoder input and output - # t5_decoder_in.append(flag) - # t5_decoder_in.extend(span.label) - # t5_decoder_out.append(flag) - # t5_decoder_out.extend(span.label) + # Append the same tokens in decoder input and output + t5_decoder_in.append(flag) + t5_decoder_in.extend(span.label) + t5_decoder_out.append(flag) + t5_decoder_out.extend(span.label) - # end_index = span.index[0] - # t5_input.extend(tokens[start_index: end_index]) - # t5_input.append(flag) + end_index = span.index[0] + t5_input.extend(tokens[start_index: end_index]) + t5_input.append(flag) - # # the next start index is the token after the last span token - # start_index = span.index[-1] + 1 + # the next start index is the token after the last span token + start_index = span.index[-1] + 1 - # # Add token to the t5_decoder_out - # t5_decoder_out.append(eos_id) + # Add token to the t5_decoder_out + t5_decoder_out.append(eos_id) - # # Add the remaining tokens to the t5 input - # t5_input.extend(tokens[start_index:]) + # Add the remaining tokens to the t5 input + t5_input.extend(tokens[start_index:]) print("sample") From fcdc9870f14747581d917086b6e8aa3003f696be Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:23:32 +0700 Subject: [PATCH 036/148] added sentinal tokens --- megatron/tokenizer/tokenizer.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 482d666be..b8432af7c 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -339,6 +339,33 @@ def __init__(self, tokenizer_name_or_path): self.tokenizer.add_tokens("") self.tokenizer.mask_token = "" + self.tokenizer.add_special_tokens({ + 'additional_special_tokens': [ + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + ] + }) + + @property def vocab_size(self): return self.tokenizer.vocab_size @@ -373,6 +400,11 @@ def pad(self): def mask(self): return self.tokenizer.mask_token_id + @property + def additional_special_tokens(self): + """ All the additional special tokens you may want to use (list of strings).""" + return self.tokenizer.additional_special_tokens + @property def bos_token_id(self): """ Id of the beginning of sentence token in the vocabulary.""" From d6fbe783bfcc6da81c47c74c09cb8491143082b2 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:23:59 +0700 Subject: [PATCH 037/148] added sentinal tokens --- megatron/data/non_causal_mlm_dataset.py | 26 ------------------------- 1 file changed, 26 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 8c5029fe6..970bd7c55 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -251,32 +251,6 @@ def __init__( # Vocab stuff. tokenizer = get_tokenizer() - tokenizer.add_special_tokens({ - 'additional_special_tokens': [ - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - ] - }) - self.vocab_id_list = list(tokenizer.inv_vocab.keys()) self.vocab_id_to_token_dict = tokenizer.inv_vocab self.cls_id = tokenizer.cls From c44daba03a335b08a86a16600b7fb7c1074ad3b1 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:25:33 +0700 Subject: [PATCH 038/148] added additional_special_tokens --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 970bd7c55..3616cab80 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -259,7 +259,7 @@ def __init__( self.pad_id = tokenizer.pad self.bos_id = tokenizer.bos_token_id self.eos_id = tokenizer.eos_token_id - self.sentinel_tokens = tokenizer.additional_special_tokens_ids + self.sentinel_tokens = tokenizer.additional_special_tokens # Checks assert np.min(documents) >= 0 assert np.max(documents) < indexed_dataset.sizes.shape[0] From a2725d81477d217b0c88485ee8bc66c3eb3315b7 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:26:53 +0700 Subject: [PATCH 039/148] added additional_special_tokens --- megatron/data/non_causal_mlm_dataset.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 3616cab80..1620c763b 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -392,20 +392,20 @@ def build_training_sample(sample, t5_input.extend(tokens[start_index:]) - print("sample") - print(sample) - print("tokens") - print(tokens) - print("masks") - print(masks) - print("labels") - print(labels) + # print("sample") + # print(sample) + # print("tokens") + # print(tokens) + # print("masks") + # print(masks) + # print("labels") + # print(labels) print("masked_spans") print(masked_spans) - for idx, spans in enumerate(masked_spans): - spans.index - sentinel_tokens - labels = spans.labels + # for idx, spans in enumerate(masked_spans): + # spans.index + # sentinel_tokens + # labels = spans.labels import sys sys.exit() From 0e942458d6aaf5f55d0d3bd55f1b60e6972cfeff Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 21:13:28 +0700 Subject: [PATCH 040/148] check t5_input and output --- megatron/data/non_causal_mlm_dataset.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 1620c763b..83909d9a3 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -391,6 +391,10 @@ def build_training_sample(sample, # Add the remaining tokens to the t5 input t5_input.extend(tokens[start_index:]) + print("t5_input") + print(t5_input) + print("t5_decoder_out") + print(t5_decoder_out) # print("sample") # print(sample) From b599ab6b04a0b89f962619c9db7edda283f86cb9 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 21:59:13 +0700 Subject: [PATCH 041/148] check decoder in and decoder out --- megatron/data/non_causal_mlm_dataset.py | 22 +++++++++++----------- megatron/tokenizer/tokenizer.py | 12 ++---------- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 83909d9a3..98eb51a13 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -259,7 +259,7 @@ def __init__( self.pad_id = tokenizer.pad self.bos_id = tokenizer.bos_token_id self.eos_id = tokenizer.eos_token_id - self.sentinel_tokens = tokenizer.additional_special_tokens + self.sentinel_tokens = tokenizer.additional_special_tokens_ids # Checks assert np.min(documents) >= 0 assert np.max(documents) < indexed_dataset.sizes.shape[0] @@ -366,7 +366,7 @@ def build_training_sample(sample, # print(padded_labels) sentinel_tokens = collections.deque(sentinel_tokens) - t5_input = [] + input_tokens_ids = [] (t5_decoder_in, t5_decoder_out) = ([bos_id], []) (start_index, end_index) = (0, None) for span in masked_spans: @@ -379,8 +379,8 @@ def build_training_sample(sample, t5_decoder_out.extend(span.label) end_index = span.index[0] - t5_input.extend(tokens[start_index: end_index]) - t5_input.append(flag) + input_tokens_ids.extend(tokens[start_index: end_index]) + input_tokens_ids.append(flag) # the next start index is the token after the last span token start_index = span.index[-1] + 1 @@ -389,12 +389,14 @@ def build_training_sample(sample, t5_decoder_out.append(eos_id) # Add the remaining tokens to the t5 input - t5_input.extend(tokens[start_index:]) + input_tokens_ids.extend(tokens[start_index:]) - print("t5_input") - print(t5_input) + print("input_tokens_ids") + print(input_tokens_ids) print("t5_decoder_out") print(t5_decoder_out) + print("t5_decoder_in") + print(t5_decoder_in) # print("sample") # print(sample) @@ -404,8 +406,8 @@ def build_training_sample(sample, # print(masks) # print("labels") # print(labels) - print("masked_spans") - print(masked_spans) + # print("masked_spans") + # print(masked_spans) # for idx, spans in enumerate(masked_spans): # spans.index # sentinel_tokens @@ -415,8 +417,6 @@ def build_training_sample(sample, train_sample = { 'text': padded_tokens, - 'labels': padded_labels, - 'mask': padded_masks, 'prefix_len': 0 } return train_sample diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index b8432af7c..d8041a0bc 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -401,9 +401,9 @@ def mask(self): return self.tokenizer.mask_token_id @property - def additional_special_tokens(self): + def additional_special_tokens_ids(self): """ All the additional special tokens you may want to use (list of strings).""" - return self.tokenizer.additional_special_tokens + return self.tokenizer.additional_special_tokens_ids @property def bos_token_id(self): @@ -414,11 +414,3 @@ def bos_token_id(self): def eos_token_id(self): """ Id of the end of sentence token in the vocabulary.""" return self.tokenizer.eos_token_id - - @property - def eod(self): - return self.tokenizer.eos_token_id - - @property - def pad(self): - return self.tokenizer.pad_token_id From 626b0ae355dbf6d49f308444a8a7910f2d992621 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 22 May 2022 11:07:14 +0700 Subject: [PATCH 042/148] made into input and output tokens --- megatron/data/non_causal_mlm_dataset.py | 48 +++++++++---------------- 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 98eb51a13..c9b5ecfc6 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -367,16 +367,13 @@ def build_training_sample(sample, sentinel_tokens = collections.deque(sentinel_tokens) input_tokens_ids = [] - (t5_decoder_in, t5_decoder_out) = ([bos_id], []) + output_tokens_ids = [] #[bos_id] (start_index, end_index) = (0, None) for span in masked_spans: flag = sentinel_tokens.popleft() - # Append the same tokens in decoder input and output - t5_decoder_in.append(flag) - t5_decoder_in.extend(span.label) - t5_decoder_out.append(flag) - t5_decoder_out.extend(span.label) + output_tokens_ids.append(flag) + output_tokens_ids.extend(span.label) end_index = span.index[0] input_tokens_ids.extend(tokens[start_index: end_index]) @@ -385,41 +382,28 @@ def build_training_sample(sample, # the next start index is the token after the last span token start_index = span.index[-1] + 1 - # Add token to the t5_decoder_out - t5_decoder_out.append(eos_id) - # Add the remaining tokens to the t5 input + # Add the remaining tokens to input_tokens_ids input_tokens_ids.extend(tokens[start_index:]) + # Add token to the output_tokens_ids + output_tokens_ids.append(eos_id) + prefix_len = len(input_tokens_ids) + text_tokens_ids = input_tokens_ids + output_tokens_ids + print("text_tokens_ids") + print(text_tokens_ids) print("input_tokens_ids") print(input_tokens_ids) - print("t5_decoder_out") - print(t5_decoder_out) - print("t5_decoder_in") - print(t5_decoder_in) - - # print("sample") - # print(sample) - # print("tokens") - # print(tokens) - # print("masks") - # print(masks) - # print("labels") - # print(labels) - # print("masked_spans") - # print(masked_spans) - # for idx, spans in enumerate(masked_spans): - # spans.index - # sentinel_tokens - # labels = spans.labels + print("output_tokens_ids") + print(output_tokens_ids) + import sys sys.exit() - train_sample = { - 'text': padded_tokens, - 'prefix_len': 0 + return { + 'text': input_tokens_ids, + 'prefix_len': prefix_len } - return train_sample def _build_index_mappings(name, data_prefix, documents, sizes, From 6008937deef5e346bd086f659c057e1ab06e74f8 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 22 May 2022 11:14:31 +0700 Subject: [PATCH 043/148] made into input and output tokens --- megatron/data/non_causal_mlm_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index c9b5ecfc6..73f0e72c4 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -364,6 +364,8 @@ def build_training_sample(sample, # padded_masks = pad_and_convert_to_numpy(masks, pad_id, max_seq_length) # print(padded_tokens) # print(padded_labels) + print("sentinel_tokens") + print(sentinel_tokens) sentinel_tokens = collections.deque(sentinel_tokens) input_tokens_ids = [] From c1524db53b66d3e2ff1341a383dc3facf675f9ad Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 22 May 2022 11:17:44 +0700 Subject: [PATCH 044/148] made into input and output tokens --- megatron/data/non_causal_mlm_dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 73f0e72c4..a39c099f1 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -260,6 +260,9 @@ def __init__( self.bos_id = tokenizer.bos_token_id self.eos_id = tokenizer.eos_token_id self.sentinel_tokens = tokenizer.additional_special_tokens_ids + + print("self.sentinel_tokens") + print(self.sentinel_tokens) # Checks assert np.min(documents) >= 0 assert np.max(documents) < indexed_dataset.sizes.shape[0] @@ -366,6 +369,8 @@ def build_training_sample(sample, # print(padded_labels) print("sentinel_tokens") print(sentinel_tokens) + import sys + sys.exit() sentinel_tokens = collections.deque(sentinel_tokens) input_tokens_ids = [] From c59c061c31851a1c90a08af552f4430e7dd281c3 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 22 May 2022 11:24:44 +0700 Subject: [PATCH 045/148] made into input and output tokens --- megatron/data/non_causal_mlm_dataset.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index a39c099f1..39e7105c8 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -367,10 +367,6 @@ def build_training_sample(sample, # padded_masks = pad_and_convert_to_numpy(masks, pad_id, max_seq_length) # print(padded_tokens) # print(padded_labels) - print("sentinel_tokens") - print(sentinel_tokens) - import sys - sys.exit() sentinel_tokens = collections.deque(sentinel_tokens) input_tokens_ids = [] @@ -398,11 +394,11 @@ def build_training_sample(sample, text_tokens_ids = input_tokens_ids + output_tokens_ids print("text_tokens_ids") - print(text_tokens_ids) + print(len(text_tokens_ids)) print("input_tokens_ids") - print(input_tokens_ids) + print(len(input_tokens_ids)) print("output_tokens_ids") - print(output_tokens_ids) + print(len(output_tokens_ids)) import sys sys.exit() From e677e165fd97f2a4b700addb69b62505ed6ce8fc Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 22 May 2022 11:29:32 +0700 Subject: [PATCH 046/148] made into input and output tokens --- megatron/data/non_causal_mlm_dataset.py | 9 ++------- megatron/tokenizer/tokenizer.py | 4 ++++ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 39e7105c8..2b3ace71d 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -361,13 +361,6 @@ def build_training_sample(sample, cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, max_ngrams=10, geometric_dist=True, masking_style="t5") - # Padding. - # padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) - # padded_labels = pad_and_convert_to_numpy(labels, pad_id, max_seq_length) - # padded_masks = pad_and_convert_to_numpy(masks, pad_id, max_seq_length) - # print(padded_tokens) - # print(padded_labels) - sentinel_tokens = collections.deque(sentinel_tokens) input_tokens_ids = [] output_tokens_ids = [] #[bos_id] @@ -402,6 +395,8 @@ def build_training_sample(sample, import sys sys.exit() + # Padding. + # padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) return { 'text': input_tokens_ids, diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index d8041a0bc..faa5f4d1b 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -365,6 +365,10 @@ def __init__(self, tokenizer_name_or_path): ] }) + print(self.tokenizer.special_tokens_map) + import sys + sys.exit() + @property def vocab_size(self): From 9ffaeb9c33227aac61b5aece1200f6c38db41364 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 22 May 2022 11:30:50 +0700 Subject: [PATCH 047/148] made into input and output tokens --- megatron/tokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index faa5f4d1b..8c7dc6354 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -365,7 +365,7 @@ def __init__(self, tokenizer_name_or_path): ] }) - print(self.tokenizer.special_tokens_map) + print(self.tokenizer.additional_special_tokens_ids) import sys sys.exit() From d0a6a2fc5e56aaa0f3b0d8cb3d9cd3c1fd323de0 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 22 May 2022 11:40:56 +0700 Subject: [PATCH 048/148] made into input and output tokens --- megatron/tokenizer/tokenizer.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 8c7dc6354..2e0e954ec 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -327,17 +327,17 @@ def __init__(self, tokenizer_name_or_path): self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} - if 'mask_token' not in self.tokenizer.special_tokens_map: - self.tokenizer.add_tokens("") - self.tokenizer.mask_token = "" + # if 'mask_token' not in self.tokenizer.special_tokens_map: + # self.tokenizer.add_tokens("") + # self.tokenizer.mask_token = "" - if 'cls_token' not in self.tokenizer.special_tokens_map: - self.tokenizer.add_tokens("") - self.tokenizer.mask_token = "" + # if 'cls_token' not in self.tokenizer.special_tokens_map: + # self.tokenizer.add_tokens("") + # self.tokenizer.mask_token = "" - if 'sep_token' not in self.tokenizer.special_tokens_map: - self.tokenizer.add_tokens("") - self.tokenizer.mask_token = "" + # if 'sep_token' not in self.tokenizer.special_tokens_map: + # self.tokenizer.add_tokens("") + # self.tokenizer.mask_token = "" self.tokenizer.add_special_tokens({ 'additional_special_tokens': [ From 47fd987aadeb5dbca25ee95646e623f8b3302263 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 23 May 2022 16:00:49 +0700 Subject: [PATCH 049/148] made into input and output tokens --- megatron/data/non_causal_mlm_dataset.py | 8 +++++++- megatron/tokenizer/tokenizer.py | 5 ----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 2b3ace71d..6c06dfcea 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -385,7 +385,7 @@ def build_training_sample(sample, output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) - text_tokens_ids = input_tokens_ids + output_tokens_ids + text_tokens_ids = input_tokens_ids.extend(output_tokens_ids) print("text_tokens_ids") print(len(text_tokens_ids)) print("input_tokens_ids") @@ -393,6 +393,12 @@ def build_training_sample(sample, print("output_tokens_ids") print(len(output_tokens_ids)) + # input_tokens_ids = pad_and_convert_to_numpy( + # input_tokens_ids, + # self.tokenizer.pad, + # self.seq_length + # ) + import sys sys.exit() # Padding. diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 2e0e954ec..637cda9a3 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -365,11 +365,6 @@ def __init__(self, tokenizer_name_or_path): ] }) - print(self.tokenizer.additional_special_tokens_ids) - import sys - sys.exit() - - @property def vocab_size(self): return self.tokenizer.vocab_size From 4f377e8fea94324f6ddcda972374157463d6c7a7 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 23 May 2022 16:02:18 +0700 Subject: [PATCH 050/148] made into input and output tokens --- megatron/data/non_causal_mlm_dataset.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 6c06dfcea..a61372cc0 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -261,8 +261,6 @@ def __init__( self.eos_id = tokenizer.eos_token_id self.sentinel_tokens = tokenizer.additional_special_tokens_ids - print("self.sentinel_tokens") - print(self.sentinel_tokens) # Checks assert np.min(documents) >= 0 assert np.max(documents) < indexed_dataset.sizes.shape[0] From 5c0bf76c984d77eba9492a4d112676960dfb379b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 23 May 2022 16:05:04 +0700 Subject: [PATCH 051/148] added eos --- megatron/tokenizer/tokenizer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 637cda9a3..d588e6e66 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -395,6 +395,10 @@ def sep(self): def pad(self): return self.tokenizer.pad_token_id + @property + def eod(self): + return self.tokenizer.eos_token_id + @property def mask(self): return self.tokenizer.mask_token_id From 7c63e4bf41c75f76531c94559d82fcb5343b53f8 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 23 May 2022 16:06:34 +0700 Subject: [PATCH 052/148] added eos --- megatron/tokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index d588e6e66..5a278ebbe 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -396,7 +396,7 @@ def pad(self): return self.tokenizer.pad_token_id @property - def eod(self): + def eod(self): return self.tokenizer.eos_token_id @property From 871124cc95dac124544e53f0bba7a85654fee17e Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 13:09:21 +0700 Subject: [PATCH 053/148] test text_token --- megatron/data/non_causal_mlm_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index a61372cc0..e5beec9ba 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -385,6 +385,7 @@ def build_training_sample(sample, text_tokens_ids = input_tokens_ids.extend(output_tokens_ids) print("text_tokens_ids") + print(text_tokens_ids) print(len(text_tokens_ids)) print("input_tokens_ids") print(len(input_tokens_ids)) From 55a593dec27e517081f06848d1dd720889fa2e52 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 13:13:24 +0700 Subject: [PATCH 054/148] test text_token --- megatron/data/non_causal_mlm_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e5beec9ba..a691e3070 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -383,14 +383,14 @@ def build_training_sample(sample, output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) - text_tokens_ids = input_tokens_ids.extend(output_tokens_ids) - print("text_tokens_ids") - print(text_tokens_ids) - print(len(text_tokens_ids)) print("input_tokens_ids") print(len(input_tokens_ids)) print("output_tokens_ids") print(len(output_tokens_ids)) + text_tokens_ids = input_tokens_ids.extend(output_tokens_ids) + print("text_tokens_ids") + print(text_tokens_ids) + print(len(text_tokens_ids)) # input_tokens_ids = pad_and_convert_to_numpy( # input_tokens_ids, From adb59caedd99446eb392e64dc9830d217768b6de Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 13:14:50 +0700 Subject: [PATCH 055/148] test text_token --- megatron/data/non_causal_mlm_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index a691e3070..382e6a5c2 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -384,9 +384,9 @@ def build_training_sample(sample, prefix_len = len(input_tokens_ids) print("input_tokens_ids") - print(len(input_tokens_ids)) + print(input_tokens_ids) print("output_tokens_ids") - print(len(output_tokens_ids)) + print(output_tokens_ids) text_tokens_ids = input_tokens_ids.extend(output_tokens_ids) print("text_tokens_ids") print(text_tokens_ids) From d71afb4805c6b5824bd325095508df6135544c75 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 13:16:09 +0700 Subject: [PATCH 056/148] test text_token --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 382e6a5c2..caad3319b 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -387,7 +387,7 @@ def build_training_sample(sample, print(input_tokens_ids) print("output_tokens_ids") print(output_tokens_ids) - text_tokens_ids = input_tokens_ids.extend(output_tokens_ids) + text_tokens_ids = input_tokens_ids+output_tokens_ids print("text_tokens_ids") print(text_tokens_ids) print(len(text_tokens_ids)) From 7b99bb7c16167524a93d08a65bd3495baddb0934 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 15:11:11 +0700 Subject: [PATCH 057/148] test text_token --- megatron/data/non_causal_mlm_dataset.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index caad3319b..f0cf81287 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -383,13 +383,13 @@ def build_training_sample(sample, output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) - print("input_tokens_ids") - print(input_tokens_ids) - print("output_tokens_ids") - print(output_tokens_ids) text_tokens_ids = input_tokens_ids+output_tokens_ids + # print("input_tokens_ids") + # print(input_tokens_ids) + # print("output_tokens_ids") + # print(output_tokens_ids) print("text_tokens_ids") - print(text_tokens_ids) + # print(text_tokens_ids) print(len(text_tokens_ids)) # input_tokens_ids = pad_and_convert_to_numpy( @@ -398,10 +398,10 @@ def build_training_sample(sample, # self.seq_length # ) + # Padding. + padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) import sys sys.exit() - # Padding. - # padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) return { 'text': input_tokens_ids, From 922b09d287beaa0942177f3daccc2e565cd784d4 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 17:24:31 +0700 Subject: [PATCH 058/148] assigned array --- megatron/data/non_causal_mlm_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index f0cf81287..5236ef25f 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -383,7 +383,7 @@ def build_training_sample(sample, output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) - text_tokens_ids = input_tokens_ids+output_tokens_ids + text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) # print("input_tokens_ids") # print(input_tokens_ids) # print("output_tokens_ids") @@ -399,7 +399,7 @@ def build_training_sample(sample, # ) # Padding. - padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) + padded_tokens = pad_and_convert_to_numpy(text_tokens_ids, pad_id, max_seq_length) import sys sys.exit() From 469a02dbbf3f05ce6aaddf22f8f90913a3ddd9c7 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 17:29:41 +0700 Subject: [PATCH 059/148] assigned array --- examples/finetune_mp3.sh | 4 ++-- megatron/data/non_causal_mlm_dataset.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/finetune_mp3.sh b/examples/finetune_mp3.sh index 1e9919a86..598f2acec 100644 --- a/examples/finetune_mp3.sh +++ b/examples/finetune_mp3.sh @@ -15,8 +15,8 @@ deepspeed --num_gpus 2 pretrain_mp3_gpt.py \ --num-attention-heads 4 \ --micro-batch-size 4 \ --global-batch-size 16 \ - --seq-length 512 \ - --max-position-embeddings 512 \ + --seq-length 626 \ + --max-position-embeddings 1024 \ --train-iters 10000 \ --lr-decay-iters 5000 \ --save $CHECKPOINT_PATH \ diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 5236ef25f..e2fb731b8 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -399,6 +399,7 @@ def build_training_sample(sample, # ) # Padding. + print(max_seq_length) padded_tokens = pad_and_convert_to_numpy(text_tokens_ids, pad_id, max_seq_length) import sys sys.exit() From 15cb6a0f3c5bc37361c4607c2b5ca130efc83048 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 17:31:31 +0700 Subject: [PATCH 060/148] assigned array --- megatron/data/non_causal_mlm_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e2fb731b8..7c283ef8d 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -384,10 +384,10 @@ def build_training_sample(sample, prefix_len = len(input_tokens_ids) text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) - # print("input_tokens_ids") - # print(input_tokens_ids) - # print("output_tokens_ids") - # print(output_tokens_ids) + print("input_tokens_ids") + print(len(input_tokens_ids)) + print("output_tokens_ids") + print(len(output_tokens_ids)) print("text_tokens_ids") # print(text_tokens_ids) print(len(text_tokens_ids)) From 5b0bc17605486aed962da7f62137c52f7229bc61 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 17:33:42 +0700 Subject: [PATCH 061/148] hardcoded sequence length --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 7c283ef8d..0229f39cc 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -348,7 +348,7 @@ def build_training_sample(sample, tokens = sample # Truncate to `target_sequence_length`. - max_num_tokens = max_seq_length + max_num_tokens = 512 #max_seq_length truncated = len(tokens) > max_num_tokens tokens = tokens[:max_num_tokens] From 0671c79dd40bdcf3f7a902bf9e5cbb973670635f Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 22:20:13 +0700 Subject: [PATCH 062/148] check again --- megatron/data/non_causal_mlm_dataset.py | 3 +- megatron/tokenizer/tokenizer.py | 44 ++++++++++++------------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 0229f39cc..eae03d4ac 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -386,8 +386,10 @@ def build_training_sample(sample, text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) print("input_tokens_ids") print(len(input_tokens_ids)) + print(input_tokens_ids) print("output_tokens_ids") print(len(output_tokens_ids)) + print(output_tokens_ids) print("text_tokens_ids") # print(text_tokens_ids) print(len(text_tokens_ids)) @@ -399,7 +401,6 @@ def build_training_sample(sample, # ) # Padding. - print(max_seq_length) padded_tokens = pad_and_convert_to_numpy(text_tokens_ids, pad_id, max_seq_length) import sys sys.exit() diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 5a278ebbe..28b4221dc 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -17,7 +17,7 @@ from abc import ABC from abc import abstractmethod -from transformers import AutoTokenizer +from transformers import AutoTokenizer, AddedToken from .bert_tokenization import FullTokenizer as FullBertTokenizer from .gpt2_tokenization import GPT2Tokenizer @@ -341,27 +341,27 @@ def __init__(self, tokenizer_name_or_path): self.tokenizer.add_special_tokens({ 'additional_special_tokens': [ - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), ] }) From 6db5c9b0bfbe9c3602d0f212c625937407df7199 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:09:31 +0700 Subject: [PATCH 063/148] show sentinal tokens --- megatron/data/non_causal_mlm_dataset.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index eae03d4ac..09e6a70ef 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -348,10 +348,13 @@ def build_training_sample(sample, tokens = sample # Truncate to `target_sequence_length`. - max_num_tokens = 512 #max_seq_length + max_num_tokens = max_seq_length truncated = len(tokens) > max_num_tokens tokens = tokens[:max_num_tokens] + print(sentinel_tokens) + import sys + sys.exit() # Masking. max_predictions_per_seq = masked_lm_prob * max_num_tokens (tokens, masks, labels, _, masked_spans) = create_masked_lm_predictions( @@ -379,6 +382,7 @@ def build_training_sample(sample, # Add the remaining tokens to input_tokens_ids input_tokens_ids.extend(tokens[start_index:]) + input_tokens_ids.append(eos_id) # Add token to the output_tokens_ids output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) From 8a58007e44ef058d72236bd04edac4d4b673d089 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:10:52 +0700 Subject: [PATCH 064/148] show sentinal tokens --- examples/finetune_mp3.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/finetune_mp3.sh b/examples/finetune_mp3.sh index 598f2acec..d813a9258 100644 --- a/examples/finetune_mp3.sh +++ b/examples/finetune_mp3.sh @@ -5,11 +5,11 @@ RANK=0 WORLD_SIZE=1 -DATA_PATH=data/t0-test_text_document +DATA_PATH=data/mc4-id_text_document CHECKPOINT_PATH=data -deepspeed --num_gpus 2 pretrain_mp3_gpt.py \ +deepspeed --num_gpus 8 pretrain_mp3_gpt.py \ --num-layers 2 \ --hidden-size 128 \ --num-attention-heads 4 \ @@ -23,7 +23,7 @@ deepspeed --num_gpus 2 pretrain_mp3_gpt.py \ --load $CHECKPOINT_PATH \ --data-path $DATA_PATH \ --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles \ + --tokenizer-name-or-path bigscience/tokenizer \ --data-impl mmap \ --split 949,50,1 \ --distributed-backend nccl \ From 8b0bbc20423013d85879a6549f15687c7a46fd84 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:14:17 +0700 Subject: [PATCH 065/148] show sentinal tokens --- examples/finetune_mp3.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/finetune_mp3.sh b/examples/finetune_mp3.sh index d813a9258..59cb34d4c 100644 --- a/examples/finetune_mp3.sh +++ b/examples/finetune_mp3.sh @@ -14,7 +14,7 @@ deepspeed --num_gpus 8 pretrain_mp3_gpt.py \ --hidden-size 128 \ --num-attention-heads 4 \ --micro-batch-size 4 \ - --global-batch-size 16 \ + --global-batch-size 1024 \ --seq-length 626 \ --max-position-embeddings 1024 \ --train-iters 10000 \ @@ -39,4 +39,4 @@ deepspeed --num_gpus 8 pretrain_mp3_gpt.py \ --eval-interval 1000 \ --eval-iters 10 \ --fp16 \ - --tensorboard-dir GPT2 + --tensorboard-dir LOG From 3d1b2560aa058bf9304402c5a02b2a378cfebbf5 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:18:05 +0700 Subject: [PATCH 066/148] show sentinal tokens --- megatron/data/non_causal_mlm_dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 09e6a70ef..578b53120 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -353,8 +353,7 @@ def build_training_sample(sample, tokens = tokens[:max_num_tokens] print(sentinel_tokens) - import sys - sys.exit() + # Masking. max_predictions_per_seq = masked_lm_prob * max_num_tokens (tokens, masks, labels, _, masked_spans) = create_masked_lm_predictions( From ce00fd9e2f92399f6f8b3a3fea897febfabcda00 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:25:10 +0700 Subject: [PATCH 067/148] add more special tokens --- megatron/data/non_causal_mlm_dataset.py | 2 -- megatron/tokenizer/tokenizer.py | 4 ++++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 578b53120..e00cad309 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -352,8 +352,6 @@ def build_training_sample(sample, truncated = len(tokens) > max_num_tokens tokens = tokens[:max_num_tokens] - print(sentinel_tokens) - # Masking. max_predictions_per_seq = masked_lm_prob * max_num_tokens (tokens, masks, labels, _, masked_spans) = create_masked_lm_predictions( diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 28b4221dc..c1924cfa1 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -362,6 +362,10 @@ def __init__(self, tokenizer_name_or_path): AddedToken('', lstrip=False, rstrip=False, normalization=False), AddedToken('', lstrip=False, rstrip=False, normalization=False), AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), ] }) From 3bcc50c14b9ec1802738ea6c9f51648ebba7d09c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:35:56 +0700 Subject: [PATCH 068/148] changed how mlm data is loaded --- megatron/data/non_causal_mlm_dataset.py | 79 +++++++++++++++---------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e00cad309..5811f7fe7 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -249,6 +249,17 @@ def __init__( # Dataset. self.indexed_dataset = indexed_dataset + # Build the samples mapping. + self.samples_mapping = get_samples_mapping(self.indexed_dataset, + data_prefix, + num_epochs, + max_num_samples, + self.max_seq_length - 2, # account for added tokens + short_seq_prob, + self.seed, + self.name, + False) + # Vocab stuff. tokenizer = get_tokenizer() self.vocab_id_list = list(tokenizer.inv_vocab.keys()) @@ -261,45 +272,51 @@ def __init__( self.eos_id = tokenizer.eos_token_id self.sentinel_tokens = tokenizer.additional_special_tokens_ids - # Checks - assert np.min(documents) >= 0 - assert np.max(documents) < indexed_dataset.sizes.shape[0] + # # Checks + # assert np.min(documents) >= 0 + # assert np.max(documents) < indexed_dataset.sizes.shape[0] - # Build index mappings. - self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( - self.name, data_prefix, documents, self.indexed_dataset.sizes, - num_samples, seq_length, seed) + # # Build index mappings. + # self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( + # self.name, data_prefix, documents, self.indexed_dataset.sizes, + # num_samples, seq_length, seed) def __len__(self): # -1 is due to data structure used to retieve the index: # sample i --> [sample_idx[i], sample_idx[i+1]) - return self.sample_idx.shape[0] - 1 + # return self.sample_idx.shape[0] - 1 + return self.samples_mapping.shape[0] def __getitem__(self, idx): - idx = self.shuffle_idx[idx] - # Start and end documents and offsets. - doc_index_f = self.sample_idx[idx][0] - doc_index_l = self.sample_idx[idx + 1][0] - offset_f = self.sample_idx[idx][1] - offset_l = self.sample_idx[idx + 1][1] - # If we are within the same document, just extract the chunk. - if doc_index_f == doc_index_l: - sample = self.indexed_dataset.get(self.doc_idx[doc_index_f], - offset=offset_f, - length=offset_l - offset_f + 1) - else: - # Otherwise, get the rest of the initial document. - sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], - offset=offset_f)] - # Loop over all in between documents and add the entire document. - for i in range(doc_index_f + 1, doc_index_l): - sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) - # And finally add the relevant portion of last document. - sample_list.append(self.indexed_dataset.get( - self.doc_idx[doc_index_l], - length=offset_l + 1)) - sample = np.concatenate(sample_list) + # idx = self.shuffle_idx[idx] + # # Start and end documents and offsets. + # doc_index_f = self.sample_idx[idx][0] + # doc_index_l = self.sample_idx[idx + 1][0] + # offset_f = self.sample_idx[idx][1] + # offset_l = self.sample_idx[idx + 1][1] + # # If we are within the same document, just extract the chunk. + # if doc_index_f == doc_index_l: + # sample = self.indexed_dataset.get(self.doc_idx[doc_index_f], + # offset=offset_f, + # length=offset_l - offset_f + 1) + # else: + # # Otherwise, get the rest of the initial document. + # sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], + # offset=offset_f)] + # # Loop over all in between documents and add the entire document. + # for i in range(doc_index_f + 1, doc_index_l): + # sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) + # # And finally add the relevant portion of last document. + # sample_list.append(self.indexed_dataset.get( + # self.doc_idx[doc_index_l], + # length=offset_l + 1)) + # sample = np.concatenate(sample_list) + + start_index, end_index, seq_length = self.samples_mapping[idx] + sample = [] + for index in range(start_index, end_index): + sample.append(self.indexed_dataset[index]) # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. From 76960f7c3be109a5637162b2ee3b97d2cb6f7011 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:51:36 +0700 Subject: [PATCH 069/148] changed how mlm data is loaded --- megatron/data/non_causal_mlm_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 5811f7fe7..d8dab0153 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -252,10 +252,10 @@ def __init__( # Build the samples mapping. self.samples_mapping = get_samples_mapping(self.indexed_dataset, data_prefix, - num_epochs, - max_num_samples, + False, #num_epochs, + num_samples, #max_num_samples, self.max_seq_length - 2, # account for added tokens - short_seq_prob, + 0.1, #short_seq_prob, self.seed, self.name, False) From 229d6617b81e34d3db50aeda650bec8d90264ca0 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:53:54 +0700 Subject: [PATCH 070/148] changed how mlm data is loaded --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index d8dab0153..b8b6dda16 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -254,7 +254,7 @@ def __init__( data_prefix, False, #num_epochs, num_samples, #max_num_samples, - self.max_seq_length - 2, # account for added tokens + self.seq_length-2, #self.max_seq_length - 2, # account for added tokens 0.1, #short_seq_prob, self.seed, self.name, From 55e3df7555ea9aa92c9746f870aa6a54bba421d0 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:01:27 +0700 Subject: [PATCH 071/148] changed how mlm data is loaded --- megatron/data/non_causal_mlm_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index b8b6dda16..eaa98d0f7 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -317,6 +317,8 @@ def __getitem__(self, idx): sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) + # sample = np.concatenate(sample_list) + print(sample) # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. From 05dea6d1e995ee5a995302ce06235c1c34ea8858 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:03:53 +0700 Subject: [PATCH 072/148] changed how mlm data is loaded --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index eaa98d0f7..d1839c2e6 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -317,7 +317,7 @@ def __getitem__(self, idx): sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) - # sample = np.concatenate(sample_list) + sample = np.concatenate(sample) print(sample) # Note that this rng state should be numpy and not python since From 661c8bba3960f6db1fb82bdc98fdeb2f1d813f58 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:16:31 +0700 Subject: [PATCH 073/148] added new script --- examples/train_non_causal_mlm_adaption.sh | 42 ++++ megatron/data/non_causal_mlm_dataset.py | 1 - train_non_causal_mlm_adaptation_gpt.py | 257 ++++++++++++++++++++++ 3 files changed, 299 insertions(+), 1 deletion(-) create mode 100644 examples/train_non_causal_mlm_adaption.sh create mode 100644 train_non_causal_mlm_adaptation_gpt.py diff --git a/examples/train_non_causal_mlm_adaption.sh b/examples/train_non_causal_mlm_adaption.sh new file mode 100644 index 000000000..b47ea7142 --- /dev/null +++ b/examples/train_non_causal_mlm_adaption.sh @@ -0,0 +1,42 @@ +#! /bin/bash + +# Runs the "345M" parameter model + +RANK=0 +WORLD_SIZE=1 + +DATA_PATH=data/mc4-id_text_document +CHECKPOINT_PATH=data + + +deepspeed --num_gpus 8 train_non_causal_mlm_adaption_mlm.py \ + --num-layers 2 \ + --hidden-size 128 \ + --num-attention-heads 4 \ + --micro-batch-size 4 \ + --global-batch-size 1024 \ + --seq-length 626 \ + --max-position-embeddings 1024 \ + --train-iters 10000 \ + --lr-decay-iters 5000 \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path bigscience/tokenizer \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 0.00015 \ + --min-lr 1.0e-5 \ + --lr-decay-style cosine \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --lr-warmup-fraction .01 \ + --checkpoint-activations \ + --log-interval 100 \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 \ + --fp16 \ + --tensorboard-dir LOG diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index d1839c2e6..fadfcb6fc 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -398,7 +398,6 @@ def build_training_sample(sample, # Add the remaining tokens to input_tokens_ids input_tokens_ids.extend(tokens[start_index:]) - input_tokens_ids.append(eos_id) # Add token to the output_tokens_ids output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py new file mode 100644 index 000000000..b7af289a5 --- /dev/null +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -0,0 +1,257 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pretrain GPT""" + +import torch +from functools import partial +from megatron import get_args +from megatron import print_rank_0 +from megatron import get_timers +from megatron import get_tokenizer +from megatron import mpu +# from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ +from megatron.utils import average_losses_across_data_parallel_group + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +import subprocess + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building GPT model ...') + see_memory_usage(f"Before Building Model", force=True) + + args = get_args() + + with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed: + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + prefix_lm=True + ) + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + + else: + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + prefix_lm=True + ) + see_memory_usage(f"After Building Model", force=True) + return model + +_KEYS = ['text', 'prefix_len'] + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = _KEYS + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + prefix_indices = data_b['prefix_len'].cpu().tolist() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + # weight loss_mask + if args.reweight_loss_based_on_position_frequency: + reweight_loss_mask_(loss_mask, tokens) + + return tokens, labels, loss_mask, attention_mask, position_ids + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = _KEYS + datatype = torch.int64 + + # Broadcast data. + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + prefix_indices = data_b['prefix_len'].cpu().tolist() + + # Get the masks and position ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + # weight loss_mask + if args.reweight_loss_based_on_position_frequency: + reweight_loss_mask_(loss_mask, tokens) + + return (tokens, position_ids, attention_mask), (labels, loss_mask), prefix_indices + +def loss_func(loss_mask, output_tensor): + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator').start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + train_ds, valid_ds, test_ds = None, None, None + + print_rank_0('> building train, validation, and test datasets for GPT ...') + # Option 1 of data loading using --data-path + + if args.data_path: + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup)) + + # Option 2 of data loading using --(train|valid|test)-weighted-split-paths + elif args.train_weighted_split_paths: + assigned_train_valid_test = [] + if args.train_weighted_split_paths is not None: + train_ds = [] + assigned_train_valid_test.append("train") + if args.valid_weighted_split_paths is not None: + valid_ds = [] + assigned_train_valid_test.append("valid") + if args.test_weighted_split_paths is not None: + test_ds = [] + assigned_train_valid_test.append("test") + + for s in assigned_train_valid_test: + data_groups = zip(eval(f"args.{s}_weighted_split_paths"), + eval(f"args.{s}_weighted_split_weights"), + eval(f"args.{s}_weighted_split_splits"), + eval(f"args.{s}_weighted_split_names")) + for paths, weights, splits, name in data_groups: + d = build_dataset_group(name, paths, weights, splits, + args.data_impl, + train_val_test_num_samples, + args.seq_length, args.seed, + (not args.mmap_warmup), + train_valid_test=s) + eval(f"{s}_ds").append(d) + else: + raise NotImplementedError("No dataloading argument passed") + + print_rank_0("> finished creating GPT datasets ...") + return train_ds, valid_ds, test_ds + +def command_exists(cmd): + result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) + return result.wait() == 0 + +def git_ds_info(): + from deepspeed.env_report import main as ds_report + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists('git'): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode('utf-8').strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode('utf-8').strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') + + +if __name__ == "__main__": + git_ds_info() + pretrain(train_valid_test_datasets_provider, model_provider, forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) From 97d3810fdd5b78c1efb3465a9153d78adc1064c0 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:26:13 +0700 Subject: [PATCH 074/148] added new script --- megatron/data/non_causal_mlm_dataset.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index fadfcb6fc..4bcd41e53 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -317,8 +317,12 @@ def __getitem__(self, idx): sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) + print(self.indexed_dataset[index]) + print(len(self.indexed_dataset[index])) sample = np.concatenate(sample) print(sample) + import sys + sys.exit() # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. From 71388ee30815fc99411b9e62b35bec0dc4fce880 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:27:55 +0700 Subject: [PATCH 075/148] added new script --- megatron/data/non_causal_mlm_dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 4bcd41e53..e6cab3c59 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -317,10 +317,9 @@ def __getitem__(self, idx): sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) - print(self.indexed_dataset[index]) print(len(self.indexed_dataset[index])) sample = np.concatenate(sample) - print(sample) + print(len(sample)) import sys sys.exit() From b0f04d5c19c654c94b96348042c485955b876eff Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:50:32 +0700 Subject: [PATCH 076/148] try t5 dataset --- megatron/data/non_causal_mlm_dataset.py | 654 ++++++------------------ train_non_causal_mlm_adaptation_gpt.py | 18 +- 2 files changed, 172 insertions(+), 500 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e6cab3c59..1b5d37250 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -15,236 +15,30 @@ """GPT Non-Causal Mask Language Model Finetune Style dataset.""" -import os -import time import collections import numpy as np import torch -from megatron import mpu, print_rank_0, get_tokenizer -from megatron.data.blendable_dataset import BlendableDataset -from megatron.data.dataset_utils import get_datasets_weights_and_num_samples -from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ -from megatron.data.dataset_utils import create_masked_lm_predictions, get_samples_mapping -from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset - - -def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, - train_valid_test_num_samples, - seq_length, seed, skip_warmup): - """Build train, valid, and test datasets.""" - - # Single dataset. - if len(data_prefix) == 1: - all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets(data_prefix[0], - data_impl, splits_string, - train_valid_test_num_samples, - seq_length, seed, skip_warmup) - # Blending dataset. - else: - - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - train_datasets = [] - valid_datasets = [] - test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - prefixes[i], data_impl, splits_string, - datasets_train_valid_test_num_samples[i], - seq_length, seed, skip_warmup) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) - - all_train_datasets = BlendableDataset(train_datasets, weights) \ - if train_datasets else None - all_valid_datasets = BlendableDataset(valid_datasets, weights) \ - if valid_datasets else None - all_test_datasets = BlendableDataset(test_datasets, weights) \ - if test_datasets else None - - return all_train_datasets, all_valid_datasets, all_test_datasets - - -def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, - train_valid_test_num_samples, - seq_length, seed, skip_warmup, train_valid_test): - ''' - Build a single dataset group corresponding to Option 2 of data loading see arguments.py - a dataset group is passed on the following form - GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2 - or alternatively - GIVEN_NAME PATH1 # for a single dataset to be used fully - ''' - - assert train_valid_test in ["train","valid","test"] - - # Single dataset. - if len(paths) == 1: - dataset = _build_single_datasets(paths[0], - splits[0], - data_impl, - train_valid_test_num_samples, - seq_length, seed, skip_warmup, - dataset_group_name, train_valid_test) - return dataset - # Blending dataset. - else: - - data_prefix = [] - # data_prefix is on the shape: - # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"] - for w,p in zip(weights, paths): - data_prefix += [w,p] - - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - datasets = [] - for i in range(len(prefixes)): - ds = _build_single_datasets(prefixes[i], - splits[i], - data_impl, - datasets_train_valid_test_num_samples[i], - seq_length, - seed, skip_warmup, - dataset_group_name, train_valid_test) - - datasets.append(ds) - all_datasets = BlendableDataset(datasets, weights) - - return all_datasets - -def _build_single_datasets(data_prefix, range_string, data_impl, train_valid_test_num_samples, - seq_length, seed, skip_warmup, dataset_group_name, train_valid_test): - """Build a single dataset""" - - assert train_valid_test in ["train","valid","test"] - index = ["train","valid","test"].index(train_valid_test) - - # Indexed dataset. - indexed_dataset = get_indexed_dataset_(data_prefix, - data_impl, - skip_warmup) - - total_num_of_documents = indexed_dataset.sizes.shape[0] - # this corresponds to option2 for data loading on the form - # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3 - # splits here is an array of size 2 [start_index, end_index] - splits = get_split_by_range_(range_string=range_string, size=total_num_of_documents) - - # Print stats about the splits. - print_rank_0(' > dataset split:') - - print_rank_0(' {}:'.format(dataset_group_name)) - print_rank_0(' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[0], splits[1], - splits[1] - splits[0])) - - def build_dataset(name): - dataset = None - if splits[1] > splits[0]: - documents = np.arange(start=splits[0], stop=splits[1], - step=1, dtype=np.int32) - dataset = NonCausalMLMDataset(name, data_prefix, - documents, indexed_dataset, - train_valid_test_num_samples[index], - seq_length, seed) - return dataset - - dataset = build_dataset(dataset_group_name) - - return dataset - - -def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, - train_valid_test_num_samples, - seq_length, seed, skip_warmup): - """Build train, valid, and test datasets.""" - - - # Indexed dataset. - indexed_dataset = get_indexed_dataset_(data_prefix, - data_impl, - skip_warmup) - - total_num_of_documents = indexed_dataset.sizes.shape[0] - # splits here is an array of size 4 [train_start_index, valid_start_index, test_start_index, test_end_index] - splits = get_train_valid_test_split_(splits_string, total_num_of_documents) - # Print stats about the splits. - print_rank_0(' > dataset split:') - - def print_split_stats(name, index): - print_rank_0(' {}:'.format(name)) - print_rank_0(' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[index], splits[index + 1], - splits[index + 1] - splits[index])) - print_split_stats('train', 0) - print_split_stats('validation', 1) - print_split_stats('test', 2) - - def build_dataset(index, name): - dataset = None - if splits[index + 1] > splits[index]: - documents = np.arange(start=splits[index], stop=splits[index + 1], - step=1, dtype=np.int32) - dataset = NonCausalMLMDataset(name, data_prefix, - documents, indexed_dataset, - train_valid_test_num_samples[index], - seq_length, seed) - return dataset - - train_dataset = build_dataset(0, 'train') - valid_dataset = build_dataset(1, 'valid') - test_dataset = build_dataset(2, 'test') - - return (train_dataset, valid_dataset, test_dataset) - - -def get_indexed_dataset_(path, data_impl, skip_warmup): - """Build indexed dataset.""" - print_rank_0(' > building dataset index ...') - start_time = time.time() - indexed_dataset = make_indexed_dataset(path, - data_impl, - skip_warmup) - print_rank_0(' > finished creating indexed dataset in {:4f} ' - 'seconds'.format(time.time() - start_time)) - print_rank_0(' number of documents: {}'.format( - indexed_dataset.sizes.shape[0])) - - return indexed_dataset - - -class NonCausalMLMDataset(torch.utils.data.Dataset): - - def __init__( - self, - name, - data_prefix, - documents, - indexed_dataset, - num_samples, - seq_length, - seed, - masked_lm_prob=0.15, - ): +from megatron import get_tokenizer +from megatron.data.dataset_utils import ( + create_masked_lm_predictions, + get_samples_mapping +) + +class T5Dataset(torch.utils.data.Dataset): + + def __init__(self, name, indexed_dataset, data_prefix, + num_epochs, max_num_samples, masked_lm_prob, + max_seq_length, max_seq_length_dec, + short_seq_prob, seed): # Params to store. self.name = name - self.seq_length = seq_length self.seed = seed self.masked_lm_prob = masked_lm_prob + self.max_seq_length = max_seq_length + self.max_seq_length_dec = max_seq_length_dec # Dataset. self.indexed_dataset = indexed_dataset @@ -252,10 +46,10 @@ def __init__( # Build the samples mapping. self.samples_mapping = get_samples_mapping(self.indexed_dataset, data_prefix, - False, #num_epochs, - num_samples, #max_num_samples, - self.seq_length-2, #self.max_seq_length - 2, # account for added tokens - 0.1, #short_seq_prob, + num_epochs, + max_num_samples, + self.max_seq_length - 2, # account for added tokens + short_seq_prob, self.seed, self.name, False) @@ -271,75 +65,34 @@ def __init__( self.bos_id = tokenizer.bos_token_id self.eos_id = tokenizer.eos_token_id self.sentinel_tokens = tokenizer.additional_special_tokens_ids - - # # Checks - # assert np.min(documents) >= 0 - # assert np.max(documents) < indexed_dataset.sizes.shape[0] - - # # Build index mappings. - # self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( - # self.name, data_prefix, documents, self.indexed_dataset.sizes, - # num_samples, seq_length, seed) + assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" def __len__(self): - # -1 is due to data structure used to retieve the index: - # sample i --> [sample_idx[i], sample_idx[i+1]) - # return self.sample_idx.shape[0] - 1 return self.samples_mapping.shape[0] def __getitem__(self, idx): - # idx = self.shuffle_idx[idx] - # # Start and end documents and offsets. - # doc_index_f = self.sample_idx[idx][0] - # doc_index_l = self.sample_idx[idx + 1][0] - # offset_f = self.sample_idx[idx][1] - # offset_l = self.sample_idx[idx + 1][1] - # # If we are within the same document, just extract the chunk. - # if doc_index_f == doc_index_l: - # sample = self.indexed_dataset.get(self.doc_idx[doc_index_f], - # offset=offset_f, - # length=offset_l - offset_f + 1) - # else: - # # Otherwise, get the rest of the initial document. - # sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], - # offset=offset_f)] - # # Loop over all in between documents and add the entire document. - # for i in range(doc_index_f + 1, doc_index_l): - # sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) - # # And finally add the relevant portion of last document. - # sample_list.append(self.indexed_dataset.get( - # self.doc_idx[doc_index_l], - # length=offset_l + 1)) - # sample = np.concatenate(sample_list) - start_index, end_index, seq_length = self.samples_mapping[idx] sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) - print(len(self.indexed_dataset[index])) - sample = np.concatenate(sample) - print(len(sample)) - import sys - sys.exit() - # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. np_rng = np.random.RandomState(seed=(self.seed + idx)) - return build_training_sample(sample, - self.seq_length, # needed for padding + return build_training_sample(sample, seq_length, + self.max_seq_length, # needed for padding + self.max_seq_length_dec, self.vocab_id_list, self.vocab_id_to_token_dict, self.cls_id, self.sep_id, self.mask_id, self.pad_id, self.masked_lm_prob, np_rng, self.bos_id, self.eos_id, - self.sentinel_tokens - ) + self.sentinel_tokens) -def build_training_sample(sample, - max_seq_length, +def build_training_sample(sample, target_seq_length, + max_seq_length, max_seq_length_dec, vocab_id_list, vocab_id_to_token_dict, cls_id, sep_id, mask_id, pad_id, masked_lm_prob, np_rng, bos_id=None, @@ -348,6 +101,7 @@ def build_training_sample(sample, Arguments: sample: A list of sentences in which each sentence is a list token ids. + target_seq_length: Desired sequence length. max_seq_length: Maximum length of the sequence. All values are padded to this length. vocab_id_list: List of vocabulary ids. Used to pick a random id. @@ -365,47 +119,35 @@ def build_training_sample(sample, sentinel_tokens: unique value to be substituted for every replaced span """ + assert target_seq_length <= max_seq_length + # flatten sentences into one list - # tokens = [token for sentence in sample for token in sentence] - tokens = sample + tokens = [token for sentence in sample for token in sentence] # Truncate to `target_sequence_length`. - max_num_tokens = max_seq_length + max_num_tokens = target_seq_length truncated = len(tokens) > max_num_tokens tokens = tokens[:max_num_tokens] # Masking. max_predictions_per_seq = masked_lm_prob * max_num_tokens - (tokens, masks, labels, _, masked_spans) = create_masked_lm_predictions( + (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions( tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, max_ngrams=10, geometric_dist=True, masking_style="t5") - sentinel_tokens = collections.deque(sentinel_tokens) - input_tokens_ids = [] - output_tokens_ids = [] #[bos_id] - (start_index, end_index) = (0, None) - for span in masked_spans: - flag = sentinel_tokens.popleft() - - output_tokens_ids.append(flag) - output_tokens_ids.extend(span.label) - - end_index = span.index[0] - input_tokens_ids.extend(tokens[start_index: end_index]) - input_tokens_ids.append(flag) - - # the next start index is the token after the last span token - start_index = span.index[-1] + 1 + # Padding. + input_tokens_ids, output_tokens_ids, labels, enc_mask, \ + dec_mask, enc_dec_mask, loss_mask \ + = pad_and_convert_to_numpy(tokens, masked_positions, + masked_labels, pad_id, max_seq_length, + max_seq_length_dec, masked_spans, + bos_id, eos_id, sentinel_tokens) + text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) - # Add the remaining tokens to input_tokens_ids - input_tokens_ids.extend(tokens[start_index:]) - # Add token to the output_tokens_ids - output_tokens_ids.append(eos_id) - prefix_len = len(input_tokens_ids) - text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) + text_tokens_ids = input_tokens_ids + output_tokens_ids print("input_tokens_ids") print(len(input_tokens_ids)) print(input_tokens_ids) @@ -422,8 +164,6 @@ def build_training_sample(sample, # self.seq_length # ) - # Padding. - padded_tokens = pad_and_convert_to_numpy(text_tokens_ids, pad_id, max_seq_length) import sys sys.exit() @@ -433,207 +173,127 @@ def build_training_sample(sample, } -def _build_index_mappings(name, data_prefix, documents, sizes, - num_samples, seq_length, seed, cutoff_last_epoch=0.95): - """Build doc-idx, sample-idx, and shuffle-idx. - doc-idx: is an array (ordered) of documents to be used in training. - sample-idx: is the start document index and document offset for each - training sample. - shuffle-idx: maps the sample index into a random index into sample-idx. - """ - # Number of tokens in each epoch and number of required epochs. - tokens_per_epoch = _num_tokens(documents, sizes) - num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) - # rng state - np_rng = np.random.RandomState(seed=seed) - - # Filename of the index mappings. - _filename = data_prefix - _filename += '_{}_indexmap'.format(name) - _filename += '_{}ns'.format(num_samples) - _filename += '_{}sl'.format(seq_length) - _filename += '_{}s'.format(seed) - doc_idx_filename = _filename + '_doc_idx.npy' - sample_idx_filename = _filename + '_sample_idx.npy' - shuffle_idx_filename = _filename + '_shuffle_idx.npy' - - # Build the indexed mapping if not exist. - if torch.distributed.get_rank() == 0: - if (not os.path.isfile(doc_idx_filename)) or \ - (not os.path.isfile(sample_idx_filename)) or \ - (not os.path.isfile(shuffle_idx_filename)): - - print_rank_0(' > WARNING: could not find index map files, building ' - 'the indices on rank 0 ...') - - # For the last epoch, decide whether include the entire epoch - # in the global shuffle or not. - - # If we need only one epoch, then separating last epoch does - # not mean anything. - if num_epochs == 1: - separate_last_epoch = False - print(' > only one epoch required, setting ' - 'separate_last_epoch to False', flush=True) - - else: - # Get the number of samples for the last epoch - num_samples_from_epochs_minus_one = ( - (num_epochs - 1) * tokens_per_epoch - 1) // seq_length - last_epoch_num_samples = num_samples - \ - num_samples_from_epochs_minus_one - assert last_epoch_num_samples >= 0, \ - f'last epoch number of samples {last_epoch_num_samples} should be non-negative.' - num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length - assert last_epoch_num_samples <= num_samples_per_epoch, \ - f'last epoch number of samples {last_epoch_num_samples} exceeded max value {num_samples_per_epoch}.' - # If we have less than cutoff_last_epoch * samples_per_epoch of the samples for the last epoch, - # seperate out the epoch and treat it differently. - separate_last_epoch = (last_epoch_num_samples < - int(cutoff_last_epoch * num_samples_per_epoch)) - if separate_last_epoch: - string = ' > last epoch number of samples ({}) is smaller '\ - 'than {}% of number of samples per epoch ({}), '\ - 'setting separate_last_epoch to True' - else: - string = ' > last epoch number of samples ({}) is larger '\ - 'than {}% of number of samples per epoch ({}), '\ - 'setting separate_last_epoch to False' - print(string.format(last_epoch_num_samples, cutoff_last_epoch * 100, - num_samples_per_epoch), flush=True) - - # doc-idx. - start_time = time.time() - doc_idx = _build_doc_idx(documents, num_epochs, np_rng, - separate_last_epoch) - np.save(doc_idx_filename, doc_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save doc-idx mapping ' - '(seconds): {:4f}'.format(time.time() - start_time)) - # sample-idx. - start_time = time.time() - # Use C++ implementation for speed. - # First compile and then import. - from megatron.data import helpers - assert doc_idx.dtype == np.int32 - assert sizes.dtype == np.int32 - sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, - num_epochs, tokens_per_epoch) - - np.save(sample_idx_filename, sample_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save sample-idx mapping ' - '(seconds): {:4f}'.format(time.time() - start_time)) - # shuffle-idx. - start_time = time.time() - # -1 is due to data structure used to retieve the index: - # sample i --> [sample_idx[i], sample_idx[i+1]) - if separate_last_epoch: - num_samples_ = num_samples_from_epochs_minus_one - else: - num_samples_ = sample_idx.shape[0] - 1 - shuffle_idx = _build_shuffle_idx(num_samples_, - sample_idx.shape[0] - 1, np_rng) - np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save shuffle-idx mapping' - ' (seconds): {:4f}'.format(time.time() - start_time)) - - # This should be a barrier but nccl barrier assumes - # device_index=rank which is not the case for model - # parallel case - counts = torch.cuda.LongTensor([1]) - torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) - torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) - assert counts[0].item() == ( - torch.distributed.get_world_size() // - torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) - - # Load mappings. - start_time = time.time() - print_rank_0(' > loading doc-idx mapping from {}'.format( - doc_idx_filename)) - doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') - print_rank_0(' > loading sample-idx mapping from {}'.format( - sample_idx_filename)) - sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') - print_rank_0(' > loading shuffle-idx mapping from {}'.format( - shuffle_idx_filename)) - shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') - print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( - time.time() - start_time)) - print_rank_0(' total number of samples: {}'.format( - sample_idx.shape[0])) - print_rank_0(' total number of epochs: {}'.format(num_epochs)) - - return doc_idx, sample_idx, shuffle_idx - - -def _num_tokens(documents, sizes): - """Total number of tokens in the dataset.""" - return np.sum(sizes[documents]) - - -def _num_epochs(tokens_per_epoch, seq_length, num_samples): - """Based on number of samples and sequence lenght, calculate how many - epochs will be needed.""" - num_epochs = 0 - total_tokens = 0 - while True: - num_epochs += 1 - total_tokens += tokens_per_epoch - # -1 is because we need to retrieve seq_length + 1 token each time - # but the last token will overlap with the first token of the next - # sample except for the last sample. - if ((total_tokens - 1) // seq_length) >= num_samples: - return num_epochs - - -def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch): - """Build an array with length = number-of-epochs * number-of-dcuments. - Each index is mapped to a corresponding document.""" - if not separate_last_epoch or num_epochs == 1: - doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1] - doc_idx[:] = documents - doc_idx = doc_idx.reshape(-1) - doc_idx = doc_idx.astype(np.int32) - np_rng.shuffle(doc_idx) - return doc_idx - - doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False) - doc_idx_last = _build_doc_idx(documents, 1, np_rng, False) - return np.concatenate((doc_idx_first, doc_idx_last)) - - -def _build_shuffle_idx(num_samples, total_size, np_rng): - """Build the range [0, size) and shuffle.""" - print(' > building shuffle index with split [0, {}) and [{}, {}) ' - '...'.format(num_samples, num_samples, total_size), flush=True) - - dtype_ = np.uint32 - if total_size >= (np.iinfo(np.uint32).max - 1): - dtype_ = np.int64 - - shuffle_idx_first = np.arange(start=0, stop=num_samples, - step=1, dtype=dtype_) - np_rng.shuffle(shuffle_idx_first) - if num_samples == total_size: - return shuffle_idx_first - - shuffle_idx_last = np.arange(start=num_samples, stop=total_size, - step=1, dtype=dtype_) - np_rng.shuffle(shuffle_idx_last) - - return np.concatenate((shuffle_idx_first, shuffle_idx_last)) - - -def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): + # train_sample = { + # 'text_enc': tokens_enc, + # 'text_dec': tokens_dec_in, + # 'labels': labels, + # 'loss_mask': loss_mask, + # 'truncated': int(truncated), + # 'enc_mask': enc_mask, + # 'dec_mask': dec_mask, + # 'enc_dec_mask': enc_dec_mask, + # } + # return train_sample + + +def pad_and_convert_to_numpy(tokens, masked_positions, + masked_labels, pad_id, + max_seq_length, max_seq_length_dec, + masked_spans=None, bos_id=None, + eos_id=None, sentinel_tokens=None): """Pad sequences and convert them to numpy.""" + sentinel_tokens = collections.deque(sentinel_tokens) + t5_input = [] + (t5_decoder_in, t5_decoder_out) = ([bos_id], []) + (start_index, end_index) = (0, None) + for span in masked_spans: + flag = sentinel_tokens.popleft() + + # Append the same tokens in decoder input and output + t5_decoder_in.append(flag) + t5_decoder_in.extend(span.label) + t5_decoder_out.append(flag) + t5_decoder_out.extend(span.label) + + end_index = span.index[0] + t5_input.extend(tokens[start_index: end_index]) + t5_input.append(flag) + + # the next start index is the token after the last span token + start_index = span.index[-1] + 1 + + # Add token to the t5_decoder_out + t5_decoder_out.append(eos_id) + + # Add the remaining tokens to the t5 input + t5_input.extend(tokens[start_index:]) + + # assert (len(t5_input) - len(masked_spans)) + \ + # (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens) + # Some checks. - num_tokens = len(tokens) + + # Encoder-side padding mask. + num_tokens = len(t5_input) padding_length = max_seq_length - num_tokens assert padding_length >= 0 + assert len(masked_positions) == len(masked_labels) + + # Tokens.. + filler = [pad_id] * padding_length + tokens_enc = np.array(t5_input + filler, dtype=np.int64) - # Tokens and token types. - filler = np.array([pad_id] * padding_length) - tokens_np = np.concatenate((tokens, filler), dtype=np.int64) + # Decoder-side padding mask. + num_tokens_dec = len(t5_decoder_in) + padding_length_dec = max_seq_length_dec - num_tokens_dec + assert padding_length_dec >= 0 + filler_dec = [pad_id] * padding_length_dec + tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64) - return tokens_np \ No newline at end of file + # Create attention masks + enc_mask = make_attention_mask(tokens_enc, tokens_enc) + enc_dec_mask = make_attention_mask(tokens_dec_in, tokens_enc) + dec_mask = make_attention_mask(tokens_dec_in, tokens_dec_in) + dec_mask = dec_mask * make_history_mask(tokens_dec_in) + + # Labels mask. + labels = t5_decoder_out + ([-1] * padding_length_dec) + labels = np.array(labels, dtype=np.int64) + + # Loss mask + loss_mask = ([1] * num_tokens_dec) + ([0] * padding_length_dec) + loss_mask = np.array(loss_mask, dtype=np.int64) + + return tokens_enc, tokens_dec_in, labels, enc_mask, \ + dec_mask, enc_dec_mask, loss_mask + + +def make_attention_mask(source_block, target_block): + """ + Returns a 2-dimensional (2-D) attention mask + :param source_block: 1-D array + :param target_block: 1-D array + """ + mask = (target_block[None, :] >= 1) * (source_block[:, None] >= 1) + mask = mask.astype(np.int64) + # (source_length, target_length) + return mask + + +def make_attention_mask_3d(source_block, target_block): + """ + Returns a 3-dimensional (3-D) attention mask + :param source_block: 1-D array + :param target_block: 1-D array + """ + mask = (target_block[:, None, :] >= 1) * (source_block[:, :, None] >= 1) + # (batch, source_length, target_length) + # mask = mask.astype(np.int64) + return mask + + +def make_history_mask(block): + length = block.shape[0] + arange = np.arange(length) + history_mask = (arange[None, ] <= arange[:, None]) + history_mask = history_mask.astype(np.int64) + return history_mask + + +def make_history_mask_3d(block): + batch, length = block.shape + arange = torch.arange(length, device=block.device) + history_mask = (arange[None, ] <= arange[:, None])[None, ] + history_mask = history_mask.expand(batch, length, length) + return history_mask diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py index b7af289a5..3a12e0b95 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -22,7 +22,7 @@ from megatron import get_timers from megatron import get_tokenizer from megatron import mpu -# from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group + from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain @@ -184,14 +184,26 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): # Option 1 of data loading using --data-path if args.data_path: + # train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + # data_prefix=args.data_path, + # data_impl=args.data_impl, + # splits_string=args.split, + # train_valid_test_num_samples=train_val_test_num_samples, + # seq_length=args.seq_length, + # seed=args.seed, + # skip_warmup=(not args.mmap_warmup)) train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - seq_length=args.seq_length, + max_seq_length=512,#args.encoder_seq_length, + max_seq_length_dec=114,#args.decoder_seq_length, + masked_lm_prob=args.mask_prob, + short_seq_prob=args.short_seq_prob, seed=args.seed, - skip_warmup=(not args.mmap_warmup)) + skip_warmup=(not args.mmap_warmup), + dataset_type='t5') # Option 2 of data loading using --(train|valid|test)-weighted-split-paths elif args.train_weighted_split_paths: From cd43a54249f6e999215c63acfb4bc6427f41a34c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:51:46 +0700 Subject: [PATCH 077/148] try t5 dataset --- megatron/data/non_causal_mlm_dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 1b5d37250..43b25544a 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -144,8 +144,7 @@ def build_training_sample(sample, target_seq_length, max_seq_length_dec, masked_spans, bos_id, eos_id, sentinel_tokens) - text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) - + #text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) text_tokens_ids = input_tokens_ids + output_tokens_ids print("input_tokens_ids") From e0dc666b18f432495711d379f7a29509cffb7703 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:56:19 +0700 Subject: [PATCH 078/148] try t5 dataset --- megatron/data/non_causal_mlm_dataset.py | 212 +++++++++++++++++++++++- 1 file changed, 205 insertions(+), 7 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 43b25544a..6425c8b72 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -15,18 +15,216 @@ """GPT Non-Causal Mask Language Model Finetune Style dataset.""" +import os +import time import collections import numpy as np import torch -from megatron import get_tokenizer -from megatron.data.dataset_utils import ( - create_masked_lm_predictions, - get_samples_mapping -) - -class T5Dataset(torch.utils.data.Dataset): +from megatron import mpu, print_rank_0, get_tokenizer +from megatron.data.blendable_dataset import BlendableDataset +from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, get_samples_mapping, create_masked_lm_predictions +from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ +from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset + + +def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup): + """Build train, valid, and test datasets.""" + + # Single dataset. + if len(data_prefix) == 1: + all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets(data_prefix[0], + data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup) + # Blending dataset. + else: + + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + prefixes[i], data_impl, splits_string, + datasets_train_valid_test_num_samples[i], + seq_length, seed, skip_warmup) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + all_train_datasets = BlendableDataset(train_datasets, weights) \ + if train_datasets else None + all_valid_datasets = BlendableDataset(valid_datasets, weights) \ + if valid_datasets else None + all_test_datasets = BlendableDataset(test_datasets, weights) \ + if test_datasets else None + + return all_train_datasets, all_valid_datasets, all_test_datasets + + +def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, train_valid_test): + ''' + Build a single dataset group corresponding to Option 2 of data loading see arguments.py + a dataset group is passed on the following form + GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2 + or alternatively + GIVEN_NAME PATH1 # for a single dataset to be used fully + ''' + + assert train_valid_test in ["train","valid","test"] + + # Single dataset. + if len(paths) == 1: + dataset = _build_single_datasets(paths[0], + splits[0], + data_impl, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, + dataset_group_name, train_valid_test) + return dataset + # Blending dataset. + else: + + data_prefix = [] + # data_prefix is on the shape: + # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"] + for w,p in zip(weights, paths): + data_prefix += [w,p] + + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + datasets = [] + for i in range(len(prefixes)): + ds = _build_single_datasets(prefixes[i], + splits[i], + data_impl, + datasets_train_valid_test_num_samples[i], + seq_length, + seed, skip_warmup, + dataset_group_name, train_valid_test) + + datasets.append(ds) + all_datasets = BlendableDataset(datasets, weights) + + return all_datasets + +def _build_single_datasets(data_prefix, range_string, data_impl, train_valid_test_num_samples, + seq_length, seed, skip_warmup, dataset_group_name, train_valid_test): + """Build a single dataset""" + + assert train_valid_test in ["train","valid","test"] + index = ["train","valid","test"].index(train_valid_test) + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + data_impl, + skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + # this corresponds to option2 for data loading on the form + # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3 + # splits here is an array of size 2 [start_index, end_index] + splits = get_split_by_range_(range_string=range_string, size=total_num_of_documents) + + # Print stats about the splits. + print_rank_0(' > dataset split:') + + print_rank_0(' {}:'.format(dataset_group_name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[0], splits[1], + splits[1] - splits[0])) + + def build_dataset(name): + dataset = None + if splits[1] > splits[0]: + documents = np.arange(start=splits[0], stop=splits[1], + step=1, dtype=np.int32) + dataset = NonCausalMLMDataset(name, data_prefix, + documents, indexed_dataset, + train_valid_test_num_samples[index], + seq_length, seed) + return dataset + + dataset = build_dataset(dataset_group_name) + + return dataset + + +def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup): + """Build train, valid, and test datasets.""" + + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + data_impl, + skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + # splits here is an array of size 4 [train_start_index, valid_start_index, test_start_index, test_end_index] + splits = get_train_valid_test_split_(splits_string, total_num_of_documents) + # Print stats about the splits. + print_rank_0(' > dataset split:') + + def print_split_stats(name, index): + print_rank_0(' {}:'.format(name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[index], splits[index + 1], + splits[index + 1] - splits[index])) + print_split_stats('train', 0) + print_split_stats('validation', 1) + print_split_stats('test', 2) + + def build_dataset(index, name): + dataset = None + if splits[index + 1] > splits[index]: + documents = np.arange(start=splits[index], stop=splits[index + 1], + step=1, dtype=np.int32) + dataset = NonCausalMLMDataset(name, data_prefix, + documents, indexed_dataset, + train_valid_test_num_samples[index], + seq_length, seed) + return dataset + + train_dataset = build_dataset(0, 'train') + valid_dataset = build_dataset(1, 'valid') + test_dataset = build_dataset(2, 'test') + + return (train_dataset, valid_dataset, test_dataset) + + +def get_indexed_dataset_(path, data_impl, skip_warmup): + """Build indexed dataset.""" + print_rank_0(' > building dataset index ...') + start_time = time.time() + indexed_dataset = make_indexed_dataset(path, + data_impl, + skip_warmup) + print_rank_0(' > finished creating indexed dataset in {:4f} ' + 'seconds'.format(time.time() - start_time)) + print_rank_0(' number of documents: {}'.format( + indexed_dataset.sizes.shape[0])) + + return indexed_dataset + +class NonCausalMLMDataset(torch.utils.data.Dataset): def __init__(self, name, indexed_dataset, data_prefix, num_epochs, max_num_samples, masked_lm_prob, From 866cee153a9398382c9e95709539c63278f16735 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:13:34 +0700 Subject: [PATCH 079/148] try t5 dataset --- megatron/data/non_causal_mlm_dataset.py | 236 +++++++++--------------- 1 file changed, 83 insertions(+), 153 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 6425c8b72..adc752280 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -25,150 +25,72 @@ from megatron import mpu, print_rank_0, get_tokenizer from megatron.data.blendable_dataset import BlendableDataset from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, get_samples_mapping, create_masked_lm_predictions -from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ +from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_, get_indexed_dataset_ from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, train_valid_test_num_samples, - seq_length, seed, skip_warmup): - """Build train, valid, and test datasets.""" - - # Single dataset. + max_seq_length, + masked_lm_prob, short_seq_prob, seed, + skip_warmup, binary_head=False, + max_seq_length_dec=None, + dataset_type='standard_bert'): if len(data_prefix) == 1: - all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets(data_prefix[0], + return _build_train_valid_test_datasets(data_prefix[0], data_impl, splits_string, train_valid_test_num_samples, - seq_length, seed, skip_warmup) - # Blending dataset. - else: - - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - train_datasets = [] - valid_datasets = [] - test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - prefixes[i], data_impl, splits_string, - datasets_train_valid_test_num_samples[i], - seq_length, seed, skip_warmup) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) - - all_train_datasets = BlendableDataset(train_datasets, weights) \ - if train_datasets else None - all_valid_datasets = BlendableDataset(valid_datasets, weights) \ - if valid_datasets else None - all_test_datasets = BlendableDataset(test_datasets, weights) \ - if test_datasets else None - - return all_train_datasets, all_valid_datasets, all_test_datasets - - -def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, - train_valid_test_num_samples, - seq_length, seed, skip_warmup, train_valid_test): - ''' - Build a single dataset group corresponding to Option 2 of data loading see arguments.py - a dataset group is passed on the following form - GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2 - or alternatively - GIVEN_NAME PATH1 # for a single dataset to be used fully - ''' - - assert train_valid_test in ["train","valid","test"] - - # Single dataset. - if len(paths) == 1: - dataset = _build_single_datasets(paths[0], - splits[0], - data_impl, - train_valid_test_num_samples, - seq_length, seed, skip_warmup, - dataset_group_name, train_valid_test) - return dataset + max_seq_length, masked_lm_prob, + short_seq_prob, seed, + skip_warmup, + binary_head, + max_seq_length_dec, + dataset_type=dataset_type) # Blending dataset. - else: - - data_prefix = [] - # data_prefix is on the shape: - # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"] - for w,p in zip(weights, paths): - data_prefix += [w,p] - - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - datasets = [] - for i in range(len(prefixes)): - ds = _build_single_datasets(prefixes[i], - splits[i], - data_impl, - datasets_train_valid_test_num_samples[i], - seq_length, - seed, skip_warmup, - dataset_group_name, train_valid_test) - - datasets.append(ds) - all_datasets = BlendableDataset(datasets, weights) - - return all_datasets - -def _build_single_datasets(data_prefix, range_string, data_impl, train_valid_test_num_samples, - seq_length, seed, skip_warmup, dataset_group_name, train_valid_test): - """Build a single dataset""" - - assert train_valid_test in ["train","valid","test"] - index = ["train","valid","test"].index(train_valid_test) - - # Indexed dataset. - indexed_dataset = get_indexed_dataset_(data_prefix, - data_impl, - skip_warmup) - - total_num_of_documents = indexed_dataset.sizes.shape[0] - # this corresponds to option2 for data loading on the form - # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3 - # splits here is an array of size 2 [start_index, end_index] - splits = get_split_by_range_(range_string=range_string, size=total_num_of_documents) - - # Print stats about the splits. - print_rank_0(' > dataset split:') - - print_rank_0(' {}:'.format(dataset_group_name)) - print_rank_0(' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[0], splits[1], - splits[1] - splits[0])) - - def build_dataset(name): - dataset = None - if splits[1] > splits[0]: - documents = np.arange(start=splits[0], stop=splits[1], - step=1, dtype=np.int32) - dataset = NonCausalMLMDataset(name, data_prefix, - documents, indexed_dataset, - train_valid_test_num_samples[index], - seq_length, seed) - return dataset - - dataset = build_dataset(dataset_group_name) - - return dataset + # Parse the values. + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + prefixes[i], data_impl, splits_string, + datasets_train_valid_test_num_samples[i], + max_seq_length, masked_lm_prob, short_seq_prob, + seed, skip_warmup, binary_head, dataset_type=dataset_type) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + # Blend. + blending_train_dataset = None + if train_datasets: + blending_train_dataset = BlendableDataset(train_datasets, weights) + blending_valid_dataset = None + if valid_datasets: + blending_valid_dataset = BlendableDataset(valid_datasets, weights) + blending_test_dataset = None + if test_datasets: + blending_test_dataset = BlendableDataset(test_datasets, weights) + + return (blending_train_dataset, blending_valid_dataset, + blending_test_dataset) def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, train_valid_test_num_samples, - seq_length, seed, skip_warmup): + max_seq_length, + masked_lm_prob, short_seq_prob, seed, + skip_warmup, binary_head, + max_seq_length_dec, + dataset_type='standard_bert'): """Build train, valid, and test datasets.""" @@ -177,8 +99,7 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, data_impl, skip_warmup) - total_num_of_documents = indexed_dataset.sizes.shape[0] - # splits here is an array of size 4 [train_start_index, valid_start_index, test_start_index, test_end_index] + total_num_of_documents = indexed_dataset.sizes.shape[0] - 1 splits = get_train_valid_test_split_(splits_string, total_num_of_documents) # Print stats about the splits. print_rank_0(' > dataset split:') @@ -188,6 +109,11 @@ def print_split_stats(name, index): print_rank_0(' document indices in [{}, {}) total of {} ' 'documents'.format(splits[index], splits[index + 1], splits[index + 1] - splits[index])) + start_index = indexed_dataset.doc_idx[splits[index]] + end_index = indexed_dataset.doc_idx[splits[index + 1]] + print_rank_0(' sentence indices in [{}, {}) total of {} ' + 'sentences'.format(start_index, end_index, + end_index - start_index)) print_split_stats('train', 0) print_split_stats('validation', 1) print_split_stats('test', 2) @@ -195,12 +121,30 @@ def print_split_stats(name, index): def build_dataset(index, name): dataset = None if splits[index + 1] > splits[index]: - documents = np.arange(start=splits[index], stop=splits[index + 1], - step=1, dtype=np.int32) - dataset = NonCausalMLMDataset(name, data_prefix, - documents, indexed_dataset, - train_valid_test_num_samples[index], - seq_length, seed) + # Get the pointer to the original doc-idx so we can set it later. + doc_idx_ptr = indexed_dataset.get_doc_idx() + # Slice the doc-idx + start_index = splits[index] + # Add +1 so we can index into the dataset to get the upper bound. + end_index = splits[index + 1] + 1 + # New doc_idx view. + indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index]) + # Build the dataset accordingly. + kwargs = dict( + name=name, + data_prefix=data_prefix, + num_epochs=None, + max_num_samples=train_valid_test_num_samples[index], + max_seq_length=max_seq_length, + seed=seed, + ) + dataset = NonCausalMLMDataset( + indexed_dataset=indexed_dataset, + masked_lm_prob=masked_lm_prob, + max_seq_length_dec=max_seq_length_dec, + short_seq_prob=short_seq_prob, + **kwargs + ) return dataset train_dataset = build_dataset(0, 'train') @@ -210,20 +154,6 @@ def build_dataset(index, name): return (train_dataset, valid_dataset, test_dataset) -def get_indexed_dataset_(path, data_impl, skip_warmup): - """Build indexed dataset.""" - print_rank_0(' > building dataset index ...') - start_time = time.time() - indexed_dataset = make_indexed_dataset(path, - data_impl, - skip_warmup) - print_rank_0(' > finished creating indexed dataset in {:4f} ' - 'seconds'.format(time.time() - start_time)) - print_rank_0(' number of documents: {}'.format( - indexed_dataset.sizes.shape[0])) - - return indexed_dataset - class NonCausalMLMDataset(torch.utils.data.Dataset): def __init__(self, name, indexed_dataset, data_prefix, From 0b56a7da5f64e80209e15710cd0d5196bae44915 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:14:54 +0700 Subject: [PATCH 080/148] try t5 dataset --- train_non_causal_mlm_adaptation_gpt.py | 56 +++++++++++++------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py index 3a12e0b95..7bbb4308d 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -205,34 +205,34 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): skip_warmup=(not args.mmap_warmup), dataset_type='t5') - # Option 2 of data loading using --(train|valid|test)-weighted-split-paths - elif args.train_weighted_split_paths: - assigned_train_valid_test = [] - if args.train_weighted_split_paths is not None: - train_ds = [] - assigned_train_valid_test.append("train") - if args.valid_weighted_split_paths is not None: - valid_ds = [] - assigned_train_valid_test.append("valid") - if args.test_weighted_split_paths is not None: - test_ds = [] - assigned_train_valid_test.append("test") - - for s in assigned_train_valid_test: - data_groups = zip(eval(f"args.{s}_weighted_split_paths"), - eval(f"args.{s}_weighted_split_weights"), - eval(f"args.{s}_weighted_split_splits"), - eval(f"args.{s}_weighted_split_names")) - for paths, weights, splits, name in data_groups: - d = build_dataset_group(name, paths, weights, splits, - args.data_impl, - train_val_test_num_samples, - args.seq_length, args.seed, - (not args.mmap_warmup), - train_valid_test=s) - eval(f"{s}_ds").append(d) - else: - raise NotImplementedError("No dataloading argument passed") + # # Option 2 of data loading using --(train|valid|test)-weighted-split-paths + # elif args.train_weighted_split_paths: + # assigned_train_valid_test = [] + # if args.train_weighted_split_paths is not None: + # train_ds = [] + # assigned_train_valid_test.append("train") + # if args.valid_weighted_split_paths is not None: + # valid_ds = [] + # assigned_train_valid_test.append("valid") + # if args.test_weighted_split_paths is not None: + # test_ds = [] + # assigned_train_valid_test.append("test") + + # for s in assigned_train_valid_test: + # data_groups = zip(eval(f"args.{s}_weighted_split_paths"), + # eval(f"args.{s}_weighted_split_weights"), + # eval(f"args.{s}_weighted_split_splits"), + # eval(f"args.{s}_weighted_split_names")) + # for paths, weights, splits, name in data_groups: + # d = build_dataset_group(name, paths, weights, splits, + # args.data_impl, + # train_val_test_num_samples, + # args.seq_length, args.seed, + # (not args.mmap_warmup), + # train_valid_test=s) + # eval(f"{s}_ds").append(d) + # else: + # raise NotImplementedError("No dataloading argument passed") print_rank_0("> finished creating GPT datasets ...") return train_ds, valid_ds, test_ds From 5bb512b9991b9fc7ce995367c0193caf33bab80a Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:15:18 +0700 Subject: [PATCH 081/148] try t5 dataset --- train_non_causal_mlm_adaptation_gpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py index 7bbb4308d..3175f35da 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -23,7 +23,7 @@ from megatron import get_tokenizer from megatron import mpu -from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets #, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ From 31d844f18ff8c4249692c31ae4989ad767783c73 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:16:50 +0700 Subject: [PATCH 082/148] try t5 dataset --- train_non_causal_mlm_adaptation_gpt.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py index 3175f35da..6e57dfd15 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -23,7 +23,8 @@ from megatron import get_tokenizer from megatron import mpu -from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets #, build_dataset_group +from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets +#, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ From 1d21963bce3d4d16bc6dcb9685637c7d32dbcf62 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:18:27 +0700 Subject: [PATCH 083/148] try t5 dataset --- examples/train_non_causal_mlm_adaption.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/train_non_causal_mlm_adaption.sh b/examples/train_non_causal_mlm_adaption.sh index b47ea7142..6cb1c38a0 100644 --- a/examples/train_non_causal_mlm_adaption.sh +++ b/examples/train_non_causal_mlm_adaption.sh @@ -9,7 +9,7 @@ DATA_PATH=data/mc4-id_text_document CHECKPOINT_PATH=data -deepspeed --num_gpus 8 train_non_causal_mlm_adaption_mlm.py \ +deepspeed --num_gpus 8 train_non_causal_mlm_adaption_gpt.py \ --num-layers 2 \ --hidden-size 128 \ --num-attention-heads 4 \ From 1429645bced0f12583a724b80c279800fac5b605 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:19:16 +0700 Subject: [PATCH 084/148] try t5 dataset --- examples/train_non_causal_mlm_adaption.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/train_non_causal_mlm_adaption.sh b/examples/train_non_causal_mlm_adaption.sh index 6cb1c38a0..e150e3cee 100644 --- a/examples/train_non_causal_mlm_adaption.sh +++ b/examples/train_non_causal_mlm_adaption.sh @@ -9,7 +9,7 @@ DATA_PATH=data/mc4-id_text_document CHECKPOINT_PATH=data -deepspeed --num_gpus 8 train_non_causal_mlm_adaption_gpt.py \ +deepspeed --num_gpus 8 train_non_causal_mlm_adaptation_gpt.py \ --num-layers 2 \ --hidden-size 128 \ --num-attention-heads 4 \ From f5341f8687d067a850ba9293260de4c038f2fac9 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:29:01 +0700 Subject: [PATCH 085/148] try t5 dataset --- megatron/data/non_causal_mlm_dataset.py | 65 +++++++++++++------------ train_non_causal_mlm_adaptation_gpt.py | 3 +- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index adc752280..b8b17a8a6 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -47,38 +47,39 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, max_seq_length_dec, dataset_type=dataset_type) # Blending dataset. - # Parse the values. - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - train_datasets = [] - valid_datasets = [] - test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - prefixes[i], data_impl, splits_string, - datasets_train_valid_test_num_samples[i], - max_seq_length, masked_lm_prob, short_seq_prob, - seed, skip_warmup, binary_head, dataset_type=dataset_type) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) - - # Blend. - blending_train_dataset = None - if train_datasets: - blending_train_dataset = BlendableDataset(train_datasets, weights) - blending_valid_dataset = None - if valid_datasets: - blending_valid_dataset = BlendableDataset(valid_datasets, weights) - blending_test_dataset = None - if test_datasets: - blending_test_dataset = BlendableDataset(test_datasets, weights) + else: + # Parse the values. + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + prefixes[i], data_impl, splits_string, + datasets_train_valid_test_num_samples[i], + max_seq_length, masked_lm_prob, short_seq_prob, + seed, skip_warmup, binary_head, dataset_type=dataset_type) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + # Blend. + blending_train_dataset = None + if train_datasets: + blending_train_dataset = BlendableDataset(train_datasets, weights) + blending_valid_dataset = None + if valid_datasets: + blending_valid_dataset = BlendableDataset(valid_datasets, weights) + blending_test_dataset = None + if test_datasets: + blending_test_dataset = BlendableDataset(test_datasets, weights) return (blending_train_dataset, blending_valid_dataset, blending_test_dataset) diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py index 6e57dfd15..3175f35da 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -23,8 +23,7 @@ from megatron import get_tokenizer from megatron import mpu -from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets -#, build_dataset_group +from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets #, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ From b05b17512872d61e340eab596083b6482ae85fec Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:39:19 +0700 Subject: [PATCH 086/148] try t5 dataset --- megatron/data/non_causal_mlm_dataset.py | 67 ++++++++++++------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index b8b17a8a6..e3db5f00c 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -47,39 +47,38 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, max_seq_length_dec, dataset_type=dataset_type) # Blending dataset. - else: - # Parse the values. - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - train_datasets = [] - valid_datasets = [] - test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - prefixes[i], data_impl, splits_string, - datasets_train_valid_test_num_samples[i], - max_seq_length, masked_lm_prob, short_seq_prob, - seed, skip_warmup, binary_head, dataset_type=dataset_type) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) - - # Blend. - blending_train_dataset = None - if train_datasets: - blending_train_dataset = BlendableDataset(train_datasets, weights) - blending_valid_dataset = None - if valid_datasets: - blending_valid_dataset = BlendableDataset(valid_datasets, weights) - blending_test_dataset = None - if test_datasets: - blending_test_dataset = BlendableDataset(test_datasets, weights) + # Parse the values. + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + prefixes[i], data_impl, splits_string, + datasets_train_valid_test_num_samples[i], + max_seq_length, masked_lm_prob, short_seq_prob, + seed, skip_warmup, binary_head, dataset_type=dataset_type) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + # Blend. + blending_train_dataset = None + if train_datasets: + blending_train_dataset = BlendableDataset(train_datasets, weights) + blending_valid_dataset = None + if valid_datasets: + blending_valid_dataset = BlendableDataset(valid_datasets, weights) + blending_test_dataset = None + if test_datasets: + blending_test_dataset = BlendableDataset(test_datasets, weights) return (blending_train_dataset, blending_valid_dataset, blending_test_dataset) @@ -100,7 +99,7 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, data_impl, skip_warmup) - total_num_of_documents = indexed_dataset.sizes.shape[0] - 1 + total_num_of_documents = indexed_dataset.sizes.shape[0]# - 1 splits = get_train_valid_test_split_(splits_string, total_num_of_documents) # Print stats about the splits. print_rank_0(' > dataset split:') From 59a6e322c97584c4676afb49321284dc6ec29685 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:45:51 +0700 Subject: [PATCH 087/148] try t5 dataset --- megatron/data/non_causal_mlm_dataset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e3db5f00c..6fd2fef81 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -36,6 +36,9 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, skip_warmup, binary_head=False, max_seq_length_dec=None, dataset_type='standard_bert'): + print(len(data_prefix)) + import sys + sys.exit() if len(data_prefix) == 1: return _build_train_valid_test_datasets(data_prefix[0], data_impl, splits_string, @@ -99,7 +102,7 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, data_impl, skip_warmup) - total_num_of_documents = indexed_dataset.sizes.shape[0]# - 1 + total_num_of_documents = indexed_dataset.sizes.shape[0] - 1 splits = get_train_valid_test_split_(splits_string, total_num_of_documents) # Print stats about the splits. print_rank_0(' > dataset split:') From ab76d4924407c9cfa99eece3b9caa60502158b50 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:54:53 +0700 Subject: [PATCH 088/148] developing --- megatron/data/non_causal_mlm_dataset.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 6fd2fef81..e45966fa8 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -36,9 +36,6 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, skip_warmup, binary_head=False, max_seq_length_dec=None, dataset_type='standard_bert'): - print(len(data_prefix)) - import sys - sys.exit() if len(data_prefix) == 1: return _build_train_valid_test_datasets(data_prefix[0], data_impl, splits_string, @@ -148,6 +145,11 @@ def build_dataset(index, name): short_seq_prob=short_seq_prob, **kwargs ) + indexed_dataset.set_doc_idx(doc_idx_ptr) + # Checks. + assert indexed_dataset.doc_idx[0] == 0 + assert indexed_dataset.doc_idx.shape[0] == \ + (total_num_of_documents + 1) return dataset train_dataset = build_dataset(0, 'train') From 0d8dfacf6e78d73f0cb819a6f751a57888029bfb Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 02:03:09 +0700 Subject: [PATCH 089/148] developing --- megatron/data/non_causal_mlm_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e45966fa8..231275b8c 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -147,9 +147,9 @@ def build_dataset(index, name): ) indexed_dataset.set_doc_idx(doc_idx_ptr) # Checks. - assert indexed_dataset.doc_idx[0] == 0 - assert indexed_dataset.doc_idx.shape[0] == \ - (total_num_of_documents + 1) + # assert indexed_dataset.doc_idx[0] == 0 + # assert indexed_dataset.doc_idx.shape[0] == \ + # (total_num_of_documents + 1) return dataset train_dataset = build_dataset(0, 'train') From e6292245edd5dbd43249355119bd23950c031fe8 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 02:04:37 +0700 Subject: [PATCH 090/148] developing --- megatron/data/non_causal_mlm_dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 231275b8c..063cb9a61 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -277,18 +277,18 @@ def build_training_sample(sample, target_seq_length, max_seq_length_dec, masked_spans, bos_id, eos_id, sentinel_tokens) - #text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) + # text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) - text_tokens_ids = input_tokens_ids + output_tokens_ids + # text_tokens_ids = input_tokens_ids + output_tokens_ids print("input_tokens_ids") print(len(input_tokens_ids)) print(input_tokens_ids) print("output_tokens_ids") print(len(output_tokens_ids)) print(output_tokens_ids) - print("text_tokens_ids") - # print(text_tokens_ids) - print(len(text_tokens_ids)) + # print("text_tokens_ids") + # # print(text_tokens_ids) + # print(len(text_tokens_ids)) # input_tokens_ids = pad_and_convert_to_numpy( # input_tokens_ids, From efcf50f6b79ce4822dd6714c5d689c27bced8de5 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 02:24:27 +0700 Subject: [PATCH 091/148] developing --- examples/train_non_causal_mlm_adaption.sh | 1 + megatron/data/non_causal_mlm_dataset.py | 38 ++++++++++++++++++----- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/examples/train_non_causal_mlm_adaption.sh b/examples/train_non_causal_mlm_adaption.sh index e150e3cee..a595fe161 100644 --- a/examples/train_non_causal_mlm_adaption.sh +++ b/examples/train_non_causal_mlm_adaption.sh @@ -39,4 +39,5 @@ deepspeed --num_gpus 8 train_non_causal_mlm_adaptation_gpt.py \ --eval-interval 1000 \ --eval-iters 10 \ --fp16 \ + --loss-on-targets-only \ --tensorboard-dir LOG diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 063cb9a61..a02c27e12 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -269,13 +269,37 @@ def build_training_sample(sample, target_seq_length, cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, max_ngrams=10, geometric_dist=True, masking_style="t5") - # Padding. - input_tokens_ids, output_tokens_ids, labels, enc_mask, \ - dec_mask, enc_dec_mask, loss_mask \ - = pad_and_convert_to_numpy(tokens, masked_positions, - masked_labels, pad_id, max_seq_length, - max_seq_length_dec, masked_spans, - bos_id, eos_id, sentinel_tokens) + sentinel_tokens = collections.deque(sentinel_tokens) + input_tokens_ids = [] + output_tokens_ids = [] #[bos_id] + (start_index, end_index) = (0, None) + for span in masked_spans: + flag = sentinel_tokens.popleft() + + output_tokens_ids.append(flag) + output_tokens_ids.extend(span.label) + + end_index = span.index[0] + input_tokens_ids.extend(tokens[start_index: end_index]) + input_tokens_ids.append(flag) + + # the next start index is the token after the last span token + start_index = span.index[-1] + 1 + + + # Add the remaining tokens to input_tokens_ids + input_tokens_ids.extend(tokens[start_index:]) + # Add token to the output_tokens_ids + output_tokens_ids.append(eos_id) + prefix_len = len(input_tokens_ids) + + # # Padding. + # input_tokens_ids, _, output_tokens_ids, enc_mask, \ + # dec_mask, enc_dec_mask, loss_mask \ + # = pad_and_convert_to_numpy(tokens, masked_positions, + # masked_labels, pad_id, max_seq_length, + # max_seq_length_dec, masked_spans, + # bos_id, eos_id, sentinel_tokens) # text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) From e5eb615971ef8bfa3475a923b336700bc3045003 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 02:34:17 +0700 Subject: [PATCH 092/148] developing --- megatron/data/non_causal_mlm_dataset.py | 169 +++++++++++++----------- 1 file changed, 90 insertions(+), 79 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index a02c27e12..ef5eeea82 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -289,6 +289,7 @@ def build_training_sample(sample, target_seq_length, # Add the remaining tokens to input_tokens_ids input_tokens_ids.extend(tokens[start_index:]) + input_tokens_ids.append(eos_id) # Add token to the output_tokens_ids output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) @@ -303,28 +304,26 @@ def build_training_sample(sample, target_seq_length, # text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) - # text_tokens_ids = input_tokens_ids + output_tokens_ids + text_tokens_ids = pad_and_convert_to_numpy( + input_tokens_ids+output_tokens_ids, + pad_id, + max_seq_length+max_seq_length_dec + ) + print("input_tokens_ids") print(len(input_tokens_ids)) print(input_tokens_ids) print("output_tokens_ids") print(len(output_tokens_ids)) print(output_tokens_ids) - # print("text_tokens_ids") - # # print(text_tokens_ids) - # print(len(text_tokens_ids)) - - # input_tokens_ids = pad_and_convert_to_numpy( - # input_tokens_ids, - # self.tokenizer.pad, - # self.seq_length - # ) - + print("text_tokens_ids") + print(text_tokens_ids) + print(len(text_tokens_ids)) import sys sys.exit() return { - 'text': input_tokens_ids, + 'text': text_tokens_ids, 'prefix_len': prefix_len } @@ -341,78 +340,90 @@ def build_training_sample(sample, target_seq_length, # } # return train_sample - -def pad_and_convert_to_numpy(tokens, masked_positions, - masked_labels, pad_id, - max_seq_length, max_seq_length_dec, - masked_spans=None, bos_id=None, - eos_id=None, sentinel_tokens=None): +def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): """Pad sequences and convert them to numpy.""" - sentinel_tokens = collections.deque(sentinel_tokens) - t5_input = [] - (t5_decoder_in, t5_decoder_out) = ([bos_id], []) - (start_index, end_index) = (0, None) - for span in masked_spans: - flag = sentinel_tokens.popleft() - - # Append the same tokens in decoder input and output - t5_decoder_in.append(flag) - t5_decoder_in.extend(span.label) - t5_decoder_out.append(flag) - t5_decoder_out.extend(span.label) - - end_index = span.index[0] - t5_input.extend(tokens[start_index: end_index]) - t5_input.append(flag) - - # the next start index is the token after the last span token - start_index = span.index[-1] + 1 - - # Add token to the t5_decoder_out - t5_decoder_out.append(eos_id) - - # Add the remaining tokens to the t5 input - t5_input.extend(tokens[start_index:]) - - # assert (len(t5_input) - len(masked_spans)) + \ - # (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens) - # Some checks. - - # Encoder-side padding mask. - num_tokens = len(t5_input) + num_tokens = len(tokens) padding_length = max_seq_length - num_tokens assert padding_length >= 0 - assert len(masked_positions) == len(masked_labels) - - # Tokens.. - filler = [pad_id] * padding_length - tokens_enc = np.array(t5_input + filler, dtype=np.int64) - - # Decoder-side padding mask. - num_tokens_dec = len(t5_decoder_in) - padding_length_dec = max_seq_length_dec - num_tokens_dec - assert padding_length_dec >= 0 - filler_dec = [pad_id] * padding_length_dec - tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64) - - # Create attention masks - enc_mask = make_attention_mask(tokens_enc, tokens_enc) - enc_dec_mask = make_attention_mask(tokens_dec_in, tokens_enc) - dec_mask = make_attention_mask(tokens_dec_in, tokens_dec_in) - dec_mask = dec_mask * make_history_mask(tokens_dec_in) - - # Labels mask. - labels = t5_decoder_out + ([-1] * padding_length_dec) - labels = np.array(labels, dtype=np.int64) - - # Loss mask - loss_mask = ([1] * num_tokens_dec) + ([0] * padding_length_dec) - loss_mask = np.array(loss_mask, dtype=np.int64) - - return tokens_enc, tokens_dec_in, labels, enc_mask, \ - dec_mask, enc_dec_mask, loss_mask + + # Tokens and token types. + filler = np.array([pad_id] * padding_length) + tokens_np = np.concatenate((tokens, filler), dtype=np.int64) + + return tokens_np +# def pad_and_convert_to_numpy(tokens, masked_positions, +# masked_labels, pad_id, +# max_seq_length, max_seq_length_dec, +# masked_spans=None, bos_id=None, +# eos_id=None, sentinel_tokens=None): +# """Pad sequences and convert them to numpy.""" + +# sentinel_tokens = collections.deque(sentinel_tokens) +# t5_input = [] +# (t5_decoder_in, t5_decoder_out) = ([bos_id], []) +# (start_index, end_index) = (0, None) +# for span in masked_spans: +# flag = sentinel_tokens.popleft() + +# # Append the same tokens in decoder input and output +# t5_decoder_in.append(flag) +# t5_decoder_in.extend(span.label) +# t5_decoder_out.append(flag) +# t5_decoder_out.extend(span.label) + +# end_index = span.index[0] +# t5_input.extend(tokens[start_index: end_index]) +# t5_input.append(flag) + +# # the next start index is the token after the last span token +# start_index = span.index[-1] + 1 + +# # Add token to the t5_decoder_out +# t5_decoder_out.append(eos_id) + +# # Add the remaining tokens to the t5 input +# t5_input.extend(tokens[start_index:]) + +# # assert (len(t5_input) - len(masked_spans)) + \ +# # (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens) + +# # Some checks. + +# # Encoder-side padding mask. +# num_tokens = len(t5_input) +# padding_length = max_seq_length - num_tokens +# assert padding_length >= 0 +# assert len(masked_positions) == len(masked_labels) + +# # Tokens.. +# filler = [pad_id] * padding_length +# tokens_enc = np.array(t5_input + filler, dtype=np.int64) + +# # Decoder-side padding mask. +# num_tokens_dec = len(t5_decoder_in) +# padding_length_dec = max_seq_length_dec - num_tokens_dec +# assert padding_length_dec >= 0 +# filler_dec = [pad_id] * padding_length_dec +# tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64) + +# # Create attention masks +# enc_mask = make_attention_mask(tokens_enc, tokens_enc) +# enc_dec_mask = make_attention_mask(tokens_dec_in, tokens_enc) +# dec_mask = make_attention_mask(tokens_dec_in, tokens_dec_in) +# dec_mask = dec_mask * make_history_mask(tokens_dec_in) + +# # Labels mask. +# labels = t5_decoder_out + ([-1] * padding_length_dec) +# labels = np.array(labels, dtype=np.int64) + +# # Loss mask +# loss_mask = ([1] * num_tokens_dec) + ([0] * padding_length_dec) +# loss_mask = np.array(loss_mask, dtype=np.int64) + +# return tokens_enc, tokens_dec_in, labels, enc_mask, \ +# dec_mask, enc_dec_mask, loss_mask def make_attention_mask(source_block, target_block): From 2eee807dcfe4ec4dfe98bf85ac788ec94cb3053e Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 02:35:50 +0700 Subject: [PATCH 093/148] developing --- megatron/data/non_causal_mlm_dataset.py | 147 +----------------------- 1 file changed, 1 insertion(+), 146 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index ef5eeea82..0380d1623 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -294,52 +294,18 @@ def build_training_sample(sample, target_seq_length, output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) - # # Padding. - # input_tokens_ids, _, output_tokens_ids, enc_mask, \ - # dec_mask, enc_dec_mask, loss_mask \ - # = pad_and_convert_to_numpy(tokens, masked_positions, - # masked_labels, pad_id, max_seq_length, - # max_seq_length_dec, masked_spans, - # bos_id, eos_id, sentinel_tokens) - - # text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) - text_tokens_ids = pad_and_convert_to_numpy( input_tokens_ids+output_tokens_ids, pad_id, max_seq_length+max_seq_length_dec ) - print("input_tokens_ids") - print(len(input_tokens_ids)) - print(input_tokens_ids) - print("output_tokens_ids") - print(len(output_tokens_ids)) - print(output_tokens_ids) - print("text_tokens_ids") - print(text_tokens_ids) - print(len(text_tokens_ids)) - import sys - sys.exit() - return { 'text': text_tokens_ids, 'prefix_len': prefix_len } - # train_sample = { - # 'text_enc': tokens_enc, - # 'text_dec': tokens_dec_in, - # 'labels': labels, - # 'loss_mask': loss_mask, - # 'truncated': int(truncated), - # 'enc_mask': enc_mask, - # 'dec_mask': dec_mask, - # 'enc_dec_mask': enc_dec_mask, - # } - # return train_sample - def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): """Pad sequences and convert them to numpy.""" @@ -352,115 +318,4 @@ def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): filler = np.array([pad_id] * padding_length) tokens_np = np.concatenate((tokens, filler), dtype=np.int64) - return tokens_np -# def pad_and_convert_to_numpy(tokens, masked_positions, -# masked_labels, pad_id, -# max_seq_length, max_seq_length_dec, -# masked_spans=None, bos_id=None, -# eos_id=None, sentinel_tokens=None): -# """Pad sequences and convert them to numpy.""" - -# sentinel_tokens = collections.deque(sentinel_tokens) -# t5_input = [] -# (t5_decoder_in, t5_decoder_out) = ([bos_id], []) -# (start_index, end_index) = (0, None) -# for span in masked_spans: -# flag = sentinel_tokens.popleft() - -# # Append the same tokens in decoder input and output -# t5_decoder_in.append(flag) -# t5_decoder_in.extend(span.label) -# t5_decoder_out.append(flag) -# t5_decoder_out.extend(span.label) - -# end_index = span.index[0] -# t5_input.extend(tokens[start_index: end_index]) -# t5_input.append(flag) - -# # the next start index is the token after the last span token -# start_index = span.index[-1] + 1 - -# # Add token to the t5_decoder_out -# t5_decoder_out.append(eos_id) - -# # Add the remaining tokens to the t5 input -# t5_input.extend(tokens[start_index:]) - -# # assert (len(t5_input) - len(masked_spans)) + \ -# # (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens) - -# # Some checks. - -# # Encoder-side padding mask. -# num_tokens = len(t5_input) -# padding_length = max_seq_length - num_tokens -# assert padding_length >= 0 -# assert len(masked_positions) == len(masked_labels) - -# # Tokens.. -# filler = [pad_id] * padding_length -# tokens_enc = np.array(t5_input + filler, dtype=np.int64) - -# # Decoder-side padding mask. -# num_tokens_dec = len(t5_decoder_in) -# padding_length_dec = max_seq_length_dec - num_tokens_dec -# assert padding_length_dec >= 0 -# filler_dec = [pad_id] * padding_length_dec -# tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64) - -# # Create attention masks -# enc_mask = make_attention_mask(tokens_enc, tokens_enc) -# enc_dec_mask = make_attention_mask(tokens_dec_in, tokens_enc) -# dec_mask = make_attention_mask(tokens_dec_in, tokens_dec_in) -# dec_mask = dec_mask * make_history_mask(tokens_dec_in) - -# # Labels mask. -# labels = t5_decoder_out + ([-1] * padding_length_dec) -# labels = np.array(labels, dtype=np.int64) - -# # Loss mask -# loss_mask = ([1] * num_tokens_dec) + ([0] * padding_length_dec) -# loss_mask = np.array(loss_mask, dtype=np.int64) - -# return tokens_enc, tokens_dec_in, labels, enc_mask, \ -# dec_mask, enc_dec_mask, loss_mask - - -def make_attention_mask(source_block, target_block): - """ - Returns a 2-dimensional (2-D) attention mask - :param source_block: 1-D array - :param target_block: 1-D array - """ - mask = (target_block[None, :] >= 1) * (source_block[:, None] >= 1) - mask = mask.astype(np.int64) - # (source_length, target_length) - return mask - - -def make_attention_mask_3d(source_block, target_block): - """ - Returns a 3-dimensional (3-D) attention mask - :param source_block: 1-D array - :param target_block: 1-D array - """ - mask = (target_block[:, None, :] >= 1) * (source_block[:, :, None] >= 1) - # (batch, source_length, target_length) - # mask = mask.astype(np.int64) - return mask - - -def make_history_mask(block): - length = block.shape[0] - arange = np.arange(length) - history_mask = (arange[None, ] <= arange[:, None]) - history_mask = history_mask.astype(np.int64) - return history_mask - - -def make_history_mask_3d(block): - batch, length = block.shape - arange = torch.arange(length, device=block.device) - history_mask = (arange[None, ] <= arange[:, None])[None, ] - history_mask = history_mask.expand(batch, length, length) - return history_mask + return tokens_np \ No newline at end of file From 5840a119b77d144975121aad3f01b8278244b3f3 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 02:44:33 +0700 Subject: [PATCH 094/148] developing --- megatron/tokenizer/tokenizer.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index c1924cfa1..e20cd1f39 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -366,6 +366,21 @@ def __init__(self, tokenizer_name_or_path): AddedToken('', lstrip=False, rstrip=False, normalization=False), AddedToken('', lstrip=False, rstrip=False, normalization=False), AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), ] }) From 6d38f7359210255a48b84bf8ad8f4098b2403359 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 11:44:13 +0700 Subject: [PATCH 095/148] test to see output of get_ltor_masks_and_position_ids --- megatron/tokenizer/tokenizer.py | 12 ------------ train_non_causal_mlm_adaptation_gpt.py | 9 +++++++++ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index e20cd1f39..a5aa7e973 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -327,18 +327,6 @@ def __init__(self, tokenizer_name_or_path): self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} - # if 'mask_token' not in self.tokenizer.special_tokens_map: - # self.tokenizer.add_tokens("") - # self.tokenizer.mask_token = "" - - # if 'cls_token' not in self.tokenizer.special_tokens_map: - # self.tokenizer.add_tokens("") - # self.tokenizer.mask_token = "" - - # if 'sep_token' not in self.tokenizer.special_tokens_map: - # self.tokenizer.add_tokens("") - # self.tokenizer.mask_token = "" - self.tokenizer.add_special_tokens({ 'additional_special_tokens': [ AddedToken('', lstrip=False, rstrip=False, normalization=False), diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py index 3175f35da..b19e7c811 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -104,6 +104,15 @@ def get_batch(data_iterator): loss_on_targets_only=args.loss_on_targets_only ) + print("attention_mask") + print(attention_mask) + print("loss_mask") + print(loss_mask) + print("position_ids") + print(position_ids) + import sys + sys.exit() + # weight loss_mask if args.reweight_loss_based_on_position_frequency: reweight_loss_mask_(loss_mask, tokens) From 430fa6f01cff3938e3aa5af01228944284c6fee3 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 11:51:29 +0700 Subject: [PATCH 096/148] test to see output of get_ltor_masks_and_position_ids --- train_non_causal_mlm_adaptation_gpt.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py index b19e7c811..62c461bad 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -104,15 +104,16 @@ def get_batch(data_iterator): loss_on_targets_only=args.loss_on_targets_only ) - print("attention_mask") - print(attention_mask) - print("loss_mask") - print(loss_mask) - print("position_ids") - print(position_ids) + import numpy as np + with open('attention_mask.npy', 'wb') as f: + np.save(f, attention_mask.cpu().numpy()) + with open('loss_mask.npy', 'wb') as f: + np.save(f, loss_mask.cpu().numpy()) + with open('position_ids.npy', 'wb') as f: + np.save(f, position_ids.cpu().numpy()) import sys sys.exit() - + # weight loss_mask if args.reweight_loss_based_on_position_frequency: reweight_loss_mask_(loss_mask, tokens) From 444314f97af4ed855902550aa4fbdcfe06640923 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 23:11:53 +0700 Subject: [PATCH 097/148] add new script --- examples/4B8-en.sh | 153 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 examples/4B8-en.sh diff --git a/examples/4B8-en.sh b/examples/4B8-en.sh new file mode 100644 index 000000000..4d86fb2d6 --- /dev/null +++ b/examples/4B8-en.sh @@ -0,0 +1,153 @@ +#!/bin/bash + +EXPERIMENT_NAME=4B8-en +REPO_PATH=experiments/$EXPERIMENT_NAME +CHECKPOINT_PATH=$REPO_PATH/checkpoints +TENSORBOARD_PATH=$REPO_PATH/tensorboard +CODECARBON_PATH=$REPO_PATH/codecarbon +LOGS_PATH=$REPO_PATH/logs + +DATA_PATH=data/mc4-id_text_document + + +# XXX: edit me +GPUS_PER_NODE=8 +NNODES=1 +PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here +TP_SIZE=1 # always fixed to the size of a single node +DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=2048 +TRAIN_ITER=131_072 + +NLAYERS=24 +NHIDDEN=4096 +NHEADS=64 +FFN_HIDDEN_SIZE=10240 +SEQ_LEN=626 + +if [[ ${ROUND} == 1 ]]; then EXIT_INTERVAL=100 SAVE_INTERVAL=10 +elif [[ ${ROUND} == 2 ]]; then SAVE_INTERVAL=1500 +else echo "invalid ROUND: $ROUND" +fi + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 1e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples 146_484_375 \ + --lr-warmup-samples 183_105 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +EXIT_OPTS=" \ + --exit-duration-in-mins 1190 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --rampup-batch-size 32 32 2_000_000 \ + --train-samples $TRAIN_ITER \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path bigscience/tokenizer \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --fp16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + +OUTPUT_ARGS=" \ + --log-interval 200 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 100 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=1 + +config_json="./ds_config.$SLURM_JOBID.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + +export LAUNCHER="python -u -m torch.distributed.launch \ + --nproc_per_node $GPUS_PER_NODE \ + " + # --nnodes $NNODES \ + # --master_addr $MASTER_ADDR \ + # --master_port $MASTER_PORT \ + +export CMD=" \ + `pwd`/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + + +# # clear old checkpoint as it'd mismatch while we sort things out +# rm -rf $SAVE_CHECKPOINT_PATH + + +echo $CMD + +# We create the folder where the logs and codecarbon will be stored. +mkdir -p $REPO_PATH +mkdir -p $LOGS_PATH +# to debug - add echo (it exits and prints what it would have launched) +srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file From 26c837de481f7730559dc76e153963f0b6b9c826 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 23:13:32 +0700 Subject: [PATCH 098/148] add new script --- examples/4B8-en.sh | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/examples/4B8-en.sh b/examples/4B8-en.sh index 4d86fb2d6..21f48cbfe 100644 --- a/examples/4B8-en.sh +++ b/examples/4B8-en.sh @@ -117,12 +117,12 @@ DEEPSPEED_ARGS=" \ --deepspeed-activation-checkpointing \ " -export LAUNCHER="python -u -m torch.distributed.launch \ - --nproc_per_node $GPUS_PER_NODE \ - " - # --nnodes $NNODES \ - # --master_addr $MASTER_ADDR \ - # --master_port $MASTER_PORT \ +# export LAUNCHER="python -u -m torch.distributed.launch \ +# --nproc_per_node $GPUS_PER_NODE \ +# " +# # --nnodes $NNODES \ +# # --master_addr $MASTER_ADDR \ +# # --master_port $MASTER_PORT \ export CMD=" \ `pwd`/pretrain_gpt.py \ @@ -150,4 +150,9 @@ echo $CMD mkdir -p $REPO_PATH mkdir -p $LOGS_PATH # to debug - add echo (it exits and prints what it would have launched) -srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file + +python -u -m torch.distributed.launch \ + --nproc_per_node $GPUS_PER_NODE \ + $CMD + +# srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file From feb023cad840a14988a72675d69cbe7ca35ffb06 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 23:14:27 +0700 Subject: [PATCH 099/148] add new script --- examples/4B8-en.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/4B8-en.sh b/examples/4B8-en.sh index 21f48cbfe..c34176c98 100644 --- a/examples/4B8-en.sh +++ b/examples/4B8-en.sh @@ -27,10 +27,7 @@ NHEADS=64 FFN_HIDDEN_SIZE=10240 SEQ_LEN=626 -if [[ ${ROUND} == 1 ]]; then EXIT_INTERVAL=100 SAVE_INTERVAL=10 -elif [[ ${ROUND} == 2 ]]; then SAVE_INTERVAL=1500 -else echo "invalid ROUND: $ROUND" -fi +SAVE_INTERVAL=1500 OPTIMIZER_ARGS=" \ --optimizer adam \ From f30b9b16a0fab05bf01afd157eb1f8b83cd14052 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 30 May 2022 10:29:01 +0700 Subject: [PATCH 100/148] changed settings --- examples/4B8-en.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/4B8-en.sh b/examples/4B8-en.sh index c34176c98..fc8064450 100644 --- a/examples/4B8-en.sh +++ b/examples/4B8-en.sh @@ -37,8 +37,6 @@ OPTIMIZER_ARGS=" \ --lr 2e-4 \ --min-lr 1e-5 \ --lr-decay-style cosine \ - --lr-decay-samples 146_484_375 \ - --lr-warmup-samples 183_105 \ --clip-grad 1.0 \ --weight-decay 1e-1 \ " @@ -57,7 +55,7 @@ GPT_ARGS=" \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --rampup-batch-size 32 32 2_000_000 \ - --train-samples $TRAIN_ITER \ + --train-iters $TRAIN_ITER \ --tokenizer-type PretrainedFromHF \ --tokenizer-name-or-path bigscience/tokenizer \ --loss-scale 12 \ @@ -71,8 +69,8 @@ GPT_ARGS=" \ OUTPUT_ARGS=" \ --log-interval 200 \ --save-interval $SAVE_INTERVAL \ - --eval-interval 1000 \ - --eval-iters 100 \ + --eval-interval 0 \ + --eval-iters 0 \ --tensorboard-dir $TENSORBOARD_PATH \ --tensorboard-queue-size 5 \ --log-timers-to-tensorboard \ From 0a9203a560c78089ff702cbb062930add318992c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 30 May 2022 10:32:03 +0700 Subject: [PATCH 101/148] changed settings --- examples/4B8-en.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/4B8-en.sh b/examples/4B8-en.sh index fc8064450..5a44d30d8 100644 --- a/examples/4B8-en.sh +++ b/examples/4B8-en.sh @@ -54,7 +54,6 @@ GPT_ARGS=" \ --max-position-embeddings $SEQ_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ - --rampup-batch-size 32 32 2_000_000 \ --train-iters $TRAIN_ITER \ --tokenizer-type PretrainedFromHF \ --tokenizer-name-or-path bigscience/tokenizer \ From 672a8666fd5a434b41d51ea83a1f59579ebd06dd Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 31 May 2022 23:11:51 +0700 Subject: [PATCH 102/148] tidy up --- examples/4B8-en.sh => 4B8-en-CD-FLM.sh | 31 +- 4B8-en-ND-MLM.sh | 154 ++++++++++ 4B8-en-ND-MTF.sh | 154 ++++++++++ examples/finetune_mp3.sh | 42 --- examples/train_non_causal_mlm_adaption.sh | 43 --- ...m_adaptation_gpt.py => train_ND_MLM_gpt.py | 22 +- train_ND_MTF_gpt.py | 287 ++++++++++++++++++ 7 files changed, 625 insertions(+), 108 deletions(-) rename examples/4B8-en.sh => 4B8-en-CD-FLM.sh (89%) create mode 100644 4B8-en-ND-MLM.sh create mode 100644 4B8-en-ND-MTF.sh delete mode 100644 examples/finetune_mp3.sh delete mode 100644 examples/train_non_causal_mlm_adaption.sh rename train_non_causal_mlm_adaptation_gpt.py => train_ND_MLM_gpt.py (94%) create mode 100644 train_ND_MTF_gpt.py diff --git a/examples/4B8-en.sh b/4B8-en-CD-FLM.sh similarity index 89% rename from examples/4B8-en.sh rename to 4B8-en-CD-FLM.sh index 5a44d30d8..f8963414c 100644 --- a/examples/4B8-en.sh +++ b/4B8-en-CD-FLM.sh @@ -1,14 +1,13 @@ #!/bin/bash -EXPERIMENT_NAME=4B8-en +EXPERIMENT_NAME=4B8-en-CD-FLM REPO_PATH=experiments/$EXPERIMENT_NAME CHECKPOINT_PATH=$REPO_PATH/checkpoints TENSORBOARD_PATH=$REPO_PATH/tensorboard CODECARBON_PATH=$REPO_PATH/codecarbon LOGS_PATH=$REPO_PATH/logs -DATA_PATH=data/mc4-id_text_document - +DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document # XXX: edit me GPUS_PER_NODE=8 @@ -17,15 +16,17 @@ PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here TP_SIZE=1 # always fixed to the size of a single node DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer -MICRO_BATCH_SIZE=1 +MICRO_BATCH_SIZE=32 GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=131_072 +SEQ_LEN=626 + NLAYERS=24 NHIDDEN=4096 NHEADS=64 FFN_HIDDEN_SIZE=10240 -SEQ_LEN=626 +MAX_POSITION_EMBEDDING=1280 SAVE_INTERVAL=1500 @@ -45,13 +46,16 @@ EXIT_OPTS=" \ --exit-duration-in-mins 1190 \ " + + GPT_ARGS=" \ --num-layers $NLAYERS \ --hidden-size $NHIDDEN \ --num-attention-heads $NHEADS \ --ffn-hidden-size $FFN_HIDDEN_SIZE \ - --seq-length $SEQ_LEN \ --max-position-embeddings $SEQ_LEN \ + --position-embedding-type alibi \ + --seq-length $SEQ_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ @@ -66,10 +70,10 @@ GPT_ARGS=" \ " OUTPUT_ARGS=" \ - --log-interval 200 \ + --log-interval 1 \ --save-interval $SAVE_INTERVAL \ - --eval-interval 0 \ - --eval-iters 0 \ + --eval-interval $TRAIN_ITER \ + --eval-iters 1 \ --tensorboard-dir $TENSORBOARD_PATH \ --tensorboard-queue-size 5 \ --log-timers-to-tensorboard \ @@ -79,7 +83,7 @@ OUTPUT_ARGS=" \ ZERO_STAGE=1 -config_json="./ds_config.$SLURM_JOBID.json" +config_json="./ds_config.json" # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() cat < $config_json @@ -145,8 +149,11 @@ mkdir -p $REPO_PATH mkdir -p $LOGS_PATH # to debug - add echo (it exits and prints what it would have launched) -python -u -m torch.distributed.launch \ - --nproc_per_node $GPUS_PER_NODE \ +# python -u -m torch.distributed.launch \ +# --nproc_per_node $GPUS_PER_NODE \ +# $CMD + +deepspeed --num_gpus $GPUS_PER_NODE \ $CMD # srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh new file mode 100644 index 000000000..5f96a39b1 --- /dev/null +++ b/4B8-en-ND-MLM.sh @@ -0,0 +1,154 @@ +#!/bin/bash + +EXPERIMENT_NAME=4B8-en-ND-MLM +REPO_PATH=experiments/$EXPERIMENT_NAME +CHECKPOINT_PATH=$REPO_PATH/checkpoints +TENSORBOARD_PATH=$REPO_PATH/tensorboard +CODECARBON_PATH=$REPO_PATH/codecarbon +LOGS_PATH=$REPO_PATH/logs + +DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document + +# XXX: edit me +GPUS_PER_NODE=8 +NNODES=1 +PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here +TP_SIZE=1 # always fixed to the size of a single node +DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=2048 +TRAIN_ITER=39_718 +INPUT_LEN=512 +TARGET_LEN=114 + +NLAYERS=24 +NHIDDEN=4096 +NHEADS=64 +FFN_HIDDEN_SIZE=10240 +MAX_POSITION_EMBEDDING=1280 + +SAVE_INTERVAL=1500 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 1e-5 \ + --lr-decay-style cosine \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +EXIT_OPTS=" \ + --exit-duration-in-mins 1190 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --max-position-embeddings $MAX_POSITION_EMBEDDING \ + --encoder-seq-length $INPUT_LEN \ + --decoder-seq-length $TARGET_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-iters $TRAIN_ITER \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path t5-base \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --fp16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + +OUTPUT_ARGS=" \ + --log-interval 200 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval $TRAIN_ITER \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=1 + +config_json="./ds_config.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + +# export LAUNCHER="python -u -m torch.distributed.launch \ +# --nproc_per_node $GPUS_PER_NODE \ +# " +# # --nnodes $NNODES \ +# # --master_addr $MASTER_ADDR \ +# # --master_port $MASTER_PORT \ + +export CMD=" \ + `pwd`/train_ND_MLM_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + + +# # clear old checkpoint as it'd mismatch while we sort things out +# rm -rf $SAVE_CHECKPOINT_PATH + + +echo $CMD + +# We create the folder where the logs and codecarbon will be stored. +mkdir -p $REPO_PATH +mkdir -p $LOGS_PATH +# to debug - add echo (it exits and prints what it would have launched) + +python -u -m torch.distributed.launch \ + --nproc_per_node $GPUS_PER_NODE \ + $CMD + +# srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file diff --git a/4B8-en-ND-MTF.sh b/4B8-en-ND-MTF.sh new file mode 100644 index 000000000..a532a96fa --- /dev/null +++ b/4B8-en-ND-MTF.sh @@ -0,0 +1,154 @@ +#!/bin/bash + +EXPERIMENT_NAME=4B8-en-ND-MTF +REPO_PATH=experiments/$EXPERIMENT_NAME +CHECKPOINT_PATH=$REPO_PATH/checkpoints +TENSORBOARD_PATH=$REPO_PATH/tensorboard +CODECARBON_PATH=$REPO_PATH/codecarbon +LOGS_PATH=$REPO_PATH/logs + +DATA_PATH=data/mc4-id_text_document + +# XXX: edit me +GPUS_PER_NODE=8 +NNODES=1 +PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here +TP_SIZE=1 # always fixed to the size of a single node +DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=1024 +TRAIN_ITER=10_000 +INPUT_LEN=1024 +TARGET_LEN=256 + +NLAYERS=24 +NHIDDEN=4096 +NHEADS=64 +FFN_HIDDEN_SIZE=10240 +MAX_POSITION_EMBEDDING=1280 + +SAVE_INTERVAL=1500 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 1e-5 \ + --lr-decay-style cosine \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +EXIT_OPTS=" \ + --exit-duration-in-mins 1190 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --max-position-embeddings $MAX_POSITION_EMBEDDING \ + --encoder-seq-length $INPUT_LEN \ + --decoder-seq-length $TARGET_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-iters $TRAIN_ITER \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path t5-base \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --fp16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + +OUTPUT_ARGS=" \ + --log-interval 200 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval $TRAIN_ITER \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=1 + +config_json="./ds_config.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + +# export LAUNCHER="python -u -m torch.distributed.launch \ +# --nproc_per_node $GPUS_PER_NODE \ +# " +# # --nnodes $NNODES \ +# # --master_addr $MASTER_ADDR \ +# # --master_port $MASTER_PORT \ + +export CMD=" \ + `pwd`/train_ND_MTF_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + + +# # clear old checkpoint as it'd mismatch while we sort things out +# rm -rf $SAVE_CHECKPOINT_PATH + + +echo $CMD + +# We create the folder where the logs and codecarbon will be stored. +mkdir -p $REPO_PATH +mkdir -p $LOGS_PATH +# to debug - add echo (it exits and prints what it would have launched) + +python -u -m torch.distributed.launch \ + --nproc_per_node $GPUS_PER_NODE \ + $CMD + +# srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file diff --git a/examples/finetune_mp3.sh b/examples/finetune_mp3.sh deleted file mode 100644 index 59cb34d4c..000000000 --- a/examples/finetune_mp3.sh +++ /dev/null @@ -1,42 +0,0 @@ -#! /bin/bash - -# Runs the "345M" parameter model - -RANK=0 -WORLD_SIZE=1 - -DATA_PATH=data/mc4-id_text_document -CHECKPOINT_PATH=data - - -deepspeed --num_gpus 8 pretrain_mp3_gpt.py \ - --num-layers 2 \ - --hidden-size 128 \ - --num-attention-heads 4 \ - --micro-batch-size 4 \ - --global-batch-size 1024 \ - --seq-length 626 \ - --max-position-embeddings 1024 \ - --train-iters 10000 \ - --lr-decay-iters 5000 \ - --save $CHECKPOINT_PATH \ - --load $CHECKPOINT_PATH \ - --data-path $DATA_PATH \ - --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path bigscience/tokenizer \ - --data-impl mmap \ - --split 949,50,1 \ - --distributed-backend nccl \ - --lr 0.00015 \ - --min-lr 1.0e-5 \ - --lr-decay-style cosine \ - --weight-decay 1e-2 \ - --clip-grad 1.0 \ - --lr-warmup-fraction .01 \ - --checkpoint-activations \ - --log-interval 100 \ - --save-interval 10000 \ - --eval-interval 1000 \ - --eval-iters 10 \ - --fp16 \ - --tensorboard-dir LOG diff --git a/examples/train_non_causal_mlm_adaption.sh b/examples/train_non_causal_mlm_adaption.sh deleted file mode 100644 index a595fe161..000000000 --- a/examples/train_non_causal_mlm_adaption.sh +++ /dev/null @@ -1,43 +0,0 @@ -#! /bin/bash - -# Runs the "345M" parameter model - -RANK=0 -WORLD_SIZE=1 - -DATA_PATH=data/mc4-id_text_document -CHECKPOINT_PATH=data - - -deepspeed --num_gpus 8 train_non_causal_mlm_adaptation_gpt.py \ - --num-layers 2 \ - --hidden-size 128 \ - --num-attention-heads 4 \ - --micro-batch-size 4 \ - --global-batch-size 1024 \ - --seq-length 626 \ - --max-position-embeddings 1024 \ - --train-iters 10000 \ - --lr-decay-iters 5000 \ - --save $CHECKPOINT_PATH \ - --load $CHECKPOINT_PATH \ - --data-path $DATA_PATH \ - --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path bigscience/tokenizer \ - --data-impl mmap \ - --split 949,50,1 \ - --distributed-backend nccl \ - --lr 0.00015 \ - --min-lr 1.0e-5 \ - --lr-decay-style cosine \ - --weight-decay 1e-2 \ - --clip-grad 1.0 \ - --lr-warmup-fraction .01 \ - --checkpoint-activations \ - --log-interval 100 \ - --save-interval 10000 \ - --eval-interval 1000 \ - --eval-iters 10 \ - --fp16 \ - --loss-on-targets-only \ - --tensorboard-dir LOG diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_ND_MLM_gpt.py similarity index 94% rename from train_non_causal_mlm_adaptation_gpt.py rename to train_ND_MLM_gpt.py index 62c461bad..0326e778a 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_ND_MLM_gpt.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Pretrain GPT""" +"""Non-Causal Decoder GPT MLM Adaptation""" import torch from functools import partial @@ -52,6 +52,14 @@ def model_provider(pre_process=True, post_process=True): parallel_output=True, prefix_lm=True ) + # loaded_dir, state_dict = model[0].load_checkpoint( + # args.finetune, load_optimizer_states=False) + # if loaded_dir is None: + # print_rank_0('WARNING: could not find the metadata file {} '.format( + # load_dir)) + # print_rank_0(' will not load any checkpoints and will start from ' + # 'random') + # This is a hack to give us a reference to get_batch_pipe from within training.py # We need to call model.set_batch_fn after deepspeed.initialize model._megatron_batch_fn = get_batch_pipe @@ -194,21 +202,13 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): # Option 1 of data loading using --data-path if args.data_path: - # train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - # data_prefix=args.data_path, - # data_impl=args.data_impl, - # splits_string=args.split, - # train_valid_test_num_samples=train_val_test_num_samples, - # seq_length=args.seq_length, - # seed=args.seed, - # skip_warmup=(not args.mmap_warmup)) train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=512,#args.encoder_seq_length, - max_seq_length_dec=114,#args.decoder_seq_length, + max_seq_length=args.encoder_seq_length, + max_seq_length_dec=args.decoder_seq_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, seed=args.seed, diff --git a/train_ND_MTF_gpt.py b/train_ND_MTF_gpt.py new file mode 100644 index 000000000..d16c9bb82 --- /dev/null +++ b/train_ND_MTF_gpt.py @@ -0,0 +1,287 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Non-Causal Decoder GPT Multitask Finetuning""" + +import torch +from functools import partial +from megatron import get_args +from megatron import print_rank_0 +from megatron import get_timers +from megatron import get_tokenizer +from megatron import mpu + +from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets #, build_dataset_group +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ +from megatron.utils import average_losses_across_data_parallel_group + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +import subprocess + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building GPT model ...') + see_memory_usage(f"Before Building Model", force=True) + + args = get_args() + + with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed: + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + prefix_lm=True + ) + # loaded_dir, state_dict = model[0].load_checkpoint( + # args.finetune, load_optimizer_states=False) + # if loaded_dir is None: + # print_rank_0('WARNING: could not find the metadata file {} '.format( + # load_dir)) + # print_rank_0(' will not load any checkpoints and will start from ' + # 'random') + + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + + else: + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + prefix_lm=True + ) + see_memory_usage(f"After Building Model", force=True) + return model + +_KEYS = ['text', 'prefix_len'] + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = _KEYS + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + prefix_indices = data_b['prefix_len'].cpu().tolist() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + import numpy as np + with open('attention_mask.npy', 'wb') as f: + np.save(f, attention_mask.cpu().numpy()) + with open('loss_mask.npy', 'wb') as f: + np.save(f, loss_mask.cpu().numpy()) + with open('position_ids.npy', 'wb') as f: + np.save(f, position_ids.cpu().numpy()) + import sys + sys.exit() + + # weight loss_mask + if args.reweight_loss_based_on_position_frequency: + reweight_loss_mask_(loss_mask, tokens) + + return tokens, labels, loss_mask, attention_mask, position_ids + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = _KEYS + datatype = torch.int64 + + # Broadcast data. + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + prefix_indices = data_b['prefix_len'].cpu().tolist() + + # Get the masks and position ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + # weight loss_mask + if args.reweight_loss_based_on_position_frequency: + reweight_loss_mask_(loss_mask, tokens) + + return (tokens, position_ids, attention_mask), (labels, loss_mask), prefix_indices + +def loss_func(loss_mask, output_tensor): + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator').start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + train_ds, valid_ds, test_ds = None, None, None + + print_rank_0('> building train, validation, and test datasets for GPT ...') + # Option 1 of data loading using --data-path + + if args.data_path: + # train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + # data_prefix=args.data_path, + # data_impl=args.data_impl, + # splits_string=args.split, + # train_valid_test_num_samples=train_val_test_num_samples, + # seq_length=args.seq_length, + # seed=args.seed, + # skip_warmup=(not args.mmap_warmup)) + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + max_seq_length=args.encoder_seq_length, + max_seq_length_dec=args.decoder_seq_length, + masked_lm_prob=args.mask_prob, + short_seq_prob=args.short_seq_prob, + seed=args.seed, + skip_warmup=(not args.mmap_warmup), + dataset_type='t5') + + # # Option 2 of data loading using --(train|valid|test)-weighted-split-paths + # elif args.train_weighted_split_paths: + # assigned_train_valid_test = [] + # if args.train_weighted_split_paths is not None: + # train_ds = [] + # assigned_train_valid_test.append("train") + # if args.valid_weighted_split_paths is not None: + # valid_ds = [] + # assigned_train_valid_test.append("valid") + # if args.test_weighted_split_paths is not None: + # test_ds = [] + # assigned_train_valid_test.append("test") + + # for s in assigned_train_valid_test: + # data_groups = zip(eval(f"args.{s}_weighted_split_paths"), + # eval(f"args.{s}_weighted_split_weights"), + # eval(f"args.{s}_weighted_split_splits"), + # eval(f"args.{s}_weighted_split_names")) + # for paths, weights, splits, name in data_groups: + # d = build_dataset_group(name, paths, weights, splits, + # args.data_impl, + # train_val_test_num_samples, + # args.seq_length, args.seed, + # (not args.mmap_warmup), + # train_valid_test=s) + # eval(f"{s}_ds").append(d) + # else: + # raise NotImplementedError("No dataloading argument passed") + + print_rank_0("> finished creating GPT datasets ...") + return train_ds, valid_ds, test_ds + +def command_exists(cmd): + result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) + return result.wait() == 0 + +def git_ds_info(): + from deepspeed.env_report import main as ds_report + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists('git'): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode('utf-8').strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode('utf-8').strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') + + +if __name__ == "__main__": + git_ds_info() + pretrain(train_valid_test_datasets_provider, model_provider, forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) From 3780e611e089fb047ad2f942b33a40bad27981f5 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 31 May 2022 23:15:04 +0700 Subject: [PATCH 103/148] changed tokenizer and position embedding --- 4B8-en-CD-FLM.sh | 2 -- 4B8-en-ND-MLM.sh | 5 +++-- 4B8-en-ND-MTF.sh | 5 +++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/4B8-en-CD-FLM.sh b/4B8-en-CD-FLM.sh index f8963414c..17079579d 100644 --- a/4B8-en-CD-FLM.sh +++ b/4B8-en-CD-FLM.sh @@ -46,8 +46,6 @@ EXIT_OPTS=" \ --exit-duration-in-mins 1190 \ " - - GPT_ARGS=" \ --num-layers $NLAYERS \ --hidden-size $NHIDDEN \ diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 5f96a39b1..583ff1893 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -51,14 +51,15 @@ GPT_ARGS=" \ --hidden-size $NHIDDEN \ --num-attention-heads $NHEADS \ --ffn-hidden-size $FFN_HIDDEN_SIZE \ - --max-position-embeddings $MAX_POSITION_EMBEDDING \ + --max-position-embeddings $SEQ_LEN \ + --position-embedding-type alibi \ --encoder-seq-length $INPUT_LEN \ --decoder-seq-length $TARGET_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path t5-base \ + --tokenizer-name-or-path bigscience/tokenizer \ --loss-scale 12 \ --clip-grad 1.0 \ --fp16 \ diff --git a/4B8-en-ND-MTF.sh b/4B8-en-ND-MTF.sh index a532a96fa..209732ad3 100644 --- a/4B8-en-ND-MTF.sh +++ b/4B8-en-ND-MTF.sh @@ -51,14 +51,15 @@ GPT_ARGS=" \ --hidden-size $NHIDDEN \ --num-attention-heads $NHEADS \ --ffn-hidden-size $FFN_HIDDEN_SIZE \ - --max-position-embeddings $MAX_POSITION_EMBEDDING \ + --max-position-embeddings $SEQ_LEN \ + --position-embedding-type alibi \ --encoder-seq-length $INPUT_LEN \ --decoder-seq-length $TARGET_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path t5-base \ + --tokenizer-name-or-path bigscience/tokenizer \ --loss-scale 12 \ --clip-grad 1.0 \ --fp16 \ From 2130c31535f532c9c846a8ac13ba0cc582799fb5 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:31:45 +0700 Subject: [PATCH 104/148] modifying mlm to reflect original implementation --- 4B8-en-ND-MLM.sh | 3 +- megatron/data/non_causal_mlm_dataset.py | 231 ++++++++++++++++++--- megatron/tokenizer/tokenizer.py | 55 +---- prepare_tokenizer.py | 16 ++ pretrain_mp3_gpt.py | 257 ------------------------ 5 files changed, 223 insertions(+), 339 deletions(-) create mode 100644 prepare_tokenizer.py delete mode 100644 pretrain_mp3_gpt.py diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 583ff1893..c8e1ba0d6 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -8,6 +8,7 @@ CODECARBON_PATH=$REPO_PATH/codecarbon LOGS_PATH=$REPO_PATH/logs DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document +TOKENIZER_PATH=bigscience-tokenizer-padded # XXX: edit me GPUS_PER_NODE=8 @@ -59,7 +60,7 @@ GPT_ARGS=" \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path bigscience/tokenizer \ + --tokenizer-name-or-path $TOKENIZER_PATH \ --loss-scale 12 \ --clip-grad 1.0 \ --fp16 \ diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 0380d1623..ed633fb32 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -262,38 +262,63 @@ def build_training_sample(sample, target_seq_length, truncated = len(tokens) > max_num_tokens tokens = tokens[:max_num_tokens] - # Masking. - max_predictions_per_seq = masked_lm_prob * max_num_tokens - (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions( - tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, - cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, - max_ngrams=10, geometric_dist=True, masking_style="t5") - - sentinel_tokens = collections.deque(sentinel_tokens) - input_tokens_ids = [] - output_tokens_ids = [] #[bos_id] - (start_index, end_index) = (0, None) - for span in masked_spans: - flag = sentinel_tokens.popleft() - - output_tokens_ids.append(flag) - output_tokens_ids.extend(span.label) - - end_index = span.index[0] - input_tokens_ids.extend(tokens[start_index: end_index]) - input_tokens_ids.append(flag) - - # the next start index is the token after the last span token - start_index = span.index[-1] + 1 - - - # Add the remaining tokens to input_tokens_ids - input_tokens_ids.extend(tokens[start_index:]) - input_tokens_ids.append(eos_id) - # Add token to the output_tokens_ids - output_tokens_ids.append(eos_id) + max_ngrams = 3 + # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. + # To ensure that the input length is `max_seq_length`, we need to increase the maximum length + # according to `masked_lm_prob` and `max_ngrams`. We can also define the label length accordingly. + expanded_inputs_length, targets_length = compute_input_and_target_lengths( + max_seq_length, + masked_lm_prob, + max_ngrams + ) + + mask_indices = np.asarray([random_spans_noise_mask(expanded_inputs_length)]) + labels_mask = ~mask_indices + + input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8)) + labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8)) + + input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel) + output_tokens_ids = filter_input_ids(tokens, labels_sentinel) + + # # Masking. + # max_predictions_per_seq = masked_lm_prob * max_num_tokens + # (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions( + # tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, + # cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, + # max_ngrams=max_ngrams, geometric_dist=True, masking_style="t5") + + # sentinel_tokens = collections.deque(sentinel_tokens) + # input_tokens_ids = [] + # output_tokens_ids = [] #[bos_id] + # (start_index, end_index) = (0, None) + # for span in masked_spans: + # flag = sentinel_tokens.popleft() + + # output_tokens_ids.append(flag) + # output_tokens_ids.extend(span.label) + + # end_index = span.index[0] + # input_tokens_ids.extend(tokens[start_index: end_index]) + # input_tokens_ids.append(flag) + + # # the next start index is the token after the last span token + # start_index = span.index[-1] + 1 + + + # # Add the remaining tokens to input_tokens_ids + # input_tokens_ids.extend(tokens[start_index:]) + # input_tokens_ids.append(eos_id) + # # Add token to the output_tokens_ids + # output_tokens_ids.append(eos_id) + prefix_len = len(input_tokens_ids) + print("input_tokens_ids") + print(input_tokens_ids) + print("output_tokens_ids") + print(output_tokens_ids) + text_tokens_ids = pad_and_convert_to_numpy( input_tokens_ids+output_tokens_ids, pad_id, @@ -318,4 +343,148 @@ def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): filler = np.array([pad_id] * padding_length) tokens_np = np.concatenate((tokens, filler), dtype=np.int64) - return tokens_np \ No newline at end of file + return tokens_np + + +def create_sentinel_ids(self, mask_indices): + """ + Sentinel ids creation given the indices that should be masked. + The start indices of each mask are replaced by the sentinel ids in increasing + order. Consecutive mask indices to be deleted are replaced with `-1`. + """ + start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices + start_indices[:, 0] = mask_indices[:, 0] + + sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices) + sentinel_ids = np.where(sentinel_ids != 0, (len(self.tokenizer) - sentinel_ids), 0) + sentinel_ids -= mask_indices - start_indices + + return sentinel_ids + + +def filter_input_ids(self, input_ids, sentinel_ids): + """ + Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting. + This will reduce the sequence length from `expanded_inputs_length` to `input_length`. + """ + batch_size = input_ids.shape[0] + + input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids) + # input_ids tokens and sentinel tokens are >= 0, tokens < 0 are + # masked tokens coming after sentinel tokens and should be removed + input_ids = input_ids_full[input_ids_full >= 0].reshape((batch_size, -1)) + input_ids = np.concatenate( + [input_ids, np.full((batch_size, 1), self.tokenizer.eos_token_id, dtype=np.int32)], axis=-1 + ) + return input_ids + + +def compute_input_and_target_lengths(inputs_length, noise_density, mean_noise_span_length): + """This function is copy of `random_spans_helper `__ . + Training parameters to avoid padding with random_spans_noise_mask. + When training a model with random_spans_noise_mask, we would like to set the other + training hyperparmeters in a way that avoids padding. + This function helps us compute these hyperparameters. + We assume that each noise span in the input is replaced by extra_tokens_per_span_inputs sentinel tokens, + and each non-noise span in the targets is replaced by extra_tokens_per_span_targets sentinel tokens. + This function tells us the required number of tokens in the raw example (for split_tokens()) + as well as the length of the encoded targets. Note that this function assumes + the inputs and targets will have EOS appended and includes that in the reported length. + Args: + inputs_length: an integer - desired length of the tokenized inputs sequence + noise_density: a float + mean_noise_span_length: a float + Returns: + tokens_length: length of original text in tokens + targets_length: an integer - length in tokens of encoded targets sequence + """ + + def _tokens_length_to_inputs_length_targets_length(tokens_length): + num_noise_tokens = int(round(tokens_length * noise_density)) + num_nonnoise_tokens = tokens_length - num_noise_tokens + num_noise_spans = int(round(num_noise_tokens / mean_noise_span_length)) + # inputs contain all nonnoise tokens, sentinels for all noise spans + # and one EOS token. + _input_length = num_nonnoise_tokens + num_noise_spans + 1 + _output_length = num_noise_tokens + num_noise_spans + 1 + return _input_length, _output_length + + tokens_length = inputs_length + + while _tokens_length_to_inputs_length_targets_length(tokens_length + 1)[0] <= inputs_length: + tokens_length += 1 + + inputs_length, targets_length = _tokens_length_to_inputs_length_targets_length(tokens_length) + + # minor hack to get the targets length to be equal to inputs length + # which is more likely to have been set to a nice round number. + if noise_density == 0.5 and targets_length > inputs_length: + tokens_length -= 1 + targets_length -= 1 + return tokens_length, targets_length + + +def random_spans_noise_mask( + length, + noise_density=0.15, + mean_noise_span_length=3 + ): + + """This function is copy of `random_spans_helper `__ . + Noise mask consisting of random spans of noise tokens. + The number of noise tokens and the number of noise spans and non-noise spans + are determined deterministically as follows: + num_noise_tokens = round(length * noise_density) + num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length) + Spans alternate between non-noise and noise, beginning with non-noise. + Subject to the above restrictions, all masks are equally likely. + Args: + length: an int32 scalar (length of the incoming token sequence) + noise_density: a float - approximate density of output mask + mean_noise_span_length: a number + Returns: + a boolean tensor with shape [length] + """ + + orig_length = length + + num_noise_tokens = int(np.round(length * noise_density)) + # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens. + num_noise_tokens = min(max(num_noise_tokens, 1), length - 1) + num_noise_spans = int(np.round(num_noise_tokens / mean_noise_span_length)) + + # avoid degeneracy by ensuring positive number of noise spans + num_noise_spans = max(num_noise_spans, 1) + num_nonnoise_tokens = length - num_noise_tokens + + # pick the lengths of the noise spans and the non-noise spans + def _random_segmentation(num_items, num_segments): + """Partition a sequence of items randomly into non-empty segments. + Args: + num_items: an integer scalar > 0 + num_segments: an integer scalar in [1, num_items] + Returns: + a Tensor with shape [num_segments] containing positive integers that add + up to num_items + """ + mask_indices = np.arange(num_items - 1) < (num_segments - 1) + np.random.shuffle(mask_indices) + first_in_segment = np.pad(mask_indices, [[1, 0]]) + segment_id = np.cumsum(first_in_segment) + # count length of sub segments assuming that list is sorted + _, segment_length = np.unique(segment_id, return_counts=True) + return segment_length + + noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans) + nonnoise_span_lengths = _random_segmentation(num_nonnoise_tokens, num_noise_spans) + + interleaved_span_lengths = np.reshape( + np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1), [num_noise_spans * 2] + ) + span_starts = np.cumsum(interleaved_span_lengths)[:-1] + span_start_indicator = np.zeros((length,), dtype=np.int8) + span_start_indicator[span_starts] = True + span_num = np.cumsum(span_start_indicator) + is_noise = np.equal(span_num % 2, 1) + + return is_noise[:orig_length] \ No newline at end of file diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index a5aa7e973..8b3fcc934 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -17,7 +17,7 @@ from abc import ABC from abc import abstractmethod -from transformers import AutoTokenizer, AddedToken +from transformers import AutoTokenizer from .bert_tokenization import FullTokenizer as FullBertTokenizer from .gpt2_tokenization import GPT2Tokenizer @@ -327,51 +327,6 @@ def __init__(self, tokenizer_name_or_path): self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} - self.tokenizer.add_special_tokens({ - 'additional_special_tokens': [ - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - ] - }) - @property def vocab_size(self): return self.tokenizer.vocab_size @@ -390,6 +345,10 @@ def tokenize(self, text): def detokenize(self, token_ids): return self.tokenizer.decode(token_ids) + @property + def eod(self): + return self.tokenizer.eos_token_id + @property def cls(self): return self.tokenizer.cls_token_id @@ -402,10 +361,6 @@ def sep(self): def pad(self): return self.tokenizer.pad_token_id - @property - def eod(self): - return self.tokenizer.eos_token_id - @property def mask(self): return self.tokenizer.mask_token_id diff --git a/prepare_tokenizer.py b/prepare_tokenizer.py new file mode 100644 index 000000000..e058ac62a --- /dev/null +++ b/prepare_tokenizer.py @@ -0,0 +1,16 @@ +from transformers import AutoTokenizer, AddedToken + +tokenizer = AutoTokenizer.from_pretrained('bigscience/tokenizer') + +tokenizer.add_special_tokens({ + 'additional_special_tokens': [ + AddedToken( + ''.format(str(idx).zfill(3)), + lstrip=False, + rstrip=False, + normalization=False + ) for idx in reversed(range(0,200)) + ] + }) + +tokenizer.save_pretrained('bigscience-tokenizer-padded') \ No newline at end of file diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py deleted file mode 100644 index b7af289a5..000000000 --- a/pretrain_mp3_gpt.py +++ /dev/null @@ -1,257 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Pretrain GPT""" - -import torch -from functools import partial -from megatron import get_args -from megatron import print_rank_0 -from megatron import get_timers -from megatron import get_tokenizer -from megatron import mpu -# from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group -from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets, build_dataset_group -from megatron.model import GPTModel, GPTModelPipe -from megatron.training import pretrain -from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ -from megatron.utils import average_losses_across_data_parallel_group - -import deepspeed -from deepspeed.runtime.utils import see_memory_usage -import subprocess - -def model_provider(pre_process=True, post_process=True): - """Build the model.""" - - print_rank_0('building GPT model ...') - see_memory_usage(f"Before Building Model", force=True) - - args = get_args() - - with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), - remote_device=None if args.remote_device == 'none' else args.remote_device, - config_dict_or_path=args.deepspeed_config, - enabled=args.zero_stage == 3, - mpu=mpu): - if args.deepspeed: - model = GPTModelPipe( - num_tokentypes=0, - parallel_output=True, - prefix_lm=True - ) - # This is a hack to give us a reference to get_batch_pipe from within training.py - # We need to call model.set_batch_fn after deepspeed.initialize - model._megatron_batch_fn = get_batch_pipe - - else: - model = GPTModel( - num_tokentypes=0, - parallel_output=True, - pre_process=pre_process, - post_process=post_process, - prefix_lm=True - ) - see_memory_usage(f"After Building Model", force=True) - return model - -_KEYS = ['text', 'prefix_len'] - -def get_batch(data_iterator): - """Generate a batch""" - args = get_args() - tokenizer = get_tokenizer() - - # Items and their type. - keys = _KEYS - datatype = torch.int64 - - # Broadcast data. - if data_iterator is not None: - data = next(data_iterator) - else: - data = None - data_b = mpu.broadcast_data(keys, data, datatype) - - # Unpack. - tokens_ = data_b['text'].long() - labels = tokens_[:, 1:].contiguous() - tokens = tokens_[:, :-1].contiguous() - - # Prefix - prefix_indices = data_b['prefix_len'].cpu().tolist() - - # Get the masks and postition ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( - tokens, - tokenizer.eod, - args.reset_position_ids, - args.reset_attention_mask, - args.eod_mask_loss, - prefix_indices=prefix_indices, - loss_on_targets_only=args.loss_on_targets_only - ) - - # weight loss_mask - if args.reweight_loss_based_on_position_frequency: - reweight_loss_mask_(loss_mask, tokens) - - return tokens, labels, loss_mask, attention_mask, position_ids - -def get_batch_pipe(data): - """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" - args = get_args() - tokenizer = get_tokenizer() - - # Items and their type. - keys = _KEYS - datatype = torch.int64 - - # Broadcast data. - data_b = mpu.broadcast_data(keys, data, datatype) - - # Unpack. - tokens_ = data_b['text'].long() - labels = tokens_[:, 1:].contiguous() - tokens = tokens_[:, :-1].contiguous() - - # Prefix - prefix_indices = data_b['prefix_len'].cpu().tolist() - - # Get the masks and position ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( - tokens, - tokenizer.eod, - args.reset_position_ids, - args.reset_attention_mask, - args.eod_mask_loss, - prefix_indices=prefix_indices, - loss_on_targets_only=args.loss_on_targets_only - ) - - # weight loss_mask - if args.reweight_loss_based_on_position_frequency: - reweight_loss_mask_(loss_mask, tokens) - - return (tokens, position_ids, attention_mask), (labels, loss_mask), prefix_indices - -def loss_func(loss_mask, output_tensor): - losses = output_tensor.float() - loss_mask = loss_mask.view(-1).float() - loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() - - # Reduce loss for logging. - averaged_loss = average_losses_across_data_parallel_group([loss]) - - return loss, {'lm loss': averaged_loss[0]} - - -def forward_step(data_iterator, model): - """Forward step.""" - args = get_args() - timers = get_timers() - - # Get the batch. - timers('batch-generator').start() - tokens, labels, loss_mask, attention_mask, position_ids = get_batch( - data_iterator) - timers('batch-generator').stop() - - output_tensor = model(tokens, position_ids, attention_mask, - labels=labels) - - return output_tensor, partial(loss_func, loss_mask) - - -def train_valid_test_datasets_provider(train_val_test_num_samples): - """Build train, valid, and test datasets.""" - args = get_args() - train_ds, valid_ds, test_ds = None, None, None - - print_rank_0('> building train, validation, and test datasets for GPT ...') - # Option 1 of data loading using --data-path - - if args.data_path: - train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=args.data_path, - data_impl=args.data_impl, - splits_string=args.split, - train_valid_test_num_samples=train_val_test_num_samples, - seq_length=args.seq_length, - seed=args.seed, - skip_warmup=(not args.mmap_warmup)) - - # Option 2 of data loading using --(train|valid|test)-weighted-split-paths - elif args.train_weighted_split_paths: - assigned_train_valid_test = [] - if args.train_weighted_split_paths is not None: - train_ds = [] - assigned_train_valid_test.append("train") - if args.valid_weighted_split_paths is not None: - valid_ds = [] - assigned_train_valid_test.append("valid") - if args.test_weighted_split_paths is not None: - test_ds = [] - assigned_train_valid_test.append("test") - - for s in assigned_train_valid_test: - data_groups = zip(eval(f"args.{s}_weighted_split_paths"), - eval(f"args.{s}_weighted_split_weights"), - eval(f"args.{s}_weighted_split_splits"), - eval(f"args.{s}_weighted_split_names")) - for paths, weights, splits, name in data_groups: - d = build_dataset_group(name, paths, weights, splits, - args.data_impl, - train_val_test_num_samples, - args.seq_length, args.seed, - (not args.mmap_warmup), - train_valid_test=s) - eval(f"{s}_ds").append(d) - else: - raise NotImplementedError("No dataloading argument passed") - - print_rank_0("> finished creating GPT datasets ...") - return train_ds, valid_ds, test_ds - -def command_exists(cmd): - result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) - return result.wait() == 0 - -def git_ds_info(): - from deepspeed.env_report import main as ds_report - ds_report() - - # Write out version/git info - git_hash_cmd = "git rev-parse --short HEAD" - git_branch_cmd = "git rev-parse --abbrev-ref HEAD" - if command_exists('git'): - try: - result = subprocess.check_output(git_hash_cmd, shell=True) - git_hash = result.decode('utf-8').strip() - result = subprocess.check_output(git_branch_cmd, shell=True) - git_branch = result.decode('utf-8').strip() - except subprocess.CalledProcessError: - git_hash = "unknown" - git_branch = "unknown" - else: - git_hash = "unknown" - git_branch = "unknown" - print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') - - -if __name__ == "__main__": - git_ds_info() - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) From 26afe438201e9825315e92cb415e8193a88e21a1 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:40:15 +0700 Subject: [PATCH 105/148] minor fix --- 4B8-en-ND-MLM.sh | 1 + prepare_tokenizer.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index c8e1ba0d6..f0a5f59fb 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -22,6 +22,7 @@ GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=39_718 INPUT_LEN=512 TARGET_LEN=114 +SEQ_LEN=INPUT_LEN+TARGET_LEN NLAYERS=24 NHIDDEN=4096 diff --git a/prepare_tokenizer.py b/prepare_tokenizer.py index e058ac62a..280ba458d 100644 --- a/prepare_tokenizer.py +++ b/prepare_tokenizer.py @@ -13,4 +13,13 @@ ] }) -tokenizer.save_pretrained('bigscience-tokenizer-padded') \ No newline at end of file +tokenizer.save_pretrained('bigscience-tokenizer-padded') + +# python tools/preprocess_data.py \ +# --input data/oscar-en-10k.jsonl \ +# --output-prefix data/meg-gpt2-oscar-en-10k \ +# --dataset-impl mmap \ +# --tokenizer-type PretrainedFromHF \ +# --tokenizer-name-or-path bigscience-tokenizer-padded \ +# --append-eod \ +# --workers 4 \ No newline at end of file From c1b98168367bcbb8750e7b832768efb901d46596 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:41:28 +0700 Subject: [PATCH 106/148] minor fix --- 4B8-en-ND-MLM.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index f0a5f59fb..e2317f119 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -22,7 +22,7 @@ GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=39_718 INPUT_LEN=512 TARGET_LEN=114 -SEQ_LEN=INPUT_LEN+TARGET_LEN +SEQ_LEN=$(INPUT_LEN+TARGET_LEN) NLAYERS=24 NHIDDEN=4096 From 453822f066fe9878d0b0a8bfd715f6e40b02b484 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:42:10 +0700 Subject: [PATCH 107/148] minor fix --- 4B8-en-ND-MLM.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index e2317f119..bbc5be97d 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -22,13 +22,13 @@ GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=39_718 INPUT_LEN=512 TARGET_LEN=114 -SEQ_LEN=$(INPUT_LEN+TARGET_LEN) +SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) NLAYERS=24 NHIDDEN=4096 NHEADS=64 FFN_HIDDEN_SIZE=10240 -MAX_POSITION_EMBEDDING=1280 + SAVE_INTERVAL=1500 From a62266a67b757e0bae0070dfb69f6944b43b67d4 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:43:47 +0700 Subject: [PATCH 108/148] minor fix --- 4B8-en-ND-MLM.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index bbc5be97d..77f91de81 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -150,8 +150,7 @@ mkdir -p $REPO_PATH mkdir -p $LOGS_PATH # to debug - add echo (it exits and prints what it would have launched) -python -u -m torch.distributed.launch \ - --nproc_per_node $GPUS_PER_NODE \ +deepspeed --num_gpus $GPUS_PER_NODE \ $CMD # srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file From 02dda79c4bee7434d2c3846c8bca91909343525f Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:47:38 +0700 Subject: [PATCH 109/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index ed633fb32..4e12b59ee 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -278,8 +278,8 @@ def build_training_sample(sample, target_seq_length, input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8)) labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8)) - input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel) - output_tokens_ids = filter_input_ids(tokens, labels_sentinel) + input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id) + output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id) # # Masking. # max_predictions_per_seq = masked_lm_prob * max_num_tokens @@ -346,7 +346,7 @@ def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): return tokens_np -def create_sentinel_ids(self, mask_indices): +def create_sentinel_ids(mask_indices): """ Sentinel ids creation given the indices that should be masked. The start indices of each mask are replaced by the sentinel ids in increasing @@ -356,13 +356,13 @@ def create_sentinel_ids(self, mask_indices): start_indices[:, 0] = mask_indices[:, 0] sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices) - sentinel_ids = np.where(sentinel_ids != 0, (len(self.tokenizer) - sentinel_ids), 0) + sentinel_ids = np.where(sentinel_ids != 0, (len(tokenizer) - sentinel_ids), 0) sentinel_ids -= mask_indices - start_indices return sentinel_ids -def filter_input_ids(self, input_ids, sentinel_ids): +def filter_input_ids(input_ids, sentinel_ids, eos_id): """ Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting. This will reduce the sequence length from `expanded_inputs_length` to `input_length`. @@ -374,7 +374,7 @@ def filter_input_ids(self, input_ids, sentinel_ids): # masked tokens coming after sentinel tokens and should be removed input_ids = input_ids_full[input_ids_full >= 0].reshape((batch_size, -1)) input_ids = np.concatenate( - [input_ids, np.full((batch_size, 1), self.tokenizer.eos_token_id, dtype=np.int32)], axis=-1 + [input_ids, np.full((batch_size, 1), eos_id, dtype=np.int32)], axis=-1 ) return input_ids From 80331cb05991450f4649e747d6fb620fa92c816f Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:48:05 +0700 Subject: [PATCH 110/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 4e12b59ee..0d771c58d 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -319,6 +319,9 @@ def build_training_sample(sample, target_seq_length, print("output_tokens_ids") print(output_tokens_ids) + import sys + sys.exit() + text_tokens_ids = pad_and_convert_to_numpy( input_tokens_ids+output_tokens_ids, pad_id, From 350227dbdf2fc385540dbb9e6d9ac3a33ac24773 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:52:27 +0700 Subject: [PATCH 111/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 0d771c58d..8ed22f234 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -275,8 +275,8 @@ def build_training_sample(sample, target_seq_length, mask_indices = np.asarray([random_spans_noise_mask(expanded_inputs_length)]) labels_mask = ~mask_indices - input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8)) - labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8)) + input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8), vocab_len=len(vocab_id_list)) + labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8), vocab_len=len(vocab_id_list)) input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id) output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id) @@ -349,7 +349,7 @@ def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): return tokens_np -def create_sentinel_ids(mask_indices): +def create_sentinel_ids(mask_indices, vocab_len): """ Sentinel ids creation given the indices that should be masked. The start indices of each mask are replaced by the sentinel ids in increasing @@ -359,7 +359,7 @@ def create_sentinel_ids(mask_indices): start_indices[:, 0] = mask_indices[:, 0] sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices) - sentinel_ids = np.where(sentinel_ids != 0, (len(tokenizer) - sentinel_ids), 0) + sentinel_ids = np.where(sentinel_ids != 0, (vocab_len - sentinel_ids), 0) sentinel_ids -= mask_indices - start_indices return sentinel_ids From d0eecd4195b20a0d0c7f8a879068da1cb023d796 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:54:29 +0700 Subject: [PATCH 112/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 8ed22f234..641162050 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -278,6 +278,7 @@ def build_training_sample(sample, target_seq_length, input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8), vocab_len=len(vocab_id_list)) labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8), vocab_len=len(vocab_id_list)) + tokens = np.asarray(tokens) input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id) output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id) From 243cebea904a87a51ead48910cf4a219b1f98699 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:05:02 +0700 Subject: [PATCH 113/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 26 ++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 641162050..27d0c1a4e 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -278,7 +278,19 @@ def build_training_sample(sample, target_seq_length, input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8), vocab_len=len(vocab_id_list)) labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8), vocab_len=len(vocab_id_list)) - tokens = np.asarray(tokens) + + if len(tokens) < expanded_inputs_length: + tokens = pad_and_convert_to_numpy( + tokens, + pad_id, + expanded_inputs_length + ) + + tokens = np.asarray([tokens]) + print("input_ids_sentinel.shape") + print(input_ids_sentinel.shape) + print("tokens.shape") + print(tokens.shape) input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id) output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id) @@ -313,6 +325,12 @@ def build_training_sample(sample, target_seq_length, # # Add token to the output_tokens_ids # output_tokens_ids.append(eos_id) + # text_tokens_ids = pad_and_convert_to_numpy( + # input_tokens_ids+output_tokens_ids, + # pad_id, + # max_seq_length+max_seq_length_dec + # ) + prefix_len = len(input_tokens_ids) print("input_tokens_ids") @@ -323,12 +341,6 @@ def build_training_sample(sample, target_seq_length, import sys sys.exit() - text_tokens_ids = pad_and_convert_to_numpy( - input_tokens_ids+output_tokens_ids, - pad_id, - max_seq_length+max_seq_length_dec - ) - return { 'text': text_tokens_ids, 'prefix_len': prefix_len From da22e0b8002d65f85c0db0411b1e042aac9aaea4 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:13:58 +0700 Subject: [PATCH 114/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 27d0c1a4e..371479e22 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -287,10 +287,6 @@ def build_training_sample(sample, target_seq_length, ) tokens = np.asarray([tokens]) - print("input_ids_sentinel.shape") - print(input_ids_sentinel.shape) - print("tokens.shape") - print(tokens.shape) input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id) output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id) @@ -334,9 +330,9 @@ def build_training_sample(sample, target_seq_length, prefix_len = len(input_tokens_ids) print("input_tokens_ids") - print(input_tokens_ids) + print(len(input_tokens_ids)) print("output_tokens_ids") - print(output_tokens_ids) + print(len(output_tokens_ids)) import sys sys.exit() From 083dce75809b4c920ab54bfdde63670776fef96c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:15:43 +0700 Subject: [PATCH 115/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 371479e22..7bf16ce03 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -287,8 +287,8 @@ def build_training_sample(sample, target_seq_length, ) tokens = np.asarray([tokens]) - input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id) - output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id) + input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id)[0] + output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id)[0] # # Masking. # max_predictions_per_seq = masked_lm_prob * max_num_tokens From 541e9d60ca3544baa4f7127be8745b965590a584 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:29:31 +0700 Subject: [PATCH 116/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 7bf16ce03..f709a9a8c 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -181,7 +181,8 @@ def __init__(self, name, indexed_dataset, data_prefix, data_prefix, num_epochs, max_num_samples, - self.max_seq_length - 2, # account for added tokens + # self.max_seq_length - 2, # account for added tokens + self.max_seq_length*2, short_seq_prob, self.seed, self.name, @@ -329,14 +330,6 @@ def build_training_sample(sample, target_seq_length, prefix_len = len(input_tokens_ids) - print("input_tokens_ids") - print(len(input_tokens_ids)) - print("output_tokens_ids") - print(len(output_tokens_ids)) - - import sys - sys.exit() - return { 'text': text_tokens_ids, 'prefix_len': prefix_len From 86bfc8a4d10ed4754bc7d775498b5153a53102b1 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:31:09 +0700 Subject: [PATCH 117/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index f709a9a8c..ddbc1333c 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -291,6 +291,13 @@ def build_training_sample(sample, target_seq_length, input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id)[0] output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id)[0] + print("input_tokens_ids") + print(len(input_tokens_ids)) + print(input_tokens_ids) + print("output_tokens_ids") + print(len(output_tokens_ids)) + print(output_tokens_ids) + # # Masking. # max_predictions_per_seq = masked_lm_prob * max_num_tokens # (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions( From e21a4484458dd0f82ba51bee57660ed1de07b1f7 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:42:18 +0700 Subject: [PATCH 118/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index ddbc1333c..64a474a18 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -181,8 +181,7 @@ def __init__(self, name, indexed_dataset, data_prefix, data_prefix, num_epochs, max_num_samples, - # self.max_seq_length - 2, # account for added tokens - self.max_seq_length*2, + self.max_seq_length - 2, # account for added tokens short_seq_prob, self.seed, self.name, @@ -210,6 +209,14 @@ def __getitem__(self, idx): sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) + + #concat more to avoid padding + for i in range(0,2): + _idx = random.randint(idx, self.__len__) + start_index, end_index, seq_length = self.samples_mapping[_idx] + for index in range(start_index, end_index): + sample.append(self.indexed_dataset[index]) + # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. np_rng = np.random.RandomState(seed=(self.seed + idx)) @@ -335,6 +342,9 @@ def build_training_sample(sample, target_seq_length, # max_seq_length+max_seq_length_dec # ) + text_tokens_ids = input_tokens_ids+output_tokens_ids + print*text_tokens_ids + prefix_len = len(input_tokens_ids) return { From f47d678661e9aa641f83bfb6e75804448f32d4c9 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:43:53 +0700 Subject: [PATCH 119/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 64a474a18..ad7c5dc35 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -17,6 +17,7 @@ import os import time +import random import collections import numpy as np From 415b8bce80bdeb4fd5b55bd649ad408031236ae3 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:45:16 +0700 Subject: [PATCH 120/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index ad7c5dc35..1eff6762e 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -213,7 +213,7 @@ def __getitem__(self, idx): #concat more to avoid padding for i in range(0,2): - _idx = random.randint(idx, self.__len__) + _idx = random.randint(idx, self.__len__()) start_index, end_index, seq_length = self.samples_mapping[_idx] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) From 79bd6f8f6b0e52396ab3782355dde63dd87223a0 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 21:42:55 +0700 Subject: [PATCH 121/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 1eff6762e..286c2a3d8 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -212,12 +212,14 @@ def __getitem__(self, idx): sample.append(self.indexed_dataset[index]) #concat more to avoid padding - for i in range(0,2): + while seq_length < (self.max_seq_length/self.masked_lm_prob): _idx = random.randint(idx, self.__len__()) - start_index, end_index, seq_length = self.samples_mapping[_idx] + start_index, end_index, _seq_length = self.samples_mapping[_idx] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) + seq_length += _seq_length + # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. np_rng = np.random.RandomState(seed=(self.seed + idx)) From ba19fdf308f892b6089a95c5e39a58fcbb08420d Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 21:47:06 +0700 Subject: [PATCH 122/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 286c2a3d8..cfd8730d5 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -263,15 +263,15 @@ def build_training_sample(sample, target_seq_length, sentinel_tokens: unique value to be substituted for every replaced span """ - assert target_seq_length <= max_seq_length + # assert target_seq_length <= max_seq_length # flatten sentences into one list tokens = [token for sentence in sample for token in sentence] - # Truncate to `target_sequence_length`. - max_num_tokens = target_seq_length - truncated = len(tokens) > max_num_tokens - tokens = tokens[:max_num_tokens] + # # Truncate to `target_sequence_length`. + # max_num_tokens = target_seq_length + # truncated = len(tokens) > max_num_tokens + # tokens = tokens[:max_num_tokens] max_ngrams = 3 # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. From d200f4d7ae4e25c6087050cef6817311b37b9e8e Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 21:49:16 +0700 Subject: [PATCH 123/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index cfd8730d5..161af4885 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -290,12 +290,14 @@ def build_training_sample(sample, target_seq_length, labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8), vocab_len=len(vocab_id_list)) - if len(tokens) < expanded_inputs_length: + if len(tokens) <= expanded_inputs_length: tokens = pad_and_convert_to_numpy( tokens, pad_id, expanded_inputs_length ) + else: + tokens = tokens[:expanded_inputs_length] tokens = np.asarray([tokens]) input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id)[0] From 102a4614b97861a61e5494fbc4ebe0b1773fc19c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 21:51:42 +0700 Subject: [PATCH 124/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 161af4885..3920b9d95 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -347,8 +347,7 @@ def build_training_sample(sample, target_seq_length, # max_seq_length+max_seq_length_dec # ) - text_tokens_ids = input_tokens_ids+output_tokens_ids - print*text_tokens_ids + text_tokens_ids = np.concatenate((input_tokens_ids, output_tokens_ids)) prefix_len = len(input_tokens_ids) From e5304404db797fe7579fe06bb9f2dbeffb73896f Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 21:56:08 +0700 Subject: [PATCH 125/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 3920b9d95..59a1f2ee5 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -303,13 +303,6 @@ def build_training_sample(sample, target_seq_length, input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id)[0] output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id)[0] - print("input_tokens_ids") - print(len(input_tokens_ids)) - print(input_tokens_ids) - print("output_tokens_ids") - print(len(output_tokens_ids)) - print(output_tokens_ids) - # # Masking. # max_predictions_per_seq = masked_lm_prob * max_num_tokens # (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions( @@ -348,6 +341,9 @@ def build_training_sample(sample, target_seq_length, # ) text_tokens_ids = np.concatenate((input_tokens_ids, output_tokens_ids)) + print("text_tokens_ids") + print(len(text_tokens_ids)) + print(text_tokens_ids) prefix_len = len(input_tokens_ids) From 25680392589efb1f610e2600bedbb80d19057caa Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 22:51:52 +0700 Subject: [PATCH 126/148] minor fix --- megatron/data/non_causal_mlm_dataset.py | 3 --- train_ND_MLM_gpt.py | 10 ---------- train_ND_MTF_gpt.py | 10 ---------- 3 files changed, 23 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 59a1f2ee5..2df7dcfd3 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -341,9 +341,6 @@ def build_training_sample(sample, target_seq_length, # ) text_tokens_ids = np.concatenate((input_tokens_ids, output_tokens_ids)) - print("text_tokens_ids") - print(len(text_tokens_ids)) - print(text_tokens_ids) prefix_len = len(input_tokens_ids) diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 0326e778a..5ba98cd11 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -112,16 +112,6 @@ def get_batch(data_iterator): loss_on_targets_only=args.loss_on_targets_only ) - import numpy as np - with open('attention_mask.npy', 'wb') as f: - np.save(f, attention_mask.cpu().numpy()) - with open('loss_mask.npy', 'wb') as f: - np.save(f, loss_mask.cpu().numpy()) - with open('position_ids.npy', 'wb') as f: - np.save(f, position_ids.cpu().numpy()) - import sys - sys.exit() - # weight loss_mask if args.reweight_loss_based_on_position_frequency: reweight_loss_mask_(loss_mask, tokens) diff --git a/train_ND_MTF_gpt.py b/train_ND_MTF_gpt.py index d16c9bb82..69b8c825b 100644 --- a/train_ND_MTF_gpt.py +++ b/train_ND_MTF_gpt.py @@ -112,16 +112,6 @@ def get_batch(data_iterator): loss_on_targets_only=args.loss_on_targets_only ) - import numpy as np - with open('attention_mask.npy', 'wb') as f: - np.save(f, attention_mask.cpu().numpy()) - with open('loss_mask.npy', 'wb') as f: - np.save(f, loss_mask.cpu().numpy()) - with open('position_ids.npy', 'wb') as f: - np.save(f, position_ids.cpu().numpy()) - import sys - sys.exit() - # weight loss_mask if args.reweight_loss_based_on_position_frequency: reweight_loss_mask_(loss_mask, tokens) From e6b4120b3c69ef190a213bd527426266493c4086 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 23:34:37 +0700 Subject: [PATCH 127/148] minor fix --- megatron/tokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 8b3fcc934..c45468951 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -329,7 +329,7 @@ def __init__(self, tokenizer_name_or_path): @property def vocab_size(self): - return self.tokenizer.vocab_size + return self.tokenizer.__len__() #self.tokenizer.vocab_size @property def vocab(self): From fd7fe97811135595b5412c3b2f696cbc9c1f174e Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 23:57:41 +0700 Subject: [PATCH 128/148] minor fix --- 4B8-en-ND-MLM.sh | 2 -- train_ND_MLM_gpt.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 77f91de81..3af4ed0bc 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -55,8 +55,6 @@ GPT_ARGS=" \ --ffn-hidden-size $FFN_HIDDEN_SIZE \ --max-position-embeddings $SEQ_LEN \ --position-embedding-type alibi \ - --encoder-seq-length $INPUT_LEN \ - --decoder-seq-length $TARGET_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 5ba98cd11..148d287bb 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,8 +197,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.encoder_seq_length, - max_seq_length_dec=args.decoder_seq_length, + max_seq_length=512, #args.encoder_seq_length, + max_seq_length_dec=114, #args.decoder_seq_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, seed=args.seed, From 861fc7b9a84290e50fa5715637598f2e0eacec0d Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 3 Jun 2022 00:01:37 +0700 Subject: [PATCH 129/148] minor fix --- 4B8-en-ND-MLM.sh | 3 ++- train_ND_MLM_gpt.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 3af4ed0bc..3de57d8ad 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -22,7 +22,7 @@ GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=39_718 INPUT_LEN=512 TARGET_LEN=114 -SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) +SEQ_LEN=$INPUT_LEN #$((INPUT_LEN+TARGET_LEN)) NLAYERS=24 NHIDDEN=4096 @@ -55,6 +55,7 @@ GPT_ARGS=" \ --ffn-hidden-size $FFN_HIDDEN_SIZE \ --max-position-embeddings $SEQ_LEN \ --position-embedding-type alibi \ + --seq-length $SEQ_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 148d287bb..80789f9ef 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,8 +197,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=512, #args.encoder_seq_length, - max_seq_length_dec=114, #args.decoder_seq_length, + max_seq_length=args.seq_length, + max_seq_length_dec=args.seq_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, seed=args.seed, From 21c19840e53e4a109a39ed045dca5ffba6e2a7be Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 3 Jun 2022 00:04:55 +0700 Subject: [PATCH 130/148] minor fix --- 4B8-en-ND-MLM.sh | 2 +- train_ND_MLM_gpt.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 3de57d8ad..969bba02a 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -22,7 +22,7 @@ GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=39_718 INPUT_LEN=512 TARGET_LEN=114 -SEQ_LEN=$INPUT_LEN #$((INPUT_LEN+TARGET_LEN)) +SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) NLAYERS=24 NHIDDEN=4096 diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 80789f9ef..8a254cee0 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,8 +197,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.seq_length, - max_seq_length_dec=args.seq_length, + max_seq_length=512, #args.seq_length, + max_seq_length_dec=114, #args.seq_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, seed=args.seed, From 14e8d0f6e74dc862ff55b2cea0528b3b432e71d8 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 3 Jun 2022 00:23:21 +0700 Subject: [PATCH 131/148] minor fix --- 4B8-en-ND-MLM.sh | 4 +--- train_ND_MLM_gpt.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 969bba02a..53e3d9d01 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -20,9 +20,7 @@ DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatic MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=39_718 -INPUT_LEN=512 -TARGET_LEN=114 -SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) +SEQ_LEN=512 NLAYERS=24 NHIDDEN=4096 diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 8a254cee0..e286d1e0d 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -23,7 +23,7 @@ from megatron import get_tokenizer from megatron import mpu -from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets #, build_dataset_group +from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets, compute_input_and_target_lengths #, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ @@ -192,13 +192,18 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): # Option 1 of data loading using --data-path if args.data_path: + + extended_seq_length, target_length = compute_input_and_target_lengths(args.seq_length, args.mask_prob, 3) + args.max_position_embeddings = extended_seq_length + args.seq_length = extended_seq_length + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=512, #args.seq_length, - max_seq_length_dec=114, #args.seq_length, + max_seq_length=extended_seq_length, + max_seq_length_dec=target_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, seed=args.seed, From 920343f88d6caafa2060f55c2e757eee5a636738 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 3 Jun 2022 00:26:58 +0700 Subject: [PATCH 132/148] minor fix --- train_ND_MLM_gpt.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index e286d1e0d..8765e05f8 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -194,15 +194,13 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): if args.data_path: extended_seq_length, target_length = compute_input_and_target_lengths(args.seq_length, args.mask_prob, 3) - args.max_position_embeddings = extended_seq_length - args.seq_length = extended_seq_length train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=extended_seq_length, + max_seq_length=args.seq_length, max_seq_length_dec=target_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, From a68873d81b0ab0299997f7879b8dea7d2f5b4fbf Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 3 Jun 2022 00:34:30 +0700 Subject: [PATCH 133/148] minor fix --- 4B8-en-ND-MLM.sh | 8 ++++++-- train_ND_MLM_gpt.py | 9 +++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 53e3d9d01..58247e9e4 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -20,7 +20,9 @@ DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatic MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=39_718 -SEQ_LEN=512 +INPUT_LEN=512 +TARGET_LEN=114 +SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) NLAYERS=24 NHIDDEN=4096 @@ -54,6 +56,8 @@ GPT_ARGS=" \ --max-position-embeddings $SEQ_LEN \ --position-embedding-type alibi \ --seq-length $SEQ_LEN \ + --encoder-seq-length $INPUT_LEN \ + --decoder-seq-length $TARGET_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ @@ -68,7 +72,7 @@ GPT_ARGS=" \ " OUTPUT_ARGS=" \ - --log-interval 200 \ + --log-interval 1 \ --save-interval $SAVE_INTERVAL \ --eval-interval $TRAIN_ITER \ --eval-iters 1 \ diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 8765e05f8..5ba98cd11 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -23,7 +23,7 @@ from megatron import get_tokenizer from megatron import mpu -from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets, compute_input_and_target_lengths #, build_dataset_group +from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets #, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ @@ -192,16 +192,13 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): # Option 1 of data loading using --data-path if args.data_path: - - extended_seq_length, target_length = compute_input_and_target_lengths(args.seq_length, args.mask_prob, 3) - train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.seq_length, - max_seq_length_dec=target_length, + max_seq_length=args.encoder_seq_length, + max_seq_length_dec=args.decoder_seq_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, seed=args.seed, From 5d43986328646c8fd39d132b757adce07ef1b50b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 3 Jun 2022 00:42:02 +0700 Subject: [PATCH 134/148] minor fix --- 4B8-en-ND-MLM.sh | 1 - train_ND_MLM_gpt.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 58247e9e4..4d0be40cd 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -56,7 +56,6 @@ GPT_ARGS=" \ --max-position-embeddings $SEQ_LEN \ --position-embedding-type alibi \ --seq-length $SEQ_LEN \ - --encoder-seq-length $INPUT_LEN \ --decoder-seq-length $TARGET_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 5ba98cd11..a83ded877 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,7 +197,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.encoder_seq_length, + max_seq_length=args.seq_length-args.decoder_seq_length, max_seq_length_dec=args.decoder_seq_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, From 79e8c1a1ed194d3403cf8db3f3991b30402a57ae Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 3 Jun 2022 00:48:21 +0700 Subject: [PATCH 135/148] set correct seq len --- 4B8-en-ND-MLM.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 4d0be40cd..949a84ed8 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -18,10 +18,10 @@ TP_SIZE=1 # always fixed to the size of a single node DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer MICRO_BATCH_SIZE=1 -GLOBAL_BATCH_SIZE=2048 -TRAIN_ITER=39_718 -INPUT_LEN=512 -TARGET_LEN=114 +GLOBAL_BATCH_SIZE=512 +TRAIN_ITER=48_562 +INPUT_LEN=1675 +TARGET_LEN=373 SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) NLAYERS=24 From 786d252f09f0520e088b9dde88b3f9cf2c7c7503 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 19:14:31 +0700 Subject: [PATCH 136/148] refined sampling method --- megatron/data/non_causal_mlm_dataset.py | 247 +++++++++++++----------- 1 file changed, 130 insertions(+), 117 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 2df7dcfd3..f36730c64 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -25,7 +25,7 @@ from megatron import mpu, print_rank_0, get_tokenizer from megatron.data.blendable_dataset import BlendableDataset -from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, get_samples_mapping, create_masked_lm_predictions +from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, create_masked_lm_predictions from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_, get_indexed_dataset_ from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset @@ -177,16 +177,23 @@ def __init__(self, name, indexed_dataset, data_prefix, # Dataset. self.indexed_dataset = indexed_dataset + max_ngrams = 3 + # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. + # To ensure that the input length is `max_seq_length`, we need to increase the maximum length + # according to `masked_lm_prob` and `max_ngrams`. We can also define the label length accordingly. + expanded_inputs_length, targets_length = compute_input_and_target_lengths( + self.max_seq_length, + self.masked_lm_prob, + max_ngrams + ) + # Build the samples mapping. - self.samples_mapping = get_samples_mapping(self.indexed_dataset, - data_prefix, - num_epochs, - max_num_samples, - self.max_seq_length - 2, # account for added tokens - short_seq_prob, - self.seed, - self.name, - False) + self.samples_mapping = get_samples_mapping( + self.indexed_dataset, + data_prefix, + self.name, + max_len=expanded_inputs_length + ) # Vocab stuff. tokenizer = get_tokenizer() @@ -202,88 +209,40 @@ def __init__(self, name, indexed_dataset, data_prefix, assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" def __len__(self): - return self.samples_mapping.shape[0] + return len(self.samples_mapping) def __getitem__(self, idx): - start_index, end_index, seq_length = self.samples_mapping[idx] + indices = self.samples_mapping[idx] sample = [] - for index in range(start_index, end_index): - sample.append(self.indexed_dataset[index]) - - #concat more to avoid padding - while seq_length < (self.max_seq_length/self.masked_lm_prob): - _idx = random.randint(idx, self.__len__()) - start_index, end_index, _seq_length = self.samples_mapping[_idx] - for index in range(start_index, end_index): - sample.append(self.indexed_dataset[index]) - - seq_length += _seq_length - - # Note that this rng state should be numpy and not python since - # python randint is inclusive whereas the numpy one is exclusive. - np_rng = np.random.RandomState(seed=(self.seed + idx)) - return build_training_sample(sample, seq_length, - self.max_seq_length, # needed for padding - self.max_seq_length_dec, - self.vocab_id_list, - self.vocab_id_to_token_dict, - self.cls_id, self.sep_id, - self.mask_id, self.pad_id, - self.masked_lm_prob, np_rng, - self.bos_id, self.eos_id, - self.sentinel_tokens) - - -def build_training_sample(sample, target_seq_length, - max_seq_length, max_seq_length_dec, - vocab_id_list, vocab_id_to_token_dict, - cls_id, sep_id, mask_id, pad_id, - masked_lm_prob, np_rng, bos_id=None, - eos_id=None, sentinel_tokens=None): + for doc_idx, start_index, end_index in indices: + sample.append(self.indexed_dataset[index][start_index:end_index]) + + return build_training_sample( + sample, expanded_inputs_length, self.vocab_id_list, + self.cls_id, self.sep_id, self.mask_id, self.pad_id, self.bos_id, self.eos_id, + self.sentinel_tokens + ) + + +def build_training_sample( + sample, expanded_inputs_length, vocab_id_list, + cls_id, sep_id, mask_id, pad_id, bos_id=None, eos_id=None, sentinel_tokens=None + ): """Build training sample. Arguments: - sample: A list of sentences in which each sentence is a list token ids. - target_seq_length: Desired sequence length. - max_seq_length: Maximum length of the sequence. All values are padded to - this length. - vocab_id_list: List of vocabulary ids. Used to pick a random id. - vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. - cls_id: Start of example id. - sep_id: Separator id. - mask_id: Mask token id. - pad_id: Padding token id. - masked_lm_prob: Probability to mask tokens. - np_rng: Random number genenrator. Note that this rng state should be - numpy and not python since python randint is inclusive for - the opper bound whereas the numpy one is exclusive. - bos_id: start of decoder example id - eos_id: end of generation id - sentinel_tokens: unique value to be substituted for every replaced span + TODO: Add description """ - # assert target_seq_length <= max_seq_length - # flatten sentences into one list tokens = [token for sentence in sample for token in sentence] - # # Truncate to `target_sequence_length`. - # max_num_tokens = target_seq_length - # truncated = len(tokens) > max_num_tokens - # tokens = tokens[:max_num_tokens] - - max_ngrams = 3 - # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. - # To ensure that the input length is `max_seq_length`, we need to increase the maximum length - # according to `masked_lm_prob` and `max_ngrams`. We can also define the label length accordingly. - expanded_inputs_length, targets_length = compute_input_and_target_lengths( - max_seq_length, - masked_lm_prob, - max_ngrams - ) - - mask_indices = np.asarray([random_spans_noise_mask(expanded_inputs_length)]) + mask_indices = np.asarray([random_spans_noise_mask( + expanded_inputs_length, + noise_density=0.15, + mean_noise_span_length=3 + )]) labels_mask = ~mask_indices input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8), vocab_len=len(vocab_id_list)) @@ -303,43 +262,6 @@ def build_training_sample(sample, target_seq_length, input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id)[0] output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id)[0] - # # Masking. - # max_predictions_per_seq = masked_lm_prob * max_num_tokens - # (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions( - # tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, - # cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, - # max_ngrams=max_ngrams, geometric_dist=True, masking_style="t5") - - # sentinel_tokens = collections.deque(sentinel_tokens) - # input_tokens_ids = [] - # output_tokens_ids = [] #[bos_id] - # (start_index, end_index) = (0, None) - # for span in masked_spans: - # flag = sentinel_tokens.popleft() - - # output_tokens_ids.append(flag) - # output_tokens_ids.extend(span.label) - - # end_index = span.index[0] - # input_tokens_ids.extend(tokens[start_index: end_index]) - # input_tokens_ids.append(flag) - - # # the next start index is the token after the last span token - # start_index = span.index[-1] + 1 - - - # # Add the remaining tokens to input_tokens_ids - # input_tokens_ids.extend(tokens[start_index:]) - # input_tokens_ids.append(eos_id) - # # Add token to the output_tokens_ids - # output_tokens_ids.append(eos_id) - - # text_tokens_ids = pad_and_convert_to_numpy( - # input_tokens_ids+output_tokens_ids, - # pad_id, - # max_seq_length+max_seq_length_dec - # ) - text_tokens_ids = np.concatenate((input_tokens_ids, output_tokens_ids)) prefix_len = len(input_tokens_ids) @@ -350,6 +272,97 @@ def build_training_sample(sample, target_seq_length, } +def get_samples_mapping(indexed_dataset, data_prefix, name, max_len=568): + + def breakdown(sample_len, idx_offset=None, idx_list=None, max_len=max_len): + + if idx_list is None: + idx_list = [] + + if idx_offset is None: + idx_offset = 0 + + if sample_len < max_len: + idx_list.append(idx_offset+sample_len) + else: + sample_len = sample_len - max_len + idx_list.append(idx_offset+max_len) + idx_offset += max_len + + breakdown(sample_len, idx_offset=idx_offset, idx_list=idx_list) + + idx_list = [0]+idx_list + return list(zip(idx_list[:-1], idx_list[1:])) + + + # Filename of the index mapping + indexmap_filename = data_prefix + indexmap_filename += '_{}_indexmap'.format(name) + indexmap_filename += '.npy' + + # Build the indexed mapping if not exist. + if torch.distributed.get_rank() == 0 and \ + not os.path.isfile(indexmap_filename): + + samples_mapping = [] + sample_indices = [] + doc_idx = 0 + current_len = 0 + _idx = 0 + for doc_idx, sample_len in zip(indexed_dataset.doc_idx, indexed_dataset.sizes): + _idx = 0 + + if current_len + sample_len > max_len: + end_idx = max_len - current_len + sample_indices.append([doc_idx, 0, end_idx]) + samples_mapping.append(sample_indices) + sample_indices = [] + current_len = 0 + sample_len -= end_idx + _idx = end_idx + + break_len = current_len + sample_len + + indices = breakdown(sample_len) + for _start_idx, _end_idx in indices: + _len = _end_idx - _start_idx + if _len == max_len: + samples_mapping.append([[doc_idx, _start_idx+_idx, _end_idx+_idx]]) + else: + sample_indices.append([doc_idx, _start_idx+_idx, _end_idx+_idx]) + current_len += _len + + print_rank_0(' > done building sapmles index maping') + np.save(indexmap_filename, samples_mapping, allow_pickle=True) + print_rank_0(' > saved the index mapping in {}'.format( + indexmap_filename)) + # Make sure all the ranks have built the mapping + print_rank_0(' > elasped time to build and save samples mapping ' + '(seconds): {:4f}'.format( + time.time() - start_time)) + # This should be a barrier but nccl barrier assumes + # device_index=rank which is not the case for model + # parallel case + counts = torch.cuda.LongTensor([1]) + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) + assert counts[0].item() == ( + torch.distributed.get_world_size() // + torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) + + # Load indexed dataset. + print_rank_0(' > loading indexed mapping from {}'.format( + indexmap_filename)) + start_time = time.time() + samples_mapping = np.load(indexmap_filename, allow_pickle=True) + print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( + time.time() - start_time)) + print_rank_0(' total number of samples: {}'.format( + len(samples_mapping))) + + return samples_mapping + + def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): """Pad sequences and convert them to numpy.""" From 9110520e389b97e4c6c016f73094ef012ae482eb Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 19:18:25 +0700 Subject: [PATCH 137/148] refined sampling method --- megatron/data/non_causal_mlm_dataset.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index f36730c64..e37a1981c 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -303,7 +303,18 @@ def breakdown(sample_len, idx_offset=None, idx_list=None, max_len=max_len): # Build the indexed mapping if not exist. if torch.distributed.get_rank() == 0 and \ not os.path.isfile(indexmap_filename): - + print(' > WARNING: could not find index map file {}, building ' + 'the indices on rank 0 ...'.format(indexmap_filename)) + + # Make sure the types match the helpers input types. + assert indexed_dataset.doc_idx.dtype == np.int64 + assert indexed_dataset.sizes.dtype == np.int32 + + # Build samples mapping + verbose = torch.distributed.get_rank() == 0 + start_time = time.time() + print_rank_0(' > building sapmles index mapping for {} ...'.format( + name)) samples_mapping = [] sample_indices = [] doc_idx = 0 From 7db34b9e8a3f2d787e44463a962db9831e05a7fd Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 19:19:34 +0700 Subject: [PATCH 138/148] refined sampling method --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e37a1981c..cb2ee5b34 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -216,7 +216,7 @@ def __getitem__(self, idx): indices = self.samples_mapping[idx] sample = [] for doc_idx, start_index, end_index in indices: - sample.append(self.indexed_dataset[index][start_index:end_index]) + sample.append(self.indexed_dataset[doc_idx][start_index:end_index]) return build_training_sample( sample, expanded_inputs_length, self.vocab_id_list, From d9465158b78844b40c00b8230dceadc06c6da620 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 21:28:31 +0700 Subject: [PATCH 139/148] refined sampling method --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index cb2ee5b34..7c8ca01a0 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -216,7 +216,7 @@ def __getitem__(self, idx): indices = self.samples_mapping[idx] sample = [] for doc_idx, start_index, end_index in indices: - sample.append(self.indexed_dataset[doc_idx][start_index:end_index]) + sample.append(self.indexed_dataset.get(doc_idx)[start_index:end_index]) return build_training_sample( sample, expanded_inputs_length, self.vocab_id_list, From bb4e6561479be758ab1ef215cad42e5f703a8462 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 21:30:30 +0700 Subject: [PATCH 140/148] refined sampling method --- megatron/data/non_causal_mlm_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 7c8ca01a0..0c6213fa8 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -186,6 +186,8 @@ def __init__(self, name, indexed_dataset, data_prefix, self.masked_lm_prob, max_ngrams ) + self.expanded_inputs_length = expanded_inputs_length + self.targets_length = targets_length # Build the samples mapping. self.samples_mapping = get_samples_mapping( @@ -219,7 +221,7 @@ def __getitem__(self, idx): sample.append(self.indexed_dataset.get(doc_idx)[start_index:end_index]) return build_training_sample( - sample, expanded_inputs_length, self.vocab_id_list, + sample, self.expanded_inputs_length, self.vocab_id_list, self.cls_id, self.sep_id, self.mask_id, self.pad_id, self.bos_id, self.eos_id, self.sentinel_tokens ) From 2e7161d4199ac580f70cbf48c59707ac3354eb31 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 21:36:21 +0700 Subject: [PATCH 141/148] refined sampling method --- megatron/data/non_causal_mlm_dataset.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 0c6213fa8..9289f6b51 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -250,16 +250,6 @@ def build_training_sample( input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8), vocab_len=len(vocab_id_list)) labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8), vocab_len=len(vocab_id_list)) - - if len(tokens) <= expanded_inputs_length: - tokens = pad_and_convert_to_numpy( - tokens, - pad_id, - expanded_inputs_length - ) - else: - tokens = tokens[:expanded_inputs_length] - tokens = np.asarray([tokens]) input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id)[0] output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id)[0] @@ -385,7 +375,7 @@ def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): assert padding_length >= 0 # Tokens and token types. - filler = np.array([pad_id] * padding_length) + filler = np.array([pad_id] * padding_length, dtype=np.int64) tokens_np = np.concatenate((tokens, filler), dtype=np.int64) return tokens_np From 00473e42162c9fd5fc674cf3e6769de42f6fe39d Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 22:08:16 +0700 Subject: [PATCH 142/148] first commit, adding non causal mlm dataset --- 4B8-en-ND-MLM.sh | 3 +- megatron/data/non_causal_mlm_dataset.py | 50 ++++++------------------- train_ND_MLM_gpt.py | 6 +-- 3 files changed, 15 insertions(+), 44 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 949a84ed8..a856f2e77 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -22,7 +22,7 @@ GLOBAL_BATCH_SIZE=512 TRAIN_ITER=48_562 INPUT_LEN=1675 TARGET_LEN=373 -SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) +SEQ_LEN=$INPUT_LEN NLAYERS=24 NHIDDEN=4096 @@ -56,7 +56,6 @@ GPT_ARGS=" \ --max-position-embeddings $SEQ_LEN \ --position-embedding-type alibi \ --seq-length $SEQ_LEN \ - --decoder-seq-length $TARGET_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 9289f6b51..bab07ceba 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -33,20 +33,16 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, train_valid_test_num_samples, max_seq_length, - masked_lm_prob, short_seq_prob, seed, - skip_warmup, binary_head=False, - max_seq_length_dec=None, - dataset_type='standard_bert'): + masked_lm_prob, seed, + skip_warmup + ): if len(data_prefix) == 1: return _build_train_valid_test_datasets(data_prefix[0], data_impl, splits_string, train_valid_test_num_samples, max_seq_length, masked_lm_prob, - short_seq_prob, seed, - skip_warmup, - binary_head, - max_seq_length_dec, - dataset_type=dataset_type) + seed, skip_warmup + ) # Blending dataset. # Parse the values. output = get_datasets_weights_and_num_samples(data_prefix, @@ -61,8 +57,8 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( prefixes[i], data_impl, splits_string, datasets_train_valid_test_num_samples[i], - max_seq_length, masked_lm_prob, short_seq_prob, - seed, skip_warmup, binary_head, dataset_type=dataset_type) + max_seq_length, masked_lm_prob, + seed, skip_warmup) if train_ds: train_datasets.append(train_ds) if valid_ds: @@ -87,11 +83,8 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, train_valid_test_num_samples, - max_seq_length, - masked_lm_prob, short_seq_prob, seed, - skip_warmup, binary_head, - max_seq_length_dec, - dataset_type='standard_bert'): + max_seq_length, masked_lm_prob, seed, + skip_warmup): """Build train, valid, and test datasets.""" @@ -134,16 +127,12 @@ def build_dataset(index, name): kwargs = dict( name=name, data_prefix=data_prefix, - num_epochs=None, - max_num_samples=train_valid_test_num_samples[index], max_seq_length=max_seq_length, seed=seed, ) dataset = NonCausalMLMDataset( indexed_dataset=indexed_dataset, masked_lm_prob=masked_lm_prob, - max_seq_length_dec=max_seq_length_dec, - short_seq_prob=short_seq_prob, **kwargs ) indexed_dataset.set_doc_idx(doc_idx_ptr) @@ -163,9 +152,9 @@ def build_dataset(index, name): class NonCausalMLMDataset(torch.utils.data.Dataset): def __init__(self, name, indexed_dataset, data_prefix, - num_epochs, max_num_samples, masked_lm_prob, - max_seq_length, max_seq_length_dec, - short_seq_prob, seed): + masked_lm_prob, + max_seq_length, + seed): # Params to store. self.name = name @@ -366,21 +355,6 @@ def breakdown(sample_len, idx_offset=None, idx_list=None, max_len=max_len): return samples_mapping -def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): - """Pad sequences and convert them to numpy.""" - - # Some checks. - num_tokens = len(tokens) - padding_length = max_seq_length - num_tokens - assert padding_length >= 0 - - # Tokens and token types. - filler = np.array([pad_id] * padding_length, dtype=np.int64) - tokens_np = np.concatenate((tokens, filler), dtype=np.int64) - - return tokens_np - - def create_sentinel_ids(mask_indices, vocab_len): """ Sentinel ids creation given the indices that should be masked. diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index a83ded877..b942cd8bc 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,13 +197,11 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.seq_length-args.decoder_seq_length, - max_seq_length_dec=args.decoder_seq_length, + max_seq_length=args.seq_length, masked_lm_prob=args.mask_prob, - short_seq_prob=args.short_seq_prob, seed=args.seed, skip_warmup=(not args.mmap_warmup), - dataset_type='t5') + ) # # Option 2 of data loading using --(train|valid|test)-weighted-split-paths # elif args.train_weighted_split_paths: From 5992776b44ad8202cb0536782bde543c5bd20521 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 22:10:10 +0700 Subject: [PATCH 143/148] fixed mlm dataset --- megatron/data/non_causal_mlm_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index bab07ceba..df66ef9e7 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -161,7 +161,6 @@ def __init__(self, name, indexed_dataset, data_prefix, self.seed = seed self.masked_lm_prob = masked_lm_prob self.max_seq_length = max_seq_length - self.max_seq_length_dec = max_seq_length_dec # Dataset. self.indexed_dataset = indexed_dataset From 83f5dee9a4e022900bdc91dbed20eae39c0a524f Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 22:16:21 +0700 Subject: [PATCH 144/148] fixed mlm dataset --- 4B8-en-ND-MLM.sh | 2 +- train_ND_MLM_gpt.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index a856f2e77..4b25609a7 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -22,7 +22,7 @@ GLOBAL_BATCH_SIZE=512 TRAIN_ITER=48_562 INPUT_LEN=1675 TARGET_LEN=373 -SEQ_LEN=$INPUT_LEN +SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) NLAYERS=24 NHIDDEN=4096 diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index b942cd8bc..c6a729acd 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,7 +197,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.seq_length, + max_seq_length=args.seq_length-373, masked_lm_prob=args.mask_prob, seed=args.seed, skip_warmup=(not args.mmap_warmup), From 3235c2d094b72bb4e297131e53f69c5fbcad01f2 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 22:20:02 +0700 Subject: [PATCH 145/148] fixed mlm dataset --- megatron/data/non_causal_mlm_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index df66ef9e7..ceb99e7ce 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -252,9 +252,9 @@ def build_training_sample( } -def get_samples_mapping(indexed_dataset, data_prefix, name, max_len=568): +def get_samples_mapping(indexed_dataset, data_prefix, name, max_len): - def breakdown(sample_len, idx_offset=None, idx_list=None, max_len=max_len): + def breakdown(sample_len, idx_offset=None, idx_list=None, max_len=None): if idx_list is None: idx_list = [] @@ -314,7 +314,7 @@ def breakdown(sample_len, idx_offset=None, idx_list=None, max_len=max_len): break_len = current_len + sample_len - indices = breakdown(sample_len) + indices = breakdown(sample_len, max_len=max_len) for _start_idx, _end_idx in indices: _len = _end_idx - _start_idx if _len == max_len: From 5449978127fc2ee0a0e68dd48f3ef942204c57f9 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 22:30:33 +0700 Subject: [PATCH 146/148] fixed mlm dataset --- 4B8-en-ND-MLM.sh | 1 + train_ND_MLM_gpt.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 4b25609a7..a765b1b4a 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -56,6 +56,7 @@ GPT_ARGS=" \ --max-position-embeddings $SEQ_LEN \ --position-embedding-type alibi \ --seq-length $SEQ_LEN \ + --encoder-seq-length $INPUT_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index c6a729acd..881b02c45 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,7 +197,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.seq_length-373, + max_seq_length=args.encoder_seq_length, masked_lm_prob=args.mask_prob, seed=args.seed, skip_warmup=(not args.mmap_warmup), From 95c98515e7e79d16120d23b2fabe12eade3fd18c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 22:48:30 +0700 Subject: [PATCH 147/148] fixed mlm dataset --- 4B8-en-ND-MLM.sh | 2 +- megatron/arguments.py | 2 ++ train_ND_MLM_gpt.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index a765b1b4a..75fc3e89d 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -56,7 +56,7 @@ GPT_ARGS=" \ --max-position-embeddings $SEQ_LEN \ --position-embedding-type alibi \ --seq-length $SEQ_LEN \ - --encoder-seq-length $INPUT_LEN \ + --input-length $INPUT_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ diff --git a/megatron/arguments.py b/megatron/arguments.py index 2be64b77d..a0f70ce83 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -878,6 +878,8 @@ def __call__(self, parser, args, values, option_string=None): 'They are used for span masking in the T5 model') group.add_argument('--seq-length', type=int, default=None, help='Maximum sequence length to process.') + group.add_argument('--input-length', type=int, default=None, + help='Maximum sequence length to process.') group.add_argument('--encoder-seq-length', type=int, default=None, help='Maximum encoder sequence length to process.' 'This should be exclusive of --seq-length') diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 881b02c45..3f23320e8 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,7 +197,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.encoder_seq_length, + max_seq_length=args.input_length, masked_lm_prob=args.mask_prob, seed=args.seed, skip_warmup=(not args.mmap_warmup), From 451318f12a50b1aa1fcb40123c46522d1e243637 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 14 Jun 2022 20:36:09 +0700 Subject: [PATCH 148/148] minor changes --- megatron/arguments.py | 2 +- megatron/data/non_causal_mlm_dataset.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index a0f70ce83..8f4f4fc5c 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -879,7 +879,7 @@ def __call__(self, parser, args, values, option_string=None): group.add_argument('--seq-length', type=int, default=None, help='Maximum sequence length to process.') group.add_argument('--input-length', type=int, default=None, - help='Maximum sequence length to process.') + help='Maximum input length to process for MLM adaptation.') group.add_argument('--encoder-seq-length', type=int, default=None, help='Maximum encoder sequence length to process.' 'This should be exclusive of --seq-length') diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index ceb99e7ce..aa4a45a9f 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -154,7 +154,8 @@ class NonCausalMLMDataset(torch.utils.data.Dataset): def __init__(self, name, indexed_dataset, data_prefix, masked_lm_prob, max_seq_length, - seed): + seed, + max_ngrams = 3): # Params to store. self.name = name @@ -165,14 +166,14 @@ def __init__(self, name, indexed_dataset, data_prefix, # Dataset. self.indexed_dataset = indexed_dataset - max_ngrams = 3 + self.max_ngrams = max_ngrams # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. # To ensure that the input length is `max_seq_length`, we need to increase the maximum length # according to `masked_lm_prob` and `max_ngrams`. We can also define the label length accordingly. expanded_inputs_length, targets_length = compute_input_and_target_lengths( self.max_seq_length, self.masked_lm_prob, - max_ngrams + self.max_ngrams ) self.expanded_inputs_length = expanded_inputs_length self.targets_length = targets_length