From ce8d5e1182e20873c30bb926ca5902384b4af02c Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 4 Jul 2022 20:02:05 +0200 Subject: [PATCH 01/25] That's what I'm hoping is all I need --- megatron/arguments.py | 1 + megatron/model/transformer.py | 2 ++ megatron/tokenizer/tokenizer.py | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 31a8d4000..dbb77c6e0 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -388,6 +388,7 @@ def _add_network_size_args(parser): group.add_argument('--onnx-safe', type=bool, required=False, help='Use workarounds for known problems with ' 'Torch ONNX exporter') + group.add_argument('--relu', type=bool, action='store_true') group.add_argument('--bert-no-binary-head', action='store_false', help='Disable BERT binary head.', dest='bert_binary_head') diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 48401a9f1..59170acb9 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -86,6 +86,8 @@ def __init__(self, init_method, output_layer_init_method): self.activation_func = openai_gelu elif args.onnx_safe: self.activation_func = erf_gelu + elif args.relu: + self.activation_func = F.relu # Project back to h. self.dense_4h_to_h = mpu.RowParallelLinear( diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index fcc3ed20d..69702c8ee 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -327,7 +327,7 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): if vocab_extra_ids > 0: # TODO @thomasw21 we might need to concatenate to a pre-existing list? hf_tokenizer_kwargs["additional_special_tokens"] = [f"" for _id in range(vocab_extra_ids)] - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, **hf_tokenizer_kwargs) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, **hf_tokenizer_kwargs, use_fast=False) self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} From b21d773ea22a432649d4d8a0ba12db3ee56e224b Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 4 Jul 2022 21:24:09 +0200 Subject: [PATCH 02/25] relu to work --- megatron/arguments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index dbb77c6e0..058da4640 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -388,7 +388,7 @@ def _add_network_size_args(parser): group.add_argument('--onnx-safe', type=bool, required=False, help='Use workarounds for known problems with ' 'Torch ONNX exporter') - group.add_argument('--relu', type=bool, action='store_true') + group.add_argument('--relu', action='store_true') group.add_argument('--bert-no-binary-head', action='store_false', help='Disable BERT binary head.', dest='bert_binary_head') From 43a9ab44dfad27b956c6ae97ca261172662b86b8 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 4 Jul 2022 21:45:56 +0200 Subject: [PATCH 03/25] Run inference for opt --- tasks/eval_harness/evaluate.py | 42 +--------------------------------- 1 file changed, 1 insertion(+), 41 deletions(-) diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index 68dd649fd..fc2eb3b20 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -288,48 +288,11 @@ def override_args(args, override_args, skip_keys, skip_if_specified_keys): # We then use the megatron deepspeed converter to load the deepspeed checkpoints as if they we're megatron checkpoints. def load_ds_checkpoint_and_setup_megatron(args): _print_args = megatron.arguments._print_args - megatron.arguments._print_args = lambda *_args, **kwarg: None + # megatron.arguments._print_args = lambda *_args, **kwarg: None if not os.path.exists(args.load): raise ValueError(f"checkpoint path {args.load} doesn't exit") - ds_checkpoint = DeepSpeedCheckpoint(args.load, - tp_degree=args.tensor_model_parallel_size, - pp_degree=args.pipeline_model_parallel_size) - - - cp_args = ds_checkpoint.get_args() - # Merge the current args with the checkpoint args. - skip_keys = [ - 'abort_on_unmet_fused_kernel_constraints', - 'batch_size', - 'data_parallel_size', - 'deepspeed', - 'deepspeed_config', - 'device_count', - 'global_batch_size', - 'inference', - 'iteration', - 'load', - 'local_rank', - 'micro_batch_size', - 'pipeline_model_parallel_size', - 'rampup_batch_size', - 'rank', - 'tensor_model_parallel_size', - 'tensorboard_dir', - 'world_size', - ] - - skip_if_specified = ['merge_file', 'vocab_file'] - - if args.eval_fp32: - cp_args.fp16 = False - cp_args.bf16 = False - cp_args.params_dtype = torch.float32 - - override_args(args, cp_args, skip_keys, skip_if_specified) - # stop megatron from reparsing the arguments. megatron.global_vars._parse_args = lambda *_args, **kwarg: args megatron.global_vars._GLOBAL_ARGS = args @@ -337,9 +300,6 @@ def load_ds_checkpoint_and_setup_megatron(args): initialize_megatron() torch.distributed.barrier() - # Initializing megatron will update eg. tokenizer size. Override again. - override_args(args, cp_args, skip_keys, skip_if_specified) - # print final arguments. _print_args(args) if args.deepspeed: From 8e095da2189de6856878167146ce8de31c87d9b6 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 5 Jul 2022 02:09:00 +0200 Subject: [PATCH 04/25] Test --- tasks/eval_harness/evaluate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index fc2eb3b20..d2db4bb4d 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -82,7 +82,7 @@ def loglikelihood(self, requests): # end of text as context context_enc = [self.EOT_TOKEN_ID] else: - context_enc = self.tokenizer_encode(context) + context_enc = [self.EOT_TOKEN_ID, self.tokenizer_encode(context)] continuation_enc = self.tokenizer_encode(continuation) @@ -194,6 +194,8 @@ def create_model_inputs(self, tokens): prefix_indices=None, loss_on_targets_only=False) + position_ids = position_ids -1 + return (tokens, position_ids, attention_mask), (tokens, loss_mask) def _model_call(self, inps): From e742a3fecfec5d6fdeb9a381e20baa2bd9304bab Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 5 Jul 2022 02:52:02 +0200 Subject: [PATCH 05/25] I think OPT doesn't have shared vocabulary embedding --- megatron/model/gpt_model.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py index 31d33a91b..62f29c804 100644 --- a/megatron/model/gpt_model.py +++ b/megatron/model/gpt_model.py @@ -17,6 +17,7 @@ from functools import partial import torch +from torch import nn from megatron import get_args from megatron import mpu @@ -195,6 +196,20 @@ def CrossEntropy(output, labels): return CrossEntropy +class OutputVocab(nn.Module): + """Useful when models don't have a shared embedding space""" + def __init__(self, hidden_size: int, padded_vocab_size: int, parallel_output: bool): + super(OutputVocab, self).__init__() + self.parallel_output = parallel_output + self.lm_head = mpu.VocabParallelEmbedding( + padded_vocab_size, + hidden_size, + ) + + def forward(self, x): + return self.parallel_lm_logits(self.lm_head, x, self.parallel_output) + + class GPTModelPipe(PipelineModule,MegatronModule): """GPT-2 Language model.""" @@ -268,23 +283,12 @@ def _to_float16(inputs): args.hidden_size, eps=args.layernorm_epsilon)) - def _logits_helper(embedding, lm_output): - """A wrapper to massage inputs/outputs from pipeline. """ - return parallel_lm_logits( - lm_output, - embedding.word_embeddings_weight, - self.parallel_output) - self.specs.append( - TiedLayerSpec('embed', - EmbeddingPipe, - args.hidden_size, - args.padded_vocab_size, - args.hidden_dropout, - init_method=init_method, - num_tokentypes=num_tokentypes, - forward_fn=_logits_helper, - tied_weight_attr='word_embeddings_weight') + LayerSpec(OutputVocab, + args.hidden_size, + args.padded_vocab_size, + self.parallel_output + ) ) # Convert to fp32 if needed From 55d1a4bf10cc0576f6d315152796b9ca17e6c798 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 5 Jul 2022 03:06:22 +0200 Subject: [PATCH 06/25] Revert "I think OPT doesn't have shared vocabulary embedding" This reverts commit e742a3fecfec5d6fdeb9a381e20baa2bd9304bab. --- megatron/model/gpt_model.py | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py index 62f29c804..31d33a91b 100644 --- a/megatron/model/gpt_model.py +++ b/megatron/model/gpt_model.py @@ -17,7 +17,6 @@ from functools import partial import torch -from torch import nn from megatron import get_args from megatron import mpu @@ -196,20 +195,6 @@ def CrossEntropy(output, labels): return CrossEntropy -class OutputVocab(nn.Module): - """Useful when models don't have a shared embedding space""" - def __init__(self, hidden_size: int, padded_vocab_size: int, parallel_output: bool): - super(OutputVocab, self).__init__() - self.parallel_output = parallel_output - self.lm_head = mpu.VocabParallelEmbedding( - padded_vocab_size, - hidden_size, - ) - - def forward(self, x): - return self.parallel_lm_logits(self.lm_head, x, self.parallel_output) - - class GPTModelPipe(PipelineModule,MegatronModule): """GPT-2 Language model.""" @@ -283,12 +268,23 @@ def _to_float16(inputs): args.hidden_size, eps=args.layernorm_epsilon)) + def _logits_helper(embedding, lm_output): + """A wrapper to massage inputs/outputs from pipeline. """ + return parallel_lm_logits( + lm_output, + embedding.word_embeddings_weight, + self.parallel_output) + self.specs.append( - LayerSpec(OutputVocab, - args.hidden_size, - args.padded_vocab_size, - self.parallel_output - ) + TiedLayerSpec('embed', + EmbeddingPipe, + args.hidden_size, + args.padded_vocab_size, + args.hidden_dropout, + init_method=init_method, + num_tokentypes=num_tokentypes, + forward_fn=_logits_helper, + tied_weight_attr='word_embeddings_weight') ) # Convert to fp32 if needed From b8090b7619121b486c648456db47036f053bf24e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 5 Jul 2022 11:26:39 +0200 Subject: [PATCH 07/25] Maybe this is what's wrong --- megatron/checkpointing.py | 8 +++++++- tasks/eval_harness/evaluate.py | 5 ++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index d9a30f468..4a062888c 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -273,7 +273,13 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True if args.deepspeed: load_optimizer_states = False if args.no_load_optim else True - loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_optimizer_states=load_optimizer_states) + loaded_dir, state_dict = model[0].load_checkpoint( + load_dir, + tag=".", + load_optimizer_states=load_optimizer_states, + load_lr_scheduler_states=load_optimizer_states, + load_module_only=not load_optimizer_states + ) if loaded_dir is None: print_rank_0('WARNING: could not find the metadata file {} '.format( load_dir)) diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index d2db4bb4d..bfeaa83b6 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -3,6 +3,9 @@ import os import sys import datetime + +from megatron.checkpointing import load_checkpoint + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir,os.path.pardir))) @@ -323,7 +326,7 @@ def load_ds_checkpoint_and_setup_megatron(args): model = model[0] zero_enabled = model._config.zero_enabled model._config.zero_enabled = False - _, _ = model.load_checkpoint(cp_path, tag = '.', load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=True) + load_checkpoint(model, optimizer=None, lr_scheduler=None) model._config.zero_enabled = zero_enabled else: model = get_model(model_provider)[0] From dff9766fcf5b8c381cfc5de4b873a83bac2a8c24 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 5 Jul 2022 11:42:51 +0200 Subject: [PATCH 08/25] Update model loading --- tasks/eval_harness/evaluate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index bfeaa83b6..abb21c467 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -323,11 +323,11 @@ def load_ds_checkpoint_and_setup_megatron(args): cp_path = args.load args.load = None model, _, _ = setup_model_and_optimizer(model_provider) - model = model[0] - zero_enabled = model._config.zero_enabled - model._config.zero_enabled = False + zero_enabled = model[0]._config.zero_enabled + model[0]._config.zero_enabled = False load_checkpoint(model, optimizer=None, lr_scheduler=None) - model._config.zero_enabled = zero_enabled + model[0]._config.zero_enabled = zero_enabled + model = model[0] else: model = get_model(model_provider)[0] # Initialize megatron model using the parsed state dict. From 01e269d83b44b1eb7c87d2b2c9895b7a01b1162d Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 5 Jul 2022 11:55:12 +0200 Subject: [PATCH 09/25] WTF --- tasks/eval_harness/evaluate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index abb21c467..1fa59a453 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -319,9 +319,9 @@ def load_ds_checkpoint_and_setup_megatron(args): import deepspeed deepspeed.runtime.state_dict_factory.MegatronSDLoader.sanity_check = lambda self, ckpt_file_name: None - - cp_path = args.load - args.load = None + # + # cp_path = args.load + # args.load = None model, _, _ = setup_model_and_optimizer(model_provider) zero_enabled = model[0]._config.zero_enabled model[0]._config.zero_enabled = False From 17a075d2de90f842a4772de5f8adf27921d81574 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 5 Jul 2022 11:59:50 +0200 Subject: [PATCH 10/25] Woops --- tasks/eval_harness/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index 1fa59a453..80ebc3b28 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -85,7 +85,7 @@ def loglikelihood(self, requests): # end of text as context context_enc = [self.EOT_TOKEN_ID] else: - context_enc = [self.EOT_TOKEN_ID, self.tokenizer_encode(context)] + context_enc = [self.EOT_TOKEN_ID, *self.tokenizer_encode(context)] continuation_enc = self.tokenizer_encode(continuation) From d45af145321e8d8feffd3cb88586359ff838cf56 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 5 Jul 2022 14:35:10 +0200 Subject: [PATCH 11/25] Position offset --- tasks/eval_harness/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index 80ebc3b28..6ba013dd9 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -197,7 +197,7 @@ def create_model_inputs(self, tokens): prefix_indices=None, loss_on_targets_only=False) - position_ids = position_ids -1 + position_ids = position_ids + 2 return (tokens, position_ids, attention_mask), (tokens, loss_mask) From 73a7947604df9c87125929a55d542aa3c035ce79 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 5 Jul 2022 14:56:17 +0200 Subject: [PATCH 12/25] Add bs eval harness code --- tasks/eval_harness/evaluate.py | 11 +- tasks/eval_harness/evaluate_bsevalharness.py | 445 +++++++++++++++++++ 2 files changed, 449 insertions(+), 7 deletions(-) create mode 100644 tasks/eval_harness/evaluate_bsevalharness.py diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index 6ba013dd9..dc8c18ad9 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -222,9 +222,10 @@ def _model_call(self, inps): if output is not None: - output = torch.cat(output, 0)[:len(inps)] - else: - output = None + if self.args.offloadearly: + output = torch.cat([F.log_softmax(o, dim=-1).cpu() for o in output[:len(inps)]], 0) + else: + output = torch.cat(output, 0)[:len(inps)] # hack #2 for adaptive_seq_len to work as total_loss gets appended to and shapes aren't the same if args.adaptive_seq_len: @@ -293,7 +294,6 @@ def override_args(args, override_args, skip_keys, skip_if_specified_keys): # We then use the megatron deepspeed converter to load the deepspeed checkpoints as if they we're megatron checkpoints. def load_ds_checkpoint_and_setup_megatron(args): _print_args = megatron.arguments._print_args - # megatron.arguments._print_args = lambda *_args, **kwarg: None if not os.path.exists(args.load): raise ValueError(f"checkpoint path {args.load} doesn't exit") @@ -319,9 +319,6 @@ def load_ds_checkpoint_and_setup_megatron(args): import deepspeed deepspeed.runtime.state_dict_factory.MegatronSDLoader.sanity_check = lambda self, ckpt_file_name: None - # - # cp_path = args.load - # args.load = None model, _, _ = setup_model_and_optimizer(model_provider) zero_enabled = model[0]._config.zero_enabled model[0]._config.zero_enabled = False diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py new file mode 100644 index 000000000..eddcd58a2 --- /dev/null +++ b/tasks/eval_harness/evaluate_bsevalharness.py @@ -0,0 +1,445 @@ +""" +An evaluate function compatible with https://github.com/bigscience-workshop/lm-evaluation-harness +at commit 2d968c60fc8bd808e5e475ca300781f774d234c1 +Env Setup: +git clone https://github.com/bigscience-workshop/lm-evaluation-harness +cd lm-evaluation-harness +pip install "promptsource @ git+https://github.com/bigscience-workshop/promptsource@eval-hackathon" +pip install -e ".[dev]" +& then: https://github.com/bigscience-workshop/bigscience/blob/12f06bd39221f2e3788524ea86139ac1ac2b1b1a/jz/envs/README.md#creating-production-conda-env +""" + +import logging +import os +import sys +import datetime +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir,os.path.pardir))) + +from codecarbon import OfflineEmissionsTracker +from lm_eval import evaluator, tasks +from lm_eval.api import utils +from lm_eval.api.model import CacheHook +from tqdm import tqdm +import torch.nn.functional as F + +from lm_eval.tasks import ALL_TASKS +from pretrain_gpt import model_provider +import numpy as np + +import torch +from megatron import get_args +from megatron import print_rank_0 +from megatron import get_tokenizer +from megatron import mpu +from megatron.training import setup_model_and_optimizer, get_model +from megatron.mpu.mappings import gather_from_tensor_model_parallel_region + +from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model +from megatron.p2p_communication import recv_forward, send_forward +import json + +from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP +from megatron.model.distributed import DistributedDataParallel as LocalDDP +from megatron.model.module import Float16Module +from deepspeed.runtime.pipe import schedule + + +def setup_example_logger(output_path): + """ + Sets up a logger that will save each example and prediction. + Copied from https://github.com/bigscience-workshop/lm-evaluation-harness/blob/2d968c60fc8bd808e5e475ca300781f774d234c1/main.py#L74 + """ + example_logger = logging.getLogger("examples") + filename = f"./examples-{output_path}.jsonl" + formatter = logging.Formatter("%(message)s") + handler = logging.FileHandler(filename) + handler.setFormatter(formatter) + example_logger.addHandler(handler) + example_logger.setLevel(logging.INFO) + +class EvalHarnessAdaptor: + def __init__(self, model, tokenizer): + args = get_args() + self.args = args + self.model = model + self.tokenizer = tokenizer + self.VOCAB_SIZE = tokenizer.vocab_size + self.EOT_TOKEN_ID = tokenizer.eod + + self._max_length = args.seq_length + + # For ds we split into mini batches and then micro batches to keep pipelining api happy. + # With Megatron we just go to micro_batches directly + self._batch_size = args.micro_batch_size * args.micro_bs_multiplier + + self.cache_hook = CacheHook(None) + self.is_main = args.rank == 0 + self.is_local_main = args.local_rank == 0 + self._device = torch.cuda.current_device() + self.is_model_parallel = mpu.get_tensor_model_parallel_world_size() > 1 + self.is_pipe_parallel = mpu.get_pipeline_model_parallel_world_size() > 1 + self.is_data_parallel = mpu.get_data_parallel_world_size() > 1 + self.adaptive_seq_len = args.adaptive_seq_len + if self.is_data_parallel: + raise NotImplementedError("Data parallelism is currently not supported for evaluation") + + self.is_last_stage = True if not self.is_pipe_parallel else mpu.is_pipeline_last_stage() # only the last stage of the pipeline model will receive the logits + + @property + def max_length(self): + return self._max_length + + @property + def batch_size(self): + return self._batch_size + + @property + def device(self): + return self._device + + + def loglikelihood(self, requests): + new_reqs = [] + for context, continuation in requests: + if context == "": + # end of text as context + context_enc = [self.EOT_TOKEN_ID] + else: + context_enc = [self.EOT_TOKEN_ID, *self.tokenizer_encode(context)] + + continuation_enc = self.tokenizer_encode(continuation) + + new_reqs.append(((context, continuation), context_enc, continuation_enc)) + + return self._loglikelihood_tokens(new_reqs) + + def loglikelihood_rolling(self, requests): + # TODO: Implement caching once we've confirmed the perplexity implementation + # TODO: automatic batch size detection for vectorization + + loglikelihoods = [] + with torch.no_grad(): + for string, in tqdm(requests): + rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows( + token_list=self.tokenizer_encode(string), + prefix_token=self.EOT_TOKEN_ID, + max_seq_len=self.max_length, + context_len=1, + ))) + + rolling_token_windows = [(None,) + x for x in rolling_token_windows] + + # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that + string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True) + + # discard is_greedy + string_nll = [x[0] for x in string_nll] + + string_nll = sum(string_nll) + loglikelihoods.append(string_nll) + + return loglikelihoods + + def _loglikelihood_tokens(self, requests, disable_tqdm=False): + disable_tqdm = disable_tqdm if self.is_main else True + res = [] + res_len = 0 # storing the result length for later + self.model.eval() + with torch.no_grad(): + def _collate(x): + toks = x[1] + x[2] + return (-len(toks), tuple(toks)) + + reord = utils.Reorderer(requests, _collate) + for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size): + inps, contlens, inplens, padding_length = [], [], [], None + for _, context_enc, continuation_enc in chunk: + # when too long to fit in context, truncate from the left + inp = torch.tensor( + (context_enc + continuation_enc)[-(self.max_length + 1):][:-1] + , dtype=torch.long).to(self.device) + inplen, = inp.shape + + cont = continuation_enc + + # since in _collate we make sure length is descending, the longest is always the first one. + padding_length = padding_length if padding_length is not None else inplen + if not self.adaptive_seq_len: + padding_length = self.max_length + # pad to length + inp = torch.cat([ + inp, # [seq] + torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq] + ], dim=0) + + inps.append(inp.unsqueeze(0)) + + contlens.append(cont) + inplens.append(inplen) + + logits = self._model_call(torch.cat(inps, dim=0)) + res_len += len(chunk) + if logits is not None: + if self.args.offloadearly: + multi_logits = logits + else: + multi_logits = F.log_softmax(logits, dim=-1).cpu() # [batch, seq, vocab] + + for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens): + contlen = len(cont_toks) + logits = logits[inplen - contlen:inplen].unsqueeze(0) # [1, seq, vocab] + greedy_tokens = logits.argmax(dim=-1) + # cont_toks :: [1, seq] + cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0) + max_equal = (greedy_tokens == cont_toks).all() + # last_token_slice = logits[:, -1, :].squeeze(0).tolist() + + logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq] + answer = (float(logits.sum()), bool(max_equal)) + # partial caching + if cache_key is not None: + self.cache_hook.add_partial("loglikelihood", cache_key, answer) + res.append(answer) + + if not mpu.is_pipeline_last_stage(): + # @HACK: To make the eval harness happy on threads that don't have access to the results. + # We just randomly generate some data. + res = [(np.random.rand(), np.random.rand()>0.5) for _ in requests] + + return reord.get_original(res) + + def create_model_inputs(self, tokens): + args = get_args() + + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + self.EOT_TOKEN_ID, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=None, + loss_on_targets_only=False) + + return (tokens, position_ids, attention_mask), (tokens, loss_mask) + + def _model_call(self, inps, count_lens, input): + args = get_args() + + if args.deepspeed: + self.model.set_batch_fn(self.create_model_inputs) + # round up to multiple of micro_batch_size + new_size = ((len(inps) + args.micro_batch_size-1) // args.micro_batch_size) * args.micro_batch_size + padded = F.pad(inps, (0, 0, 0, new_size-len(inps)), value = 0) + # dummy data iterator for pipelining. + data_iterator = list((torch.stack(inp) for inp in utils.chunks(padded, args.micro_batch_size))) + self.model.micro_batches = len(data_iterator) + + if self.adaptive_seq_len: + # Allow different shapes than the default seq_len to be communicated across pipes + # Without this Deepspeed will hang when trying to receive activations + self.model.reset_activation_shape() + + output = self.model.eval_batch(iter(data_iterator), compute_loss = False, reduce_output = None) + + if output is not None: + if self.args.offloadearly: + output = torch.cat([F.log_softmax(o, dim=-1).cpu() for o in output[:len(inps)]], 0) + else: + output = torch.cat(output, 0)[:len(inps)] + + # hack #2 for adaptive_seq_len to work as total_loss gets appended to and shapes aren't the same + if args.adaptive_seq_len: + self.model.total_loss = None + else: + # Since the shape of the micro-batch will change + # We need set the correct shapes here + # So that latter pipeline stages knows which shapes to expect. + # Otherwise we will deadlock. + + args.micro_batch_size = len(inps) + args.seq_length = len(inps[0]) + args.max_position_embeddings = args.seq_length + + input_tensor = recv_forward() + + # Forward pass through the model. + unwrapped_model = unwrap_model(self.model, (torchDDP, LocalDDP, Float16Module)) + unwrapped_model.set_input_tensor(input_tensor) + output = self.model(*self.create_model_inputs(inps)[0]) + send_forward(output) + + if mpu.is_pipeline_last_stage(): + return gather_from_tensor_model_parallel_region(output)[..., :self.tokenizer.vocab_size] + else: + return None + + def tokenizer_encode(self, text): + """Tokenize text *without* adding special tokens.""" + # Splitting this into its own method in case we need to handle special cases for different tokenizers + from megatron.tokenizer.gpt2_tokenization import GPT2Tokenizer + if isinstance(self.tokenizer.tokenizer, GPT2Tokenizer): + return self.tokenizer.tokenizer.encode(text) + else: + return self.tokenizer.tokenizer.encode(text, add_special_tokens=False) + + +from megatron.initialize import initialize_megatron +import megatron + +from tools.convert_checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint +from tools.convert_checkpoint.deepspeed_to_megatron import _create_rank_checkpoint + +def override_args(args, override_args, skip_keys, skip_if_specified_keys): + for k, v in vars(override_args).items(): + if k in skip_keys: + continue + if k in skip_if_specified_keys and getattr(args, k) is not None: + continue + setattr(args, k, v) + + +# Note(Hesslow): +# The model loading is a bit convoluted. +# We want to parse out the model arguments from the checkpoint and use those to initialize megatron-ds. +# +# However megatron-ds expects its arguments on the command line. +# And at that point we don't know them. +# +# Instead we use Jasons way: we load the arguments form the checkpoint and then override _parse_args to return whatever args we want. +# +# If the checkpoint is old, some new arguments may have been introduced and the code will expect these arguments to exist. +# In order to support this we _first_ parse the arguments normally, and then override them with the arguments from the checkpoint. +# Keeping the default-value of newer arguments. +# +# We then use the megatron deepspeed converter to load the deepspeed checkpoints as if they we're megatron checkpoints. +def load_ds_checkpoint_and_setup_megatron(args): + _print_args = megatron.arguments._print_args + + if not os.path.exists(args.load): + raise ValueError(f"checkpoint path {args.load} doesn't exit") + + # stop megatron from reparsing the arguments. + megatron.global_vars._parse_args = lambda *_args, **kwarg: args + megatron.global_vars._GLOBAL_ARGS = args + + initialize_megatron() + torch.distributed.barrier() + + # print final arguments. + _print_args(args) + if args.deepspeed: + + # Hack #3: + # Loading pipelined models in deepspeed with different TP than it was originally trained on fails + # due to a sanity check, that makes sure that all state_dicts that we merge contains attention layers. + # This, however, is not true for pipelining when we will merge the state_dict for the embeddings which + # which does not contain these attention-specific keys. + # + # Deepspeed does however manage to load the model if we just turn off this sanity check. + import deepspeed + deepspeed.runtime.state_dict_factory.MegatronSDLoader.sanity_check = lambda self, ckpt_file_name: None + + model, _, _ = setup_model_and_optimizer(model_provider) + zero_enabled = model[0]._config.zero_enabled + model[0]._config.zero_enabled = False + load_checkpoint(model, optimizer=None, lr_scheduler=None) + model[0]._config.zero_enabled = zero_enabled + model = model[0] + else: + model = get_model(model_provider)[0] + # Initialize megatron model using the parsed state dict. + sd = _create_rank_checkpoint(ds_checkpoint, None, mpu.get_tensor_model_parallel_rank(), + mpu.get_pipeline_model_parallel_rank(), True) + + model.load_state_dict(sd['model'], strict=True) + + if args.eval_fp32: + model = model.float() + + torch.distributed.barrier() + return model + +def tasks_args(parser): + + results_path_default = f"results-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.json" + + """Provide extra arguments required for tasks.""" + group = parser.add_argument_group(title='Evaluation options') + group.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.') + group.add_argument('--results_path', type=str, default = results_path_default, help='Path to where the results will be stored.') + group.add_argument('--adaptive_seq_len', default = False, action='store_true', + help='Should the sequence length be adapted to the batch during evaluation, if in fp16 the results will be slightly different due to numerical errors but greatly speed up evaluation.') + group.add_argument('--eval_fp32', default = False, action='store_true', help='Should the evaluation run in fp32') + group.add_argument('--intermed_results', default = False, action='store_true', help='Whether to print & write intermediate results for each task') + group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation') + group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel') + group.add_argument('--offloadearly', default = False, action='store_true', help='Offloads logits to CPU earlier to allow using a higher micro_bs_multiplier - Speeds up eval by up to 1.5x for 176B') + return parser + +from megatron.global_vars import _parse_args + +def main(): + # parse the megatron args. But wait with initalizing megatron. + # avoid printing the arguments, since they will later be overridden. + args = _parse_args(tasks_args) + load_path = args.load + model = load_ds_checkpoint_and_setup_megatron(args) + + args = get_args() + if args.deepspeed and args.adaptive_seq_len: + # adaptive_seq_len hack #1: + # CL automatically enables reset_activation_shape() which allows us to change input shapes + # and it also reshapes the attenion scores in attention_mask_func + args.curriculum_learning = 1 + + task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',') + task_dict = tasks.get_task_dict_promptsource(task_list) + + model.module.activation_checkpoint_interval = 0 + model._compute_loss = False + model.fwd_outputs = [] + + tokenizer = get_tokenizer() + adaptor = EvalHarnessAdaptor(model, tokenizer) + + def add_config(results): + results["config"] = { + "adaptive_seq_len": args.adaptive_seq_len, + "num_fewshot": 0, + "bootstrap_iters": args.bootstrap_iters, + } + return results + + with OfflineEmissionsTracker(country_iso_code="FRA", log_level="error"): + if args.intermed_results: + global_results = {"results": [], "versions": {}, "table_results": {}} + timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + iteration_id = load_path.split("/")[-1].replace("/", "") + results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json") + # Backup file in case of interruption during writing + results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json") + examples_path = results_path.replace(".json", "_examples") + setup_example_logger(examples_path) + for task_name, task in task_dict.items(): + results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters) + global_results["results"].extend(results["results"]) + global_results["versions"] = {**global_results["versions"], **results["versions"]} + global_results["table_results"] = {**global_results["table_results"], **results["table_results"]} + global_results = add_config(global_results) + if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: + print(json.dumps(results, indent=2)) + with open(results_path, 'w') as outfile: + json.dump(global_results, outfile, indent=4) + with open(results_path_backup, 'w') as outfile: + json.dump(global_results, outfile, indent=4) + else: + global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters) + global_results = add_config(global_results) + if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: + print(json.dumps(global_results, indent=2)) + with open(args.results_path, 'w') as outfile: + json.dump(global_results, outfile, indent=4) + +if __name__ == '__main__': + main() \ No newline at end of file From f34841cd17a70e2b425deffe52e1fe93f5645195 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 5 Jul 2022 15:05:41 +0200 Subject: [PATCH 13/25] Missing import --- tasks/eval_harness/evaluate_bsevalharness.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py index eddcd58a2..88358f27b 100644 --- a/tasks/eval_harness/evaluate_bsevalharness.py +++ b/tasks/eval_harness/evaluate_bsevalharness.py @@ -13,6 +13,9 @@ import os import sys import datetime + +from megatron.checkpointing import load_checkpoint + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir,os.path.pardir))) From 1423bf9f4e6ca7965d757eda0ae3dcb46bb4e8ab Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 5 Jul 2022 15:06:15 +0200 Subject: [PATCH 14/25] Update position ids --- tasks/eval_harness/evaluate_bsevalharness.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py index 88358f27b..8ac5b2c8c 100644 --- a/tasks/eval_harness/evaluate_bsevalharness.py +++ b/tasks/eval_harness/evaluate_bsevalharness.py @@ -224,6 +224,8 @@ def create_model_inputs(self, tokens): prefix_indices=None, loss_on_targets_only=False) + position_ids = position_ids + 2 + return (tokens, position_ids, attention_mask), (tokens, loss_mask) def _model_call(self, inps, count_lens, input): From fae49cabcf161dd275a1847c5706c2ff74e163b7 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 5 Jul 2022 16:10:57 +0200 Subject: [PATCH 15/25] That pathing breaks everything --- tasks/eval_harness/evaluate_bsevalharness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py index 8ac5b2c8c..c05128f43 100644 --- a/tasks/eval_harness/evaluate_bsevalharness.py +++ b/tasks/eval_harness/evaluate_bsevalharness.py @@ -54,7 +54,7 @@ def setup_example_logger(output_path): Copied from https://github.com/bigscience-workshop/lm-evaluation-harness/blob/2d968c60fc8bd808e5e475ca300781f774d234c1/main.py#L74 """ example_logger = logging.getLogger("examples") - filename = f"./examples-{output_path}.jsonl" + filename = f"{output_path}.jsonl" formatter = logging.Formatter("%(message)s") handler = logging.FileHandler(filename) handler.setFormatter(formatter) From 6a326afa3c127975bb50f713457a57c4e1721c0a Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 5 Jul 2022 16:48:38 +0200 Subject: [PATCH 16/25] add offloading possitility --- tasks/eval_harness/evaluate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index dc8c18ad9..3e04f1bfa 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -352,6 +352,7 @@ def tasks_args(parser): group.add_argument('--intermed_results', default = False, action='store_true', help='Whether to print & write intermediate results for each task') group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation') group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel') + group.add_argument('--offloadearly', default = False, action='store_true', help='Offloads logits to CPU earlier to allow using a higher micro_bs_multiplier - Speeds up eval by up to 1.5x for 176B') return parser from megatron.global_vars import _parse_args From b2caf4ddbd4994712e62ae8e2382632302369df4 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 5 Jul 2022 18:14:01 +0200 Subject: [PATCH 17/25] Black list all tasks that require greedy_until --- tasks/eval_harness/evaluate_bsevalharness.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py index c05128f43..1635d55f1 100644 --- a/tasks/eval_harness/evaluate_bsevalharness.py +++ b/tasks/eval_harness/evaluate_bsevalharness.py @@ -388,6 +388,15 @@ def main(): # parse the megatron args. But wait with initalizing megatron. # avoid printing the arguments, since they will later be overridden. args = _parse_args(tasks_args) + + # TODO @thomasw21 If all the tasks require `greedy_until` then we early stop as this isn't implemented + task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',') + task_dict = tasks.get_task_dict_promptsource(task_list) + task_dict = {task_name: task for task_name, task in task_dict.items() if task.need_greedy_until is False} + if len(task_dict) == 0: + print_rank_0("Early stopping as `greedy_until` is not implemented yet.") + return + load_path = args.load model = load_ds_checkpoint_and_setup_megatron(args) @@ -398,9 +407,6 @@ def main(): # and it also reshapes the attenion scores in attention_mask_func args.curriculum_learning = 1 - task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',') - task_dict = tasks.get_task_dict_promptsource(task_list) - model.module.activation_checkpoint_interval = 0 model._compute_loss = False model.fwd_outputs = [] From 48892b93d5ddcc4337526f5873ecfd1ba07004a6 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Wed, 6 Jul 2022 12:13:45 +0200 Subject: [PATCH 18/25] Woops --- tasks/eval_harness/download.py | 15 ++++++++++++++- tasks/eval_harness/evaluate_bsevalharness.py | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/tasks/eval_harness/download.py b/tasks/eval_harness/download.py index d2abcd83a..f58a1cb17 100644 --- a/tasks/eval_harness/download.py +++ b/tasks/eval_harness/download.py @@ -14,7 +14,20 @@ def main(): task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',') - tasks.get_task_dict(task_list) + task_and_exceptions = [] + for task in task_list: + print("--------") + print(f"Downloading dataset for task: {task}") + try: + tasks.get_task_dict([task]) + except Exception as e: + task_and_exceptions.append((task, e)) + + for task, exception in task_and_exceptions: + print("=======================================") + print(task) + print(exception) + if __name__ == '__main__': main() diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py index 1635d55f1..11a913f83 100644 --- a/tasks/eval_harness/evaluate_bsevalharness.py +++ b/tasks/eval_harness/evaluate_bsevalharness.py @@ -228,7 +228,7 @@ def create_model_inputs(self, tokens): return (tokens, position_ids, attention_mask), (tokens, loss_mask) - def _model_call(self, inps, count_lens, input): + def _model_call(self, inps): args = get_args() if args.deepspeed: From 1476691b20740f2d3691f9ce3db152ed8427be3f Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Wed, 6 Jul 2022 14:42:51 +0200 Subject: [PATCH 19/25] Disable CarbonEmissionTracker --- tasks/eval_harness/evaluate_bsevalharness.py | 52 ++++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py index 11a913f83..b95a4207c 100644 --- a/tasks/eval_harness/evaluate_bsevalharness.py +++ b/tasks/eval_harness/evaluate_bsevalharness.py @@ -422,35 +422,35 @@ def add_config(results): } return results - with OfflineEmissionsTracker(country_iso_code="FRA", log_level="error"): - if args.intermed_results: - global_results = {"results": [], "versions": {}, "table_results": {}} - timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') - iteration_id = load_path.split("/")[-1].replace("/", "") - results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json") - # Backup file in case of interruption during writing - results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json") - examples_path = results_path.replace(".json", "_examples") - setup_example_logger(examples_path) - for task_name, task in task_dict.items(): - results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters) - global_results["results"].extend(results["results"]) - global_results["versions"] = {**global_results["versions"], **results["versions"]} - global_results["table_results"] = {**global_results["table_results"], **results["table_results"]} - global_results = add_config(global_results) - if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: - print(json.dumps(results, indent=2)) - with open(results_path, 'w') as outfile: - json.dump(global_results, outfile, indent=4) - with open(results_path_backup, 'w') as outfile: - json.dump(global_results, outfile, indent=4) - else: - global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters) + # with OfflineEmissionsTracker(country_iso_code="FRA", log_level="error"): + if args.intermed_results: + global_results = {"results": [], "versions": {}, "table_results": {}} + timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + iteration_id = load_path.split("/")[-1].replace("/", "") + results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json") + # Backup file in case of interruption during writing + results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json") + examples_path = results_path.replace(".json", "_examples") + setup_example_logger(examples_path) + for task_name, task in task_dict.items(): + results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters) + global_results["results"].extend(results["results"]) + global_results["versions"] = {**global_results["versions"], **results["versions"]} + global_results["table_results"] = {**global_results["table_results"], **results["table_results"]} global_results = add_config(global_results) if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: - print(json.dumps(global_results, indent=2)) - with open(args.results_path, 'w') as outfile: + print(json.dumps(results, indent=2)) + with open(results_path, 'w') as outfile: + json.dump(global_results, outfile, indent=4) + with open(results_path_backup, 'w') as outfile: json.dump(global_results, outfile, indent=4) + else: + global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters) + global_results = add_config(global_results) + if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: + print(json.dumps(global_results, indent=2)) + with open(args.results_path, 'w') as outfile: + json.dump(global_results, outfile, indent=4) if __name__ == '__main__': main() \ No newline at end of file From 5868d295e98a6eb510be4ea78a46895a03fb788c Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Wed, 6 Jul 2022 18:03:11 +0200 Subject: [PATCH 20/25] Enable CarbonEmissionTracker --- tasks/eval_harness/evaluate_bsevalharness.py | 52 ++++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py index b95a4207c..11a913f83 100644 --- a/tasks/eval_harness/evaluate_bsevalharness.py +++ b/tasks/eval_harness/evaluate_bsevalharness.py @@ -422,35 +422,35 @@ def add_config(results): } return results - # with OfflineEmissionsTracker(country_iso_code="FRA", log_level="error"): - if args.intermed_results: - global_results = {"results": [], "versions": {}, "table_results": {}} - timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') - iteration_id = load_path.split("/")[-1].replace("/", "") - results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json") - # Backup file in case of interruption during writing - results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json") - examples_path = results_path.replace(".json", "_examples") - setup_example_logger(examples_path) - for task_name, task in task_dict.items(): - results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters) - global_results["results"].extend(results["results"]) - global_results["versions"] = {**global_results["versions"], **results["versions"]} - global_results["table_results"] = {**global_results["table_results"], **results["table_results"]} + with OfflineEmissionsTracker(country_iso_code="FRA", log_level="error"): + if args.intermed_results: + global_results = {"results": [], "versions": {}, "table_results": {}} + timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + iteration_id = load_path.split("/")[-1].replace("/", "") + results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json") + # Backup file in case of interruption during writing + results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json") + examples_path = results_path.replace(".json", "_examples") + setup_example_logger(examples_path) + for task_name, task in task_dict.items(): + results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters) + global_results["results"].extend(results["results"]) + global_results["versions"] = {**global_results["versions"], **results["versions"]} + global_results["table_results"] = {**global_results["table_results"], **results["table_results"]} + global_results = add_config(global_results) + if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: + print(json.dumps(results, indent=2)) + with open(results_path, 'w') as outfile: + json.dump(global_results, outfile, indent=4) + with open(results_path_backup, 'w') as outfile: + json.dump(global_results, outfile, indent=4) + else: + global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters) global_results = add_config(global_results) if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: - print(json.dumps(results, indent=2)) - with open(results_path, 'w') as outfile: - json.dump(global_results, outfile, indent=4) - with open(results_path_backup, 'w') as outfile: + print(json.dumps(global_results, indent=2)) + with open(args.results_path, 'w') as outfile: json.dump(global_results, outfile, indent=4) - else: - global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters) - global_results = add_config(global_results) - if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: - print(json.dumps(global_results, indent=2)) - with open(args.results_path, 'w') as outfile: - json.dump(global_results, outfile, indent=4) if __name__ == '__main__': main() \ No newline at end of file From 94ef27ade0a8c0e18c533375251f6743524eefef Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 7 Jul 2022 00:02:38 +0200 Subject: [PATCH 21/25] Woops --- tasks/eval_harness/evaluate_bsevalharness.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py index 11a913f83..712cec860 100644 --- a/tasks/eval_harness/evaluate_bsevalharness.py +++ b/tasks/eval_harness/evaluate_bsevalharness.py @@ -182,6 +182,7 @@ def _collate(x): inplens.append(inplen) logits = self._model_call(torch.cat(inps, dim=0)) + torch.distributed.barrier() res_len += len(chunk) if logits is not None: if self.args.offloadearly: From f713ff0111a808b9af70ce19fadf0b62c8c60b42 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 7 Jul 2022 02:31:22 +0200 Subject: [PATCH 22/25] Be very careful of random states --- tasks/eval_harness/evaluate_bsevalharness.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py index 712cec860..ea98f95f2 100644 --- a/tasks/eval_harness/evaluate_bsevalharness.py +++ b/tasks/eval_harness/evaluate_bsevalharness.py @@ -11,6 +11,7 @@ import logging import os +import random import sys import datetime @@ -209,7 +210,7 @@ def _collate(x): if not mpu.is_pipeline_last_stage(): # @HACK: To make the eval harness happy on threads that don't have access to the results. # We just randomly generate some data. - res = [(np.random.rand(), np.random.rand()>0.5) for _ in requests] + res = [(0.5, 0.5) for _ in requests] return reord.get_original(res) @@ -381,6 +382,7 @@ def tasks_args(parser): group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation') group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel') group.add_argument('--offloadearly', default = False, action='store_true', help='Offloads logits to CPU earlier to allow using a higher micro_bs_multiplier - Speeds up eval by up to 1.5x for 176B') + group.add_argument('--seed', default=42, type=int, help='Random state seed') return parser from megatron.global_vars import _parse_args @@ -423,6 +425,7 @@ def add_config(results): } return results + random.seed(args.seed) with OfflineEmissionsTracker(country_iso_code="FRA", log_level="error"): if args.intermed_results: global_results = {"results": [], "versions": {}, "table_results": {}} @@ -434,7 +437,7 @@ def add_config(results): examples_path = results_path.replace(".json", "_examples") setup_example_logger(examples_path) for task_name, task in task_dict.items(): - results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters) + results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters, rng=np.random.seed(args.seed)) global_results["results"].extend(results["results"]) global_results["versions"] = {**global_results["versions"], **results["versions"]} global_results["table_results"] = {**global_results["table_results"], **results["table_results"]} @@ -446,7 +449,7 @@ def add_config(results): with open(results_path_backup, 'w') as outfile: json.dump(global_results, outfile, indent=4) else: - global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters) + global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters, rng=np.random.seed(args.seed)) global_results = add_config(global_results) if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: print(json.dumps(global_results, indent=2)) From fab9fb965ba3224f9680b35b71effc9b8928a775 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 7 Jul 2022 02:32:55 +0200 Subject: [PATCH 23/25] Woops --- tasks/eval_harness/evaluate_bsevalharness.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py index ea98f95f2..f2041cdad 100644 --- a/tasks/eval_harness/evaluate_bsevalharness.py +++ b/tasks/eval_harness/evaluate_bsevalharness.py @@ -210,7 +210,7 @@ def _collate(x): if not mpu.is_pipeline_last_stage(): # @HACK: To make the eval harness happy on threads that don't have access to the results. # We just randomly generate some data. - res = [(0.5, 0.5) for _ in requests] + res = [(0.5, True) for _ in requests] return reord.get_original(res) @@ -457,4 +457,4 @@ def add_config(results): json.dump(global_results, outfile, indent=4) if __name__ == '__main__': - main() \ No newline at end of file + main() From 23d3f4ca6c6efc678f6d0e71c2ad7892a1b0e56d Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 7 Jul 2022 02:52:24 +0200 Subject: [PATCH 24/25] Woops --- tasks/eval_harness/evaluate_bsevalharness.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py index f2041cdad..e199524d9 100644 --- a/tasks/eval_harness/evaluate_bsevalharness.py +++ b/tasks/eval_harness/evaluate_bsevalharness.py @@ -382,7 +382,6 @@ def tasks_args(parser): group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation') group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel') group.add_argument('--offloadearly', default = False, action='store_true', help='Offloads logits to CPU earlier to allow using a higher micro_bs_multiplier - Speeds up eval by up to 1.5x for 176B') - group.add_argument('--seed', default=42, type=int, help='Random state seed') return parser from megatron.global_vars import _parse_args @@ -437,7 +436,7 @@ def add_config(results): examples_path = results_path.replace(".json", "_examples") setup_example_logger(examples_path) for task_name, task in task_dict.items(): - results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters, rng=np.random.seed(args.seed)) + results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters, rng=np.random.default_rng(args.seed)) global_results["results"].extend(results["results"]) global_results["versions"] = {**global_results["versions"], **results["versions"]} global_results["table_results"] = {**global_results["table_results"], **results["table_results"]} @@ -449,7 +448,7 @@ def add_config(results): with open(results_path_backup, 'w') as outfile: json.dump(global_results, outfile, indent=4) else: - global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters, rng=np.random.seed(args.seed)) + global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters, rng=np.random.default_rng(args.seed)) global_results = add_config(global_results) if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: print(json.dumps(global_results, indent=2)) From c6f9c021ea4cb22273d24498bc8dd1e421384343 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 7 Jul 2022 11:46:06 +0200 Subject: [PATCH 25/25] Remove codecarbon --- tasks/eval_harness/evaluate_bsevalharness.py | 52 ++++++++++---------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py index e199524d9..2b17ad57d 100644 --- a/tasks/eval_harness/evaluate_bsevalharness.py +++ b/tasks/eval_harness/evaluate_bsevalharness.py @@ -20,7 +20,6 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir,os.path.pardir))) -from codecarbon import OfflineEmissionsTracker from lm_eval import evaluator, tasks from lm_eval.api import utils from lm_eval.api.model import CacheHook @@ -425,35 +424,34 @@ def add_config(results): return results random.seed(args.seed) - with OfflineEmissionsTracker(country_iso_code="FRA", log_level="error"): - if args.intermed_results: - global_results = {"results": [], "versions": {}, "table_results": {}} - timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') - iteration_id = load_path.split("/")[-1].replace("/", "") - results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json") - # Backup file in case of interruption during writing - results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json") - examples_path = results_path.replace(".json", "_examples") - setup_example_logger(examples_path) - for task_name, task in task_dict.items(): - results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters, rng=np.random.default_rng(args.seed)) - global_results["results"].extend(results["results"]) - global_results["versions"] = {**global_results["versions"], **results["versions"]} - global_results["table_results"] = {**global_results["table_results"], **results["table_results"]} - global_results = add_config(global_results) - if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: - print(json.dumps(results, indent=2)) - with open(results_path, 'w') as outfile: - json.dump(global_results, outfile, indent=4) - with open(results_path_backup, 'w') as outfile: - json.dump(global_results, outfile, indent=4) - else: - global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters, rng=np.random.default_rng(args.seed)) + if args.intermed_results: + global_results = {"results": [], "versions": {}, "table_results": {}} + timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + iteration_id = load_path.split("/")[-1].replace("/", "") + results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json") + # Backup file in case of interruption during writing + results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json") + examples_path = results_path.replace(".json", "_examples") + setup_example_logger(examples_path) + for task_name, task in task_dict.items(): + results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters, rng=np.random.default_rng(args.seed)) + global_results["results"].extend(results["results"]) + global_results["versions"] = {**global_results["versions"], **results["versions"]} + global_results["table_results"] = {**global_results["table_results"], **results["table_results"]} global_results = add_config(global_results) if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: - print(json.dumps(global_results, indent=2)) - with open(args.results_path, 'w') as outfile: + print(json.dumps(results, indent=2)) + with open(results_path, 'w') as outfile: + json.dump(global_results, outfile, indent=4) + with open(results_path_backup, 'w') as outfile: json.dump(global_results, outfile, indent=4) + else: + global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters, rng=np.random.default_rng(args.seed)) + global_results = add_config(global_results) + if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: + print(json.dumps(global_results, indent=2)) + with open(args.results_path, 'w') as outfile: + json.dump(global_results, outfile, indent=4) if __name__ == '__main__': main()