Skip to content

Commit 1e40fc6

Browse files
committed
add requirements file for leaderboard
Signed-off-by: Oleg Silkin <97077423+RobotSail@users.noreply.github.com>
1 parent cd47eaa commit 1e40fc6

File tree

6 files changed

+72
-48
lines changed

6 files changed

+72
-48
lines changed

requirements-leaderboard.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
lm-eval[ifeval,vllm,math,sentencepiece]>=0.4.4
2+
3+
# vLLM 0.8.3 + torch 2.6.0 doesn't work when running vLLM on granite-3.1-8b-instruct
4+
vllm<=0.7.3
5+
torch<=2.5.1
6+
7+
# XXX(osilkin): We use StrEnum in leaderboard, but Python3.10 doesn't have it as part of
8+
# the standard library, so we have to install it from the older library.
9+
strenum>=0.4.15; python_version < '3.11'

scripts/evaluate_best_checkpoint.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@
77
--output-file /path/to/output_file
88
"""
99

10-
import json
11-
import typer
10+
# Standard
1211
from pathlib import Path
1312
from typing import Optional
13+
import json
1414

15+
# Third Party
16+
import typer
1517

1618
app = typer.Typer()
1719

@@ -42,22 +44,23 @@ def main(
4244
raise typer.Exit(1)
4345

4446
typer.echo("importing LeaderboardV2Evaluator, this may take a while...")
47+
# First Party
4548
from instructlab.eval.leaderboard import LeaderboardV2Evaluator
4649

4750
checkpoint_results = {}
4851
for checkpoint in checkpoint_dirs:
4952
typer.echo(f"Processing checkpoint: {checkpoint}")
5053
ckpt_output_file = checkpoint / "leaderboard_results.json"
5154
evaluator = LeaderboardV2Evaluator(
52-
model_path=str(checkpoint), output_file=ckpt_output_file
55+
model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=8
5356
)
5457
result = evaluator.run()
5558
checkpoint_results[checkpoint.name] = result
56-
typer.echo(f"Checkpoint {checkpoint.name} results: {result['score']}")
59+
typer.echo(f"Checkpoint {checkpoint.name} results: {result['overall_score']}")
5760

5861
# Sort checkpoints by score
5962
sorted_checkpoints = sorted(
60-
checkpoint_results.items(), key=lambda x: x[1]["score"], reverse=True
63+
checkpoint_results.items(), key=lambda x: x[1]["overall_score"], reverse=True
6164
)
6265
typer.echo("Sorted checkpoints by score:")
6366
for checkpoint_name, result in sorted_checkpoints:

scripts/test_leaderboard.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
# NOTE: This script requires the leaderboard optional dependencies.
55
# Install with: pip install instructlab-eval[leaderboard]
66

7-
# First Party
7+
# Standard
88
import json
9+
10+
# First Party
911
from instructlab.eval.leaderboard import LeaderboardV2Evaluator
1012

1113
if __name__ == "__main__":

src/instructlab/eval/leaderboard.py

Lines changed: 47 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
# Standard
2-
from enum import StrEnum
2+
from copy import deepcopy
33
from pathlib import Path
44
import gc
55
import json
66
import os
77
import typing as t
8-
from copy import deepcopy
98

109
# Third Party
1110
from accelerate import Accelerator
@@ -18,6 +17,14 @@
1817
# Local
1918
from .evaluator import Evaluator
2019

20+
# Since StrEnum wasn't part of the STL until Python3.11, we must do this
21+
try:
22+
# Standard
23+
from enum import StrEnum
24+
except ImportError as ie:
25+
# Third Party
26+
from strenum import StrEnum # type: ignore[no-redef]
27+
2128

2229
class ParsedScores(t.TypedDict):
2330
"""
@@ -94,7 +101,7 @@ class TaskGrouping(t.TypedDict):
94101
}
95102

96103
# 1. Add OpenAI configuration defaults
97-
DEFAULT_OPENAI_CONFIG = {
104+
DEFAULT_OPENAI_CONFIG: t.Dict[str, t.Any] = {
98105
"max_tokens": 768,
99106
"temperature": 0.0,
100107
"seed": 1337,
@@ -194,9 +201,6 @@ def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
194201
def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
195202
# we need to use torch.multiprocessing to run each task in a separate process,
196203
# and then combine the results
197-
# Third Party
198-
import torch.multiprocessing as mp
199-
200204
num_processes = args["num_gpus"]
201205

202206
# Create the context and queue within the same context
@@ -222,9 +226,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
222226
p.join()
223227

224228
# extract the result which is not None
225-
assert len([res for res in results.values() if res is not None]) == 1, (
226-
"we expect exactly 1 process to return a results dict properly"
227-
)
229+
assert (
230+
len([res for res in results.values() if res is not None]) == 1
231+
), "we expect exactly 1 process to return a results dict properly"
228232
results_dict = [res for res in results.values() if res is not None][0]
229233
return results_dict
230234

@@ -290,9 +294,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
290294
parsed_scores = parse_multitask_results(
291295
result_dict, LeaderboardV2Tasks.BBH.value, "acc_norm"
292296
)
293-
assert len(parsed_scores["subtasks"]) == 24, (
294-
"there should be 24 subtasks of bbh run"
295-
)
297+
assert (
298+
len(parsed_scores["subtasks"]) == 24
299+
), "there should be 24 subtasks of bbh run"
296300
return parsed_scores
297301

298302

@@ -343,9 +347,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
343347
scores.append(value)
344348
target_metrics.remove(metric)
345349

346-
assert len(scores) == 2, (
347-
f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
348-
)
350+
assert (
351+
len(scores) == 2
352+
), f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
349353
return {
350354
"score": sum(scores) / 2,
351355
}
@@ -369,9 +373,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
369373
parsed_scores = parse_multitask_results(
370374
result_dict, LeaderboardV2Tasks.GPQA.value, "acc_norm"
371375
)
372-
assert len(parsed_scores["subtasks"]) == 3, (
373-
f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
374-
)
376+
assert (
377+
len(parsed_scores["subtasks"]) == 3
378+
), f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
375379
return parsed_scores
376380

377381

@@ -382,9 +386,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
382386
parsed_scores = parse_multitask_results(
383387
result_dict, LeaderboardV2Tasks.MATH_HARD.value, "exact_match"
384388
)
385-
assert len(parsed_scores["subtasks"]) == 7, (
386-
f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
387-
)
389+
assert (
390+
len(parsed_scores["subtasks"]) == 7
391+
), f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
388392
return parsed_scores
389393

390394

@@ -451,9 +455,9 @@ def get_scores_from_result_dicts(
451455
# this is just a sanity check step
452456
benchmarks_already_covered = set(parsed_scores.keys())
453457
overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
454-
assert len(benchmarks_already_covered & benchmarks_to_parse) == 0, (
455-
f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
456-
)
458+
assert (
459+
len(benchmarks_already_covered & benchmarks_to_parse) == 0
460+
), f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
457461

458462
# now actually add them
459463
for benchmark in benchmarks_to_parse:
@@ -486,12 +490,15 @@ def validate_output_path(output_file: str) -> None:
486490

487491
# Test if we can write to the file by opening it in append mode
488492
# We don't actually write anything
489-
output_path.open("a").close()
493+
with output_path.open("a", encoding="utf-8") as _:
494+
pass
490495

491-
except PermissionError:
492-
raise ValueError(f"Permission denied: Cannot write to {output_file}")
493-
except OSError as e:
494-
raise ValueError(f"Invalid output path: {output_file}. Error: {str(e)}")
496+
except PermissionError as pe:
497+
raise ValueError(f"Permission denied: Cannot write to {output_file}") from pe
498+
except OSError as ose:
499+
raise ValueError(
500+
f"Invalid output path: {output_file}. Error: {str(ose)}"
501+
) from ose
495502

496503

497504
def validate_leaderboard_v2_tasks(tasks: t.List[str]):
@@ -658,7 +665,7 @@ def save_to_file(self, output_file: t.Optional[str] = None) -> None:
658665
output_dir = os.path.dirname(output_file)
659666
if output_dir:
660667
os.makedirs(output_dir, exist_ok=True)
661-
with open(output_file, "w") as f:
668+
with open(output_file, "w", encoding="utf-8") as f:
662669
json.dump(self._results, f, indent=2)
663670

664671
def run(
@@ -739,15 +746,6 @@ def run(
739746
# validation logic
740747
validate_leaderboard_v2_tasks(tasks)
741748

742-
# Only validate GPU requirements when not using an API endpoint
743-
if not api_endpoint:
744-
if not num_gpus:
745-
num_gpus = cuda.device_count()
746-
if num_gpus <= 0 or num_gpus > cuda.device_count():
747-
raise ValueError(
748-
f"invalid value for num_gpus, must be between 1 and {cuda.device_count()}; got: {num_gpus}"
749-
)
750-
751749
if output_file:
752750
validate_output_path(output_file)
753751

@@ -767,6 +765,14 @@ def run(
767765
openai_results = evaluate_with_openai(args_openai)
768766
self._lm_eval_results.append(openai_results)
769767
else:
768+
# Only validate GPU requirements when not using an API endpoint
769+
if not num_gpus:
770+
num_gpus = cuda.device_count()
771+
if num_gpus <= 0 or num_gpus > cuda.device_count():
772+
raise ValueError(
773+
f"invalid value for num_gpus, must be between 1 and {cuda.device_count()}; got: {num_gpus}"
774+
)
775+
770776
# Only run local evaluation if not using OpenAI API
771777
if vllm_tasks := grouped_tasks["vllm"]:
772778
args_vllm: LeaderboardArgs = {
@@ -823,11 +829,11 @@ def evaluate_with_openai(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
823829

824830
# Add base_url if provided
825831
if base_url:
826-
model_args["base_url"] = base_url
832+
model_args.update({"base_url": base_url})
827833

828834
# Add API key if provided
829835
if api_key:
830-
model_args["api_key"] = api_key
836+
model_args.update({"api_key": api_key})
831837

832838
# Add any remaining backend config options
833839
model_args.update(backend_config)

tests/test_project.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
# First Party
66
from instructlab.eval.evaluator import Evaluator
7+
from instructlab.eval.leaderboard import LeaderboardV2Evaluator
78
from instructlab.eval.mmlu import MMLUBranchEvaluator, MMLUEvaluator
89
from instructlab.eval.mt_bench import MTBenchBranchEvaluator, MTBenchEvaluator
910

@@ -14,6 +15,7 @@ def test_evaluator_eps():
1415
"mmlu_branch": MMLUBranchEvaluator,
1516
"mt_bench": MTBenchEvaluator,
1617
"mt_bench_branch": MTBenchBranchEvaluator,
18+
"leaderboard_v2": LeaderboardV2Evaluator,
1719
}
1820
eps = entry_points(group="instructlab.eval.evaluator")
1921
found = {}

tox.ini

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ setenv =
1919
package = wheel
2020
wheel_build_env = pkg
2121
# equivalent to `pip install instructlab[cpu]`
22-
extras = cpu
22+
extras =
23+
cpu
24+
leaderboard
2325
deps =
2426
pytest
2527
pytest-asyncio

0 commit comments

Comments
 (0)