Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 93 additions & 24 deletions bench_runner/scripts/bisect.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import contextlib
import json
from pathlib import Path
import subprocess
import sys
Expand All @@ -12,18 +13,27 @@

from bench_runner import flags as mflags
from bench_runner import git
from bench_runner import result
from bench_runner import result as mresult
from bench_runner.scripts import run_benchmarks
from bench_runner.scripts import workflow
from bench_runner.util import PathLike, format_seconds


def format_result(result: float | int, pystats: str | None = None) -> str:
if pystats is None:
return format_seconds(result)
else:
return f"{result:,}"


def _get_result_commandline(
benchmark: str,
good_val: float,
bad_val: float,
pgo: bool,
flags: str,
pystats: str | None,
invert: bool,
repo: PathLike,
) -> list[str]:
repo = Path(repo)
Expand All @@ -37,16 +47,18 @@ def _get_result_commandline(
str(bad_val),
str(pgo),
str(flags),
str(pystats),
str(invert),
str(repo.absolute()),
]


def parse_result(benchmark_json: PathLike) -> float:
def parse_timing_result(benchmark_json: PathLike) -> float:
# The name of the benchmark in the JSON file may be different from the one
# used to select the benchmark. Therefore, just take the mean of all the
# benchmarks in the JSON file.
result.clear_contents_cache()
r = result.Result.from_arbitrary_filename(benchmark_json)
mresult.clear_contents_cache()
r = mresult.Result.from_arbitrary_filename(benchmark_json)
timing_data = r.get_timing_data()
return float(np.mean([x.mean() for x in timing_data.values()]))

Expand All @@ -55,22 +67,34 @@ def get_result(
benchmark: str,
pgo: bool = False,
flags: str = "",
pystats: str | None = None,
cpython: PathLike = Path("cpython"),
reconfigure: bool = False,
) -> float:
cpython = Path(cpython)
python = cpython / "python"
parsed_flags = mflags.parse_flags(flags)

if pgo or reconfigure:
# Jumping around through commits with PGO can leave stale PGO data
# around, so we need to clean it each time. We also always do it the
# first time in case the *last* bisect run used pgo.
subprocess.run(["make", "clean"], cwd=cpython)

workflow.compile_unix(cpython, mflags.parse_flags(flags), pgo, False, reconfigure)
run_benchmarks.run_benchmarks(cpython / "python", benchmark)
timing = parse_result(run_benchmarks.BENCHMARK_JSON)
workflow.compile_unix(cpython, parsed_flags, pgo, pystats is not None, reconfigure)

return timing
if pystats is None:
run_benchmarks.run_benchmarks(python, benchmark)
return parse_timing_result(run_benchmarks.BENCHMARK_JSON)
else:
run_benchmarks.collect_pystats(python, benchmark)
summarize_stats_path = cpython / "Tools" / "scripts" / "summarize_stats.py"
subprocess.check_output(
[str(python), str(summarize_stats_path), "--json-output", "pystats.json"]
)
with open("pystats.json", "r") as f:
contents = json.load(f)
return int(contents[pystats])


def get_log_file() -> Path:
Expand Down Expand Up @@ -103,11 +127,14 @@ def _main(
bad: str,
pgo: bool = False,
flags: str = "",
pystats: str | None = None,
invert: bool = False,
repo: PathLike = Path("."),
cpython: PathLike = Path("cpython"),
):
repo = Path(repo).absolute()
cpython = Path(cpython).absolute()
im = -1 if invert else 1

delete_log()

Expand All @@ -117,17 +144,21 @@ def _main(
)

git.checkout(cpython, good)
good_timing = get_result(benchmark, pgo, flags, cpython=cpython, reconfigure=True)
log(f"KNOWN GOOD ({good[:7]}): {format_seconds(good_timing)}")
good_result = get_result(
benchmark, pgo, flags, pystats, cpython=cpython, reconfigure=True
)
log(f"KNOWN GOOD ({good[:7]}): {format_result(good_result, pystats)}")

git.checkout(cpython, bad)
bad_timing = get_result(benchmark, pgo, flags, cpython=cpython)
log(f"KNOWN BAD ({bad[:7]}): {format_seconds(bad_timing)}")
bad_result = get_result(benchmark, pgo, flags, pystats, cpython=cpython)
log(f"KNOWN BAD ({bad[:7]}): {format_result(bad_result, pystats)}")

if good_timing >= bad_timing:
if im * good_result >= im * bad_result:
show_log()
raise ValueError(
f"Good timing ({good_timing}) must be less than bad timing ({bad_timing})."
f"Good result ({format_result(good_result, pystats)}) "
f"must be {'more' if invert else 'less'} than "
f"bad result ({format_result(bad_result, pystats)})."
)

try:
Expand All @@ -138,7 +169,14 @@ def _main(
subprocess.run(
["git", "bisect", "run"]
+ _get_result_commandline(
benchmark, good_timing, bad_timing, pgo, flags, repo
benchmark,
good_result,
bad_result,
pgo,
flags,
pystats,
invert,
repo,
)
)
finally:
Expand Down Expand Up @@ -182,10 +220,29 @@ def main():
type=str,
default="",
)
parser.add_argument(
"--pystats",
type=str,
help="Bisect using pystats. Should be the key in the pystats.json file.",
)
parser.add_argument(
"--invert",
action="store_true",
help="Invert the values, i.e. expect the bad value to be lower than "
"the good value.",
)

args = parser.parse_args()

_main(args.benchmark, args.good, args.bad, args.pgo, args.flags)
_main(
args.benchmark,
args.good,
args.bad,
args.pgo,
args.flags,
args.pystats,
args.invert,
)


if __name__ == "__main__":
Expand All @@ -197,18 +254,30 @@ def main():
parser.add_argument("bad_val", type=float)
parser.add_argument("pgo", type=str)
parser.add_argument("flags", type=str)
parser.add_argument("pystats", type=str)
parser.add_argument("invert", type=str, choices=["True", "False"])
parser.add_argument("repo", type=str)
args = parser.parse_args()

if args.pystats == "None":
args.pystats = None

invert = args.invert == "True"
im = -1 if invert else 1

mid_point = (args.good_val + args.bad_val) / 2.0

repo = Path(args.repo)
cpython = repo / "cpython"

try:
with contextlib.chdir(repo):
timing = get_result(
args.benchmark, args.pgo == "True", args.flags, cpython=cpython
result = get_result(
args.benchmark,
args.pgo == "True",
args.flags,
args.pystats,
cpython=cpython,
)
except Exception as e:
# If there was any exception, display that exception and traceback and
Expand All @@ -218,20 +287,20 @@ def main():

# The confidence is 0.0 at the mid-point, 1.0 at the good and bad values,
# and > 1.0 outside of that.
confidence = abs((timing - mid_point) / ((args.bad_val - args.good_val) / 2.0))
confidence = abs((result - mid_point) / ((args.bad_val - args.good_val) / 2.0))

with contextlib.chdir(repo):
if timing > mid_point:
if im * result > im * mid_point:
log(
f"BISECT BAD ({git.get_git_hash(cpython)[:7]}): "
f"{format_seconds(timing)} (confidence {confidence:.02f})"
f"{format_result(result, args.pystats)} (confidence {confidence:.02f})"
)
print(f"BAD: {timing} vs. ({args.good_val}, {args.bad_val})")
print(f"BAD: {result} vs. ({args.good_val}, {args.bad_val})")
sys.exit(1)
else:
log(
f"BISECT GOOD ({git.get_git_hash(cpython)[:7]}): "
f"{format_seconds(timing)} (confidence {confidence:.02f})"
f"{format_result(result, args.pystats)} (confidence {confidence:.02f})"
)
print(f"GOOD: {timing} vs. ({args.good_val}, {args.bad_val})")
print(f"GOOD: {result} vs. ({args.good_val}, {args.bad_val})")
sys.exit(0)
11 changes: 6 additions & 5 deletions bench_runner/scripts/run_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,9 @@ def run_benchmarks(
def collect_pystats(
python: PathLike,
benchmarks: str,
fork: str,
ref: str,
individual: bool,
fork: str | None = None,
ref: str | None = None,
individual: bool = True,
flags: Iterable[str] | None = None,
) -> None:
pystats_dir = Path("/tmp/py_stats")
Expand Down Expand Up @@ -167,7 +167,7 @@ def collect_pystats(
except NoBenchmarkError:
pass
else:
if individual:
if individual and fork is not None and ref is not None:
Copy link

Copilot AI Jun 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ensure that the new condition correctly captures all intended cases. If the bisect operation requires both fork and ref to be set, consider documenting the expected behavior to avoid ambiguity.

Suggested change
if individual and fork is not None and ref is not None:
# Ensure both fork and ref are set and valid before proceeding.
if individual and isinstance(fork, str) and fork.strip() and isinstance(ref, str) and ref.strip():
# This ensures that the operation can correctly summarize stats for the given benchmark.
if individual and isinstance(fork, str) and fork.strip() and isinstance(ref, str) and ref.strip():

Copilot uses AI. Check for mistakes.
run_summarize_stats(python, fork, ref, benchmark, flags=flags)

for filename in pystats_dir.iterdir():
Expand All @@ -181,7 +181,8 @@ def collect_pystats(
else:
benchmark_links = []

run_summarize_stats(python, fork, ref, "all", benchmark_links, flags=flags)
if fork is not None and ref is not None:
Copy link

Copilot AI Jun 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] This check duplicates the condition logic from earlier; consider centralizing the validation of fork and ref to improve maintainability and reduce potential inconsistencies.

Copilot uses AI. Check for mistakes.
run_summarize_stats(python, fork, ref, "all", benchmark_links, flags=flags)


def get_perf_lines(files: Iterable[PathLike]) -> Iterable[str]:
Expand Down