diff --git a/.claude/settings.local.json b/.claude/settings.local.json index fad0bef0..85414854 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -5,7 +5,8 @@ "Bash(dotnet build:*)", "Bash(ls:*)", "Bash(find:*)", - "Bash(dotnet format:*)" + "Bash(dotnet format:*)", + "Bash(dotnet run:*)" ] } } diff --git a/.claude/skills/bench-fast/skill.md b/.claude/skills/bench-fast/skill.md new file mode 100644 index 00000000..507a02b9 --- /dev/null +++ b/.claude/skills/bench-fast/skill.md @@ -0,0 +1,33 @@ +# bench-fast + +Run a benchmark quickly with minimal iterations to generate assembly code via DisassemblyDiagnoser. + +## Usage + +``` +/bench-fast [] +``` + +## Arguments + +- `$ARGUMENTS` - The name of the benchmark class to run (e.g., `LruJustGetOrAdd`, `LfuJustGetOrAdd`, `SketchIncrement`), optionally followed by a list of one or more runtimes (e.g., `net48`, `net9.0` or `net48 net9.0`) + +## Instructions + +Run the specified benchmark from BitFaster.Caching.Benchmarks with minimal iterations using BenchmarkDotNet's command line. + +Parse the arguments: the first argument is the benchmark name, and the optional second argument is the list of runtimes. + +If a runtime arg is specified, execute: + +```bash +dotnet run -c Release --project BitFaster.Caching.Benchmarks --framework net9.0 -- --runtimes --filter "" -j short --warmupCount 3 --iterationCount 5 -d --disasmDepth 5 +``` + +If no runtime is specified, simply omit that command line arg: + +```bash +dotnet run -c Release --project BitFaster.Caching.Benchmarks --framework net9.0 -- --filter "" -j short --warmupCount 3 --iterationCount 5 -d --disasmDepth 5 +``` + +The `--warmupCount 3 --iterationCount 5` options reduce warmup and iteration counts for faster execution while still executing the code enough times to JIT optimized code. diff --git a/.claude/skills/dump-asm/skill.md b/.claude/skills/dump-asm/skill.md new file mode 100644 index 00000000..02faeca9 --- /dev/null +++ b/.claude/skills/dump-asm/skill.md @@ -0,0 +1,89 @@ +# dump-asm + +Generate assembly code for a benchmark and organize output into a baseline directory named after the current git branch. + +## Usage + +``` +/dump-asm [] +``` + +## Arguments + +- `$ARGUMENTS` - The name of the benchmark class to run (e.g., `LruJustGetOrAdd`, `LfuJustGetOrAdd`, `SketchIncrement`), optionally followed by a target framework (e.g., `net9.0`, `net8.0`, `net6.0`) + +## Instructions + +This skill orchestrates benchmark assembly generation and organizes the output for comparison. + +Parse the arguments: the first argument is the benchmark name, and the optional second argument is the target framework. + +### Step 1: Clean artifacts + +Delete the BenchmarkDotNet.Artifacts directory to ensure a clean run: + +```bash +rm -rf BenchmarkDotNet.Artifacts +``` + +### Step 2: Run benchmark + +Run the bench-fast skill with the provided benchmark name and optional framework to generate assembly code. + +If a framework is specified, execute: + +```bash +dotnet run -c Release --project BitFaster.Caching.Benchmarks --framework --filter "" -j short --warmupCount 3 --iterationCount 5 -d --disasmDepth 5 +``` + +If no framework is specified, default to `net9.0`: + +```bash +dotnet run -c Release --project BitFaster.Caching.Benchmarks --framework net9.0 --filter "" -j short --warmupCount 3 --iterationCount 5 -d --disasmDepth 5 +``` + +### Step 3: Split assembly files + +Run the split-asm skill to generate individual assembly code files: + +```bash +dotnet run --project C:/repo/splitasm/splitasm -- BenchmarkDotNet.Artifacts/results +``` + +### Step 4: Organize into baseline directory + +Get the current git branch name and convert it to a valid directory name by replacing forward slashes with dashes: + +```bash +git rev-parse --abbrev-ref HEAD | tr '/' '-' +``` + +For example, `users/alexpeck/foo` becomes `users-alexpeck-foo`. + +Create the baseline directory structure preserving the benchmark name and runtime hierarchy. For each benchmark and runtime combination found in `BenchmarkDotNet.Artifacts/results/`: + +1. Extract the short benchmark name from the full benchmark path (e.g., `BitFaster.Caching.Benchmarks.LruJustGetOrAdd` → `LruJustGetOrAdd`) +2. Create the directory `baseline////` +3. Copy all files from the corresponding `BenchmarkDotNet.Artifacts/results///` directory + +The final structure should be: +``` +baseline/ + / + / + / + -asm.md + -summary.md + ... +``` + +For example: +``` +baseline/ + users-alexpeck-skills/ + LruJustGetOrAdd/ + .NET 6.0.36 (6.0.3624.51421), X64 RyuJIT AVX2/ + FastConcurrentLru-asm.md + FastConcurrentLru-summary.md + ... +``` diff --git a/.claude/skills/split-asm/skill.md b/.claude/skills/split-asm/skill.md new file mode 100644 index 00000000..0f0fa882 --- /dev/null +++ b/.claude/skills/split-asm/skill.md @@ -0,0 +1,45 @@ +# split-asm + +Split BenchmarkDotNet assembly markdown files into individual files per benchmark method using splitasm. + +## Usage + +``` +/split-asm [] +``` + +## Arguments + +- `$ARGUMENTS` - Optional path to the BenchmarkDotNet results directory. Defaults to `BenchmarkDotNet.Artifacts/results` in the current repository. + +## Instructions + +Run splitasm to break down BenchmarkDotNet assembly markdown files into a single file per benchmark method. This enables using file diffs to compare how code changes affect disassembler output. + +Parse the arguments: the optional first argument is the path to the results directory. + +If a path is specified, execute: + +```bash +dotnet run --project C:/repo/splitasm/splitasm -- +``` + +If no path is specified, default to the standard BenchmarkDotNet output location: + +```bash +dotnet run --project C:/repo/splitasm/splitasm -- BenchmarkDotNet.Artifacts/results +``` + +The tool produces: +1. Individual assembly files - one markdown file per benchmarked method containing its assembly code +2. A summary file listing disassembled code size in bytes for each benchmarked method + +Output is organized hierarchically by target benchmark, then by target framework. + +## Prerequisites + +The splitasm repository should be cloned to C:/repo/splitasm. If not available, clone from https://github.com/bitfaster/splitasm: + +```bash +cd C:/repo && git clone https://github.com/bitfaster/splitasm.git +``` diff --git a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj index f32896d9..dd0665ae 100644 --- a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj +++ b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj @@ -3,7 +3,7 @@ Exe latest - net6.0;net8.0;net9.0 + net48;net6.0;net8.0;net9.0 True true diff --git a/BitFaster.Caching.Benchmarks/DataStructureBenchmarks.cs b/BitFaster.Caching.Benchmarks/DataStructureBenchmarks.cs index 6c0d4ba7..c53ec891 100644 --- a/BitFaster.Caching.Benchmarks/DataStructureBenchmarks.cs +++ b/BitFaster.Caching.Benchmarks/DataStructureBenchmarks.cs @@ -7,10 +7,6 @@ namespace BitFaster.Caching.Benchmarks { -#if Windows - [SimpleJob(RuntimeMoniker.Net48)] -#endif - [SimpleJob(RuntimeMoniker.Net60)] [MemoryDiagnoser(displayGenColumns: false)] public class DataStructureBenchmarks { diff --git a/BitFaster.Caching.Benchmarks/DisposerBench.cs b/BitFaster.Caching.Benchmarks/DisposerBench.cs index ed2eaa4c..9de8fb62 100644 --- a/BitFaster.Caching.Benchmarks/DisposerBench.cs +++ b/BitFaster.Caching.Benchmarks/DisposerBench.cs @@ -9,9 +9,7 @@ namespace BitFaster.Caching.Benchmarks // https://github.com/dotnet/runtime/issues/4920 #if Windows [DisassemblyDiagnoser(printSource: true, maxDepth: 3)] - [SimpleJob(RuntimeMoniker.Net48)] #endif - [SimpleJob(RuntimeMoniker.Net60)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] public class DisposerBench diff --git a/BitFaster.Caching.Benchmarks/DrainBenchmarks.cs b/BitFaster.Caching.Benchmarks/DrainBenchmarks.cs index 85943cfe..39cb975e 100644 --- a/BitFaster.Caching.Benchmarks/DrainBenchmarks.cs +++ b/BitFaster.Caching.Benchmarks/DrainBenchmarks.cs @@ -8,9 +8,7 @@ namespace BitFaster.Caching.Benchmarks { #if Windows [DisassemblyDiagnoser(printSource: true, maxDepth: 3)] - [SimpleJob(RuntimeMoniker.Net48)] #endif - [SimpleJob(RuntimeMoniker.Net60)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] public class DrainBenchmarks { diff --git a/BitFaster.Caching.Benchmarks/Lfu/LfuJustGetOrAdd.cs b/BitFaster.Caching.Benchmarks/Lfu/LfuJustGetOrAdd.cs index 5b616d85..cd4528d2 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/LfuJustGetOrAdd.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/LfuJustGetOrAdd.cs @@ -10,10 +10,6 @@ namespace BitFaster.Caching.Benchmarks { -#if Windows - [SimpleJob(RuntimeMoniker.Net48)] -#endif - [SimpleJob(RuntimeMoniker.Net60)] //[DisassemblyDiagnoser(printSource: true, maxDepth: 5)] [MemoryDiagnoser(displayGenColumns: false)] // [HardwareCounters(HardwareCounter.LlcMisses, HardwareCounter.CacheMisses)] // Requires Admin https://adamsitnik.com/Hardware-Counters-Diagnoser/ diff --git a/BitFaster.Caching.Benchmarks/Lru/LruAsyncGet.cs b/BitFaster.Caching.Benchmarks/Lru/LruAsyncGet.cs index 41f81234..f668582d 100644 --- a/BitFaster.Caching.Benchmarks/Lru/LruAsyncGet.cs +++ b/BitFaster.Caching.Benchmarks/Lru/LruAsyncGet.cs @@ -9,10 +9,6 @@ namespace BitFaster.Caching.Benchmarks.Lru /// /// Verify 0 allocs for GetOrAddAsync cache hits. /// -#if Windows - [SimpleJob(RuntimeMoniker.Net48)] -#endif - [SimpleJob(RuntimeMoniker.Net60)] // [DisassemblyDiagnoser(printSource: true, maxDepth: 5)] // Unstable [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] diff --git a/BitFaster.Caching.Benchmarks/Lru/LruCycleBench.cs b/BitFaster.Caching.Benchmarks/Lru/LruCycleBench.cs index 995848a8..d5458a9e 100644 --- a/BitFaster.Caching.Benchmarks/Lru/LruCycleBench.cs +++ b/BitFaster.Caching.Benchmarks/Lru/LruCycleBench.cs @@ -24,9 +24,7 @@ namespace BitFaster.Caching.Benchmarks.Lru //| ClassicLru | 16.35 us | 0.091 us | 0.076 us | 0.72 | 4 KB | 3.2959 | 14 KB | #if Windows [DisassemblyDiagnoser(printSource: true, maxDepth: 5)] - [SimpleJob(RuntimeMoniker.Net48)] #endif - [SimpleJob(RuntimeMoniker.Net60)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] public class LruCycleBench diff --git a/BitFaster.Caching.Benchmarks/Lru/LruJustGetOrAdd.cs b/BitFaster.Caching.Benchmarks/Lru/LruJustGetOrAdd.cs index 5b80ab9b..b43b3e3b 100644 --- a/BitFaster.Caching.Benchmarks/Lru/LruJustGetOrAdd.cs +++ b/BitFaster.Caching.Benchmarks/Lru/LruJustGetOrAdd.cs @@ -32,9 +32,7 @@ namespace BitFaster.Caching.Benchmarks //| ExtensionsMemoryCacheGet | 93.188 ns | 0.2321 ns | 0.2171 ns | 11.85 | 0.07 | 78 B | 0.0055 | 24 B | #if Windows [DisassemblyDiagnoser(printSource: true, maxDepth: 5)] - [SimpleJob(RuntimeMoniker.Net48)] #endif - [SimpleJob(RuntimeMoniker.Net60)] [MemoryDiagnoser(displayGenColumns: false)] // [HardwareCounters(HardwareCounter.LlcMisses, HardwareCounter.CacheMisses)] // Requires Admin https://adamsitnik.com/Hardware-Counters-Diagnoser/ // [ThreadingDiagnoser] // Requires .NET Core diff --git a/BitFaster.Caching.Benchmarks/Lru/LruJustGetOrAddGuid.cs b/BitFaster.Caching.Benchmarks/Lru/LruJustGetOrAddGuid.cs index 118fe987..9cfc2764 100644 --- a/BitFaster.Caching.Benchmarks/Lru/LruJustGetOrAddGuid.cs +++ b/BitFaster.Caching.Benchmarks/Lru/LruJustGetOrAddGuid.cs @@ -16,9 +16,7 @@ namespace BitFaster.Caching.Benchmarks #if Windows [DisassemblyDiagnoser(printSource: true, maxDepth: 5)] - [SimpleJob(RuntimeMoniker.Net48)] #endif - [SimpleJob(RuntimeMoniker.Net60)] [MemoryDiagnoser(displayGenColumns: false)] // [HardwareCounters(HardwareCounter.LlcMisses, HardwareCounter.CacheMisses)] // Requires Admin https://adamsitnik.com/Hardware-Counters-Diagnoser/ // [ThreadingDiagnoser] // Requires .NET Core diff --git a/BitFaster.Caching.Benchmarks/Lru/LruJustTryGet.cs b/BitFaster.Caching.Benchmarks/Lru/LruJustTryGet.cs index bfd590fc..f988ef69 100644 --- a/BitFaster.Caching.Benchmarks/Lru/LruJustTryGet.cs +++ b/BitFaster.Caching.Benchmarks/Lru/LruJustTryGet.cs @@ -21,9 +21,7 @@ namespace BitFaster.Caching.Benchmarks.Lru //| FastConcurrentTLru | 25.350 ns | 0.3301 ns | 0.3088 ns | 5.66 | 0.08 | 546 B | - | #if Windows [DisassemblyDiagnoser(printSource: true, maxDepth: 5)] - [SimpleJob(RuntimeMoniker.Net48)] #endif - [SimpleJob(RuntimeMoniker.Net60)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] public class LruJustTryGet diff --git a/BitFaster.Caching.Benchmarks/Lru/LruMultiGet.cs b/BitFaster.Caching.Benchmarks/Lru/LruMultiGet.cs index b5740219..8f907ff0 100644 --- a/BitFaster.Caching.Benchmarks/Lru/LruMultiGet.cs +++ b/BitFaster.Caching.Benchmarks/Lru/LruMultiGet.cs @@ -26,9 +26,7 @@ namespace BitFaster.Caching.Benchmarks.Lru //| MemoryCache | 117.075 ns | 1.7664 ns | 1.5658 ns | 13.96 | 0.18 | 94 B | 0.0073 | 32 B | #if Windows [DisassemblyDiagnoser(printSource: true, maxDepth: 5)] - [SimpleJob(RuntimeMoniker.Net48)] #endif - [SimpleJob(RuntimeMoniker.Net60)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] public class LruMultiGet diff --git a/BitFaster.Caching.Benchmarks/Lru/LruZipDistribution.cs b/BitFaster.Caching.Benchmarks/Lru/LruZipDistribution.cs index 9528aeaa..54274d51 100644 --- a/BitFaster.Caching.Benchmarks/Lru/LruZipDistribution.cs +++ b/BitFaster.Caching.Benchmarks/Lru/LruZipDistribution.cs @@ -23,9 +23,7 @@ namespace BitFaster.Caching.Benchmarks.Lru //| ConcurrentTLru | 169.7 ns | 0.86 ns | 0.80 ns | 1.52 | 0.02 | 0.0098 | 5,982 B | 43 B | #if Windows [DisassemblyDiagnoser(printSource: true, maxDepth: 5)] - [SimpleJob(RuntimeMoniker.Net48)] #endif - [SimpleJob(RuntimeMoniker.Net60)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] public class LruZipDistribution diff --git a/BitFaster.Caching.Benchmarks/Lru/TLruTimeBenchmark.cs b/BitFaster.Caching.Benchmarks/Lru/TLruTimeBenchmark.cs index 37c09e91..57555f60 100644 --- a/BitFaster.Caching.Benchmarks/Lru/TLruTimeBenchmark.cs +++ b/BitFaster.Caching.Benchmarks/Lru/TLruTimeBenchmark.cs @@ -9,10 +9,6 @@ namespace BitFaster.Caching.Benchmarks.Lru /// /// Compare different implementations of the TLRU policy. In particular, which clock impl is fastest? /// -#if Windows - [SimpleJob(RuntimeMoniker.Net48)] -#endif - [SimpleJob(RuntimeMoniker.Net60)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] public class TLruTimeBenchmark { diff --git a/BitFaster.Caching.Benchmarks/Program.cs b/BitFaster.Caching.Benchmarks/Program.cs index aa332e3c..7c19e042 100644 --- a/BitFaster.Caching.Benchmarks/Program.cs +++ b/BitFaster.Caching.Benchmarks/Program.cs @@ -1,4 +1,9 @@ -using BenchmarkDotNet.Running; +using System; +using System.Collections.Generic; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Environments; +using BenchmarkDotNet.Jobs; +using BenchmarkDotNet.Running; namespace BitFaster.Caching.Benchmarks { @@ -6,7 +11,35 @@ class Program { static void Main(string[] args) { - BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); + BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args, GetGlobalConfig(args)); + } + + // This gives a default where we run both net48 and net9.0 unless overridden on the command line. + static IConfig GetGlobalConfig(string[] args) + { + //if args contains either --runtimes or --r, return default config + foreach (var a in args) + { + if (a == "--runtimes" || a == "--r") + { + return DefaultConfig.Instance; + } + } + + // else default to both net48 and net9.0 + return DefaultConfig.Instance +#if Windows + .AddJob( + Job.Default + .WithRuntime(ClrRuntime.Net48) + .WithId("net48")) +#endif + .AddJob( + Job.Default + .WithRuntime(CoreRuntime.Core90) + .WithId("net9.0") + .AsDefault()); + } } } diff --git a/BitFaster.Caching.Benchmarks/TimeBenchmarks.cs b/BitFaster.Caching.Benchmarks/TimeBenchmarks.cs index 4547d169..a5f7aafd 100644 --- a/BitFaster.Caching.Benchmarks/TimeBenchmarks.cs +++ b/BitFaster.Caching.Benchmarks/TimeBenchmarks.cs @@ -8,10 +8,7 @@ namespace BitFaster.Caching.Benchmarks { #if Windows [DisassemblyDiagnoser(printSource: true, maxDepth: 5)] - [SimpleJob(RuntimeMoniker.Net48)] #endif - [SimpleJob(RuntimeMoniker.Net60)] - [SimpleJob(RuntimeMoniker.Net90)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] public class TimeBenchmarks { diff --git a/BitFaster.Caching.Benchmarks/ValueFactoryBenchmarks.cs b/BitFaster.Caching.Benchmarks/ValueFactoryBenchmarks.cs index 306180be..c4888855 100644 --- a/BitFaster.Caching.Benchmarks/ValueFactoryBenchmarks.cs +++ b/BitFaster.Caching.Benchmarks/ValueFactoryBenchmarks.cs @@ -7,9 +7,7 @@ namespace BitFaster.Caching.Benchmarks { #if Windows [DisassemblyDiagnoser(printSource: true, maxDepth: 3)] - [SimpleJob(RuntimeMoniker.Net48)] #endif - [SimpleJob(RuntimeMoniker.Net60)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] public class ValueFactoryBenchmarks