feat: add hash for configured ratings

crisbeto · crisbeto · commit ec906f3d3edd · 2025-12-04T11:50:35.000+01:00
Adds a hash that can be used to easily determine if the set of ratings and categories changed between different runs.
diff --git a/runner/orchestration/generate-eval-task.ts b/runner/orchestration/generate-eval-task.ts
@@ -18,6 +18,8 @@ import {attemptBuildAndTest} from './build-serve-test-loop.js';
 import {rateGeneratedCode} from '../ratings/rate-code.js';
 import {DEFAULT_AUTORATER_MODEL_NAME} from '../configuration/constants.js';
 import assert from 'node:assert';
+import {Rating} from '../ratings/rating-types.js';
+import {getSha256Hash} from '../utils/hashing.js';
 
 /**
  * Creates and executes a task to generate or load code for a given prompt,
@@ -189,6 +191,7 @@ export async function startEvaluationTask(
       },
       outputFiles: attempt.outputFiles,
       finalAttempt: attempt,
+      ratingHash: getRatingHash(promptDef.ratings),
       score,
       repairAttempts: attempt.repairAttempts,
       attemptDetails,
@@ -203,3 +206,15 @@ export async function startEvaluationTask(
   await cleanup();
   return results;
 }
+
+function getRatingHash(ratings: Rating[]): string {
+  const parts: string[] = [];
+
+  for (const rating of ratings) {
+    parts.push(
+      `${rating.category};${rating.id};${rating.scoreReduction};${rating.groupingLabels || [].sort().join(',')}`,
+    );
+  }
+
+  return getSha256Hash(parts.sort().join('|'));
+}
diff --git a/runner/orchestration/grouping.ts b/runner/orchestration/grouping.ts
@@ -1,9 +1,8 @@
-import {createHash} from 'crypto';
-import type {LlmRunner} from '../codegen/llm-runner.js';
 import type {Environment} from '../configuration/environment.js';
 import {calculateBuildAndCheckStats} from '../ratings/stats.js';
 import type {AssessmentResult, RunGroup, RunInfo} from '../shared-interfaces.js';
 import {RunnerName} from '../codegen/runner-creation.js';
+import {getSha256Hash} from '../utils/hashing.js';
 
 /** Generates a unique grouping ID for a run. */
 export function getRunGroupId(
@@ -30,7 +29,7 @@ export function getRunGroupId(
     `${options.labels?.sort().join('/')}/${options.model}/${options.runner}`;
 
   // The group string above can get long. Hash it to something shorter and fixed length.
-  return createHash('sha256').update(group).digest('hex');
+  return getSha256Hash(group);
 }
 
 /**
diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts
@@ -535,6 +535,8 @@ export interface AssessmentResult {
   repairAttempts: number;
   /** An array detailing each attempt (initial and repairs) made for this prompt. */
   attemptDetails: AttemptDetails[];
+  /** Hash that can be used to determine if the set of ratings changed between assessment runs. */
+  ratingHash?: string;
   /** Pre-computed user journeys. */
   userJourneys?: UserJourneysResult;
   /** The number of repair attempts made after the axe initial failures. */
diff --git a/runner/utils/hashing.ts b/runner/utils/hashing.ts
@@ -0,0 +1,8 @@
+import {createHash} from 'node:crypto';
+
+/**
+ * Returns a sha-256 hash of a string.
+ */
+export function getSha256Hash(value: string): string {
+  return createHash('sha256').update(value).digest('hex');
+}