feat: add validation that ratings didn't change

crisbeto · crisbeto · commit 115aaa23ba69 · 2025-12-04T13:12:04.000+01:00
Adds a `ratingHash` on the environment that is generated from the current set of ratings. In addition, includes an `expectedRatingHash` field on the environment config which can be used to verify that an environment has a specific hash before it is executed. This is useful to ensure that the ratings stay stable between runs and `web-codegen-scorer` releases.
diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts
@@ -91,6 +91,13 @@ export const environmentConfigSchema = z.object({
       }),
     )
     .optional(),
+
+  /**
+   * When an environment is created, it generates a hash based on the configured ratings.
+   * This field is used to validate that the generated hash matches a pre-defined one.
+   * It's useful to ensure that the set of ratings hasn't changed between two runs.
+   */
+  expectedRatingHash: z.string().optional(),
 });
 
 /**
diff --git a/runner/configuration/environment.ts b/runner/configuration/environment.ts
@@ -15,6 +15,12 @@ import {lazy} from '../utils/lazy-creation.js';
 import {EnvironmentConfig} from './environment-config.js';
 import {EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
 import {renderPromptTemplate} from './prompt-templating.js';
+import {getSha256Hash} from '../utils/hashing.js';
+
+interface CategoryConfig {
+  name: string;
+  maxPoints: number;
+}
 
 /** Represents a single prompt evaluation environment. */
 export class Environment {
@@ -40,10 +46,18 @@ export class Environment {
   readonly promptTimeoutMinutes: number | undefined;
   /** Configuration for the individual rating categories. */
   readonly ratingCategories: {
-    [RatingCategory.HIGH_IMPACT]: {name: string; maxPoints: number};
-    [RatingCategory.MEDIUM_IMPACT]: {name: string; maxPoints: number};
-    [RatingCategory.LOW_IMPACT]: {name: string; maxPoints: number};
+    [RatingCategory.HIGH_IMPACT]: CategoryConfig;
+    [RatingCategory.MEDIUM_IMPACT]: CategoryConfig;
+    [RatingCategory.LOW_IMPACT]: CategoryConfig;
   };
+  /**
+   * Hash of the environment-level ratings. Can be used to
+   * validate that the ratings haven't changed between runs.
+   */
+  readonly ratingHash: string;
+
+  /** Ratings configured at the environment level. */
+  private readonly ratings: Rating[];
 
   constructor(
     rootPath: string,
@@ -71,12 +85,15 @@ export class Environment {
     this.isBuiltIn = rootPath.includes('node_modules');
     this.executor = config.executor;
     this.promptTimeoutMinutes = config.promptTimeoutMinutes;
+    this.ratings = this.resolveRatings(config);
+    this.ratingHash = this.getRatingHash(this.ratings);
     this.ratingCategories = this.getRatingCategories(config);
+    this.validateRatingHash(this.ratingHash, config);
   }
 
   /** Prompts that should be executed as a part of the evaluation. */
   executablePrompts = lazy(async () => {
-    return this.resolveExecutablePrompts(this.config.executablePrompts, this.config);
+    return this.resolveExecutablePrompts(this.config.executablePrompts);
   });
 
   systemPromptGeneration = lazy(async () => {
@@ -178,27 +195,9 @@ export class Environment {
    */
   private async resolveExecutablePrompts(
     prompts: EnvironmentConfig['executablePrompts'],
-    config: EnvironmentConfig,
   ): Promise<RootPromptDefinition[]> {
     const result: Promise<RootPromptDefinition>[] = [];
-    let envRatings: Rating[];
-
-    if (config.ratingOverrides) {
-      Object.keys(config.ratingOverrides).forEach(id => {
-        if (!config.ratings.some(rating => rating.id === id)) {
-          throw new UserFacingError(
-            `Rating with an ID of "${id}" has not been configured. Cannot apply an override to it.`,
-          );
-        }
-      });
-
-      envRatings = config.ratings.map(rating => {
-        const override = config.ratingOverrides![rating.id];
-        return override ? {...rating, ...override} : rating;
-      });
-    } else {
-      envRatings = config.ratings;
-    }
+    const envRatings = this.ratings;
 
     for (const def of prompts) {
       if (def instanceof MultiStepPrompt) {
@@ -378,6 +377,25 @@ export class Environment {
     return result;
   }
 
+  private resolveRatings(config: EnvironmentConfig) {
+    if (!config.ratingOverrides) {
+      return config.ratings;
+    }
+
+    Object.keys(config.ratingOverrides).forEach(id => {
+      if (!config.ratings.some(rating => rating.id === id)) {
+        throw new UserFacingError(
+          `Rating with an ID of "${id}" has not been configured. Cannot apply an override to it.`,
+        );
+      }
+    });
+
+    return config.ratings.map(rating => {
+      const override = config.ratingOverrides![rating.id];
+      return override ? {...rating, ...override} : rating;
+    });
+  }
+
   private getRatingCategories(config: EnvironmentConfig) {
     const overrides = config.categoryOverrides;
 
@@ -399,4 +417,29 @@ export class Environment {
       },
     };
   }
+
+  private getRatingHash(ratings: Rating[]): string {
+    const parts: string[] = [];
+
+    for (const rating of ratings) {
+      parts.push(
+        `${rating.category};${rating.id};${rating.scoreReduction};${rating.groupingLabels || [].sort().join(',')}`,
+      );
+    }
+
+    return getSha256Hash(parts.sort().join('|'));
+  }
+
+  private validateRatingHash(currentHash: string, config: EnvironmentConfig) {
+    if (config.expectedRatingHash && config.expectedRatingHash !== currentHash) {
+      throw new UserFacingError(
+        [
+          `Rating hash for environment "${this.displayName}" does not match the expectation.`,
+          `Expected: ${config.expectedRatingHash}`,
+          `Actual: ${this.ratingHash}`,
+          `Either update the \`expectedRatingHash\` field in the config or revert the ratings back to their previous configuration`,
+        ].join('\n'),
+      );
+    }
+  }
 }
diff --git a/runner/orchestration/generate-summary.ts b/runner/orchestration/generate-summary.ts
@@ -89,5 +89,6 @@ export async function prepareSummary(
       id: executorInfo.id,
       displayName: executorInfo.displayName,
     },
+    ratingHash: env.ratingHash,
   } satisfies RunSummary;
 }
diff --git a/runner/orchestration/grouping.ts b/runner/orchestration/grouping.ts
@@ -1,9 +1,8 @@
-import {createHash} from 'crypto';
-import type {LlmRunner} from '../codegen/llm-runner.js';
 import type {Environment} from '../configuration/environment.js';
 import {calculateBuildAndCheckStats} from '../ratings/stats.js';
 import type {AssessmentResult, RunGroup, RunInfo} from '../shared-interfaces.js';
 import {RunnerName} from '../codegen/runner-creation.js';
+import {getSha256Hash} from '../utils/hashing.js';
 
 /** Generates a unique grouping ID for a run. */
 export function getRunGroupId(
@@ -30,7 +29,7 @@ export function getRunGroupId(
     `${options.labels?.sort().join('/')}/${options.model}/${options.runner}`;
 
   // The group string above can get long. Hash it to something shorter and fixed length.
-  return createHash('sha256').update(group).digest('hex');
+  return getSha256Hash(group);
 }
 
 /**
diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts
@@ -442,6 +442,13 @@ export interface RunSummary {
    * Optional since some older reports might not have it.
    */
   runner?: CodegenRunnerInfo;
+
+  /**
+   * Hash of the environment-level ratings. Can be used to
+   * validate that the ratings haven't changed between runs.
+   * This field is optional, because older reports might not have it.
+   */
+  ratingHash?: string;
 }
 
 /**
diff --git a/runner/utils/hashing.ts b/runner/utils/hashing.ts
@@ -0,0 +1,8 @@
+import {createHash} from 'node:crypto';
+
+/**
+ * Returns a sha-256 hash of a string.
+ */
+export function getSha256Hash(value: string): string {
+  return createHash('sha256').update(value).digest('hex');
+}

Original file line number	Diff line number	Diff line change
`@@ -89,5 +89,6 @@ export async function prepareSummary(`
`89`	`89`	`id: executorInfo.id,`
`90`	`90`	`displayName: executorInfo.displayName,`
`91`	`91`	`},`
	`92`	`+ ratingHash: env.ratingHash,`
`92`	`93`	`} satisfies RunSummary;`
`93`	`94`	`}`