Skip to content

Commit 115aaa2

Browse files
committed
feat: add validation that ratings didn't change
Adds a `ratingHash` on the environment that is generated from the current set of ratings. In addition, includes an `expectedRatingHash` field on the environment config which can be used to verify that an environment has a specific hash before it is executed. This is useful to ensure that the ratings stay stable between runs and `web-codegen-scorer` releases.
1 parent df47768 commit 115aaa2

File tree

6 files changed

+91
-26
lines changed

6 files changed

+91
-26
lines changed

runner/configuration/environment-config.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,13 @@ export const environmentConfigSchema = z.object({
9191
}),
9292
)
9393
.optional(),
94+
95+
/**
96+
* When an environment is created, it generates a hash based on the configured ratings.
97+
* This field is used to validate that the generated hash matches a pre-defined one.
98+
* It's useful to ensure that the set of ratings hasn't changed between two runs.
99+
*/
100+
expectedRatingHash: z.string().optional(),
94101
});
95102

96103
/**

runner/configuration/environment.ts

Lines changed: 66 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ import {lazy} from '../utils/lazy-creation.js';
1515
import {EnvironmentConfig} from './environment-config.js';
1616
import {EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
1717
import {renderPromptTemplate} from './prompt-templating.js';
18+
import {getSha256Hash} from '../utils/hashing.js';
19+
20+
interface CategoryConfig {
21+
name: string;
22+
maxPoints: number;
23+
}
1824

1925
/** Represents a single prompt evaluation environment. */
2026
export class Environment {
@@ -40,10 +46,18 @@ export class Environment {
4046
readonly promptTimeoutMinutes: number | undefined;
4147
/** Configuration for the individual rating categories. */
4248
readonly ratingCategories: {
43-
[RatingCategory.HIGH_IMPACT]: {name: string; maxPoints: number};
44-
[RatingCategory.MEDIUM_IMPACT]: {name: string; maxPoints: number};
45-
[RatingCategory.LOW_IMPACT]: {name: string; maxPoints: number};
49+
[RatingCategory.HIGH_IMPACT]: CategoryConfig;
50+
[RatingCategory.MEDIUM_IMPACT]: CategoryConfig;
51+
[RatingCategory.LOW_IMPACT]: CategoryConfig;
4652
};
53+
/**
54+
* Hash of the environment-level ratings. Can be used to
55+
* validate that the ratings haven't changed between runs.
56+
*/
57+
readonly ratingHash: string;
58+
59+
/** Ratings configured at the environment level. */
60+
private readonly ratings: Rating[];
4761

4862
constructor(
4963
rootPath: string,
@@ -71,12 +85,15 @@ export class Environment {
7185
this.isBuiltIn = rootPath.includes('node_modules');
7286
this.executor = config.executor;
7387
this.promptTimeoutMinutes = config.promptTimeoutMinutes;
88+
this.ratings = this.resolveRatings(config);
89+
this.ratingHash = this.getRatingHash(this.ratings);
7490
this.ratingCategories = this.getRatingCategories(config);
91+
this.validateRatingHash(this.ratingHash, config);
7592
}
7693

7794
/** Prompts that should be executed as a part of the evaluation. */
7895
executablePrompts = lazy(async () => {
79-
return this.resolveExecutablePrompts(this.config.executablePrompts, this.config);
96+
return this.resolveExecutablePrompts(this.config.executablePrompts);
8097
});
8198

8299
systemPromptGeneration = lazy(async () => {
@@ -178,27 +195,9 @@ export class Environment {
178195
*/
179196
private async resolveExecutablePrompts(
180197
prompts: EnvironmentConfig['executablePrompts'],
181-
config: EnvironmentConfig,
182198
): Promise<RootPromptDefinition[]> {
183199
const result: Promise<RootPromptDefinition>[] = [];
184-
let envRatings: Rating[];
185-
186-
if (config.ratingOverrides) {
187-
Object.keys(config.ratingOverrides).forEach(id => {
188-
if (!config.ratings.some(rating => rating.id === id)) {
189-
throw new UserFacingError(
190-
`Rating with an ID of "${id}" has not been configured. Cannot apply an override to it.`,
191-
);
192-
}
193-
});
194-
195-
envRatings = config.ratings.map(rating => {
196-
const override = config.ratingOverrides![rating.id];
197-
return override ? {...rating, ...override} : rating;
198-
});
199-
} else {
200-
envRatings = config.ratings;
201-
}
200+
const envRatings = this.ratings;
202201

203202
for (const def of prompts) {
204203
if (def instanceof MultiStepPrompt) {
@@ -378,6 +377,25 @@ export class Environment {
378377
return result;
379378
}
380379

380+
private resolveRatings(config: EnvironmentConfig) {
381+
if (!config.ratingOverrides) {
382+
return config.ratings;
383+
}
384+
385+
Object.keys(config.ratingOverrides).forEach(id => {
386+
if (!config.ratings.some(rating => rating.id === id)) {
387+
throw new UserFacingError(
388+
`Rating with an ID of "${id}" has not been configured. Cannot apply an override to it.`,
389+
);
390+
}
391+
});
392+
393+
return config.ratings.map(rating => {
394+
const override = config.ratingOverrides![rating.id];
395+
return override ? {...rating, ...override} : rating;
396+
});
397+
}
398+
381399
private getRatingCategories(config: EnvironmentConfig) {
382400
const overrides = config.categoryOverrides;
383401

@@ -399,4 +417,29 @@ export class Environment {
399417
},
400418
};
401419
}
420+
421+
private getRatingHash(ratings: Rating[]): string {
422+
const parts: string[] = [];
423+
424+
for (const rating of ratings) {
425+
parts.push(
426+
`${rating.category};${rating.id};${rating.scoreReduction};${rating.groupingLabels || [].sort().join(',')}`,
427+
);
428+
}
429+
430+
return getSha256Hash(parts.sort().join('|'));
431+
}
432+
433+
private validateRatingHash(currentHash: string, config: EnvironmentConfig) {
434+
if (config.expectedRatingHash && config.expectedRatingHash !== currentHash) {
435+
throw new UserFacingError(
436+
[
437+
`Rating hash for environment "${this.displayName}" does not match the expectation.`,
438+
`Expected: ${config.expectedRatingHash}`,
439+
`Actual: ${this.ratingHash}`,
440+
`Either update the \`expectedRatingHash\` field in the config or revert the ratings back to their previous configuration`,
441+
].join('\n'),
442+
);
443+
}
444+
}
402445
}

runner/orchestration/generate-summary.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,5 +89,6 @@ export async function prepareSummary(
8989
id: executorInfo.id,
9090
displayName: executorInfo.displayName,
9191
},
92+
ratingHash: env.ratingHash,
9293
} satisfies RunSummary;
9394
}

runner/orchestration/grouping.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1-
import {createHash} from 'crypto';
2-
import type {LlmRunner} from '../codegen/llm-runner.js';
31
import type {Environment} from '../configuration/environment.js';
42
import {calculateBuildAndCheckStats} from '../ratings/stats.js';
53
import type {AssessmentResult, RunGroup, RunInfo} from '../shared-interfaces.js';
64
import {RunnerName} from '../codegen/runner-creation.js';
5+
import {getSha256Hash} from '../utils/hashing.js';
76

87
/** Generates a unique grouping ID for a run. */
98
export function getRunGroupId(
@@ -30,7 +29,7 @@ export function getRunGroupId(
3029
`${options.labels?.sort().join('/')}/${options.model}/${options.runner}`;
3130

3231
// The group string above can get long. Hash it to something shorter and fixed length.
33-
return createHash('sha256').update(group).digest('hex');
32+
return getSha256Hash(group);
3433
}
3534

3635
/**

runner/shared-interfaces.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,13 @@ export interface RunSummary {
442442
* Optional since some older reports might not have it.
443443
*/
444444
runner?: CodegenRunnerInfo;
445+
446+
/**
447+
* Hash of the environment-level ratings. Can be used to
448+
* validate that the ratings haven't changed between runs.
449+
* This field is optional, because older reports might not have it.
450+
*/
451+
ratingHash?: string;
445452
}
446453

447454
/**

runner/utils/hashing.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import {createHash} from 'node:crypto';
2+
3+
/**
4+
* Returns a sha-256 hash of a string.
5+
*/
6+
export function getSha256Hash(value: string): string {
7+
return createHash('sha256').update(value).digest('hex');
8+
}

0 commit comments

Comments
 (0)