Skip to content

Commit 9cfd7f5

Browse files
committed
feat: support passing metadata to prompts
* This allows more advanced custom ratings that leverage metadata that is tied to specific app prompts. e.g. golden response comparisons.
1 parent 71eab90 commit 9cfd7f5

File tree

11 files changed

+132
-33
lines changed

11 files changed

+132
-33
lines changed

examples/environments/remote_env/config.js

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,51 @@
44
* @import {EnvironmentConfig} from 'web-codegen-scorer';
55
*/
66

7-
import {getBuiltInRatings} from 'web-codegen-scorer';
7+
import {
8+
EvalPromptWithMetadata,
9+
getBuiltInRatings,
10+
RatingKind,
11+
RatingCategory,
12+
RatingState,
13+
} from 'web-codegen-scorer';
814
import {FakeRemoteExecutor} from './fake-executor';
915

1016
/** @type {EnvironmentConfig} */
1117
export default {
1218
displayName: 'Remote Env (example)',
1319
clientSideFramework: 'angular',
14-
ratings: getBuiltInRatings(),
20+
ratings: [
21+
...getBuiltInRatings(),
22+
{
23+
name: 'Test Metadata Rating',
24+
id: 'test-metadata-rating',
25+
kind: RatingKind.PER_BUILD,
26+
category: RatingCategory.MEDIUM_IMPACT,
27+
description: 'Testing the metadata of prompts',
28+
scoreReduction: '100%',
29+
rate: ctx => {
30+
const metadata = /** @type {{goldenURL: string}} */ (ctx.prompt.metadata);
31+
const found = ctx.generatedFiles.some(f => f.code.includes(metadata.goldenURL));
32+
33+
return {
34+
state: RatingState.EXECUTED,
35+
coefficient: found ? 1 : 0,
36+
message: found ? `${metadata.goldenURL} found!` : `${metadata.goldenURL} not found!`,
37+
};
38+
},
39+
},
40+
],
1541
generationSystemPrompt: './system-instructions.md',
16-
executablePrompts: ['../../prompts/**/*.md'],
42+
executablePrompts: [
43+
new EvalPromptWithMetadata(
44+
'test-app',
45+
`Create the Angular documentation website. Make sure you add a link to \`angular.dev\` in there.`,
46+
{
47+
metadata: {
48+
goldenURL: 'angular.dev',
49+
},
50+
},
51+
),
52+
],
1753
executor: new FakeRemoteExecutor(),
1854
};

examples/environments/remote_env/fake-executor.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ export class FakeRemoteExecutor implements Executor {
2626
async performFakeLlmRequest(): Promise<LlmResponse> {
2727
return {
2828
success: true,
29-
outputFiles: [{code: 'Works!', filePath: 'main.ts'}],
29+
outputFiles: [{code: 'angular.dev Works', filePath: 'main.ts'}],
3030
reasoning: '',
3131
errors: [],
3232
usage: {inputTokens: 0, totalTokens: 0, outputTokens: 0},

runner/configuration/environment-config.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import z from 'zod';
22
import {createMessageBuilder, fromError} from 'zod-validation-error/v3';
33
import {UserFacingError} from '../utils/errors.js';
44
import {ratingSchema} from '../ratings/rating-types.js';
5-
import {MultiStepPrompt} from './multi-step-prompt.js';
5+
import {EvalPrompt, EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
66
import {executorSchema} from '../orchestration/executors/executor.js';
77
import {
88
LocalExecutorConfig,
@@ -46,6 +46,8 @@ export const environmentConfigSchema = z.object({
4646
ratings: z.array(ratingSchema).optional(),
4747
}),
4848
z.custom<MultiStepPrompt>(data => data instanceof MultiStepPrompt),
49+
z.custom<EvalPrompt>(data => data instanceof EvalPrompt),
50+
z.custom<EvalPromptWithMetadata<unknown>>(data => data instanceof EvalPromptWithMetadata),
4951
]),
5052
),
5153
/**

runner/configuration/environment.ts

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ import {UserFacingError} from '../utils/errors.js';
1313
import {generateId} from '../utils/id-generation.js';
1414
import {lazy} from '../utils/lazy-creation.js';
1515
import {EnvironmentConfig} from './environment-config.js';
16-
import {MultiStepPrompt} from './multi-step-prompt.js';
16+
import {EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
1717
import {renderPromptTemplate} from './prompt-templating.js';
1818

1919
/** Represents a single prompt evaluation environment. */
@@ -176,6 +176,18 @@ export class Environment {
176176
for (const def of prompts) {
177177
if (def instanceof MultiStepPrompt) {
178178
result.push(this.getMultiStepPrompt(def, envRatings));
179+
} else if (def instanceof EvalPromptWithMetadata) {
180+
result.push(
181+
Promise.resolve({
182+
name: def.name,
183+
kind: 'single',
184+
prompt: def.text,
185+
ratings: [...envRatings, ...(def.opts.extraRatings ?? [])],
186+
systemPromptType: 'generation',
187+
contextFilePatterns: def.opts.contextFilePatterns ?? [],
188+
metadata: def.opts.metadata,
189+
} satisfies PromptDefinition),
190+
);
179191
} else {
180192
let path: string;
181193
let ratings: Rating[];
@@ -198,6 +210,7 @@ export class Environment {
198210
relativePath,
199211
ratings,
200212
/* isEditing */ false,
213+
undefined,
201214
),
202215
),
203216
);
@@ -216,12 +229,13 @@ export class Environment {
216229
* @param ratings Ratings to run against the definition.
217230
* @param isEditing Whether this is an editing or generation step.
218231
*/
219-
private async getStepPromptDefinition(
232+
private async getStepPromptDefinition<Metadata>(
220233
name: string,
221234
relativePath: string,
222235
ratings: Rating[],
223236
isEditing: boolean,
224-
): Promise<PromptDefinition> {
237+
metadata: Metadata,
238+
): Promise<PromptDefinition<Metadata>> {
225239
const {result, contextFiles} = await this.renderEnvironmentPrompt(relativePath);
226240

227241
return {
@@ -231,7 +245,8 @@ export class Environment {
231245
ratings,
232246
systemPromptType: isEditing ? 'editing' : 'generation',
233247
contextFilePatterns: contextFiles,
234-
} satisfies PromptDefinition;
248+
metadata,
249+
} satisfies PromptDefinition<Metadata>;
235250
}
236251

237252
/**
@@ -284,6 +299,7 @@ export class Environment {
284299
ratings.unshift(...def.stepRatings[current.name]);
285300
}
286301

302+
const stepMetadata = def.stepMetadata[current.name];
287303
const stepNum = parseInt(match[1]);
288304
if (stepNum === 0) {
289305
throw new UserFacingError('Multi-step prompts start with `step-1`.');
@@ -293,6 +309,7 @@ export class Environment {
293309
join(def.directoryPath, current.name),
294310
ratings,
295311
/*isEditing */ stepNum !== 1,
312+
stepMetadata,
296313
);
297314

298315
stepValues[step.name] = stepNum;

runner/configuration/multi-step-prompt.ts

Lines changed: 0 additions & 9 deletions
This file was deleted.

runner/configuration/prompts.ts

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import {Rating} from '../ratings/rating-types.js';
2+
3+
export interface EvalPromptOptions<M> {
4+
metadata: M;
5+
contextFilePatterns?: string[];
6+
extraRatings?: Rating[];
7+
}
8+
9+
/** Definition of a single-step prompt with metadata. */
10+
export class EvalPromptWithMetadata<Metadata> {
11+
constructor(
12+
readonly name: string,
13+
readonly text: string,
14+
readonly opts: EvalPromptOptions<Metadata>,
15+
) {}
16+
}
17+
18+
/** Definition of a single-step prompt. */
19+
export class EvalPrompt extends EvalPromptWithMetadata<undefined> {
20+
constructor(
21+
name: string,
22+
text: string,
23+
opts: Omit<EvalPromptOptions<undefined>, 'metadata'> = {},
24+
) {
25+
super(name, text, {...opts, metadata: undefined});
26+
}
27+
}
28+
29+
/** Definition of a multi-step prompt. */
30+
export class MultiStepPrompt {
31+
constructor(
32+
readonly directoryPath: string,
33+
readonly stepRatings: Record<string, Rating[]> = {},
34+
readonly stepMetadata: Record<string, unknown> = {},
35+
) {}
36+
}

runner/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ export * from './ratings/built-in.js';
99
export * from './ratings/rating-types.js';
1010
export * from './ratings/built-in-ratings/index.js';
1111
export {calculateBuildAndCheckStats, isPositiveScore} from './ratings/stats.js';
12-
export {MultiStepPrompt} from './configuration/multi-step-prompt.js';
12+
export {MultiStepPrompt, EvalPrompt, EvalPromptWithMetadata} from './configuration/prompts.js';
1313
export {
1414
BuildErrorType,
1515
BuildResultStatus,

runner/ratings/built-in-ratings/sufficient-generated-files-rating.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ export const sufficientGeneratedFilesRating: PerBuildRating = {
88
id: 'common-generated-file-count',
99
scoreReduction: '100%',
1010
kind: RatingKind.PER_BUILD,
11-
rate: ({generatedFileCount}) => ({
11+
rate: ({generatedFiles}) => ({
1212
state: RatingState.EXECUTED,
13-
coefficient: generatedFileCount > 0 ? 1 : 0,
13+
coefficient: generatedFiles.length > 0 ? 1 : 0,
1414
}),
1515
};

runner/ratings/rate-code.ts

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -92,19 +92,20 @@ export async function rateGeneratedCode(
9292
try {
9393
if (current.kind === RatingKind.PER_BUILD) {
9494
result = runPerBuildRating(
95+
currentPromptDef,
9596
current,
9697
buildResult,
9798
serveTestingResult,
9899
repairAttempts,
99100
testResult,
100101
testRepairAttempts,
101-
outputFiles.length,
102+
outputFiles,
102103
axeRepairAttempts,
103104
ratingsResult,
104105
);
105106
} else if (current.kind === RatingKind.PER_FILE) {
106107
categorizedFiles ??= splitFilesIntoCategories(outputFiles);
107-
result = await runPerFileRating(current, categorizedFiles, ratingsResult);
108+
result = await runPerFileRating(currentPromptDef, current, categorizedFiles, ratingsResult);
108109
} else if (current.kind === RatingKind.LLM_BASED) {
109110
result = await runLlmBasedRating(
110111
environment,
@@ -174,25 +175,27 @@ export async function rateGeneratedCode(
174175
}
175176

176177
function runPerBuildRating(
178+
prompt: PromptDefinition,
177179
rating: PerBuildRating,
178180
buildResult: BuildResult,
179181
serveResult: ServeTestingResult | null,
180182
repairAttempts: number,
181183
testResult: TestExecutionResult | null,
182184
testRepairAttempts: number,
183-
generatedFileCount: number,
185+
generatedFiles: LlmResponseFile[],
184186
axeRepairAttempts: number,
185187
ratingsResult: RatingsResult,
186188
): IndividualAssessment | SkippedIndividualAssessment {
187189
const rateResult = rating.rate({
188190
buildResult,
189191
serveResult,
190192
repairAttempts,
191-
generatedFileCount,
193+
generatedFiles,
192194
axeRepairAttempts,
193-
ratingsResult,
194195
testResult,
195196
testRepairAttempts,
197+
ratingsResult,
198+
prompt,
196199
});
197200

198201
// If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment.
@@ -208,6 +211,7 @@ function runPerBuildRating(
208211
}
209212

210213
async function runPerFileRating(
214+
prompt: PromptDefinition,
211215
rating: PerFileRating,
212216
categorizedFiles: CategorizedFiles,
213217
ratingsResult: RatingsResult,
@@ -240,7 +244,7 @@ async function runPerFileRating(
240244
// Remove comments from the code to avoid false-detection of bad patterns.
241245
// Some keywords like `NgModule` can be used in code comments.
242246
const code = removeComments(file.code, contentType);
243-
const result = await rating.rate(code, file.filePath, ratingsResult);
247+
const result = await rating.rate(code, file.filePath, {prompt, ratingsResult});
244248
let coeff: number;
245249

246250
if (typeof result === 'number') {

runner/ratings/rating-types.ts

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,11 @@ export const CATEGORY_NAMES = {
4646
[RatingCategory.LOW_IMPACT]: 'Low Impact',
4747
};
4848

49+
const ratingCommonContextFields = {
50+
ratingsResult: z.record(z.custom<IndividualAssessment | SkippedIndividualAssessment>()),
51+
prompt: z.custom<PromptDefinition>(),
52+
};
53+
4954
const ratingSchemaCommonFields = {
5055
category: z.custom<RatingCategory>(),
5156
scoreReduction: z.custom<`${number}%`>(),
@@ -63,13 +68,13 @@ const perBuildRatingSchema = z
6368
.args(
6469
z.strictObject({
6570
buildResult: z.custom<BuildResult>(),
71+
generatedFiles: z.custom<LlmResponseFile[]>(),
6672
serveResult: z.custom<ServeTestingResult | null>(),
6773
repairAttempts: z.number(),
6874
testResult: z.custom<TestExecutionResult | null>(),
6975
testRepairAttempts: z.number(),
7076
axeRepairAttempts: z.number(),
71-
generatedFileCount: z.number(),
72-
ratingsResult: z.record(z.custom<IndividualAssessment | SkippedIndividualAssessment>()),
77+
...ratingCommonContextFields,
7378
}),
7479
)
7580
.returns(z.custom<PerBuildRatingResult>()),
@@ -83,9 +88,9 @@ const perFileRatingSchema = z
8388
rate: z
8489
.function()
8590
.args(
86-
z.string(),
87-
z.string().optional(),
88-
z.record(z.custom<IndividualAssessment | SkippedIndividualAssessment>()),
91+
z.string().describe('Code'),
92+
z.string().optional().describe('File path'),
93+
z.object(ratingCommonContextFields).describe('Context'),
8994
)
9095
.returns(z.custom<PerFileRatingResult>()),
9196
filter: z.union([

0 commit comments

Comments
 (0)