Test Results
@@ -503,19 +609,7 @@
Failed Tests
}
Generated Code
-
- @for (file of attempt.outputFiles; track file) {
-
{{ file.filePath }}
-
-
-
-
-
- }
+
}
}
diff --git a/report-app/src/app/pages/report-viewer/report-viewer.scss b/report-app/src/app/pages/report-viewer/report-viewer.scss
index fa6eb4e2..effb37f1 100644
--- a/report-app/src/app/pages/report-viewer/report-viewer.scss
+++ b/report-app/src/app/pages/report-viewer/report-viewer.scss
@@ -190,6 +190,17 @@ lighthouse-category + lighthouse-category {
align-items: center;
}
+.chart-title-tooltip-icon {
+ font-size: 18px;
+ cursor: help;
+}
+
+.chart-title-right-label {
+ margin-left: auto;
+ font-size: 0.9rem;
+ font-weight: 500;
+}
+
.axe-violations ul {
padding: 0px 20px;
}
@@ -246,3 +257,18 @@ lighthouse-category + lighthouse-category {
.hidden {
visibility: hidden;
}
+
+.failed-prompt-button.neutral-button {
+ font-size: 0.8rem;
+ padding: 0.8rem;
+ height: 1.4rem;
+ margin: 0.5rem 0;
+ background-color: var(--button-active-bg-color);
+ color: color-mix(in srgb, var(--accent-yellow) 75%, transparent);
+ border-color: var(--button-active-bg-color);
+ font-weight: bold;
+
+ &:active, &:hover {
+ background-color: transparent;
+ }
+}
diff --git a/report-app/src/app/pages/report-viewer/report-viewer.ts b/report-app/src/app/pages/report-viewer/report-viewer.ts
index f09f7a16..d58dc5f4 100644
--- a/report-app/src/app/pages/report-viewer/report-viewer.ts
+++ b/report-app/src/app/pages/report-viewer/report-viewer.ts
@@ -1,19 +1,12 @@
import {Clipboard} from '@angular/cdk/clipboard';
import {DatePipe, DecimalPipe} from '@angular/common';
import {HttpClient} from '@angular/common/http';
-import {
- afterNextRender,
- Component,
- computed,
- ElementRef,
- inject,
- input,
- resource,
- signal,
- viewChild,
-} from '@angular/core';
+import {afterNextRender, Component, computed, inject, input, resource, signal} from '@angular/core';
import {NgxJsonViewerModule} from 'ngx-json-viewer';
-import {BuildErrorType} from '../../../../../runner/workers/builder/builder-types';
+import {
+ BuildErrorType,
+ BuildResultStatus,
+} from '../../../../../runner/workers/builder/builder-types';
import {
AssessmentResult,
AssessmentResultFromReportServer,
@@ -45,6 +38,11 @@ import {ProviderLabel} from '../../shared/provider-label';
import {AiAssistant} from '../../shared/ai-assistant/ai-assistant';
import {LighthouseCategory} from './lighthouse-category';
import {MultiSelect} from '../../shared/multi-select/multi-select';
+import {FileCodeViewer} from '../../shared/file-code-viewer/file-code-viewer';
+import {
+ calculateAverageRepairAttempts,
+ createRepairAttemptGraphData,
+} from './repair-attempt-graph-builder';
const localReportRegex = /-l\d+$/;
@@ -63,6 +61,7 @@ const localReportRegex = /-l\d+$/;
AiAssistant,
LighthouseCategory,
MultiSelect,
+ FileCodeViewer,
],
templateUrl: './report-viewer.html',
styleUrls: ['./report-viewer.scss'],
@@ -281,6 +280,39 @@ export class ReportViewer {
];
}
+ protected hasSuccessfulResultWithMoreThanOneBuildAttempt = computed(() => {
+ if (!this.selectedReport.hasValue()) {
+ return false;
+ }
+ for (const result of this.selectedReport.value().results) {
+ if (
+ result.finalAttempt.buildResult.status === BuildResultStatus.SUCCESS &&
+ result.repairAttempts > 1
+ ) {
+ return true;
+ }
+ }
+ return false;
+ });
+
+ protected averageRepairAttempts = computed
(() => {
+ const report = this.selectedReportWithSortedResults();
+ if (!report) {
+ return null;
+ }
+
+ return calculateAverageRepairAttempts(report);
+ });
+
+ protected repairAttemptsAsGraphData = computed(() => {
+ const report = this.selectedReportWithSortedResults();
+ if (!report) {
+ return [];
+ }
+
+ return createRepairAttemptGraphData(report);
+ });
+
protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData {
return [
{
diff --git a/report-app/src/app/services/app-color-mode.ts b/report-app/src/app/services/app-color-mode.ts
index 00d347bb..50f8d15b 100644
--- a/report-app/src/app/services/app-color-mode.ts
+++ b/report-app/src/app/services/app-color-mode.ts
@@ -20,7 +20,7 @@ export class AppColorMode {
} catch {}
if (!colorMode) {
- colorMode = matchMedia('(prefers-color-scheme: dark)') ? 'dark' : 'light';
+ colorMode = matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light';
}
this.setColorMode(colorMode);
diff --git a/report-app/src/app/shared/code-viewer.ts b/report-app/src/app/shared/code-viewer.ts
index 9c3e8c7f..b834d1a2 100644
--- a/report-app/src/app/shared/code-viewer.ts
+++ b/report-app/src/app/shared/code-viewer.ts
@@ -32,7 +32,7 @@ import {AppColorMode} from '../services/app-color-mode';
content: counter(step);
counter-increment: step;
width: 1rem;
- margin-right: 1.5rem;
+ margin-right: 0.5rem;
display: inline-block;
text-align: right;
color: rgba(115, 138, 148, 0.4);
diff --git a/report-app/src/app/shared/file-code-viewer/file-code-viewer.html b/report-app/src/app/shared/file-code-viewer/file-code-viewer.html
new file mode 100644
index 00000000..2fee117e
--- /dev/null
+++ b/report-app/src/app/shared/file-code-viewer/file-code-viewer.html
@@ -0,0 +1,52 @@
+
+
+
+ @if (selectedFile(); as file) {
+ @let iconOptions = { isDirectory: false };
+
+
+ {{ getFileIcon(file.path, iconOptions) }}
+ {{ file.name }}
+
+
+
+
+ }
+
+
diff --git a/report-app/src/app/shared/file-code-viewer/file-code-viewer.scss b/report-app/src/app/shared/file-code-viewer/file-code-viewer.scss
new file mode 100644
index 00000000..71d8bb24
--- /dev/null
+++ b/report-app/src/app/shared/file-code-viewer/file-code-viewer.scss
@@ -0,0 +1,139 @@
+:host {
+ --bg-color: #1e1e1e;
+ --sidebar-bg: #252526;
+ --border-color: #333;
+ --text-color: #cccccc;
+ --accent-color: #37373d;
+ --hover-bg: #2a2d2e;
+ --padding-base: 1rem;
+
+ display: block;
+ position: relative;
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+}
+
+.editor-container {
+ display: flex;
+ height: 40rem;
+ overflow: hidden;
+ background-color: var(--bg-color);
+ border-radius: 0.5rem;
+ border: 1px solid var(--border-color);
+}
+
+.sidebar {
+ width: 15rem;
+ max-width: 25rem;
+ background-color: var(--sidebar-bg);
+ display: flex;
+ flex-direction: column;
+ border-right: 1px solid var(--border-color);
+ flex-shrink: 0;
+ overflow: hidden;
+}
+
+.sidebar-header {
+ padding: calc(var(--padding-base) / 2) var(--padding-base);
+ font-weight: 600;
+ position: sticky;
+ top: 0;
+ background-color: var(--sidebar-bg);
+ z-index: 1;
+ text-transform: uppercase;
+ font-size: 0.7rem;
+ color: var(--text-color);
+}
+
+.file-list {
+ list-style: none;
+ padding: 0;
+ margin: 0;
+ flex-grow: 1;
+ overflow-y: auto;
+}
+
+.file-item {
+ display: flex;
+ align-items: center;
+ gap: 0.25rem;
+ padding: 0.25rem var(--padding-base);
+ cursor: pointer;
+ transition: background-color 0.2s;
+ color: var(--text-color);
+ font-size: 0.85rem;
+ white-space: nowrap;
+ overflow: hidden;
+ text-overflow: ellipsis;
+
+ &:hover {
+ background-color: var(--hover-bg);
+ }
+
+ &.selected {
+ background-color: var(--accent-color);
+ color: white;
+ }
+}
+
+.collapse-indicator {
+ transition: transform 0.2s;
+}
+
+.file-item.expanded .collapse-indicator {
+ transform: rotate(90deg);
+}
+
+.content {
+ flex-grow: 1;
+ overflow: hidden;
+ display: flex;
+ flex-direction: column;
+}
+
+.tabs {
+ display: flex;
+ justify-content: space-between;
+ align-items: center;
+ background-color: var(--sidebar-bg);
+ flex-shrink: 0;
+}
+
+.tab {
+ display: flex;
+ align-items: center;
+ gap: 0.5rem;
+ padding: calc(var(--padding-base) / 2) var(--padding-base);
+ background-color: var(--bg-color);
+ color: var(--text-color);
+ border-right: 1px solid var(--sidebar-bg);
+ font-size: 0.85rem;
+ white-space: nowrap;
+ overflow: hidden;
+ text-overflow: ellipsis;
+}
+
+.copy-button {
+ background-color: var(--border-color);
+ color: var(--text-color);
+ border: 1px solid #444;
+ border-radius: 0.25rem;
+ padding: 0.25rem 0.5rem;
+ cursor: pointer;
+ margin-right: 0.5rem;
+ display: flex;
+ align-items: center;
+ gap: 0.25rem;
+
+ &:hover {
+ background-color: #444;
+ }
+}
+
+.material-symbols-outlined {
+ font-size: 1rem;
+}
+
+.code-container {
+ flex-grow: 1;
+ overflow: auto;
+}
diff --git a/report-app/src/app/shared/file-code-viewer/file-code-viewer.ts b/report-app/src/app/shared/file-code-viewer/file-code-viewer.ts
new file mode 100644
index 00000000..290610d7
--- /dev/null
+++ b/report-app/src/app/shared/file-code-viewer/file-code-viewer.ts
@@ -0,0 +1,84 @@
+import {Component, computed, inject, input, linkedSignal} from '@angular/core';
+import {LlmResponseFile} from '../../../../../runner/shared-interfaces';
+import {CodeViewer} from '../code-viewer';
+import {Clipboard} from '@angular/cdk/clipboard';
+import {FileTreeNode, TreeNode, buildFileTree} from './file-tree';
+
+@Component({
+ selector: 'app-file-code-viewer',
+ templateUrl: './file-code-viewer.html',
+ styleUrl: './file-code-viewer.scss',
+ imports: [CodeViewer],
+})
+export class FileCodeViewer {
+ private readonly clipboard = inject(Clipboard);
+ readonly files = input.required();
+
+ private readonly fileTree = computed(() => buildFileTree(this.files()));
+
+ readonly flatTree = computed(() => {
+ const tree = this.fileTree();
+ const flatten = (nodes: TreeNode[]): TreeNode[] => {
+ let flat: TreeNode[] = [];
+ for (const node of nodes) {
+ flat.push(node);
+ if (node.isDirectory) {
+ flat = flat.concat(flatten(node.children));
+ }
+ }
+ return flat;
+ };
+ return flatten(tree);
+ });
+
+ readonly selectedFile = linkedSignal(() =>
+ this.flatTree().find(f => !f.isDirectory),
+ );
+
+ toggleNode(node: TreeNode): void {
+ if (node.isDirectory) {
+ node.isExpanded.update(e => !e);
+ } else {
+ this.selectedFile.set(node);
+ }
+ }
+
+ copyCode(): void {
+ const fileNode = this.selectedFile();
+ if (fileNode?.file) {
+ if (!this.clipboard.copy(fileNode.file.code)) {
+ alert('Failed to copy code to clipboard.');
+ }
+ }
+ }
+
+ getFileIcon(filePath: string, options: {isDirectory: boolean; isExpanded?: boolean}): string {
+ if (options.isDirectory) {
+ return options.isExpanded ? 'folder_open' : 'folder';
+ }
+ const extension = filePath.split('.').pop();
+ switch (extension) {
+ case 'html':
+ return 'html';
+ case 'ts':
+ return 'javascript';
+ case 'css':
+ return 'css';
+ case 'scss':
+ return 'css';
+ default:
+ return 'article';
+ }
+ }
+
+ isNodeVisible(node: TreeNode): boolean {
+ let current = node.parent;
+ while (current && current.path) {
+ if (!current.isExpanded()) {
+ return false;
+ }
+ current = current.parent;
+ }
+ return true;
+ }
+}
diff --git a/report-app/src/app/shared/file-code-viewer/file-tree.ts b/report-app/src/app/shared/file-code-viewer/file-tree.ts
new file mode 100644
index 00000000..3982e4b9
--- /dev/null
+++ b/report-app/src/app/shared/file-code-viewer/file-tree.ts
@@ -0,0 +1,127 @@
+import {signal, WritableSignal} from '@angular/core';
+import {LlmResponseFile} from '../../../../../runner/shared-interfaces';
+
+export interface DirectoryTreeNode {
+ isDirectory: true;
+ name: string;
+ path: string;
+ isExpanded: WritableSignal;
+ children: (FileTreeNode | DirectoryTreeNode)[];
+ depth: number;
+ parent?: DirectoryTreeNode;
+}
+
+export interface FileTreeNode {
+ isDirectory: false;
+ name: string;
+ path: string;
+ file: LlmResponseFile;
+ depth: number;
+ parent?: DirectoryTreeNode;
+}
+
+export type TreeNode = FileTreeNode | DirectoryTreeNode;
+
+/** Creates a directory tree node. */
+function createDirectoryNode(
+ name: string,
+ path: string,
+ depth: number,
+ parent: DirectoryTreeNode | undefined,
+): DirectoryTreeNode {
+ return {
+ isDirectory: true,
+ name,
+ path,
+ isExpanded: signal(true),
+ children: [],
+ depth,
+ parent,
+ };
+}
+
+/** Creates a file tree node. */
+function createFileNode(
+ name: string,
+ path: string,
+ depth: number,
+ parent: DirectoryTreeNode | undefined,
+ file: LlmResponseFile,
+): FileTreeNode {
+ return {
+ isDirectory: false,
+ name,
+ path,
+ depth,
+ parent,
+ file,
+ };
+}
+
+/** Recursively sorts the tree, directories first, then alphabetically. */
+function sortTree(nodes: TreeNode[]): void {
+ nodes.sort((a, b) => {
+ if (a.isDirectory !== b.isDirectory) {
+ return a.isDirectory ? -1 : 1;
+ }
+ return a.name.localeCompare(b.name);
+ });
+ nodes.forEach(node => {
+ if (node.isDirectory) {
+ sortTree(node.children);
+ }
+ });
+}
+
+/** Ensures all parent directories for a given path exist in the tree. */
+function ensureDirectoryPath(pathParts: string[], nodeMap: Map): void {
+ let currentPath = '';
+ let parentNode = nodeMap.get('')!; // Start from root
+
+ for (const part of pathParts) {
+ currentPath = currentPath ? `${currentPath}/${part}` : part;
+
+ if (!nodeMap.has(currentPath)) {
+ assertIsDirectory(parentNode, 'Expected parent node to be a directory.');
+ const newNode = createDirectoryNode(part, currentPath, parentNode.depth + 1, parentNode);
+ parentNode.children.push(newNode);
+ nodeMap.set(currentPath, newNode);
+ }
+ parentNode = nodeMap.get(currentPath)!;
+ }
+}
+
+/** Builds the file tree from a flat list of files. */
+export function buildFileTree(files: LlmResponseFile[]): TreeNode[] {
+ const root = createDirectoryNode('root', '', -1, undefined);
+ const nodeMap = new Map([['', root]]);
+
+ for (const file of files) {
+ const pathParts = file.filePath.split('/');
+ const fileName = pathParts.pop()!;
+ const directoryPath = pathParts.join('/');
+
+ ensureDirectoryPath(pathParts, nodeMap);
+
+ const parentNode = nodeMap.get(directoryPath)!;
+ assertIsDirectory(parentNode, 'Expected parent node to be a directory.');
+
+ const fileNode = createFileNode(
+ fileName,
+ file.filePath,
+ parentNode.depth + 1,
+ parentNode,
+ file,
+ );
+ parentNode.children.push(fileNode);
+ }
+
+ sortTree(root.children);
+ return root.children;
+}
+
+function assertIsDirectory(n: TreeNode, failureMessage: string): asserts n is DirectoryTreeNode {
+ if (!n.isDirectory) {
+ throw new Error(failureMessage);
+ }
+}
diff --git a/report-app/src/app/shared/provider-label.ts b/report-app/src/app/shared/provider-label.ts
index b3ab5afc..786e7013 100644
--- a/report-app/src/app/shared/provider-label.ts
+++ b/report-app/src/app/shared/provider-label.ts
@@ -9,6 +9,7 @@ const exactMatches: Record = {
'gemini-cli': 'gemini.webp',
genkit: 'genkit.png',
codex: 'open-ai.png',
+ 'ai-sdk': 'ai-sdk.png',
};
@Component({
@@ -30,7 +31,8 @@ const exactMatches: Record = {
height: 24px;
}
- :host-context(.dark-mode) :host(.genkit) .logo {
+ :host-context(.dark-mode) :host(.genkit) .logo,
+ :host-context(.dark-mode) :host(.next) .logo {
filter: invert(1);
}
`,
@@ -67,7 +69,7 @@ export class ProviderLabel {
function getModelLogoURL(id: string): string | null {
if (id.startsWith('gemini')) {
return 'gemini.webp';
- } else if (id.startsWith('openai')) {
+ } else if (id.startsWith('openai') || id.startsWith('gpt')) {
return 'open-ai.png';
} else if (id.startsWith('claude')) {
return 'claude.png';
diff --git a/report-app/src/app/shared/scoring.ts b/report-app/src/app/shared/scoring.ts
index f609b5ec..9081694f 100644
--- a/report-app/src/app/shared/scoring.ts
+++ b/report-app/src/app/shared/scoring.ts
@@ -8,6 +8,12 @@ export enum ScoreCssVariable {
good = 'var(--status-fill-good)',
poor = 'var(--status-fill-poor)',
neutral = 'var(--status-fill-neutral)',
+ // When we need more refined gradiant between "good" and "poor".
+ mediocre1 = 'var(--status-fill-mediocre-1)',
+ mediocre2 = 'var(--status-fill-mediocre-2)',
+ mediocre3 = 'var(--status-fill-mediocre-3)',
+ mediocre4 = 'var(--status-fill-mediocre-4)',
+ mediocre5 = 'var(--status-fill-mediocre-5)',
}
const CACHED_COLORS = {
diff --git a/report-app/src/app/shared/styles/callouts.scss b/report-app/src/app/shared/styles/callouts.scss
index 84bc52c1..71fec1b4 100644
--- a/report-app/src/app/shared/styles/callouts.scss
+++ b/report-app/src/app/shared/styles/callouts.scss
@@ -11,6 +11,7 @@
font-family: monospace;
font-size: 0.75rem;
white-space: pre-wrap;
+ overflow: scroll;
}
&.neutral {
diff --git a/report-app/src/app/shared/styles/tooltip.scss b/report-app/src/app/shared/styles/tooltip.scss
index 7b634c38..d03df688 100644
--- a/report-app/src/app/shared/styles/tooltip.scss
+++ b/report-app/src/app/shared/styles/tooltip.scss
@@ -28,6 +28,7 @@
&.multiline-tooltip::before {
white-space: normal;
+ width: max-content;
max-width: 400px;
}
diff --git a/report-app/src/app/shared/visualization/stacked-bar-chart/stacked-bar-chart.scss b/report-app/src/app/shared/visualization/stacked-bar-chart/stacked-bar-chart.scss
index d3f6a381..16027543 100644
--- a/report-app/src/app/shared/visualization/stacked-bar-chart/stacked-bar-chart.scss
+++ b/report-app/src/app/shared/visualization/stacked-bar-chart/stacked-bar-chart.scss
@@ -37,6 +37,7 @@
font-weight: 500;
transition: filter 0.2s ease-in-out;
min-width: 50px;
+ user-select: none;
&:first-child {
border-top-left-radius: 8px;
@@ -55,8 +56,9 @@
.legend {
display: flex;
- justify-content: center;
- gap: 1.5rem;
+ flex-wrap: wrap;
+ justify-content: flex-start;
+ column-gap: 1.5rem;
}
.legend-item {
@@ -65,6 +67,7 @@
font-size: 14px;
color: var(--text-secondary);
white-space: nowrap;
+ margin-top: 0.5rem;
}
.legend-color {
diff --git a/report-app/src/styles.scss b/report-app/src/styles.scss
index 3d0bcb2e..68341d6d 100644
--- a/report-app/src/styles.scss
+++ b/report-app/src/styles.scss
@@ -30,7 +30,14 @@
--status-fill-great: #84cc16;
--status-fill-good: #f59e0b;
--status-fill-poor: #ef4444;
- --status-fill-neutral: #eff6ff;
+ --status-fill-neutral: #aaa;
+
+ /* When we need a more gradiant spread of "meh". */
+ --status-fill-mediocre-1: #fbbc04; /* Yellow 500 */
+ --status-fill-mediocre-2: #f9ab00; /* Yellow 600 */
+ --status-fill-mediocre-3: #f29900; /* Yellow 700 */
+ --status-fill-mediocre-4: #ea8600; /* Yellow 800 */
+ --status-fill-mediocre-5: #e37400; /* Yellow 900 */
--status-text-excellent: #0c855d;
--status-text-great: #0c855d; // TODO: do we want to differentiate from `excellent`?
diff --git a/runner/codegen/ai-sdk/ai-sdk-model-options.ts b/runner/codegen/ai-sdk/ai-sdk-model-options.ts
new file mode 100644
index 00000000..65320553
--- /dev/null
+++ b/runner/codegen/ai-sdk/ai-sdk-model-options.ts
@@ -0,0 +1,16 @@
+import {AnthropicProviderOptions} from '@ai-sdk/anthropic';
+import {GoogleGenerativeAIProviderOptions} from '@ai-sdk/google';
+import {XaiProviderOptions} from '@ai-sdk/xai';
+import {OpenAIResponsesProviderOptions} from '@ai-sdk/openai';
+import {LanguageModelV3, SharedV3ProviderOptions} from '@ai-sdk/provider';
+
+export type AiSdkModelOptions = {
+ model: LanguageModelV3;
+ providerOptions:
+ | {anthropic: AnthropicProviderOptions}
+ | {google: GoogleGenerativeAIProviderOptions}
+ | {openai: OpenAIResponsesProviderOptions}
+ | {xai: XaiProviderOptions}
+ // This supports extensions of `AISdkRunner` for custom model providers.
+ | SharedV3ProviderOptions;
+};
diff --git a/runner/codegen/ai-sdk/ai-sdk-runner.ts b/runner/codegen/ai-sdk/ai-sdk-runner.ts
new file mode 100644
index 00000000..76cc694f
--- /dev/null
+++ b/runner/codegen/ai-sdk/ai-sdk-runner.ts
@@ -0,0 +1,261 @@
+import {
+ FilePart,
+ generateText,
+ ModelMessage,
+ Output,
+ SystemModelMessage,
+ TextPart,
+ ToolSet,
+} from 'ai';
+import {createMCPClient, MCPClient} from '@ai-sdk/mcp';
+import {Experimental_StdioMCPTransport as StdioClientTransport} from '@ai-sdk/mcp/mcp-stdio';
+import z from 'zod';
+import {combineAbortSignals} from '../../utils/abort-signal.js';
+import {callWithTimeout} from '../../utils/timeout.js';
+import {
+ LlmRunner,
+ LocalLlmConstrainedOutputGenerateRequestOptions,
+ LocalLlmConstrainedOutputGenerateResponse,
+ LocalLlmGenerateFilesRequestOptions,
+ LocalLlmGenerateFilesResponse,
+ LocalLlmGenerateTextRequestOptions,
+ LocalLlmGenerateTextResponse,
+ McpServerDetails,
+ McpServerOptions,
+ PromptDataMessage,
+} from '../llm-runner.js';
+import {ANTHROPIC_MODELS, getAiSdkModelOptionsForAnthropic} from './anthropic.js';
+import {getAiSdkModelOptionsForGoogle, GOOGLE_MODELS} from './google.js';
+import {getAiSdkModelOptionsForOpenAI, OPENAI_MODELS} from './openai.js';
+import {AiSdkModelOptions} from './ai-sdk-model-options.js';
+import {getAiSdkModelOptionsForXai, XAI_MODELS} from './xai.js';
+
+const SUPPORTED_MODELS = [
+ ...GOOGLE_MODELS,
+ ...ANTHROPIC_MODELS,
+ ...OPENAI_MODELS,
+ ...XAI_MODELS,
+] as const;
+
+// Increased to a very high value as we rely on an actual timeout
+// that aborts stuck LLM requests. WCS is targeting stability here;
+// even if it involves many exponential backoff-waiting.
+const DEFAULT_MAX_RETRIES = 100000;
+
+export class AiSdkRunner implements LlmRunner {
+ readonly displayName = 'AI SDK';
+ readonly id = 'ai-sdk';
+ readonly hasBuiltInRepairLoop = true;
+ private mcpClients: MCPClient[] | null = null;
+
+ async generateText(
+ options: LocalLlmGenerateTextRequestOptions,
+ ): Promise {
+ const response = await this._wrapRequestWithTimeoutAndRateLimiting(options, async abortSignal =>
+ generateText({
+ ...(await this.getAiSdkModelOptions(options)),
+ abortSignal: abortSignal,
+ messages: this.convertRequestToMessagesList(options),
+ maxRetries: DEFAULT_MAX_RETRIES,
+ tools: await this.getTools(),
+ }),
+ );
+
+ return {
+ reasoning: response.reasoningText ?? '',
+ text: response.text,
+ usage: {
+ inputTokens: response.usage.inputTokens ?? 0,
+ outputTokens: response.usage.outputTokens ?? 0,
+ thinkingTokens: response.usage.reasoningTokens ?? 0,
+ totalTokens: response.usage.totalTokens ?? 0,
+ },
+ // TODO: Consider supporting `toolLogs` and MCP here.
+ };
+ }
+
+ async generateConstrained(
+ options: LocalLlmConstrainedOutputGenerateRequestOptions,
+ ): Promise> {
+ const response = await this._wrapRequestWithTimeoutAndRateLimiting(options, async abortSignal =>
+ generateText({
+ ...(await this.getAiSdkModelOptions(options)),
+ messages: this.convertRequestToMessagesList(options),
+ output: Output.object>({schema: options.schema}),
+ abortSignal: abortSignal,
+ maxRetries: DEFAULT_MAX_RETRIES,
+ tools: await this.getTools(),
+ }),
+ );
+
+ return {
+ reasoning: response.reasoning.map(r => r.text).join('\n') ?? '',
+ output: response.output,
+ usage: {
+ inputTokens: response.usage.inputTokens ?? 0,
+ outputTokens: response.usage.outputTokens ?? 0,
+ thinkingTokens: response.usage.reasoningTokens ?? 0,
+ totalTokens: response.usage.totalTokens ?? 0,
+ },
+ // TODO: Consider supporting `toolLogs` and MCP here.
+ };
+ }
+
+ async generateFiles(
+ options: LocalLlmGenerateFilesRequestOptions,
+ ): Promise {
+ const response = await this.generateConstrained({
+ ...options,
+ prompt: options.context.executablePrompt,
+ systemPrompt: options.context.systemInstructions,
+ schema: z.object({
+ outputFiles: z.array(
+ z.object({
+ filePath: z.string().describe('Name of the file that is being changed'),
+ code: z.string().describe('New code of the file'),
+ }),
+ ),
+ }),
+ });
+
+ return {
+ files: response.output?.outputFiles ?? [],
+ reasoning: response.reasoning,
+ usage: response.usage,
+ // TODO: Consider supporting `toolLogs` and MCP here.
+ };
+ }
+
+ getSupportedModels(): string[] {
+ return [...SUPPORTED_MODELS];
+ }
+
+ async dispose(): Promise {
+ if (this.mcpClients) {
+ for (const client of this.mcpClients) {
+ try {
+ await client.close();
+ } catch (error) {
+ console.error(`Failed to close MCP client`, error);
+ }
+ }
+ }
+ }
+
+ async startMcpServerHost(
+ _hostName: string,
+ servers: McpServerOptions[],
+ ): Promise {
+ const details: McpServerDetails = {resources: [], tools: []};
+
+ for (const server of servers) {
+ const client = await createMCPClient({
+ transport: new StdioClientTransport({
+ command: server.command,
+ args: server.args,
+ env: server.env,
+ }),
+ });
+
+ const [resources, tools] = await Promise.all([client.listResources(), client.tools()]);
+ resources.resources.forEach(r => details.resources.push(r.name));
+ details.tools.push(...Object.keys(tools));
+ this.mcpClients ??= [];
+ this.mcpClients.push(client);
+ }
+
+ return details;
+ }
+
+ private async _wrapRequestWithTimeoutAndRateLimiting(
+ request: LocalLlmGenerateTextRequestOptions | LocalLlmConstrainedOutputGenerateRequestOptions,
+ fn: (abortSignal: AbortSignal) => Promise,
+ ): Promise {
+ // TODO: Check if rate-limiting is actually necessary here. AI SDK
+ // seems to do retrying on its own.
+
+ if (request.timeout === undefined) {
+ return await fn(request.abortSignal);
+ }
+ return callWithTimeout(
+ request.timeout.description,
+ abortSignal => fn(combineAbortSignals(abortSignal, request.abortSignal)),
+ request.timeout.durationInMins,
+ );
+ }
+
+ protected async getAiSdkModelOptions(
+ request: LocalLlmGenerateTextRequestOptions,
+ ): Promise {
+ const result =
+ (await getAiSdkModelOptionsForGoogle(request.model)) ??
+ (await getAiSdkModelOptionsForAnthropic(request.model)) ??
+ (await getAiSdkModelOptionsForOpenAI(request.model)) ??
+ (await getAiSdkModelOptionsForXai(request.model));
+ if (result === null) {
+ throw new Error(`Unexpected unsupported model: ${request.model}`);
+ }
+ return result;
+ }
+
+ protected convertRequestToMessagesList(
+ request: LocalLlmConstrainedOutputGenerateRequestOptions | LocalLlmGenerateTextRequestOptions,
+ ): ModelMessage[] {
+ return [
+ // System prompt message.
+ ...(request.systemPrompt !== undefined
+ ? [
+ {
+ role: 'system',
+ content: request.systemPrompt,
+ } satisfies SystemModelMessage,
+ ]
+ : []),
+ // Optional additional messages
+ ...this.toAiSDKMessage(request.messages ?? []),
+ // The main message.
+ {role: 'user', content: [{type: 'text', text: request.prompt}]},
+ ];
+ }
+
+ protected toAiSDKMessage(messages: PromptDataMessage[]): ModelMessage[] {
+ const result: ModelMessage[] = [];
+
+ for (const message of messages) {
+ if (message.role === 'model') {
+ result.push({
+ role: 'assistant',
+ content: message.content.map(c =>
+ 'media' in c
+ ? ({type: 'file', data: c.media.url, mediaType: 'image/png'} satisfies FilePart)
+ : ({type: 'text', text: c.text} satisfies TextPart),
+ ),
+ });
+ } else if (message.role === 'user') {
+ result.push({
+ role: 'user',
+ content: message.content.map(c =>
+ 'media' in c
+ ? ({type: 'file', data: c.media.url, mediaType: 'image/png'} satisfies FilePart)
+ : ({type: 'text', text: c.text} satisfies TextPart),
+ ),
+ });
+ }
+ }
+ return result;
+ }
+
+ private async getTools(): Promise {
+ let tools: ToolSet | undefined;
+
+ if (this.mcpClients) {
+ for (const client of this.mcpClients) {
+ const clientTools = (await client.tools()) as ToolSet;
+ tools ??= {};
+ Object.keys(clientTools).forEach(name => (tools![name] = clientTools[name]));
+ }
+ }
+
+ return tools;
+ }
+}
diff --git a/runner/codegen/ai-sdk/anthropic.ts b/runner/codegen/ai-sdk/anthropic.ts
new file mode 100644
index 00000000..3f58ef35
--- /dev/null
+++ b/runner/codegen/ai-sdk/anthropic.ts
@@ -0,0 +1,68 @@
+import {createAnthropic, AnthropicProviderOptions} from '@ai-sdk/anthropic';
+import {wrapLanguageModel} from 'ai';
+import {anthropicThinkingWithStructuredResponseMiddleware} from './anthropic_thinking_patch.js';
+import {AiSdkModelOptions} from './ai-sdk-model-options.js';
+
+export const ANTHROPIC_MODELS = [
+ 'claude-opus-4.1-no-thinking',
+ 'claude-opus-4.1-with-thinking-16k',
+ 'claude-opus-4.1-with-thinking-32k',
+ 'claude-opus-4.5-no-thinking',
+ 'claude-opus-4.5-with-thinking-16k',
+ 'claude-opus-4.5-with-thinking-32k',
+ 'claude-sonnet-4.5-no-thinking',
+ 'claude-sonnet-4.5-with-thinking-16k',
+ 'claude-sonnet-4.5-with-thinking-32k',
+] as const;
+
+export async function getAiSdkModelOptionsForAnthropic(
+ rawModelName: string,
+): Promise {
+ const modelName = rawModelName as (typeof ANTHROPIC_MODELS)[number];
+ const provideModel = createAnthropic({apiKey: process.env['ANTHROPIC_API_KEY']});
+
+ switch (modelName) {
+ case 'claude-opus-4.1-no-thinking':
+ case 'claude-opus-4.1-with-thinking-16k':
+ case 'claude-opus-4.1-with-thinking-32k':
+ case 'claude-opus-4.5-no-thinking':
+ case 'claude-opus-4.5-with-thinking-16k':
+ case 'claude-opus-4.5-with-thinking-32k':
+ case 'claude-sonnet-4.5-no-thinking':
+ case 'claude-sonnet-4.5-with-thinking-16k':
+ case 'claude-sonnet-4.5-with-thinking-32k': {
+ const thinkingEnabled = modelName.includes('-with-thinking');
+ const thinkingBudget = !thinkingEnabled
+ ? undefined
+ : modelName.endsWith('-32k')
+ ? 32_000
+ : 16_000;
+ let apiModelName = 'claude-sonnet-4-5';
+ if (modelName.includes('opus-4.1')) {
+ apiModelName = 'claude-opus-4-1';
+ } else if (modelName.includes('opus-4.5')) {
+ apiModelName = 'claude-opus-4-5';
+ }
+ const model = provideModel(apiModelName);
+ return {
+ model: thinkingEnabled
+ ? wrapLanguageModel({
+ model,
+ middleware: anthropicThinkingWithStructuredResponseMiddleware,
+ })
+ : model,
+ providerOptions: {
+ anthropic: {
+ sendReasoning: thinkingEnabled,
+ thinking: {
+ type: thinkingEnabled ? 'enabled' : 'disabled',
+ budgetTokens: thinkingBudget,
+ },
+ } satisfies AnthropicProviderOptions,
+ },
+ };
+ }
+ default:
+ return null;
+ }
+}
diff --git a/runner/codegen/ai-sdk/anthropic_thinking_patch.ts b/runner/codegen/ai-sdk/anthropic_thinking_patch.ts
new file mode 100644
index 00000000..0a8d67ff
--- /dev/null
+++ b/runner/codegen/ai-sdk/anthropic_thinking_patch.ts
@@ -0,0 +1,49 @@
+import type {LanguageModelV3Middleware} from '@ai-sdk/provider';
+
+/**
+ * Middleware for Anthropic AI SDK models that is necessary for enabling
+ * thinking mode + structured responses.
+ *
+ * This is necessary because Anthropic would be used with enforced tool usage
+ * by default with `generateText()`. This is a workaround that makes the tool
+ * optional: https://github.com/vercel/ai/issues/9351, https://github.com/vercel/ai/issues/11227.
+ */
+export const anthropicThinkingWithStructuredResponseMiddleware: LanguageModelV3Middleware = {
+ specificationVersion: 'v3',
+ transformParams: ({params}) => {
+ if (params.responseFormat?.type === 'json' && params.responseFormat.schema) {
+ params.tools = [
+ {
+ type: 'function',
+ description: 'Respond with a JSON object for the structured output/answer.',
+ inputSchema: params.responseFormat.schema,
+ name: 'json',
+ },
+ ];
+ params.toolChoice = {type: 'auto'};
+ params.responseFormat = {type: 'text'};
+ params.prompt.push({
+ role: 'user',
+ content: [
+ {
+ type: 'text',
+ text: 'Use the `json` tool to provide the structured output/answer. No other text is needed.',
+ },
+ ],
+ });
+ }
+ return Promise.resolve(params);
+ },
+ wrapGenerate: async ({doGenerate}) => {
+ const result = await doGenerate();
+
+ // Extract the JSON tool call (conforming to the schema) and return it as text response.
+ for (const r of result.content) {
+ if (r.type === 'tool-call' && r.toolName === 'json') {
+ result.content.push({type: 'text', text: r.input});
+ }
+ }
+
+ return result;
+ },
+};
diff --git a/runner/codegen/ai-sdk/google.ts b/runner/codegen/ai-sdk/google.ts
new file mode 100644
index 00000000..4683653d
--- /dev/null
+++ b/runner/codegen/ai-sdk/google.ts
@@ -0,0 +1,72 @@
+import {createGoogleGenerativeAI, GoogleGenerativeAIProviderOptions} from '@ai-sdk/google';
+import {AiSdkModelOptions} from './ai-sdk-model-options.js';
+
+export const GOOGLE_MODELS = [
+ 'gemini-2.5-flash-lite',
+ 'gemini-2.5-flash',
+ 'gemini-2.5-flash-no-thinking',
+ 'gemini-2.5-flash-with-thinking-16k',
+ 'gemini-2.5-flash-with-thinking-24k',
+ 'gemini-2.5-pro',
+ 'gemini-3-pro-preview',
+] as const;
+
+export async function getAiSdkModelOptionsForGoogle(
+ rawModelName: string,
+): Promise {
+ const modelName = rawModelName as (typeof GOOGLE_MODELS)[number];
+ const provideModel = createGoogleGenerativeAI({apiKey: process.env['GEMINI_API_KEY']});
+
+ switch (modelName) {
+ case 'gemini-2.5-flash-lite':
+ case 'gemini-2.5-flash':
+ case 'gemini-2.5-pro':
+ case 'gemini-3-pro-preview':
+ return {
+ model: provideModel(modelName),
+ providerOptions: {
+ google: {
+ thinkingConfig: {
+ includeThoughts: true,
+ },
+ } satisfies GoogleGenerativeAIProviderOptions,
+ },
+ };
+ case 'gemini-2.5-flash-no-thinking': {
+ return {
+ model: provideModel('gemini-2.5-flash'),
+ providerOptions: {
+ google: {
+ thinkingConfig: {
+ thinkingBudget: 0,
+ },
+ },
+ },
+ };
+ }
+ case 'gemini-2.5-flash-with-thinking-16k':
+ case 'gemini-2.5-flash-with-thinking-24k':
+ let thinkingBudget: number;
+ if (modelName.endsWith('-16k')) {
+ thinkingBudget = 16_000;
+ } else if (modelName.endsWith('-24k')) {
+ thinkingBudget = 24_000;
+ } else {
+ throw new Error(`Unexpected model: ${modelName}`);
+ }
+
+ return {
+ model: provideModel('gemini-2.5-flash'),
+ providerOptions: {
+ google: {
+ thinkingConfig: {
+ thinkingBudget: thinkingBudget,
+ includeThoughts: true,
+ },
+ } satisfies GoogleGenerativeAIProviderOptions,
+ },
+ };
+ default:
+ return null;
+ }
+}
diff --git a/runner/codegen/ai-sdk/openai.ts b/runner/codegen/ai-sdk/openai.ts
new file mode 100644
index 00000000..b0427f71
--- /dev/null
+++ b/runner/codegen/ai-sdk/openai.ts
@@ -0,0 +1,42 @@
+import {createOpenAI, OpenAIResponsesProviderOptions} from '@ai-sdk/openai';
+import {AiSdkModelOptions} from './ai-sdk-model-options.js';
+
+export const OPENAI_MODELS = [
+ 'gpt-5.1-no-thinking',
+ 'gpt-5.1-thinking-low',
+ 'gpt-5.1-thinking-high',
+ 'gpt-5.1-thinking-medium',
+] as const;
+
+export async function getAiSdkModelOptionsForOpenAI(
+ rawModelName: string,
+): Promise {
+ const provideModel = createOpenAI({apiKey: process.env['OPENAI_API_KEY']});
+ const modelName = rawModelName as (typeof OPENAI_MODELS)[number];
+
+ switch (modelName) {
+ case 'gpt-5.1-no-thinking':
+ case 'gpt-5.1-thinking-low':
+ case 'gpt-5.1-thinking-medium':
+ case 'gpt-5.1-thinking-high':
+ let reasoningEffort: string = 'none';
+ if (modelName === 'gpt-5.1-thinking-high') {
+ reasoningEffort = 'high';
+ } else if (modelName === 'gpt-5.1-thinking-medium') {
+ reasoningEffort = 'medium';
+ } else if (modelName === 'gpt-5.1-thinking-low') {
+ reasoningEffort = 'low';
+ }
+ return {
+ model: provideModel('gpt-5.1'),
+ providerOptions: {
+ openai: {
+ reasoningEffort,
+ reasoningSummary: 'detailed',
+ } satisfies OpenAIResponsesProviderOptions,
+ },
+ };
+ default:
+ return null;
+ }
+}
diff --git a/runner/codegen/ai-sdk/xai.ts b/runner/codegen/ai-sdk/xai.ts
new file mode 100644
index 00000000..0e0ee80e
--- /dev/null
+++ b/runner/codegen/ai-sdk/xai.ts
@@ -0,0 +1,28 @@
+import {createXai, XaiProviderOptions} from '@ai-sdk/xai';
+import {AiSdkModelOptions} from './ai-sdk-model-options.js';
+
+export const XAI_MODELS = ['grok-4', 'grok-code-fast-1'] as const;
+
+export async function getAiSdkModelOptionsForXai(
+ rawModelName: string,
+): Promise {
+ const provideModel = createXai({apiKey: process.env['XAI_API_KEY']});
+ const modelName = rawModelName as (typeof XAI_MODELS)[number];
+
+ switch (modelName) {
+ case 'grok-4':
+ case 'grok-code-fast-1':
+ const reasoningEffort = modelName === 'grok-4' ? 'high' : 'low';
+
+ return {
+ model: provideModel(modelName),
+ providerOptions: {
+ xai: {
+ reasoningEffort,
+ } satisfies XaiProviderOptions,
+ },
+ };
+ default:
+ return null;
+ }
+}
diff --git a/runner/codegen/claude-code-runner.ts b/runner/codegen/claude-code-runner.ts
index ed742e76..58592427 100644
--- a/runner/codegen/claude-code-runner.ts
+++ b/runner/codegen/claude-code-runner.ts
@@ -11,6 +11,11 @@ import {BaseCliAgentRunner} from './base-cli-agent-runner.js';
const MODEL_MAPPING: Record = {
'claude-4.0-sonnet': 'claude-sonnet-4-20250514',
'claude-3.5-haiku': 'claude-3-5-haiku-latest',
+ 'claude-4.5-sonnet': 'claude-sonnet-4-5-20250929',
+ 'claude-4.0-opus': 'claude-opus-4-20250514',
+ 'claude-4.5-opus': 'claude-opus-4-5-20251101',
+ 'claude-4.5-haiku': 'claude-haiku-4-5-20251001',
+ 'claude-4.6-opus': 'claude-opus-4-6',
};
/** Runner that generates code using the Claude Code. */
diff --git a/runner/codegen/gemini-cli-runner.ts b/runner/codegen/gemini-cli-runner.ts
index fcd47e18..bfa7ba4a 100644
--- a/runner/codegen/gemini-cli-runner.ts
+++ b/runner/codegen/gemini-cli-runner.ts
@@ -4,7 +4,12 @@ import {existsSync, mkdirSync} from 'fs';
import {writeFile} from 'fs/promises';
import {BaseCliAgentRunner} from './base-cli-agent-runner.js';
-const SUPPORTED_MODELS = ['gemini-2.5-pro', 'gemini-2.5-flash', 'gemini-2.5-flash-lite'];
+const SUPPORTED_MODELS = [
+ 'gemini-3-pro-preview',
+ 'gemini-2.5-pro',
+ 'gemini-2.5-flash',
+ 'gemini-2.5-flash-lite',
+];
/** Runner that generates code using the Gemini CLI. */
export class GeminiCliRunner extends BaseCliAgentRunner implements LlmRunner {
diff --git a/runner/codegen/genkit/genkit-logger.ts b/runner/codegen/genkit/genkit-logger.ts
deleted file mode 100644
index e4e15072..00000000
--- a/runner/codegen/genkit/genkit-logger.ts
+++ /dev/null
@@ -1,59 +0,0 @@
-import {logger} from 'genkit/logging';
-
-const defaultLogger = logger.defaultLogger;
-
-/** Custom logger for Genkit. */
-export class GenkitLogger {
- private pendingLogs: unknown[] = [];
- private isCapturingLogs = false;
- level = defaultLogger.level;
-
- info(...args: unknown[]): void {
- this.maybeCapture(args);
- defaultLogger.info(...args);
- }
-
- debug(...args: unknown[]): void {
- this.maybeCapture(args);
- defaultLogger.debug(...args);
- }
-
- error(...args: unknown[]): void {
- this.maybeCapture(args);
- defaultLogger.error(...args);
- }
-
- warn(...args: unknown[]): void {
- this.maybeCapture(args);
- defaultLogger.warn(...args);
- }
-
- shouldLog(targetLevel: string): boolean {
- return this.level === targetLevel;
- }
-
- startCapturingLogs(): void {
- if (this.isCapturingLogs) {
- throw new Error('Logger is already capturing logs');
- }
-
- this.isCapturingLogs = true;
- }
-
- flushCapturedLogs(): unknown[] {
- if (!this.isCapturingLogs) {
- throw new Error('Logger is not capturing logs');
- }
-
- const logs = this.pendingLogs;
- this.pendingLogs = [];
- this.isCapturingLogs = false;
- return logs;
- }
-
- private maybeCapture(logs: unknown[]): void {
- if (this.isCapturingLogs) {
- this.pendingLogs.push(...logs);
- }
- }
-}
diff --git a/runner/codegen/genkit/genkit-runner.ts b/runner/codegen/genkit/genkit-runner.ts
deleted file mode 100644
index 001f6583..00000000
--- a/runner/codegen/genkit/genkit-runner.ts
+++ /dev/null
@@ -1,341 +0,0 @@
-import {
- Action,
- DynamicResourceAction,
- GenerateResponse,
- genkit,
- ModelReference,
- ToolAction,
-} from 'genkit';
-import {GenkitMcpHost, McpServerConfig, createMcpHost} from '@genkit-ai/mcp';
-import {GenkitPlugin, GenkitPluginV2} from 'genkit/plugin';
-import {z} from 'zod';
-import {
- McpServerOptions,
- LocalLlmConstrainedOutputGenerateRequestOptions,
- LocalLlmConstrainedOutputGenerateResponse,
- LlmRunner,
- LocalLlmGenerateFilesResponse,
- LocalLlmGenerateTextResponse,
- LocalLlmGenerateTextRequestOptions,
- LocalLlmGenerateFilesRequestOptions,
- McpServerDetails,
-} from '../llm-runner.js';
-import {setTimeout} from 'node:timers/promises';
-import {callWithTimeout} from '../../utils/timeout.js';
-import {logger} from 'genkit/logging';
-import {GenkitLogger} from './genkit-logger.js';
-import {MODEL_PROVIDERS} from './models.js';
-import {UserFacingError} from '../../utils/errors.js';
-import {GenkitModelProvider, PromptDataForCounting} from './model-provider.js';
-import {ToolLogEntry} from '../../shared-interfaces.js';
-import {combineAbortSignals} from '../../utils/abort-signal.js';
-import {toToolDefinition} from 'genkit/tool';
-
-const globalLogger = new GenkitLogger();
-logger.init(globalLogger);
-
-/**
- * Gets the name of a Genkit action.
- */
-function getActionName(action: Action): string {
- return toToolDefinition(action).name;
-}
-
-/** Runner that uses the Genkit API under the hood. */
-export class GenkitRunner implements LlmRunner {
- readonly id = 'genkit';
- readonly displayName = 'Genkit';
- readonly hasBuiltInRepairLoop = false;
- private readonly genkitInstance = this.getGenkitInstance();
- private mcpHost: GenkitMcpHost | null = null;
- private toolLogs: ToolLogEntry[] = [];
-
- async generateConstrained(
- options: LocalLlmConstrainedOutputGenerateRequestOptions,
- ): Promise> {
- const {provider, model} = this.resolveModel(options.model);
- const result = await this._genkitRequest(provider, model, options);
-
- return {
- output: result.output,
- usage: result.usage,
- reasoning: result.reasoning,
- };
- }
-
- async generateFiles(
- options: LocalLlmGenerateFilesRequestOptions,
- ): Promise {
- const requestOptions: LocalLlmConstrainedOutputGenerateRequestOptions = {
- ...options,
- prompt: options.context.executablePrompt,
- systemPrompt: options.context.systemInstructions,
- schema: z.object({
- outputFiles: z.array(
- z.object({
- filePath: z.string().describe('Name of the file that is being changed'),
- code: z.string().describe('New code of the file'),
- }),
- ),
- }),
- };
-
- const {provider, model} = this.resolveModel(options.model);
- const result = await this._genkitRequest(provider, model, requestOptions);
- const files = result.output.outputFiles || [];
-
- if (!provider.validateGeneratedFiles(files)) {
- throw new Error(`Invalid files generated by model "${options.model}"`);
- }
-
- return {
- files,
- usage: result.usage,
- reasoning: result.reasoning,
- toolLogs: this.flushToolLogs(),
- };
- }
-
- flushToolLogs(): ToolLogEntry[] {
- return this.toolLogs.splice(0);
- }
-
- async generateText(
- options: LocalLlmGenerateTextRequestOptions,
- ): Promise {
- const {provider, model} = this.resolveModel(options.model);
- const result = await this._genkitRequest(provider, model, options);
-
- return {
- text: result.text,
- usage: result.usage,
- reasoning: result.reasoning,
- toolLogs: this.flushToolLogs(),
- };
- }
-
- getSupportedModels(): string[] {
- return MODEL_PROVIDERS.flatMap(p => p.getSupportedModels());
- }
-
- getSupportedModelsWithAPIKey(): string[] {
- return MODEL_PROVIDERS.filter(p => p.getApiKey() !== null).flatMap(p => p.getSupportedModels());
- }
-
- private async _genkitRequest(
- provider: GenkitModelProvider,
- model: ModelReference,
- options: LocalLlmGenerateTextRequestOptions | LocalLlmConstrainedOutputGenerateRequestOptions,
- ) {
- return await rateLimitLLMRequest(
- provider,
- model,
- {messages: options.messages || [], prompt: options.prompt},
- () => {
- const schema = (options as Partial).schema;
- const performRequest = async (abortSignal: AbortSignal) => {
- let tools: ToolAction[] | undefined;
- let resources: DynamicResourceAction[] | undefined;
-
- if (!options.skipMcp && this.mcpHost) {
- [tools, resources] = await Promise.all([
- this.mcpHost.getActiveTools(this.genkitInstance),
- this.mcpHost.getActiveResources(this.genkitInstance),
- ]);
- }
-
- const response = await this.genkitInstance.generate({
- prompt: options.prompt,
- system: options.systemPrompt,
- model,
- output: schema
- ? {
- // Note that the schema needs to be cast to `any`, because allowing its type to
- // be inferred ends up causing `TS2589: Type instantiation is excessively deep and possibly infinite.`,
- // most likely due to how the Genkit type inferrence is set up. This doesn't affect
- // the return type since it was already `ZodTypeAny` which coerces to `any`.
- schema: schema as any,
- constrained: true,
- }
- : undefined,
- config: provider.getModelSpecificConfig(
- {
- includeThoughts: options.thinkingConfig?.includeThoughts ?? false,
- },
- options.model,
- ),
- messages: options.messages,
- tools,
- resources,
- abortSignal,
- });
-
- this._logToolUsage(response);
-
- return response;
- };
-
- return options.timeout
- ? callWithTimeout(
- options.timeout.description,
- timeoutAbortSignal =>
- performRequest(combineAbortSignals(timeoutAbortSignal, options.abortSignal)),
- options.timeout.durationInMins,
- )
- : performRequest(options.abortSignal);
- },
- );
- }
-
- private _logToolUsage(response: GenerateResponse) {
- const toolRequests = new Map();
- const toolResponses = new Map();
-
- if (response.request?.messages) {
- for (const message of response.request.messages) {
- if (!message.content) {
- continue;
- }
- for (const contentPart of message.content) {
- if (contentPart.toolRequest) {
- toolRequests.set(contentPart.toolRequest.ref || '0', contentPart.toolRequest);
- } else if (contentPart.toolResponse) {
- toolResponses.set(contentPart.toolResponse.ref || '0', contentPart.toolResponse);
- }
- }
- }
- }
-
- for (const [ref, toolRequest] of toolRequests.entries()) {
- const toolResponse = toolResponses.get(ref);
- if (toolResponse) {
- this.toolLogs.push({
- request: toolRequest,
- response: toolResponse,
- });
- }
- }
- }
-
- async startMcpServerHost(
- hostName: string,
- servers: McpServerOptions[],
- ): Promise {
- if (this.mcpHost !== null) {
- throw new Error('MCP host is already started');
- }
-
- const mcpServers = servers.reduce(
- (result, current) => {
- const {name, ...config} = current;
- result[name] = config;
-
- return result;
- },
- {} as Record,
- );
-
- globalLogger.startCapturingLogs();
- this.mcpHost = createMcpHost({name: hostName, mcpServers});
- const tools = await this.mcpHost.getActiveTools(this.genkitInstance);
- const resources = await this.mcpHost.getActiveResources(this.genkitInstance);
- return {
- tools: tools.map(getActionName),
- resources: resources.map(getActionName),
- };
- }
-
- flushMcpServerLogs(): string[] {
- return globalLogger
- .flushCapturedLogs()
- .filter((log): log is string => typeof log === 'string' && log.includes('[MCP'));
- }
-
- async dispose() {
- try {
- await this.mcpHost?.close();
- } catch (error) {
- console.error(`Failed to close MCP host`, error);
- }
- }
-
- private resolveModel(name: string) {
- for (const provider of MODEL_PROVIDERS) {
- const model = provider.createModel(name);
-
- if (model) {
- return {provider: provider as GenkitModelProvider, model};
- }
- }
-
- throw new UserFacingError(
- `Unrecognized model '${name}'. The configured models are:\n` +
- this.getSupportedModels()
- .map(m => `- ${m}`)
- .join('\n'),
- );
- }
-
- /** Gets a Genkit instance configured with the currently-available providers. */
- private getGenkitInstance() {
- const plugins: (GenkitPlugin | GenkitPluginV2)[] = [];
- const environmentVars: string[] = [];
-
- for (const provider of MODEL_PROVIDERS) {
- const plugin = provider.getPlugin();
- environmentVars.push(provider.apiKeyVariableName);
-
- if (plugin) {
- plugins.push(plugin);
- }
- }
-
- if (plugins.length === 0) {
- throw new UserFacingError(
- `No LLM providers have been configured. You must set at least one of the ` +
- `following environment variables:\n` +
- environmentVars.map(e => `- ${e}`).join('\n'),
- );
- }
-
- return genkit({plugins});
- }
-}
-
-/**
- * Invokes the LLM request function with respect to potential model rate limits.
- */
-async function rateLimitLLMRequest(
- provider: GenkitModelProvider,
- model: ModelReference,
- prompt: string | PromptDataForCounting,
- requestFn: () => Promise,
- retryCount = 0,
-): Promise {
- if (typeof prompt === 'string') {
- prompt = {messages: [], prompt};
- }
-
- provider.rateLimit(prompt, model);
-
- try {
- return await requestFn();
- } catch (e: unknown) {
- if (typeof e === 'object') {
- // If we know it's a rate-limitation error, re-queue but with a linear backoff.
- if (
- e?.constructor?.name === 'RateLimitError' || // From `openai`
- e?.constructor?.name === 'GoogleGenerativeAIFetchError' // From `Gemini`.
- ) {
- if (retryCount === 10) {
- throw e;
- }
- // Exponential backoff with randomness to avoid retrying at the same times with other requests.
- const backoffSeconds = (25 + 10 * 1.35 ** retryCount++) * (0.8 + Math.random() * 0.4);
- await setTimeout(1000 * backoffSeconds);
- return rateLimitLLMRequest(provider, model, prompt, requestFn, retryCount);
- }
- }
- throw e;
- }
-}
diff --git a/runner/codegen/genkit/model-provider.ts b/runner/codegen/genkit/model-provider.ts
deleted file mode 100644
index f16f48fb..00000000
--- a/runner/codegen/genkit/model-provider.ts
+++ /dev/null
@@ -1,69 +0,0 @@
-import {ModelReference} from 'genkit';
-import {GenkitPlugin, GenkitPluginV2} from 'genkit/plugin';
-import {RateLimiter} from 'limiter';
-import {PromptDataMessage} from '../llm-runner.js';
-import {LlmResponseFile} from '../../shared-interfaces.js';
-
-export interface RateLimitConfig {
- requestPerMinute: RateLimiter;
- tokensPerMinute: RateLimiter;
- countTokens(prompt: PromptDataForCounting): Promise;
-}
-
-export interface PromptDataForCounting {
- prompt: string;
- messages: PromptDataMessage[];
-}
-
-/** Abstraction around an LLM provider. */
-export abstract class GenkitModelProvider {
- abstract readonly apiKeyVariableName: string;
- protected abstract readonly models: Record ModelReference>;
- protected abstract readonly rateLimitConfig: Record;
-
- /** Creates a model instance, if the the provider supports the model. */
- createModel(name: string): ModelReference | null {
- return this.supportsModel(name) ? this.models[name]() : null;
- }
-
- /** Returns whether the provider supports a specific model. */
- supportsModel(name: string): boolean {
- return this.models.hasOwnProperty(name);
- }
-
- /** Gets the names of all models supported by the provider. */
- getSupportedModels(): string[] {
- return Object.keys(this.models);
- }
-
- /** Gets the API key associated with this provider. */
- getApiKey(): string | null {
- return process.env[this.apiKeyVariableName] || null;
- }
-
- /** Gets a Genkit plugin that can be used to query the provider. */
- getPlugin(): GenkitPlugin | GenkitPluginV2 | null {
- const key = this.getApiKey();
- return key ? this.pluginFactory(key) : null;
- }
-
- /**
- * Checks whether files generated by the LLM are valid.
- * If not, the Genkit runner will throw an error.
- */
- abstract validateGeneratedFiles(files: LlmResponseFile[]): boolean;
-
- protected abstract pluginFactory(apiKey: string): GenkitPlugin | GenkitPluginV2;
-
- abstract getModelSpecificConfig(opts: {includeThoughts?: boolean}, modelName: string): object;
-
- async rateLimit(prompt: PromptDataForCounting, model: ModelReference): Promise {
- const config = this.rateLimitConfig[model.name];
-
- if (config) {
- await config.requestPerMinute.removeTokens(1);
- const tokenCount = (await config.countTokens(prompt)) ?? 0;
- await config.tokensPerMinute.removeTokens(tokenCount);
- }
- }
-}
diff --git a/runner/codegen/genkit/models.ts b/runner/codegen/genkit/models.ts
deleted file mode 100644
index beb9da57..00000000
--- a/runner/codegen/genkit/models.ts
+++ /dev/null
@@ -1,11 +0,0 @@
-import {GeminiModelProvider} from './providers/gemini.js';
-import {ClaudeModelProvider} from './providers/claude.js';
-import {OpenAiModelProvider} from './providers/open-ai.js';
-import {GrokModelProvider} from './providers/grok.js';
-
-export const MODEL_PROVIDERS = [
- new GeminiModelProvider(),
- new ClaudeModelProvider(),
- new OpenAiModelProvider(),
- new GrokModelProvider(),
-];
diff --git a/runner/codegen/genkit/providers/claude.ts b/runner/codegen/genkit/providers/claude.ts
deleted file mode 100644
index 42d6d422..00000000
--- a/runner/codegen/genkit/providers/claude.ts
+++ /dev/null
@@ -1,83 +0,0 @@
-import {Anthropic} from '@anthropic-ai/sdk';
-import {GenkitPlugin} from 'genkit/plugin';
-import {GenkitModelProvider, PromptDataForCounting, RateLimitConfig} from '../model-provider.js';
-import {anthropic} from 'genkitx-anthropic';
-import {claude35Haiku, claude4Sonnet} from 'genkitx-anthropic';
-import {lazy} from '../../../utils/lazy-creation.js';
-import {RateLimiter} from 'limiter';
-
-export class ClaudeModelProvider extends GenkitModelProvider {
- readonly apiKeyVariableName = 'ANTHROPIC_API_KEY';
-
- protected readonly models = {
- 'claude-4.0-sonnet': () => claude4Sonnet,
- 'claude-3.5-haiku': () => claude35Haiku,
- };
-
- protected rateLimitConfig: Record = {
- // See: https://docs.anthropic.com/en/api/rate-limits#tier-2
- 'anthropic/claude-4-sonnet': {
- requestPerMinute: new RateLimiter({
- tokensPerInterval: 1000,
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side.
- }),
- tokensPerMinute: new RateLimiter({
- tokensPerInterval: 40_000 * 0.75, // *0.75 to be more resilient to token count deviations
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side.
- }),
- countTokens: prompt => this.countClaudeTokens(prompt),
- },
- };
-
- getModelSpecificConfig(): object {
- // TODO: Add thinking output for Claude.
- return {};
- }
-
- validateGeneratedFiles(): boolean {
- return true;
- }
-
- private anthropicApi = lazy(() => {
- return new Anthropic({apiKey: this.getApiKey() || undefined});
- });
-
- protected pluginFactory(apiKey: string): GenkitPlugin {
- return anthropic({apiKey});
- }
-
- private async countClaudeTokens(prompt: PromptDataForCounting): Promise {
- const sonnetPrompt: string | Anthropic.Messages.MessageParam[] = [];
- for (const part of prompt.messages) {
- for (const c of part.content) {
- sonnetPrompt.push({
- role: part.role === 'user' ? 'user' : 'assistant',
- content:
- 'media' in c
- ? [
- {
- source: {
- media_type: 'image/png',
- data: c.media.base64PngImage,
- type: 'base64',
- },
- type: 'image',
- },
- ]
- : c.text,
- });
- }
- }
- const messages: Anthropic.Messages.MessageParam[] = [
- ...sonnetPrompt,
- {content: prompt.prompt, role: 'user'},
- ];
-
- return (
- await this.anthropicApi().messages.countTokens({
- model: 'claude-sonnet-4-0',
- messages,
- })
- ).input_tokens;
- }
-}
diff --git a/runner/codegen/genkit/providers/gemini.ts b/runner/codegen/genkit/providers/gemini.ts
deleted file mode 100644
index 30c3713a..00000000
--- a/runner/codegen/genkit/providers/gemini.ts
+++ /dev/null
@@ -1,111 +0,0 @@
-import {GenkitPlugin} from 'genkit/plugin';
-import {googleAI} from '@genkit-ai/googleai';
-import {GenkitModelProvider, PromptDataForCounting, RateLimitConfig} from '../model-provider.js';
-import {lazy} from '../../../utils/lazy-creation.js';
-import {GoogleGenAI, Part} from '@google/genai';
-import {RateLimiter} from 'limiter';
-import {LlmResponseFile} from '../../../shared-interfaces.js';
-
-export class GeminiModelProvider extends GenkitModelProvider {
- readonly apiKeyVariableName = 'GEMINI_API_KEY';
-
- private geminiAPI = lazy(() => new GoogleGenAI({apiKey: this.getApiKey() || undefined}));
-
- protected models = {
- 'gemini-2.5-pro': () => googleAI.model('gemini-2.5-pro'),
- 'gemini-2.5-flash': () => googleAI.model('gemini-2.5-flash'),
- 'gemini-2.5-flash-lite': () => googleAI.model('gemini-2.5-flash-lite'),
- };
-
- protected rateLimitConfig: Record = {
- // See: https://ai.google.dev/gemini-api/docs/rate-limits#tier-1
- // 150 per minute requests is Gemini Pro's limit right now.
- 'googleai/gemini-2.5-pro': {
- requestPerMinute: new RateLimiter({
- tokensPerInterval: 150,
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side.
- }),
- tokensPerMinute: new RateLimiter({
- tokensPerInterval: 2_000_000 * 0.75, // *0.75 to be more resilient to token count deviations
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side.
- }),
- countTokens: prompt => this.countGeminiTokens(prompt, 'gemini-2.5-pro'),
- },
- // See: https://ai.google.dev/gemini-api/docs/rate-limits#tier-1
- // 1000 per minute requests is Gemini Flash's limit right now.
- 'googleai/gemini-2.5-flash': {
- requestPerMinute: new RateLimiter({
- tokensPerInterval: 1000,
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side.
- }),
- tokensPerMinute: new RateLimiter({
- tokensPerInterval: 1_000_000 * 0.75, // *0.75 to be more resilient to token count deviations
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side.
- }),
- countTokens: prompt => this.countGeminiTokens(prompt, 'gemini-2.5-flash'),
- },
- 'googleai/gemini-2.5-flash-lite': {
- // See: https://ai.google.dev/gemini-api/docs/rate-limits#tier-1
- // 1000 per minute requests is Gemini Flash Lite's limit right now.
- requestPerMinute: new RateLimiter({
- tokensPerInterval: 4000,
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side.
- }),
- tokensPerMinute: new RateLimiter({
- tokensPerInterval: 4_000_000 * 0.75, // *0.75 to be more resilient to token count deviations
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side.
- }),
- countTokens: prompt => this.countGeminiTokens(prompt, 'gemini-2.5-flash-lite'),
- },
- };
-
- protected pluginFactory(apiKey: string): GenkitPlugin {
- return googleAI({apiKey});
- }
-
- getModelSpecificConfig(opts: {includeThoughts?: boolean}): object {
- return {thinkingConfig: {includeThoughts: opts.includeThoughts}};
- }
-
- validateGeneratedFiles(files: LlmResponseFile[]): boolean {
- // Gemini responses occasionally get truncated on `class=`.
- // Consider these cases as invalid so they don't influence the results.
- return files.length === 0 || !files.some(file => file.code.trim().endsWith('class='));
- }
-
- private async countGeminiTokens(
- prompt: PromptDataForCounting,
- modelName: string,
- ): Promise {
- const contents = [
- ...prompt.messages.map(m => ({
- role: m.role,
- parts: m.content.map(c => {
- return 'text' in c
- ? ({text: c.text} satisfies Part)
- : ({
- inlineData: {
- data: c.media.base64PngImage,
- mimeType: 'image/png',
- },
- } satisfies Part);
- }),
- })),
- {role: 'user', parts: [{text: prompt.prompt}]},
- ];
-
- try {
- // Note: This is a separate API and doesn't contribute to our model requests/limits!
- return (
- (
- await this.geminiAPI().models.countTokens({
- model: modelName,
- contents,
- })
- ).totalTokens ?? null
- );
- } catch (e: unknown) {
- return null;
- }
- }
-}
diff --git a/runner/codegen/genkit/providers/grok.ts b/runner/codegen/genkit/providers/grok.ts
deleted file mode 100644
index f457edd5..00000000
--- a/runner/codegen/genkit/providers/grok.ts
+++ /dev/null
@@ -1,110 +0,0 @@
-import {xAI} from '@genkit-ai/compat-oai/xai';
-import {GenkitPlugin, GenkitPluginV2} from 'genkit/plugin';
-import {RateLimiter} from 'limiter';
-import fetch from 'node-fetch';
-import {GenkitModelProvider, PromptDataForCounting, RateLimitConfig} from '../model-provider.js';
-
-export class GrokModelProvider extends GenkitModelProvider {
- readonly apiKeyVariableName = 'XAI_API_KEY';
-
- protected readonly models = {
- 'grok-4': () => xAI.model('grok-4'),
- 'grok-code-fast-1': () => xAI.model('grok-code-fast-1'),
- };
-
- private async countTokensWithXaiApi(prompt: PromptDataForCounting): Promise {
- const apiKey = this.getApiKey();
- if (!apiKey) {
- return null;
- }
-
- try {
- // Use xAI's tokenize API for accurate token counting
- const messages = this.genkitPromptToXaiFormat(prompt);
- const text = messages.map(m => `${m.role}: ${m.content}`).join('\n');
-
- const response = await fetch('https://api.x.ai/v1/tokenize', {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- Authorization: `Bearer ${apiKey}`,
- },
- body: JSON.stringify({text}),
- });
-
- if (response.ok) {
- const data = (await response.json()) as {tokens: unknown[]};
- return data.tokens?.length || 0;
- }
- return null;
- } catch (error) {
- console.warn('Failed to count tokens using xAI API', error);
- return null;
- }
- }
-
- private async countTokensForModel(
- _modelName: string,
- prompt: PromptDataForCounting,
- ): Promise {
- const xaiTokenCount = await this.countTokensWithXaiApi(prompt);
- if (xaiTokenCount !== null) {
- return xaiTokenCount;
- }
- return 0;
- }
-
- protected rateLimitConfig: Record = {
- // XAI Grok rate limits https://docs.x.ai/docs/models
- 'xai/grok-4': {
- requestPerMinute: new RateLimiter({
- tokensPerInterval: 480,
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side
- }),
- tokensPerMinute: new RateLimiter({
- tokensPerInterval: 2_000_000 * 0.75,
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side
- }),
- countTokens: prompt => this.countTokensForModel('grok-4', prompt),
- },
- 'xai/grok-code-fast-1': {
- requestPerMinute: new RateLimiter({
- tokensPerInterval: 480,
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side
- }),
- tokensPerMinute: new RateLimiter({
- tokensPerInterval: 2_000_000 * 0.75,
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side
- }),
- countTokens: prompt => this.countTokensForModel('grok-code-fast-1', prompt),
- },
- };
-
- protected pluginFactory(apiKey: string): GenkitPlugin | GenkitPluginV2 {
- return xAI({apiKey});
- }
-
- getModelSpecificConfig(): object {
- // Grok doesn't require special configuration at this time
- return {};
- }
-
- validateGeneratedFiles(): boolean {
- return true;
- }
-
- private genkitPromptToXaiFormat(
- prompt: PromptDataForCounting,
- ): Array<{role: string; content: string}> {
- const xaiPrompt: Array<{role: string; content: string}> = [];
- for (const part of prompt.messages) {
- for (const c of part.content) {
- xaiPrompt.push({
- role: part.role,
- content: 'media' in c ? c.media.url : c.text,
- });
- }
- }
- return [...xaiPrompt, {role: 'user', content: prompt.prompt}];
- }
-}
diff --git a/runner/codegen/genkit/providers/open-ai.ts b/runner/codegen/genkit/providers/open-ai.ts
deleted file mode 100644
index 0b62ffda..00000000
--- a/runner/codegen/genkit/providers/open-ai.ts
+++ /dev/null
@@ -1,97 +0,0 @@
-import {GenkitPluginV2} from 'genkit/plugin';
-import {openAI} from '@genkit-ai/compat-oai/openai';
-import {RateLimiter} from 'limiter';
-import {GenkitModelProvider, PromptDataForCounting, RateLimitConfig} from '../model-provider.js';
-import {encoding_for_model} from 'tiktoken';
-
-export class OpenAiModelProvider extends GenkitModelProvider {
- readonly apiKeyVariableName = 'OPENAI_API_KEY';
-
- protected readonly models = {
- 'openai-o3': () => openAI.model('o3'),
- 'openai-o4-mini': () => openAI.model('o4-mini'),
- 'openai-gpt-5': () => openAI.model('gpt-5'),
- };
-
- private countTokensForModel(
- modelName: Parameters[0],
- prompt: PromptDataForCounting,
- ): number {
- const encoding = encoding_for_model(modelName);
- try {
- const messages = this.genkitPromptToOpenAi(prompt);
- const text = messages.map(m => `${m.role}: ${m.content}`).join('\n');
- const tokens = encoding.encode(text);
- return tokens.length;
- } finally {
- encoding.free();
- }
- }
-
- protected rateLimitConfig: Record = {
- // See: https://platform.openai.com/docs/models/o3
- 'openai/o3': {
- requestPerMinute: new RateLimiter({
- tokensPerInterval: 500,
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side.
- }),
- tokensPerMinute: new RateLimiter({
- tokensPerInterval: 30_000 * 0.75, // *0.75 to be more resilient to token count deviations
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side.
- }),
- countTokens: async prompt => this.countTokensForModel('gpt-4o', prompt),
- },
- // See https://platform.openai.com/docs/models/o4-mini
- 'openai/o4-mini': {
- requestPerMinute: new RateLimiter({
- tokensPerInterval: 1000,
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side.
- }),
- tokensPerMinute: new RateLimiter({
- tokensPerInterval: 100_000 * 0.75, // *0.75 to be more resilient to token count deviations
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side.
- }),
- countTokens: async prompt => this.countTokensForModel('gpt-4o-mini', prompt),
- },
- // See: https://platform.openai.com/docs/models/gpt-5
- 'openai/gpt-5': {
- requestPerMinute: new RateLimiter({
- tokensPerInterval: 500,
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side.
- }),
- tokensPerMinute: new RateLimiter({
- tokensPerInterval: 30_000 * 0.75, // *0.75 to be more resilient to token count deviations
- interval: 1000 * 60 * 1.5, // Refresh tokens after 1.5 minutes to be on the safe side.
- }),
- countTokens: async prompt => this.countTokensForModel('gpt-5', prompt),
- },
- };
-
- protected pluginFactory(apiKey: string): GenkitPluginV2 {
- return openAI({apiKey, maxRetries: 0});
- }
-
- getModelSpecificConfig(): object {
- // TODO: Add thinking output for OpenAI
- return {};
- }
-
- validateGeneratedFiles(): boolean {
- return true;
- }
-
- private genkitPromptToOpenAi(
- prompt: PromptDataForCounting,
- ): Array<{role: string; content: string}> {
- const openAiPrompt: Array<{role: string; content: string}> = [];
- for (const part of prompt.messages) {
- for (const c of part.content) {
- openAiPrompt.push({
- role: part.role,
- content: 'media' in c ? c.media.url : c.text,
- });
- }
- }
- return [...openAiPrompt, {role: 'user', content: prompt.prompt}];
- }
-}
diff --git a/runner/codegen/llm-runner.ts b/runner/codegen/llm-runner.ts
index ba1cd83a..6ff830b8 100644
--- a/runner/codegen/llm-runner.ts
+++ b/runner/codegen/llm-runner.ts
@@ -136,7 +136,7 @@ export interface LocalLlmConstrainedOutputGenerateResponse | null;
/** Token usage data, if available. */
- usage?: Partial;
+ usage?: Usage;
/** Reasoning messages from the LLM. */
reasoning: string;
}
@@ -144,7 +144,7 @@ export interface LocalLlmConstrainedOutputGenerateResponse;
+ usage?: Usage;
/** Reasoning messages from the LLM. */
reasoning: string;
/** Tool requests and responses. */
diff --git a/runner/codegen/noop-unimplemented-runner.ts b/runner/codegen/noop-unimplemented-runner.ts
new file mode 100644
index 00000000..fec46915
--- /dev/null
+++ b/runner/codegen/noop-unimplemented-runner.ts
@@ -0,0 +1,32 @@
+import {
+ LlmRunner,
+ LocalLlmConstrainedOutputGenerateResponse,
+ LocalLlmGenerateFilesResponse,
+ LocalLlmGenerateTextResponse,
+} from './llm-runner.js';
+
+/**
+ * Noop runner that is useful for creating a `LocalExecutor`
+ * that doesn't leverage a runner specified to WCS.
+ */
+export class NoopUnimplementedRunner implements LlmRunner {
+ displayName = 'noop-unimplemented';
+ id = 'noop-unimplemented';
+ hasBuiltInRepairLoop = true;
+
+ generateFiles(): Promise {
+ throw new Error('Method not implemented.');
+ }
+ generateText(): Promise {
+ throw new Error('Method not implemented.');
+ }
+ generateConstrained(): Promise> {
+ throw new Error('Method not implemented.');
+ }
+ getSupportedModels(): string[] {
+ throw new Error('Method not implemented.');
+ }
+ async dispose(): Promise {
+ throw new Error('Method not implemented.');
+ }
+}
diff --git a/runner/codegen/runner-creation.ts b/runner/codegen/runner-creation.ts
index ccfee96a..481aab18 100644
--- a/runner/codegen/runner-creation.ts
+++ b/runner/codegen/runner-creation.ts
@@ -1,14 +1,16 @@
import {UserFacingError} from '../utils/errors.js';
import type {GeminiCliRunner} from './gemini-cli-runner.js';
import type {ClaudeCodeRunner} from './claude-code-runner.js';
-import type {GenkitRunner} from './genkit/genkit-runner.js';
import type {CodexRunner} from './codex-runner.js';
+import type {NoopUnimplementedRunner} from './noop-unimplemented-runner.js';
+import {AiSdkRunner} from './ai-sdk/ai-sdk-runner.js';
interface AvailableRunners {
- genkit: GenkitRunner;
+ 'ai-sdk': AiSdkRunner;
'gemini-cli': GeminiCliRunner;
'claude-code': ClaudeCodeRunner;
'codex': CodexRunner;
+ 'noop-unimplemented': NoopUnimplementedRunner;
}
/** Names of supported runners. */
@@ -16,14 +18,10 @@ export type RunnerName = keyof AvailableRunners;
/** Creates an `LlmRunner` based on a name. */
export async function getRunnerByName(name: T): Promise {
- // Note that we lazily import and resolve the runners here, because their imports
- // might have side effects. E.g. Genkit installs a listener on the process exiting
- // in order to kill pending instances and log "Closing all Genkit instances".
- // We don't want to trigger those side effects unless we actually need them.
switch (name) {
- case 'genkit':
- return import('./genkit/genkit-runner.js').then(
- m => new m.GenkitRunner() as AvailableRunners[T],
+ case 'ai-sdk':
+ return import('./ai-sdk/ai-sdk-runner.js').then(
+ m => new m.AiSdkRunner() as AvailableRunners[T],
);
case 'gemini-cli':
return import('./gemini-cli-runner.js').then(
@@ -35,6 +33,10 @@ export async function getRunnerByName(name: T): Promise new m.CodexRunner() as AvailableRunners[T]);
+ case 'noop-unimplemented':
+ return import('./noop-unimplemented-runner.js').then(
+ m => new m.NoopUnimplementedRunner() as AvailableRunners[T],
+ );
default:
throw new UserFacingError(`Unsupported runner ${name}`);
}
diff --git a/runner/configuration/constants.ts b/runner/configuration/constants.ts
index 422a2ddb..f22ae3af 100644
--- a/runner/configuration/constants.ts
+++ b/runner/configuration/constants.ts
@@ -17,9 +17,21 @@ export const DEFAULT_MODEL_NAME = 'gemini-2.5-pro'; // slower than `flash`, but
*/
export const DEFAULT_AUTORATER_MODEL_NAME = 'gemini-2.5-flash'; // use less expensive model
+/** Model used for AI summarization by default. */
+export const DEFAULT_SUMMARY_MODEL = 'gemini-2.5-flash-lite';
+
/** Name of the root folder where we store LLM-generated code for debugging */
export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output');
+/**
+ * Path for WCS temporary directories.
+ *
+ * We want temporary workspaces to be nested inside the root project to
+ * better support symlinked node modules. E.g. Turbopack will otherwise
+ * fail if symlinked node modules are not reachable via parent directories.
+ */
+export const WCS_BASE_TMP_DIR = join(rootDir, 'tmp-workspaces');
+
/**
* Number of times we'll try to ask LLM to repair a build failure,
* providing the build output and the code that causes the problem.
@@ -31,7 +43,10 @@ export const DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS = 1;
* Number of times we'll try to ask LLM to repair test failures
* E.g. Axe violations, or test command failures
*/
-export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 1;
+export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 0;
+
+/** Default number of retries when a prompt evaluation timed out. */
+export const DEFAULT_PROMPT_TIMEOUT_RETRIES = 1;
/** Name of the folder where we store all generated reports */
export const REPORTS_ROOT_DIR = join(rootDir, 'reports');
diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts
index 87a3e382..303ba6a8 100644
--- a/runner/configuration/environment-config.ts
+++ b/runner/configuration/environment-config.ts
@@ -1,13 +1,21 @@
import z from 'zod';
import {createMessageBuilder, fromError} from 'zod-validation-error/v3';
import {UserFacingError} from '../utils/errors.js';
-import {ratingSchema} from '../ratings/rating-types.js';
+import {RatingCategory, ratingOverrideSchema, ratingSchema} from '../ratings/rating-types.js';
import {EvalPrompt, EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
import {executorSchema} from '../orchestration/executors/executor.js';
import {
LocalExecutorConfig,
localExecutorConfigSchema,
} from '../orchestration/executors/local-executor-config.js';
+import {
+ LlmResponseFile,
+ PromptDefinition,
+ RatingContextFilter,
+ ReportContextFilter,
+} from '../shared-interfaces.js';
+import type {Environment} from './environment.js';
+import type {AiSdkRunner} from '../codegen/ai-sdk/ai-sdk-runner.js';
export const environmentConfigSchema = z.object({
/** Display name for the environment. */
@@ -21,6 +29,11 @@ export const environmentConfigSchema = z.object({
clientSideFramework: z.string(),
/** Ratings to run when evaluating the environment. */
ratings: z.array(ratingSchema),
+ /**
+ * Map used to override fields for specific ratings. The key is the unique ID of
+ * the rating and the value are the override fields.
+ */
+ ratingOverrides: z.record(z.string(), ratingOverrideSchema).optional(),
/** Path to the prompt used by the LLM for generating files. */
generationSystemPrompt: z.string(),
/**
@@ -57,8 +70,16 @@ export const environmentConfigSchema = z.object({
fullStackFramework: z.string().optional(),
/** Path to the prompt to use when rating code. */
codeRatingPrompt: z.string().optional(),
+ /** Path to the prompt to use when rating screenshots. */
+ visualRatingPrompt: z.string().optional(),
/** When enabled, the system prompts for this environment won't be included in the report. */
classifyPrompts: z.boolean().optional(),
+ /**
+ * Timeout in minutes for a single prompt evaluation.
+ *
+ * E.g. if a single app takes longer than 10min, it will be aborted.
+ */
+ promptTimeoutMinutes: z.number().optional(),
/** Executor to be used for this environment. */
executor: executorSchema
.optional()
@@ -66,6 +87,60 @@ export const environmentConfigSchema = z.object({
'Executor to be used for this environment. ' +
'If unset, a local executor is derived from the full environment configuration.',
),
+
+ /**
+ * Map used to override fields for specific rating categories. The key is the unique ID of
+ * the category and the value are the override fields.
+ */
+ categoryOverrides: z
+ .record(
+ z.custom(),
+ z.object({
+ name: z.string().optional(),
+ maxPoints: z.number().optional(),
+ }),
+ )
+ .optional(),
+
+ /**
+ * When an environment is created, it generates a hash based on the configured ratings.
+ * This field is used to validate that the generated hash matches a pre-defined one.
+ * It's useful to ensure that the set of ratings hasn't changed between two runs.
+ */
+ expectedRatingHash: z.string().optional(),
+
+ /**
+ * Prompts to use when for additional analysis of the eval results.
+ */
+ analysisPrompts: z
+ .array(
+ z.object({
+ name: z.string(),
+ path: z.string(),
+ model: z.string().optional(),
+ reportsFilter: z
+ .enum([ReportContextFilter.AllReports, ReportContextFilter.NonPerfectReports])
+ .optional(),
+ ratingsFilter: z
+ .enum([RatingContextFilter.AllRatings, RatingContextFilter.NonPerfectRatings])
+ .optional(),
+ }),
+ )
+ .optional(),
+
+ /**
+ * Function that can be used to augment prompts before they're evaluated.
+ */
+ augmentExecutablePrompt: z
+ .function(z.tuple([z.custom()]), z.promise(z.string()))
+ .optional(),
+
+ /**
+ * Function that can be used to augment generated files before they're evaluated.
+ */
+ augmentGeneratedFile: z
+ .function(z.tuple([z.custom>()]), z.string())
+ .optional(),
});
/**
@@ -75,6 +150,16 @@ export const environmentConfigSchema = z.object({
export type EnvironmentConfig = z.infer &
Partial;
+/** Context passed to the `augmentExecutablePrompt` function. */
+export interface PromptAugmentationContext {
+ /** Definition being augmented. */
+ promptDef: PromptDefinition;
+ /** Environment running the evaluation. */
+ environment: Environment;
+ /** Runner that the user can use for augmentation. */
+ runner: AiSdkRunner;
+}
+
/** Asserts that the specified data is a valid environment config. */
export function assertIsEnvironmentConfig(value: unknown): asserts value is EnvironmentConfig {
const validationResult = environmentConfigSchema
@@ -86,8 +171,6 @@ export function assertIsEnvironmentConfig(value: unknown): asserts value is Envi
.safeParse(value);
if (!validationResult.success) {
- // TODO: we can use `z.prettifyError` once we update to zod v4,
- // but last time the update caused some issues with Genkit.
const message = fromError(validationResult.error, {
messageBuilder: createMessageBuilder({
prefix: 'Environment parsing failed:',
diff --git a/runner/configuration/environment.ts b/runner/configuration/environment.ts
index a88acb1f..4af69f06 100644
--- a/runner/configuration/environment.ts
+++ b/runner/configuration/environment.ts
@@ -2,19 +2,39 @@ import {readdirSync, readFileSync, statSync} from 'fs';
import {basename, extname, join, resolve} from 'path';
import {globSync} from 'tinyglobby';
import {Executor} from '../orchestration/executors/executor.js';
-import {Rating} from '../ratings/rating-types.js';
+import {Rating, RatingCategory} from '../ratings/rating-types.js';
import {
FrameworkInfo,
+ LlmResponseFile,
MultiStepPromptDefinition,
PromptDefinition,
+ RatingContextFilter,
+ ReportContextFilter,
RootPromptDefinition,
} from '../shared-interfaces.js';
import {UserFacingError} from '../utils/errors.js';
import {generateId} from '../utils/id-generation.js';
import {lazy} from '../utils/lazy-creation.js';
-import {EnvironmentConfig} from './environment-config.js';
+import {EnvironmentConfig, PromptAugmentationContext} from './environment-config.js';
import {EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
import {renderPromptTemplate} from './prompt-templating.js';
+import {getSha256Hash} from '../utils/hashing.js';
+import {DEFAULT_SUMMARY_MODEL} from './constants.js';
+import type {AiSdkRunner} from '../codegen/ai-sdk/ai-sdk-runner.js';
+import {getRunnerByName} from '../codegen/runner-creation.js';
+
+interface CategoryConfig {
+ name: string;
+ maxPoints: number;
+}
+
+interface AnalysisPrompt {
+ name: string;
+ prompt: string;
+ model: string;
+ reportsFilter: ReportContextFilter;
+ ratingsFilter: RatingContextFilter;
+}
/** Represents a single prompt evaluation environment. */
export class Environment {
@@ -30,12 +50,44 @@ export class Environment {
readonly clientSideFramework: FrameworkInfo;
/** Path from which to read the code rating prompt. */
readonly codeRatingPromptPath: string | null;
+ /** Path from which to read the visual rating prompt. */
+ readonly visualRatingPromptPath: string | null;
/** Whether the prompts should be removed from the final report. */
readonly classifyPrompts: boolean;
/** Whether this is one of the built-in environment that come with the runner. */
readonly isBuiltIn: boolean;
/** Configured executor. */
readonly executor: Executor;
+ /** Timeout for a single eval prompt in minutes. */
+ readonly promptTimeoutMinutes: number | undefined;
+ /** Configuration for the individual rating categories. */
+ readonly ratingCategories: {
+ [RatingCategory.HIGH_IMPACT]: CategoryConfig;
+ [RatingCategory.MEDIUM_IMPACT]: CategoryConfig;
+ [RatingCategory.LOW_IMPACT]: CategoryConfig;
+ };
+ /**
+ * Hash of the environment-level ratings. Can be used to
+ * validate that the ratings haven't changed between runs.
+ */
+ readonly ratingHash: string;
+
+ /** Additional analysis prompts defined by the user. */
+ readonly analysisPrompts: AnalysisPrompt[];
+
+ /** Ratings configured at the environment level. */
+ private readonly ratings: Rating[];
+
+ /** User-configured function used to augment prompts. */
+ private readonly augmentExecutablePrompt:
+ | ((context: PromptAugmentationContext) => Promise)
+ | null;
+
+ /** Runner that user can use to access an LLM to augment prompts. */
+ private augmentationRunner: AiSdkRunner | null = null;
+
+ /** User-provided callback for augmenting the LLM-generated files. */
+ private readonly augmentFileCallback: ((file: LlmResponseFile) => string) | null;
constructor(
rootPath: string,
@@ -59,28 +111,39 @@ export class Environment {
this.codeRatingPromptPath = config.codeRatingPrompt
? join(rootPath, config.codeRatingPrompt)
: null;
+ this.visualRatingPromptPath = config.visualRatingPrompt
+ ? join(rootPath, config.visualRatingPrompt)
+ : null;
this.classifyPrompts = config.classifyPrompts ?? false;
this.isBuiltIn = rootPath.includes('node_modules');
this.executor = config.executor;
+ this.promptTimeoutMinutes = config.promptTimeoutMinutes;
+ this.ratingCategories = this.getRatingCategories(config);
+ this.ratings = this.resolveRatings(config);
+ this.ratingHash = this.getRatingHash(this.ratings, this.ratingCategories);
+ this.analysisPrompts = this.resolveAnalysisPrompts(config);
+ this.augmentExecutablePrompt = config.augmentExecutablePrompt || null;
+ this.augmentFileCallback = config.augmentGeneratedFile || null;
+ this.validateRatingHash(this.ratingHash, config);
}
/** Prompts that should be executed as a part of the evaluation. */
- executablePrompts = lazy(async () => {
- return this.resolveExecutablePrompts(this.config.executablePrompts, this.config.ratings);
+ readonly executablePrompts = lazy(async () => {
+ return this.resolveExecutablePrompts(this.config.executablePrompts);
});
- systemPromptGeneration = lazy(async () => {
+ readonly systemPromptGeneration = lazy(async () => {
return (await this.renderSystemPrompt(this.config.generationSystemPrompt)).result;
});
- systemPromptRepair = lazy(async () => {
+ readonly systemPromptRepair = lazy(async () => {
if (!this.config.repairSystemPrompt) {
return 'Please fix the given errors and return the corrected code.';
}
return (await this.renderSystemPrompt(this.config.repairSystemPrompt)).result;
});
- systemPromptEditing = lazy(async () => {
+ readonly systemPromptEditing = lazy(async () => {
if (!this.config.editingSystemPrompt) {
return this.systemPromptGeneration();
}
@@ -138,6 +201,21 @@ export class Environment {
});
}
+ /** Augments response files based on the user's configuration. */
+ augmentResponseFiles(files: LlmResponseFile[]): void {
+ if (this.augmentFileCallback) {
+ files.forEach(file => (file.code = this.augmentFileCallback!(file)));
+ }
+ }
+
+ async destroy(): Promise {
+ await this.executor.destroy();
+
+ if (this.augmentationRunner) {
+ await this.augmentationRunner.dispose();
+ }
+ }
+
/**
* Gets the readable display name of a framework, based on its ID.
* @param id ID to be resolved.
@@ -163,21 +241,20 @@ export class Environment {
/**
* Resolves the prompt configuration into prompt definitions.
- * @param rootPath Root path of the project.
* @param prompts Prompts to be resolved.
- * @param envRatings Environment-level ratings.
+ * @param config Configuration for the environment.
*/
private async resolveExecutablePrompts(
- prompts: EnvironmentConfig['executablePrompts'],
- envRatings: Rating[],
+ definitions: EnvironmentConfig['executablePrompts'],
): Promise {
- const result: Promise[] = [];
+ const promptPromises: Promise[] = [];
+ const envRatings = this.ratings;
- for (const def of prompts) {
+ for (const def of definitions) {
if (def instanceof MultiStepPrompt) {
- result.push(this.getMultiStepPrompt(def, envRatings));
+ promptPromises.push(this.getMultiStepPrompt(def, envRatings));
} else if (def instanceof EvalPromptWithMetadata) {
- result.push(
+ promptPromises.push(
Promise.resolve({
name: def.name,
kind: 'single',
@@ -202,10 +279,10 @@ export class Environment {
name = def.name;
}
- result.push(
+ promptPromises.push(
...globSync(path, {cwd: this.rootPath}).map(
async relativePath =>
- await this.getStepPromptDefinition(
+ await this.getSinglePromptDefinition(
name ?? basename(relativePath, extname(relativePath)),
relativePath,
ratings,
@@ -217,11 +294,39 @@ export class Environment {
}
}
- return Promise.all(result);
+ const prompts = await Promise.all(promptPromises);
+
+ if (this.augmentExecutablePrompt) {
+ const augmentationPromises: Promise[] = [];
+ const updatePrompt = (promptDef: PromptDefinition) => {
+ augmentationPromises.push(
+ this.augmentExecutablePrompt!({
+ promptDef,
+ environment: this,
+ runner: this.augmentationRunner!,
+ }).then(text => (promptDef.prompt = text)),
+ );
+ };
+ this.augmentationRunner ??= await getRunnerByName('ai-sdk');
+
+ for (const rootPrompt of prompts) {
+ if (rootPrompt.kind === 'multi-step') {
+ for (const promptDef of rootPrompt.steps) {
+ updatePrompt(promptDef);
+ }
+ } else {
+ updatePrompt(rootPrompt);
+ }
+ }
+
+ await Promise.all(augmentationPromises);
+ }
+
+ return prompts;
}
/**
- * Creates a prompt definition for a given step.
+ * Creates a prompt definition for a single prompt.
*
* @param name Name of the prompt.
* @param rootPath Root path of the project.
@@ -229,14 +334,14 @@ export class Environment {
* @param ratings Ratings to run against the definition.
* @param isEditing Whether this is an editing or generation step.
*/
- private async getStepPromptDefinition(
+ private async getSinglePromptDefinition(
name: string,
relativePath: string,
ratings: Rating[],
isEditing: boolean,
metadata: Metadata,
): Promise> {
- const {result, contextFiles} = await this.renderEnvironmentPrompt(relativePath);
+ const {result, contextFiles} = this.renderEnvironmentPrompt(relativePath);
return {
name: name,
@@ -304,11 +409,11 @@ export class Environment {
if (stepNum === 0) {
throw new UserFacingError('Multi-step prompts start with `step-1`.');
}
- const step = await this.getStepPromptDefinition(
+ const step = await this.getSinglePromptDefinition(
`${name}-step-${stepNum}`,
join(def.directoryPath, current.name),
ratings,
- /*isEditing */ stepNum !== 1,
+ /* isEditing */ stepNum !== 1,
stepMetadata,
);
@@ -334,13 +439,13 @@ export class Environment {
}
/** Renders a prompt from a path relative to the environment config. */
- private async renderEnvironmentPrompt(relativePath: string) {
+ private renderEnvironmentPrompt(relativePath: string) {
const path = resolve(this.rootPath, relativePath);
return this.renderPrompt(readFileSync(path, 'utf8'), path);
}
private async renderSystemPrompt(relativePath: string) {
- const result = await this.renderEnvironmentPrompt(relativePath);
+ const result = this.renderEnvironmentPrompt(relativePath);
// Optional hooks for post processing environment system prompts. Useful for e.g.
// supporting `@` references from Gemini CLI or inside g3.
@@ -350,4 +455,92 @@ export class Environment {
return result;
}
+
+ private resolveRatings(config: EnvironmentConfig) {
+ if (!config.ratingOverrides) {
+ return config.ratings;
+ }
+
+ Object.keys(config.ratingOverrides).forEach(id => {
+ if (!config.ratings.some(rating => rating.id === id)) {
+ throw new UserFacingError(
+ `Rating with an ID of "${id}" has not been configured. Cannot apply an override to it.`,
+ );
+ }
+ });
+
+ return config.ratings.map(rating => {
+ const override = config.ratingOverrides![rating.id];
+ return override ? {...rating, ...override} : rating;
+ });
+ }
+
+ private getRatingCategories(config: EnvironmentConfig) {
+ const overrides = config.categoryOverrides;
+
+ return {
+ [RatingCategory.HIGH_IMPACT]: {
+ name: 'High Impact',
+ maxPoints: 60,
+ ...overrides?.[RatingCategory.HIGH_IMPACT],
+ },
+ [RatingCategory.MEDIUM_IMPACT]: {
+ name: 'Medium Impact',
+ maxPoints: 30,
+ ...overrides?.[RatingCategory.MEDIUM_IMPACT],
+ },
+ [RatingCategory.LOW_IMPACT]: {
+ name: 'Low Impact',
+ maxPoints: 10,
+ ...overrides?.[RatingCategory.LOW_IMPACT],
+ },
+ };
+ }
+
+ private getRatingHash(
+ ratings: Rating[],
+ categories: Record,
+ ): string {
+ const parts: string[] = [];
+
+ for (const rating of ratings) {
+ parts.push(
+ `${rating.category};${categories[rating.category]?.maxPoints};` +
+ `${rating.id};${rating.scoreReduction};${rating.groupingLabels || [].sort().join(',')}`,
+ );
+ }
+
+ return getSha256Hash(parts.sort().join('|'));
+ }
+
+ private validateRatingHash(currentHash: string, config: EnvironmentConfig) {
+ if (config.expectedRatingHash && config.expectedRatingHash !== currentHash) {
+ throw new UserFacingError(
+ [
+ `Rating hash for environment "${this.displayName}" does not match the expectation.`,
+ `Expected: ${config.expectedRatingHash}`,
+ `Actual: ${this.ratingHash}`,
+ `Either update the \`expectedRatingHash\` field in the config or revert the ratings back to their previous configuration`,
+ ].join('\n'),
+ );
+ }
+ }
+
+ private resolveAnalysisPrompts(config: EnvironmentConfig): AnalysisPrompt[] {
+ const result: AnalysisPrompt[] = [];
+
+ config.analysisPrompts?.forEach(({name, path, model, reportsFilter, ratingsFilter}) => {
+ const prompt = this.renderEnvironmentPrompt(path).result;
+
+ result.push({
+ name,
+ prompt,
+ model: model || DEFAULT_SUMMARY_MODEL,
+ reportsFilter: reportsFilter ?? ReportContextFilter.NonPerfectReports,
+ ratingsFilter: ratingsFilter ?? RatingContextFilter.NonPerfectRatings,
+ });
+ });
+
+ return result;
+ }
}
diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts
index 89b561f4..c99d6001 100644
--- a/runner/eval-cli.ts
+++ b/runner/eval-cli.ts
@@ -6,6 +6,8 @@ import {
DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS,
DEFAULT_MAX_TEST_REPAIR_ATTEMPTS,
DEFAULT_MODEL_NAME,
+ DEFAULT_PROMPT_TIMEOUT_RETRIES,
+ REPORTS_ROOT_DIR,
} from './configuration/constants.js';
import {generateCodeAndAssess} from './orchestration/generate.js';
import {logReportToConsole, writeReportToDisk} from './reporting/report-logging.js';
@@ -21,7 +23,7 @@ export const EvalModule = {
interface Options {
environment?: string;
- model: string;
+ model: string[];
runner: RunnerName;
local: boolean;
limit: number;
@@ -42,6 +44,7 @@ interface Options {
skipLighthouse?: boolean;
maxTestRepairAttempts?: number;
maxBuildRepairAttempts?: number;
+ promptTimeoutRetries?: number;
}
function builder(argv: Argv): Argv {
@@ -54,14 +57,15 @@ function builder(argv: Argv): Argv {
})
.option('model', {
type: 'string',
- default: DEFAULT_MODEL_NAME,
- descript: 'Model to use when generating code',
+ array: true,
+ default: [DEFAULT_MODEL_NAME],
+ descript: 'Model(s) to use when generating code',
})
// Option is a noop right now when using a remote environment.
.option('runner', {
type: 'string',
- default: 'genkit' as const,
- choices: ['genkit', 'gemini-cli', 'claude-code', 'codex'] as RunnerName[],
+ default: 'ai-sdk' as const,
+ choices: ['ai-sdk', 'gemini-cli', 'claude-code', 'codex'] as RunnerName[],
description: 'Runner to use to execute the eval',
})
.option('local', {
@@ -168,6 +172,12 @@ function builder(argv: Argv): Argv {
description:
'Number of repair attempts for discovered test failures (including a11y violations and ones from testCommand)',
})
+ .option('prompt-timeout-retries', {
+ type: 'number',
+ default: DEFAULT_PROMPT_TIMEOUT_RETRIES,
+ description:
+ 'Maximum number of times to retry a prompt evaluation after it fails due to a timeout.',
+ })
.strict()
.version(false)
.help()
@@ -197,43 +207,60 @@ async function handler(cliArgs: Arguments): Promise {
process.on('SIGTERM', () => abortCtrl.abort());
process.on('exit', () => abortCtrl.abort());
- try {
- const runInfo = await generateCodeAndAssess({
- runner: cliArgs.runner,
- model: cliArgs.model,
- environmentConfigPath: BUILT_IN_ENVIRONMENTS.get(cliArgs.environment) || cliArgs.environment,
- localMode: cliArgs.local,
- limit: cliArgs.limit,
- concurrency: cliArgs.concurrency as number,
- reportName: cliArgs.reportName!,
- skipScreenshots: !!cliArgs.skipScreenshots,
- startMcp: cliArgs.mcp,
- ragEndpoint: cliArgs.ragEndpoint,
- outputDirectory: cliArgs.outputDirectory,
- promptFilter: cliArgs.promptFilter,
- labels: cliArgs.labels || [],
- skipAxeTesting: !!cliArgs.skipAxeTesting,
- enableUserJourneyTesting: cliArgs.enableUserJourneyTesting,
- enableAutoCsp: cliArgs.enableAutoCsp,
- logging: cliArgs.logging,
- autoraterModel: cliArgs.autoraterModel,
- skipAiSummary: cliArgs.skipAiSummary,
- skipLighthouse: cliArgs.skipLighthouse,
- maxBuildRepairAttempts: cliArgs.maxBuildRepairAttempts,
- maxTestRepairAttempts: cliArgs.maxTestRepairAttempts,
- abortSignal: abortCtrl.signal,
- });
+ const models = cliArgs.model;
+ const baseReportName = cliArgs.reportName!;
+
+ for (const model of models) {
+ const reportName =
+ models.length > 1
+ ? `${baseReportName}--${model.replace(/[^a-zA-Z0-9-]/g, '-')}`
+ : baseReportName;
+
+ if (models.length > 1) {
+ console.log(chalk.cyan(`\nStarting evaluation with model: ${model}\n`));
+ }
+
+ try {
+ const runInfo = await generateCodeAndAssess({
+ runner: cliArgs.runner,
+ model,
+ environment: {
+ configPath: BUILT_IN_ENVIRONMENTS.get(cliArgs.environment) || cliArgs.environment,
+ },
+ localMode: cliArgs.local,
+ limit: cliArgs.limit,
+ concurrency: cliArgs.concurrency as number,
+ reportName,
+ skipScreenshots: !!cliArgs.skipScreenshots,
+ startMcp: cliArgs.mcp,
+ ragEndpoint: cliArgs.ragEndpoint,
+ outputDirectory: cliArgs.outputDirectory,
+ promptFilter: cliArgs.promptFilter,
+ labels: cliArgs.labels || [],
+ skipAxeTesting: !!cliArgs.skipAxeTesting,
+ enableUserJourneyTesting: cliArgs.enableUserJourneyTesting,
+ enableAutoCsp: cliArgs.enableAutoCsp,
+ logging: cliArgs.logging,
+ autoraterModel: cliArgs.autoraterModel,
+ skipAiSummary: cliArgs.skipAiSummary,
+ skipLighthouse: cliArgs.skipLighthouse,
+ maxBuildRepairAttempts: cliArgs.maxBuildRepairAttempts,
+ maxTestRepairAttempts: cliArgs.maxTestRepairAttempts,
+ promptTimeoutRetries: cliArgs.promptTimeoutRetries,
+ abortSignal: abortCtrl.signal,
+ });
- logReportToConsole(runInfo);
- await writeReportToDisk(runInfo, runInfo.details.summary.environmentId);
- } catch (error: unknown) {
- if (error instanceof UserFacingError) {
- console.error(chalk.red(error.message));
- } else {
- console.error(chalk.red('An error occurred during the assessment process:'));
- console.error(chalk.red(error));
- if (process.env.DEBUG === '1' && (error as Partial).stack) {
- console.error(chalk.red((error as Error).stack));
+ logReportToConsole(runInfo);
+ await writeReportToDisk(runInfo, runInfo.details.summary.environmentId, REPORTS_ROOT_DIR);
+ } catch (error: unknown) {
+ if (error instanceof UserFacingError) {
+ console.error(chalk.red(error.message));
+ } else {
+ console.error(chalk.red('An error occurred during the assessment process:'));
+ console.error(chalk.red(error));
+ if (process.env.DEBUG === '1' && (error as Partial).stack) {
+ console.error(chalk.red((error as Error).stack));
+ }
}
}
}
diff --git a/runner/index.ts b/runner/index.ts
index 3c0e1f48..20caf85d 100644
--- a/runner/index.ts
+++ b/runner/index.ts
@@ -37,7 +37,6 @@ export {
type McpServerOptions,
type PromptDataMessage,
} from './codegen/llm-runner.js';
-export {GenkitRunner} from './codegen/genkit/genkit-runner.js';
export {GeminiCliRunner} from './codegen/gemini-cli-runner.js';
export {getRunnerByName, type RunnerName} from './codegen/runner-creation.js';
export {getEnvironmentByPath} from './configuration/environment-resolution.js';
@@ -49,3 +48,8 @@ export {NoopProgressLogger} from './progress/noop-progress-logger.js';
export {TextProgressLogger} from './progress/text-progress-logger.js';
export {type ServeTestingResult} from './workers/serve-testing/worker-types.js';
export {replaceAtReferencesInPrompt} from './utils/prompt-at-references.js';
+export {extractRubrics} from './utils/extract-rubrics.js';
+export {combineReports} from './utils/combine-reports.mjs';
+export {writeReportToDisk} from './reporting/report-logging.js';
+export {AiSdkRunner} from './codegen/ai-sdk/ai-sdk-runner.js';
+export {type AiSdkModelOptions as AiSDKModelOptions} from './codegen/ai-sdk/ai-sdk-model-options.js';
diff --git a/runner/init-cli.ts b/runner/init-cli.ts
index c5a622ca..8824e911 100644
--- a/runner/init-cli.ts
+++ b/runner/init-cli.ts
@@ -1,12 +1,10 @@
import {Argv, CommandModule, Options} from 'yargs';
-import {input, confirm} from '@inquirer/prompts';
-import chalk from 'chalk';
+import {input} from '@inquirer/prompts';
import {join, relative, dirname} from 'path';
import {cp} from 'fs/promises';
import {formatTitleCard} from './reporting/format.js';
import {generateId} from './utils/id-generation.js';
import {safeWriteFile, toProcessAbsolutePath} from './file-system-utils.js';
-import {MODEL_PROVIDERS} from './codegen/genkit/models.js';
export const InitModule = {
builder,
@@ -55,22 +53,6 @@ async function getAnswers(): Promise {
// Add some spaces at the end to align to the text of the line above.
const newLineSeparator = '\n ';
- const apiKeyVariables = MODEL_PROVIDERS.map(p => p.apiKeyVariableName);
-
- if (!apiKeyVariables.some(name => process.env[name])) {
- const hasConfirmed = await confirm({
- message: chalk.red(
- `Could not detect an API key in any of the following environment variables: ${apiKeyVariables.join(', ')}` +
- newLineSeparator +
- 'You may not be able to run the evals. Do you want to continue generating an environment anyway?',
- ),
- });
-
- if (!hasConfirmed) {
- return null;
- }
- }
-
const displayName = await input({
message: 'What will be the name of your environment?',
required: true,
diff --git a/runner/orchestration/build-serve-test-loop.ts b/runner/orchestration/build-serve-test-loop.ts
index 2c577bcd..884c90c5 100644
--- a/runner/orchestration/build-serve-test-loop.ts
+++ b/runner/orchestration/build-serve-test-loop.ts
@@ -4,12 +4,14 @@ import {BuildResultStatus} from '../workers/builder/builder-types.js';
import {Environment} from '../configuration/environment.js';
import {
AssessmentConfig,
+ AssessmentTimings,
AttemptDetails,
LlmContextFile,
RootPromptDefinition,
} from '../shared-interfaces.js';
+import {performance} from 'node:perf_hooks';
import {ProgressLogger} from '../progress/progress-logger.js';
-import {runBuild} from './build-worker.js';
+import {BuildType, runBuild} from './build-worker.js';
import {EvalID} from './executors/executor.js';
import {serveAndTestApp} from './serve-testing-worker.js';
import {runTest} from './test-worker.js';
@@ -53,7 +55,9 @@ export async function attemptBuildAndTest(
workerConcurrencyQueue: PQueue,
progress: ProgressLogger,
userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined,
+ timings: AssessmentTimings,
) {
+ const initialBuildStart = performance.now();
const initialBuildResult = await runBuild(
evalID,
directory,
@@ -62,7 +66,9 @@ export async function attemptBuildAndTest(
abortSignal,
workerConcurrencyQueue,
progress,
+ BuildType.INITIAL_BUILD,
);
+ timings.buildDurationMs += performance.now() - initialBuildStart;
let repairAttempts = 0;
let maxRepairAttempts: number;
let maxTestRepairAttempts: number;
@@ -77,7 +83,7 @@ export async function attemptBuildAndTest(
const initialAttempt = {
outputFiles: initialResponse.files,
usage: {
- ...{inputTokens: 0, outputTokens: 0, totalTokens: 0},
+ ...{inputTokens: 0, outputTokens: 0, totalTokens: 0, thinkingTokens: 0},
...initialResponse.usage,
},
reasoning: initialResponse.reasoning,
@@ -99,6 +105,7 @@ export async function attemptBuildAndTest(
`Trying to repair app build (attempt #${repairAttempts + 1})`,
);
+ const repairStart = performance.now();
const attempt = await repairAndBuild(
evalID,
config.model,
@@ -119,6 +126,7 @@ export async function attemptBuildAndTest(
progress,
'build',
);
+ timings.repairDurationMs += performance.now() - repairStart;
attemptDetails.push(attempt);
lastAttempt = attempt;
@@ -199,6 +207,7 @@ export async function attemptBuildAndTest(
});
}
+ const repairStart = performance.now();
const attempt = await repairAndBuild(
evalID,
config.model,
@@ -223,6 +232,7 @@ export async function attemptBuildAndTest(
// further repairs and capture the failed build. This is useful insight
// as LLMs seem to regress when asked to repair violations.
if (hasBuildFailure) {
+ timings.repairDurationMs += performance.now() - repairStart;
break;
}
@@ -248,6 +258,7 @@ export async function attemptBuildAndTest(
workerConcurrencyQueue,
progress,
)) ?? undefined;
+ timings.repairDurationMs += performance.now() - repairStart;
if (hasAxeFailure && lastAttempt.serveTestingResult?.axeViolations?.length === 0) {
progress.log(rootPromptDef, 'success', `Successfully fixed all Axe accessibility violations`);
diff --git a/runner/orchestration/build-worker.ts b/runner/orchestration/build-worker.ts
index 6e993409..87314258 100644
--- a/runner/orchestration/build-worker.ts
+++ b/runner/orchestration/build-worker.ts
@@ -2,9 +2,18 @@ import {BuildResult, BuildResultStatus} from '../workers/builder/builder-types.j
import {Environment} from '../configuration/environment.js';
import {ProgressLogger} from '../progress/progress-logger.js';
import {RootPromptDefinition} from '../shared-interfaces.js';
-import {EvalID, Executor} from './executors/executor.js';
+import {EvalID} from './executors/executor.js';
import PQueue from 'p-queue';
+export enum BuildType {
+ /** Initial build of an eval */
+ INITIAL_BUILD,
+ /** A build attempt as part of a repair. */
+ REPAIR_ATTEMPT_BUILD,
+ /** A build attempt as part of a repair */
+ TEST_ATTEMPT_REPAIR,
+}
+
/** Attempts to build the code. */
export async function runBuild(
evalID: EvalID,
@@ -14,8 +23,26 @@ export async function runBuild(
abortSignal: AbortSignal,
workerConcurrencyQueue: PQueue,
progress: ProgressLogger,
+ type: BuildType,
): Promise {
- progress.log(rootPromptDef, 'build', `Building the app`);
+ let suffix: string;
+ let label: string;
+ switch (type) {
+ case BuildType.INITIAL_BUILD:
+ suffix = '';
+ label = 'Initial build';
+ break;
+ case BuildType.REPAIR_ATTEMPT_BUILD:
+ suffix = ' (for a repair attempt)';
+ label = 'Repair build';
+ break;
+ case BuildType.TEST_ATTEMPT_REPAIR:
+ suffix = ' (for a test repair attempt)';
+ label = 'Test repair build';
+ break;
+ }
+
+ progress.log(rootPromptDef, 'build', `Building the app${suffix}`);
try {
const result = await env.executor.performBuild(
@@ -27,13 +54,13 @@ export async function runBuild(
progress,
);
if (result.status === BuildResultStatus.SUCCESS) {
- progress.log(rootPromptDef, 'success', 'Build is successful');
+ progress.log(rootPromptDef, 'success', `${label} is successful`);
} else {
- progress.log(rootPromptDef, 'error', 'Build has failed', result.message);
+ progress.log(rootPromptDef, 'error', `${label} has failed`, result.message);
}
return result;
} catch (err) {
- progress.log(rootPromptDef, 'error', `Error during build process`, err + '');
+ progress.log(rootPromptDef, 'error', `Error during ${label}`, err + '');
throw err;
}
}
diff --git a/runner/orchestration/codegen.ts b/runner/orchestration/codegen.ts
index 912d3a28..fe19a94d 100644
--- a/runner/orchestration/codegen.ts
+++ b/runner/orchestration/codegen.ts
@@ -48,6 +48,7 @@ export async function generateCodeWithAI(
inputTokens: response.usage?.inputTokens ?? 0,
outputTokens: response.usage?.outputTokens ?? 0,
totalTokens: response.usage?.totalTokens ?? 0,
+ thinkingTokens: response.usage?.thinkingTokens ?? 0,
};
reasoning = response.reasoning;
toolLogs = response.toolLogs ?? [];
@@ -65,11 +66,11 @@ export async function generateCodeWithAI(
success = true;
} catch (error) {
- usage = {inputTokens: 0, outputTokens: 0, totalTokens: 0};
+ usage = {inputTokens: 0, outputTokens: 0, totalTokens: 0, thinkingTokens: 0};
success = false;
reasoning = '';
toolLogs = [];
- errors.push(error + '');
+ errors.push(`${error}${error instanceof Error ? `\nStack: ${error.stack}` : ''}`);
}
return {
@@ -128,6 +129,7 @@ export async function repairCodeWithAI(
);
if (response.success) {
+ env.augmentResponseFiles(response.outputFiles);
progress.log(
promptDef,
'codegen',
@@ -161,7 +163,20 @@ export function prepareContextFilesMessage(
}
export function createLlmResponseTokenUsageMessage(response: LlmResponse): string | null {
- return response.usage.inputTokens || response.usage.outputTokens || response.usage.totalTokens
- ? `(input tokens: ${response.usage.inputTokens}, output tokens: ${response.usage.outputTokens}, total tokens: ${response.usage.totalTokens})`
- : null;
+ const usage = response?.usage;
+ if (!usage) {
+ return null;
+ }
+
+ // 2. Build the token detail string parts
+ const input = usage.inputTokens !== undefined ? `input tokens: ${usage.inputTokens}` : '';
+ const output = usage.outputTokens !== undefined ? `output tokens: ${usage.outputTokens}` : '';
+ const thinking =
+ usage.thinkingTokens !== undefined ? `thinking tokens: ${usage.thinkingTokens}` : '';
+ const total = usage.totalTokens !== undefined ? `total tokens: ${usage.totalTokens}` : '';
+
+ // 3. Filter out empty strings and join with a separator
+ const parts = [input, output, thinking, total].filter(part => part !== '');
+
+ return parts.length > 0 ? `(${parts.join(', ')})` : null;
}
diff --git a/runner/orchestration/executors/executor.ts b/runner/orchestration/executors/executor.ts
index 7eaaa2d9..229bb61d 100644
--- a/runner/orchestration/executors/executor.ts
+++ b/runner/orchestration/executors/executor.ts
@@ -1,6 +1,7 @@
import PQueue from 'p-queue';
-import {ProgressLogger} from '../../progress/progress-logger.js';
-import {
+import z from 'zod';
+import type {ProgressLogger} from '../../progress/progress-logger.js';
+import type {
LlmContextFile,
LlmGenerateFilesRequest,
LlmResponse,
@@ -8,9 +9,13 @@ import {
RootPromptDefinition,
TestExecutionResult,
} from '../../shared-interfaces.js';
-import {BuildResult} from '../../workers/builder/builder-types.js';
-import z from 'zod';
-import {ServeTestingResult} from '../../workers/serve-testing/worker-types.js';
+import type {BuildResult} from '../../workers/builder/builder-types.js';
+import type {ServeTestingResult} from '../../workers/serve-testing/worker-types.js';
+import type {
+ ExecutorAutoRateResponse,
+ ExecutorCodeAutoRateRequest,
+ ExecutorVisualAutoRateRequest,
+} from '../../ratings/autoraters/auto-rate-shared.js';
export type EvalID = string & {__evalID: true};
@@ -18,7 +23,10 @@ export type EvalID = string & {__evalID: true};
export type WorkerQueueType = PQueue;
export const executorSchema = z.object({
- initializeEval: z.function(z.tuple([]), z.promise(z.custom())),
+ initializeEval: z.function(
+ z.tuple([z.custom()]),
+ z.promise(z.custom()),
+ ),
generateInitialFiles: z.function(
z.tuple([
z.custom().describe('ID of the eval'),
@@ -65,6 +73,9 @@ export const executorSchema = z.object({
z.string().describe('Path to the application directory'),
z.custom().describe('Root prompt definition'),
z.custom().describe('Progress logger'),
+ z
+ .custom()
+ .describe('Abort Signal to fire when the server should be canceled.'),
z
.function(
z.tuple([z.string().describe('URL of the running server')]),
@@ -121,6 +132,28 @@ export const executorSchema = z.object({
}),
),
),
+ autoRateCode: z
+ .function(
+ z.tuple([
+ z.custom().describe('Context for the automated code rating'),
+ z
+ .custom()
+ .describe('Abort Signal to fire when the request should be canceled.'),
+ ]),
+ z.promise(z.custom()),
+ )
+ .optional(),
+ autoRateVisuals: z
+ .function(
+ z.tuple([
+ z.custom().describe('Context for the automated code rating'),
+ z
+ .custom()
+ .describe('Abort Signal to fire when the request should be canceled.'),
+ ]),
+ z.promise(z.custom()),
+ )
+ .optional(),
});
export type Executor = z.infer;
diff --git a/runner/orchestration/executors/local-executor.ts b/runner/orchestration/executors/local-executor.ts
index d64f018b..2b6bf814 100644
--- a/runner/orchestration/executors/local-executor.ts
+++ b/runner/orchestration/executors/local-executor.ts
@@ -1,5 +1,5 @@
import {ChildProcess, fork} from 'node:child_process';
-import path, {join} from 'node:path';
+import path from 'node:path';
import PQueue from 'p-queue';
import {LlmRunner, McpServerDetails} from '../../codegen/llm-runner.js';
import {getRunnerByName, RunnerName} from '../../codegen/runner-creation.js';
@@ -36,12 +36,15 @@ export class LocalExecutor implements Executor {
constructor(
public config: LocalExecutorConfig,
- runnerName: RunnerName = 'genkit',
+ runnerOrName: RunnerName | LlmRunner = 'noop-unimplemented',
) {
- this.llm = getRunnerByName(runnerName);
+ this.llm =
+ typeof runnerOrName === 'string'
+ ? getRunnerByName(runnerOrName)
+ : Promise.resolve(runnerOrName);
}
- async initializeEval(): Promise {
+ async initializeEval(_prompt: RootPromptDefinition): Promise {
return `${uniqueIDs++}` as EvalID;
}
@@ -178,6 +181,7 @@ export class LocalExecutor implements Executor {
appDirectoryPath: string,
rootPromptDef: RootPromptDefinition,
progress: ProgressLogger,
+ abortSignal: AbortSignal,
logicWhileServing: (serveUrl: string) => Promise,
): Promise {
// Serve testing is explicitly disabled.
@@ -190,6 +194,7 @@ export class LocalExecutor implements Executor {
rootPromptDef,
appDirectoryPath,
progress,
+ abortSignal,
logicWhileServing,
);
}
diff --git a/runner/orchestration/file-system.ts b/runner/orchestration/file-system.ts
index 94ca7896..eaad5173 100644
--- a/runner/orchestration/file-system.ts
+++ b/runner/orchestration/file-system.ts
@@ -1,5 +1,4 @@
-import {tmpdir} from 'os';
-import {LLM_OUTPUT_DIR} from '../configuration/constants.js';
+import {LLM_OUTPUT_DIR, WCS_BASE_TMP_DIR} from '../configuration/constants.js';
import {Environment} from '../configuration/environment.js';
import {
copyFolderExcept,
@@ -8,7 +7,7 @@ import {
safeWriteFile,
} from '../file-system-utils.js';
import {LlmContextFile, LlmResponseFile, RootPromptDefinition} from '../shared-interfaces.js';
-import {join} from 'path';
+import {join, resolve} from 'path';
import {existsSync} from 'fs';
import {mkdir, mkdtemp, readFile} from 'fs/promises';
import {globSync} from 'tinyglobby';
@@ -25,6 +24,7 @@ const PENDING_INSTALLS = new Map>();
* @param env Environment that is currently being run.
* @param rootPromptDef Definition of the root prompt.
* @param progress Logger to use to log out the current progress.
+ * @param tmpdirBasePath Base path for temporary directories (like `/tmp`).
* @param outputDirectory Custom output directory specified by the user.
* @returns Temporary directory in which to build and a function used to clean in up.
*/
@@ -32,6 +32,7 @@ export async function setupProjectStructure(
env: Environment,
rootPromptDef: RootPromptDefinition,
progress: ProgressLogger,
+ tmpdirBasePath: string = WCS_BASE_TMP_DIR,
outputDirectory?: string,
) {
let directory: string;
@@ -48,7 +49,8 @@ export async function setupProjectStructure(
cleanup = () => Promise.resolve();
} else {
// When outputting to the temporary directory, make sure that the directory is unique.
- directory = await mkdtemp(join(tmpdir(), `fw-${env.id}-build-${rootPromptDef.name}`));
+ await mkdir(tmpdirBasePath, {recursive: true});
+ directory = await mkdtemp(join(tmpdirBasePath, `fw-${env.id}-build-${rootPromptDef.name}`));
cleanup = async () => {
try {
@@ -60,7 +62,7 @@ export async function setupProjectStructure(
const directoriesToCopy: string[] = [];
if (env.executor instanceof LocalExecutor && env.executor.config.projectTemplate) {
- const projectTemplatePath = join(env.rootPath, env.executor.config.projectTemplate);
+ const projectTemplatePath = resolve(env.rootPath, env.executor.config.projectTemplate);
// Copy the template files first.
directoriesToCopy.push(projectTemplatePath);
@@ -80,7 +82,7 @@ export async function setupProjectStructure(
}
if (env.executor instanceof LocalExecutor && env.executor.config.sourceDirectory) {
- const sourceDirectory = join(env.rootPath, env.executor.config.sourceDirectory);
+ const sourceDirectory = resolve(env.rootPath, env.executor.config.sourceDirectory);
// Push this after the project so the environment's files that precedence.
directoriesToCopy.push(sourceDirectory);
@@ -174,14 +176,7 @@ export async function resolveContextFiles(
const paths = globSync(patterns, {
cwd: directory,
- ignore: [
- '**/node_modules/**',
- '**/README.md',
- '**/package-lock.json',
- '**/package.json',
- '**/angular.json',
- '**/.vinxi/**',
- ],
+ ignore: ['**/node_modules/**', '**/README.md', '**/package-lock.json', '**/.vinxi/**'],
});
return Promise.all(
diff --git a/runner/orchestration/generate-eval-task.ts b/runner/orchestration/generate-eval-task.ts
index 8033bd7f..83484db5 100644
--- a/runner/orchestration/generate-eval-task.ts
+++ b/runner/orchestration/generate-eval-task.ts
@@ -1,5 +1,4 @@
import PQueue from 'p-queue';
-import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
import {Environment} from '../configuration/environment.js';
import {
AssessmentConfig,
@@ -12,11 +11,14 @@ import {EvalID} from './executors/executor.js';
import {ProgressLogger} from '../progress/progress-logger.js';
import {resolveContextFiles, setupProjectStructure, writeResponseFiles} from './file-system.js';
import {generateInitialFiles} from './generate-initial-files.js';
-import {generateUserJourneysForApp} from './user-journeys.js';
+import {generateUserJourneysForApp, UserJourneysResult} from './user-journeys.js';
import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
import {attemptBuildAndTest} from './build-serve-test-loop.js';
import {rateGeneratedCode} from '../ratings/rate-code.js';
import {DEFAULT_AUTORATER_MODEL_NAME} from '../configuration/constants.js';
+import assert from 'node:assert';
+import {AiSdkRunner} from '../codegen/ai-sdk/ai-sdk-runner.js';
+import {performance} from 'node:perf_hooks';
/**
* Creates and executes a task to generate or load code for a given prompt,
@@ -25,24 +27,14 @@ import {DEFAULT_AUTORATER_MODEL_NAME} from '../configuration/constants.js';
* This function handles both online (AI-generated) and local (file-based) code retrieval.
* It manages build attempts and AI-driven repair cycles.
*
- * @param evalID ID of the evaluation task.
- * @param env Environment for this evaluation.
- * @param model Name of the LLM to use.
- * @param rootPromptDef Definition of the root prompt being processed.
- * @param localMode A boolean indicating whether to load code from local files instead of generating it.
- * @param skipScreenshots Whether to skip taking screenshot of a running application.
- * @param outputDirectory Directory in which to generate the output. Convenient for debugging.
- * @param abortSignal Abort signal for when the evaluation task should be aborted.
- * @param skipAxeTesting Whether or not to skip Axe testing of the app.
- * @param enableUserJourneyTesting Whether to enable user journey testing of generated apps.
- * @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls).
* @returns A Promise that resolves to an AssessmentResult object containing all details of the task's execution.
*/
export async function startEvaluationTask(
config: AssessmentConfig,
evalID: EvalID,
env: Environment,
- ratingLlm: GenkitRunner,
+ autoraterLlm: AiSdkRunner | null,
+ cujGenerationLlm: AiSdkRunner | null,
rootPromptDef: PromptDefinition | MultiStepPromptDefinition,
abortSignal: AbortSignal,
workerConcurrencyQueue: PQueue,
@@ -69,6 +61,7 @@ export async function startEvaluationTask(
// and for each sub-prompt, because the project will be augmented on each iteration.
const contextFiles = await resolveContextFiles(promptDef.contextFilePatterns, directory);
+ const generateStart = performance.now();
// Generate the initial set of files through the LLM.
const initialResponse = await generateInitialFiles(
config,
@@ -85,6 +78,7 @@ export async function startEvaluationTask(
abortSignal,
progress,
);
+ const generateDurationMs = performance.now() - generateStart;
const toolLogs = initialResponse.toolLogs ?? [];
@@ -128,26 +122,28 @@ export async function startEvaluationTask(
break;
}
- const userJourneys = config.enableUserJourneyTesting
- ? await generateUserJourneysForApp(
- ratingLlm,
- rootPromptDef.name,
- defsToExecute[0].prompt,
- initialResponse.files,
- abortSignal,
- )
- : undefined;
-
- // TODO: Only execute the serve command on the "final working attempt".
- // TODO: Incorporate usage.
- const userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined = userJourneys
- ? {
- userJourneys: userJourneys.result,
- appPrompt: defsToExecute[0].prompt,
- }
- : undefined;
+ let userJourneys: UserJourneysResult | undefined = undefined;
+ let userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined = undefined;
+
+ if (config.enableUserJourneyTesting) {
+ assert(cujGenerationLlm, 'Expected a CUJ generation LLM to be available.');
+ userJourneys = await generateUserJourneysForApp(
+ cujGenerationLlm,
+ rootPromptDef.name,
+ defsToExecute[0].prompt,
+ initialResponse.files,
+ abortSignal,
+ );
+
+ // TODO: Incorporate usage.
+ userJourneyAgentTaskInput = {
+ userJourneys: userJourneys.result,
+ appPrompt: defsToExecute[0].prompt,
+ };
+ }
const attemptDetails: AttemptDetails[] = []; // Store details for assessment.json
+ const timings = {generateDurationMs, buildDurationMs: 0, repairDurationMs: 0};
// Try to build the files in the root prompt directory.
// This will also attempt to fix issues with the generated code.
@@ -164,6 +160,7 @@ export async function startEvaluationTask(
workerConcurrencyQueue,
progress,
userJourneyAgentTaskInput,
+ timings,
);
if (!attempt) {
@@ -172,7 +169,7 @@ export async function startEvaluationTask(
}
const score = await rateGeneratedCode(
- ratingLlm,
+ autoraterLlm,
env,
promptDef,
fullPromptText,
@@ -205,6 +202,7 @@ export async function startEvaluationTask(
toolLogs,
testResult: attempt.testResult ?? null,
testRepairAttempts: attempt.testRepairAttempts,
+ timings,
} satisfies AssessmentResult);
}
diff --git a/runner/orchestration/generate-initial-files.ts b/runner/orchestration/generate-initial-files.ts
index d6779543..6a0495dd 100644
--- a/runner/orchestration/generate-initial-files.ts
+++ b/runner/orchestration/generate-initial-files.ts
@@ -55,6 +55,8 @@ export async function generateInitialFiles(
usage: {
inputTokens: 0,
outputTokens: 0,
+ thinkingTokens: 0,
+ totalTokens: 0,
} satisfies Usage,
// TODO: We could also try save/restore reasoning locally.
reasoning: '',
@@ -73,6 +75,7 @@ export async function generateInitialFiles(
);
if (response.success) {
+ env.augmentResponseFiles(response.outputFiles);
progress.log(
promptDef,
'codegen',
@@ -88,7 +91,7 @@ export async function generateInitialFiles(
}
return {
- files: response.outputFiles!,
+ files: response.outputFiles,
usage: response.usage,
reasoning: response.reasoning,
toolLogs: response.toolLogs,
diff --git a/runner/orchestration/generate-summary.ts b/runner/orchestration/generate-summary.ts
index da3b410b..7501281a 100644
--- a/runner/orchestration/generate-summary.ts
+++ b/runner/orchestration/generate-summary.ts
@@ -1,6 +1,7 @@
-import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
+import {AiSdkRunner} from '../codegen/ai-sdk/ai-sdk-runner.js';
import {Environment} from '../configuration/environment.js';
import {redX} from '../reporting/format.js';
+import {chatWithReportAI} from '../reporting/report-ai-chat.js';
import {summarizeReportWithAI} from '../reporting/report-ai-summary.js';
import {AssessmentResult, CompletionStats, RunSummary} from '../shared-interfaces.js';
@@ -9,16 +10,16 @@ import {AssessmentResult, CompletionStats, RunSummary} from '../shared-interface
* and also some extra metadata about the run.
*/
export async function prepareSummary(
- genkit: GenkitRunner,
+ generateAiSummaryLlm: AiSdkRunner | null,
abortSignal: AbortSignal,
- model: string,
+ evalRunModel: string,
env: Environment,
assessments: AssessmentResult[],
completionStats: CompletionStats,
- opts: {skipAiSummary?: boolean},
): Promise {
let inputTokens = 0;
let outputTokens = 0;
+ let thinkingTokens = 0;
let totalTokens = 0;
assessments.forEach(result => {
@@ -26,41 +27,81 @@ export async function prepareSummary(
if (result.score.tokenUsage) {
inputTokens += result.score.tokenUsage.inputTokens;
outputTokens += result.score.tokenUsage.outputTokens;
- totalTokens += result.score.tokenUsage.totalTokens ?? 0;
+ totalTokens += result.score.tokenUsage.totalTokens;
+ thinkingTokens += result.score.tokenUsage.thinkingTokens;
}
// Incorporate usage numbers from all generate + build attempts.
result.attemptDetails.forEach(attempt => {
if (attempt.usage) {
- inputTokens += attempt.usage.inputTokens ?? 0;
- outputTokens += attempt.usage.outputTokens ?? 0;
- totalTokens += attempt.usage.totalTokens ?? 0;
+ inputTokens += attempt.usage.inputTokens;
+ outputTokens += attempt.usage.outputTokens;
+ totalTokens += attempt.usage.totalTokens;
+ thinkingTokens += attempt.usage.thinkingTokens;
}
});
});
let aiSummary: string | undefined = undefined;
- if (!opts.skipAiSummary) {
- console.log(`✨ Generating AI summary for evaluation run..`);
+ if (generateAiSummaryLlm) {
+ console.log(`✨ Generating AI summary for evaluation run...`);
try {
- const result = await summarizeReportWithAI(genkit, abortSignal, assessments);
+ const result = await summarizeReportWithAI(generateAiSummaryLlm, abortSignal, assessments);
inputTokens += result.usage.inputTokens;
outputTokens += result.usage.outputTokens;
+ thinkingTokens += result.usage.thinkingTokens;
totalTokens += result.usage.totalTokens;
aiSummary = result.responseHtml;
console.log(`✅ Generated AI summary.`);
} catch (e) {
console.log(`${redX()} Failed to generate AI summary, skipping summary.`);
- if ((e as Partial).stack) {
+
+ if (process.env.DEBUG === '1' && (e as Partial).stack) {
console.error((e as Error).stack);
}
}
}
+ const additionalAiAnalysis: {name: string; summary: string}[] = [];
+ if (generateAiSummaryLlm && env.analysisPrompts.length > 0) {
+ console.log(`✨ Generating additional AI analysis...`);
+
+ await Promise.all(
+ env.analysisPrompts.map(async config => {
+ try {
+ const result = await chatWithReportAI(
+ generateAiSummaryLlm,
+ config.prompt,
+ abortSignal,
+ assessments,
+ [],
+ config.model,
+ {
+ reportContextFilter: config.reportsFilter,
+ ratingContextFilter: config.ratingsFilter,
+ },
+ undefined,
+ );
+ inputTokens += result.usage.inputTokens;
+ outputTokens += result.usage.outputTokens;
+ thinkingTokens += result.usage.thinkingTokens;
+ totalTokens += result.usage.totalTokens;
+ additionalAiAnalysis.push({name: config.name, summary: result.responseHtml});
+ } catch (e) {
+ console.log(`${redX()} Failed custom analysis called "${config.name}".`);
+
+ if (process.env.DEBUG === '1' && (e as Partial).stack) {
+ console.error((e as Error).stack);
+ }
+ }
+ }),
+ );
+ }
+
const executorInfo = await env.executor.getExecutorInfo?.();
return {
- model,
+ model: evalRunModel,
environmentId: env.id,
displayName: env.displayName,
framework: {
@@ -74,15 +115,18 @@ export async function prepareSummary(
},
},
aiSummary,
+ additionalAiAnalysis,
completionStats: completionStats,
usage: {
inputTokens,
outputTokens,
+ thinkingTokens,
totalTokens,
},
runner: {
id: executorInfo.id,
displayName: executorInfo.displayName,
},
+ ratingHash: env.ratingHash,
} satisfies RunSummary;
}
diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts
index 45ec4ab5..a6278a02 100644
--- a/runner/orchestration/generate.ts
+++ b/runner/orchestration/generate.ts
@@ -3,7 +3,7 @@ import {existsSync, readdirSync} from 'fs';
import {availableParallelism} from 'os';
import PQueue from 'p-queue';
import {basename, join} from 'path';
-import {assertValidModelName} from '../codegen/llm-runner.js';
+import {assertValidModelName, LlmRunner} from '../codegen/llm-runner.js';
import {getRunnerByName} from '../codegen/runner-creation.js';
import {LLM_OUTPUT_DIR, REPORT_VERSION} from '../configuration/constants.js';
import {getEnvironmentByPath} from '../configuration/environment-resolution.js';
@@ -21,12 +21,13 @@ import {
} from '../shared-interfaces.js';
import {UserFacingError} from '../utils/errors.js';
import {executeCommand} from '../utils/exec.js';
-import {callWithTimeout} from '../utils/timeout.js';
+import {callWithTimeout, TimeoutError} from '../utils/timeout.js';
import {LocalExecutor} from './executors/local-executor.js';
import {startEvaluationTask} from './generate-eval-task.js';
import {prepareSummary} from './generate-summary.js';
import {getRunGroupId} from './grouping.js';
import {combineAbortSignals} from '../utils/abort-signal.js';
+import {RatingKind} from '../ratings/rating-types.js';
/**
* Orchestrates the entire assessment process for each prompt defined in the `prompts` array.
@@ -42,32 +43,71 @@ import {combineAbortSignals} from '../utils/abort-signal.js';
* each containing the prompt, generated code, and final validation status.
*/
export async function generateCodeAndAssess(options: AssessmentConfig): Promise {
- const env = await getEnvironmentByPath(options.environmentConfigPath, options.runner);
+ const env =
+ options.environment instanceof Environment
+ ? options.environment
+ : await getEnvironmentByPath(options.environment.configPath, options.runner);
+
+ const extraCleanupFns: (() => Promise)[] = [];
const cleanup = async () => {
// Clean-up should never interrupt a potentially passing completion.
try {
- await env.executor.destroy();
+ await env.destroy();
} catch (e) {
- console.error(`Failed to destroy executor: ${e}`);
+ console.error(`Failed to destroy environment: ${e}`);
if (e instanceof Error) {
console.error(e.stack);
}
}
+
+ for (const cleanupFn of extraCleanupFns) {
+ try {
+ await cleanupFn();
+ } catch (e) {
+ console.error(`Failed cleanup: ${e}`);
+ if (e instanceof Error) {
+ console.error(e.stack);
+ }
+ }
+ }
};
// Ensure cleanup logic runs when the evaluation is aborted.
options.abortSignal?.addEventListener('abort', cleanup);
- await assertValidModelName(options.model, env.executor);
-
- const ratingLlm = await getRunnerByName('genkit');
const allTasksAbortCtrl = new AbortController();
try {
+ await assertValidModelName(options.model, env.executor);
+
const promptsToProcess = (
await getCandidateExecutablePrompts(env, options.localMode, options.promptFilter)
).slice(0, options.limit);
+ const hasLlmBasedRatings = promptsToProcess.some(p =>
+ p.kind === 'single'
+ ? // Check if some ratings are LLM based.
+ p.ratings.some(r => r.kind === RatingKind.LLM_BASED)
+ : // Check if some steps contain LLM based ratings.
+ p.steps.some(s => s.ratings.some(r => r.kind === RatingKind.LLM_BASED)),
+ );
+
+ // Only construct LLMs when necessary. This is helpful in cases where WCS is invoked
+ // as a auto-rater that doesn't have access to other LLMs.
+ const autoraterLlm = hasLlmBasedRatings ? await getRunnerByName('ai-sdk') : null;
+ const cujGenerationLlm = options.enableUserJourneyTesting
+ ? (autoraterLlm ?? (await getRunnerByName('ai-sdk')))
+ : null;
+ const generateAiSummaryLlm = !options.skipAiSummary
+ ? (autoraterLlm ?? cujGenerationLlm ?? (await getRunnerByName('ai-sdk')))
+ : null;
+
+ extraCleanupFns.push(async () => {
+ await autoraterLlm?.dispose();
+ await cujGenerationLlm?.dispose();
+ await generateAiSummaryLlm?.dispose();
+ });
+
const progress =
options.logging === 'dynamic' ? new DynamicProgressLogger() : new TextProgressLogger();
const appConcurrency =
@@ -117,56 +157,82 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
for (const rootPromptDef of promptsToProcess) {
allTasks.push(
appConcurrencyQueue.add(async () => {
- const evalID = await env.executor.initializeEval();
- let results: AssessmentResult[] | undefined;
-
- try {
- results = await callWithTimeout(
- `Evaluation of ${rootPromptDef.name}`,
- async timeoutAbortSignal =>
- startEvaluationTask(
- options,
- evalID,
- env,
- ratingLlm,
- rootPromptDef,
- combineAbortSignals(
- allTasksAbortCtrl.signal,
- timeoutAbortSignal,
- options.abortSignal,
+ const evaluate = async () => {
+ const evalID = await env.executor.initializeEval(rootPromptDef);
+ let results: AssessmentResult[] | undefined;
+
+ try {
+ results = await callWithTimeout(
+ `Evaluation of ${rootPromptDef.name}`,
+ async timeoutAbortSignal =>
+ startEvaluationTask(
+ options,
+ evalID,
+ env,
+ autoraterLlm,
+ cujGenerationLlm,
+ rootPromptDef,
+ combineAbortSignals(
+ allTasksAbortCtrl.signal,
+ timeoutAbortSignal,
+ options.abortSignal,
+ ),
+ workerConcurrencyQueue,
+ progress,
),
- workerConcurrencyQueue,
- progress,
- ),
- // 30min max per app evaluation. We just want to make sure it never gets stuck.
- // Note that this timeout is expected to never be hit as individual action timeouts
- // should fire first. E.g. local executor build or test timeouts.
- 30,
- );
- return results;
- } catch (e: unknown) {
- failedPrompts.push({
- promptName: rootPromptDef.name,
- error: `${e}`,
- stack: e instanceof Error ? e.stack : undefined,
- });
-
- let details = `Error: ${e}`;
- if (e instanceof Error && e.stack) {
- details += `\nStack: ${e.stack}`;
+ // A timeout is used to prevent from stuck evaluations.
+ env.promptTimeoutMinutes ?? 10,
+ );
+ return results;
+ } finally {
+ // Gracefully finalize the eval. Errors in finalization should not propagate.
+ try {
+ await env.executor.finalizeEval(evalID);
+ } catch (e) {
+ progress.log(rootPromptDef, 'error', 'Failed to finalize eval', `${e}`);
+ }
}
+ };
+
+ // Retries + initial attempt.
+ const maxAttempts = (options.promptTimeoutRetries ?? 0) + 1;
+ let promptResults: AssessmentResult[] | null = null;
- progress.log(rootPromptDef, 'error', 'Failed to evaluate code', details);
- return [] satisfies AssessmentResult[];
- } finally {
- // Gracefully finalize the eval. Errors in finalization should not propagate.
+ for (let attemptIdx = 0; attemptIdx < maxAttempts; attemptIdx++) {
try {
- await env.executor.finalizeEval(evalID);
- } catch (e) {
- progress.log(rootPromptDef, 'error', 'Failed to finalize eval', `${e}`);
+ promptResults = await evaluate();
+ break;
+ } catch (e: unknown) {
+ if (e instanceof TimeoutError && attemptIdx < maxAttempts - 1) {
+ continue;
+ }
+
+ failedPrompts.push({
+ promptName: rootPromptDef.name,
+ error: `${e}`,
+ stack: e instanceof Error ? e.stack : undefined,
+ });
+
+ let details = `Error: ${e}`;
+ if (e instanceof Error && e.stack) {
+ details += `\nStack: ${e.stack}`;
+ }
+
+ progress.log(rootPromptDef, 'error', 'Failed to evaluate code', details);
+ promptResults = [];
+ break;
}
- progress.evalFinished(rootPromptDef, results || []);
}
+
+ if (promptResults === null) {
+ throw new Error(
+ `Unexpected code path. ` +
+ `There were ${maxAttempts} attempts for evaluating: ${rootPromptDef.name}`,
+ );
+ }
+
+ progress.evalFinished(rootPromptDef, promptResults);
+ return promptResults;
}),
);
}
@@ -187,7 +253,7 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
const timestamp = new Date();
const details = {
summary: await prepareSummary(
- ratingLlm,
+ generateAiSummaryLlm,
allTasksAbortCtrl.signal,
options.model,
env,
@@ -196,7 +262,6 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
allPromptsCount: promptsToProcess.length,
failedPrompts,
},
- options,
),
timestamp: timestamp.toISOString(),
reportName: options.reportName,
diff --git a/runner/orchestration/grouping.ts b/runner/orchestration/grouping.ts
index 9eb19aae..054c17c6 100644
--- a/runner/orchestration/grouping.ts
+++ b/runner/orchestration/grouping.ts
@@ -1,9 +1,8 @@
-import {createHash} from 'crypto';
-import type {LlmRunner} from '../codegen/llm-runner.js';
import type {Environment} from '../configuration/environment.js';
import {calculateBuildAndCheckStats} from '../ratings/stats.js';
import type {AssessmentResult, RunGroup, RunInfo} from '../shared-interfaces.js';
import {RunnerName} from '../codegen/runner-creation.js';
+import {getSha256Hash} from '../utils/hashing.js';
/** Generates a unique grouping ID for a run. */
export function getRunGroupId(
@@ -30,7 +29,7 @@ export function getRunGroupId(
`${options.labels?.sort().join('/')}/${options.model}/${options.runner}`;
// The group string above can get long. Hash it to something shorter and fixed length.
- return createHash('sha256').update(group).digest('hex');
+ return getSha256Hash(group);
}
/**
@@ -56,6 +55,7 @@ export function groupSimilarReports(inputRuns: RunInfo[]): RunGroup[] {
const groupResults: AssessmentResult[] = [];
const firstRun = groupRuns[0];
const labels = new Set();
+ const promptNames = new Set();
let totalForGroup = 0;
let maxForGroup = 0;
let appsCount = 0;
@@ -71,6 +71,7 @@ export function groupSimilarReports(inputRuns: RunInfo[]): RunGroup[] {
totalForRun += result.score.totalPoints;
maxForRun += result.score.maxOverallPoints;
groupResults.push(result);
+ promptNames.add(result.promptDef.name);
}
// `|| 0` in case there are no results, otherwise we'll get NaN.
@@ -91,6 +92,7 @@ export function groupSimilarReports(inputRuns: RunInfo[]): RunGroup[] {
maxOverallPoints: maxForGroup / groupRuns.length || 0,
appsCount,
labels: Array.from(labels),
+ promptNames: Array.from(promptNames),
environmentId: firstRun.details.summary.environmentId,
framework: firstRun.details.summary.framework,
model: firstRun.details.summary.model,
diff --git a/runner/orchestration/repair.ts b/runner/orchestration/repair.ts
index c7b52acd..152acc1b 100644
--- a/runner/orchestration/repair.ts
+++ b/runner/orchestration/repair.ts
@@ -7,7 +7,7 @@ import {
LlmResponseFile,
RootPromptDefinition,
} from '../shared-interfaces.js';
-import {runBuild} from './build-worker.js';
+import {BuildType, runBuild} from './build-worker.js';
import {ProgressLogger} from '../progress/progress-logger.js';
import {EvalID} from './executors/executor.js';
import {repairCodeWithAI} from './codegen.js';
@@ -69,6 +69,7 @@ export async function repairAndBuild(
abortSignal,
attempts,
progress,
+ repairType,
);
}
@@ -108,6 +109,7 @@ async function handleRepairResponse(
abortSignal: AbortSignal,
attempts: number,
progress: ProgressLogger,
+ repairType: 'build' | 'test',
): Promise {
if (!repairResponse.success) {
progress.log(
@@ -124,7 +126,7 @@ async function handleRepairResponse(
const newAttemptFiles = previousAttemptFiles.map(f => ({...f}));
mergeRepairFiles(repairResponse.outputFiles, newAttemptFiles);
- writeResponseFiles(directory, newAttemptFiles, env, rootPromptDef.name);
+ await writeResponseFiles(directory, newAttemptFiles, env, rootPromptDef.name);
const buildResult = await runBuild(
evalID,
@@ -134,6 +136,7 @@ async function handleRepairResponse(
abortSignal,
workerConcurrencyQueue,
progress,
+ repairType === 'build' ? BuildType.REPAIR_ATTEMPT_BUILD : BuildType.TEST_ATTEMPT_REPAIR,
);
return {
diff --git a/runner/orchestration/serve-testing-worker.ts b/runner/orchestration/serve-testing-worker.ts
index 0e3b9354..c3d8d0a1 100644
--- a/runner/orchestration/serve-testing-worker.ts
+++ b/runner/orchestration/serve-testing-worker.ts
@@ -31,78 +31,74 @@ export async function serveAndTestApp(
progress.log(rootPromptDef, 'serve-testing', `Validating the running app`);
- try {
- const result = await env.executor.serveWebApplication(
- evalID,
- appDirectoryPath,
- rootPromptDef,
- progress,
- async serveUrl => {
- const serveParams: ServeTestingWorkerMessage = {
- serveUrl,
- appName: rootPromptDef.name,
- enableAutoCsp: !!config.enableAutoCsp,
- includeAxeTesting: config.skipAxeTesting === false,
- takeScreenshots: config.skipScreenshots === false,
- includeLighthouseData: config.skipLighthouse !== true,
- userJourneyAgentTaskInput,
- };
+ const result = await env.executor.serveWebApplication(
+ evalID,
+ appDirectoryPath,
+ rootPromptDef,
+ progress,
+ abortSignal,
+ async serveUrl => {
+ progress.log(rootPromptDef, 'serve-testing', `Validating the running app (URL: ${serveUrl})`);
+ const serveParams: ServeTestingWorkerMessage = {
+ serveUrl,
+ appName: rootPromptDef.name,
+ enableAutoCsp: !!config.enableAutoCsp,
+ includeAxeTesting: config.skipAxeTesting === false,
+ takeScreenshots: config.skipScreenshots === false,
+ includeLighthouseData: config.skipLighthouse !== true,
+ userJourneyAgentTaskInput,
+ };
- return await workerConcurrencyQueue.add(
- () =>
- new Promise((resolve, reject) => {
- const child: ChildProcess = fork(
- path.resolve(import.meta.dirname, '../workers/serve-testing/worker.js'),
- {signal: abortSignal},
- );
- child.send(serveParams);
+ return await workerConcurrencyQueue.add(
+ () =>
+ new Promise((resolve, reject) => {
+ const child: ChildProcess = fork(
+ path.resolve(import.meta.dirname, '../workers/serve-testing/worker.js'),
+ {signal: abortSignal},
+ );
+ child.send(serveParams);
- child.on('message', async (result: ServeTestingWorkerResponseMessage) => {
- if (result.type === 'result') {
- try {
- await killChildProcessWithSigterm(child);
- } catch (e) {
- progress.debugLog(`Error while killing serve testing worker: ${e}`);
- }
- resolve(result.payload);
- } else {
- progress.log(
- rootPromptDef,
- result.payload.state,
- result.payload.message,
- result.payload.details,
- );
- }
- });
- child.on('error', async err => {
+ child.on('message', async (result: ServeTestingWorkerResponseMessage) => {
+ if (result.type === 'result') {
try {
await killChildProcessWithSigterm(child);
} catch (e) {
progress.debugLog(`Error while killing serve testing worker: ${e}`);
}
- reject(err);
- });
- }),
- );
- },
- );
-
- // An executor might define `serveWebApplication` but conditionally decide
- // that no web application can be started/served.
- if (result === null) {
- return null;
- }
+ resolve(result.payload);
+ } else {
+ progress.log(
+ rootPromptDef,
+ result.payload.state,
+ result.payload.message,
+ result.payload.details,
+ );
+ }
+ });
+ child.on('error', async err => {
+ try {
+ await killChildProcessWithSigterm(child);
+ } catch (e) {
+ progress.debugLog(`Error while killing serve testing worker: ${e}`);
+ }
+ reject(err);
+ });
+ }),
+ );
+ },
+ );
- if (result.errorMessage === undefined) {
- progress.log(rootPromptDef, 'success', 'Validation of running app is successful');
- } else {
- progress.log(rootPromptDef, 'error', 'Validation of running app failed', result.errorMessage);
- }
+ // An executor might define `serveWebApplication` but conditionally decide
+ // that no web application can be started/served.
+ if (result === null) {
+ return null;
+ }
- return result;
- } catch (e) {
- progress.log(rootPromptDef, 'error', 'Error while trying to validate running app', `${e}`);
+ if (result.errorMessage === undefined) {
+ progress.log(rootPromptDef, 'success', 'Validation of running app is successful');
+ } else {
+ progress.log(rootPromptDef, 'error', 'Validation of running app failed', result.errorMessage);
}
- return null;
+ return result;
}
diff --git a/runner/orchestration/user-journeys.ts b/runner/orchestration/user-journeys.ts
index ef6165f7..c8e0dc8b 100644
--- a/runner/orchestration/user-journeys.ts
+++ b/runner/orchestration/user-journeys.ts
@@ -1,7 +1,7 @@
import {z} from 'zod';
import {LlmResponseFile, Usage} from '../shared-interfaces.js';
-import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
import {UserFacingError} from '../utils/errors.js';
+import {AiSdkRunner} from '../codegen/ai-sdk/ai-sdk-runner.js';
// NOTE: When changing this, also change `browser-agent`'s prompt!
const USER_JOURNEY_SCHEMA = z.object({
@@ -25,7 +25,7 @@ export interface UserJourneysResult {
}
export async function generateUserJourneysForApp(
- llm: GenkitRunner,
+ llm: AiSdkRunner,
appName: string,
appPrompt: string,
appFiles: LlmResponseFile[],
@@ -131,6 +131,7 @@ Create a modern, single-page web application that allows users to find recipes b
inputTokens: result.usage?.inputTokens ?? 0,
outputTokens: result.usage?.outputTokens ?? 0,
totalTokens: result.usage?.totalTokens ?? 0,
+ thinkingTokens: result.usage?.thinkingTokens ?? 0,
},
result: result.output,
};
diff --git a/runner/ratings/autoraters/auto-rate-shared.ts b/runner/ratings/autoraters/auto-rate-shared.ts
index 0f3ef7fc..24a3f8c6 100644
--- a/runner/ratings/autoraters/auto-rate-shared.ts
+++ b/runner/ratings/autoraters/auto-rate-shared.ts
@@ -1,4 +1,7 @@
-import {Usage} from '../../shared-interfaces.js';
+import type {LlmContextFile, Usage} from '../../shared-interfaces.js';
+
+/** Minimum rating that the LLM can assign. */
+export const MIN_RATING = 1;
/** Maximum rating that the LLM can assign. */
export const MAX_RATING = 10;
@@ -13,8 +16,45 @@ export interface AutoRateResult {
};
}
-export function getCoefficient(rating: number): number {
- const percent = rating / MAX_RATING;
+/** Request for executor to auto-rate generated code. */
+export interface ExecutorCodeAutoRateRequest {
+ /** Prompt used for the rating. */
+ ratingPrompt: string;
+ /** Files that should be rated. */
+ files: LlmContextFile[];
+ /** Minimum score. */
+ minRating: number;
+ /** Maxmum score. */
+ maxRating: number;
+}
+
+export interface ExecutorVisualAutoRateRequest {
+ /** Prompt used for the rating. */
+ ratingPrompt: string;
+ /** URL to the image to be rated. */
+ imageUrl: string;
+ /** base64 representation of the image. */
+ base64Image: string;
+ /** Minimum score. */
+ minRating: number;
+ /** Maxmum score. */
+ maxRating: number;
+}
+
+/** Response from the executor to an automated rating request. */
+export interface ExecutorAutoRateResponse {
+ /** Score of the rating. */
+ rating: number;
+ /** Text summary of the result. */
+ summary: string;
+ /** Categories of the rating and related descriptions. */
+ categories: {name: string; message: string}[];
+ /** Usage information about the auto rate request. */
+ usage?: Usage;
+}
+
+export function getCoefficient(rating: number, maxRating: number): number {
+ const percent = rating / maxRating;
// More than 80% is a perfect score.
if (percent >= 0.8) {
diff --git a/runner/ratings/autoraters/code-rater.ts b/runner/ratings/autoraters/code-rater.ts
index 3e016964..72deff06 100644
--- a/runner/ratings/autoraters/code-rater.ts
+++ b/runner/ratings/autoraters/code-rater.ts
@@ -2,16 +2,17 @@ import {readFileSync} from 'node:fs';
import {z} from 'zod';
import {prepareContextFilesMessage} from '../../orchestration/codegen.js';
import {Environment} from '../../configuration/environment.js';
+import {IndividualAssessmentState, LlmResponseFile, Usage} from '../../shared-interfaces.js';
import {
- IndividualAssessment,
- IndividualAssessmentState,
- LlmResponseFile,
- SkippedIndividualAssessment,
-} from '../../shared-interfaces.js';
-import {AutoRateResult, getCoefficient, MAX_RATING} from './auto-rate-shared.js';
-import {GenkitRunner} from '../../codegen/genkit/genkit-runner.js';
+ AutoRateResult,
+ ExecutorAutoRateResponse,
+ getCoefficient,
+ MAX_RATING,
+ MIN_RATING,
+} from './auto-rate-shared.js';
import defaultCodeRaterPrompt from './code-rating-prompt.js';
import {RatingsResult} from '../rating-types.js';
+import {AiSdkRunner} from '../../codegen/ai-sdk/ai-sdk-runner.js';
/** Framework-specific hints for the rating prompt. */
const FW_HINTS: Record = {
@@ -38,7 +39,7 @@ const CACHED_RATING_PROMPTS: Record = {};
* @param ratingsResult Context containing results from previous ratings.
*/
export async function autoRateCode(
- llm: GenkitRunner,
+ llm: AiSdkRunner,
abortSignal: AbortSignal,
model: string,
environment: Environment,
@@ -46,13 +47,7 @@ export async function autoRateCode(
appPrompt: string,
ratingsResult: RatingsResult,
): Promise {
- const contextMessage = prepareContextFilesMessage(
- files.map(o => ({
- relativePath: o.filePath,
- content: o.code,
- })),
- );
-
+ const contextFiles = files.map(o => ({relativePath: o.filePath, content: o.code}));
let promptText: string;
if (environment.codeRatingPromptPath) {
@@ -80,31 +75,56 @@ export async function autoRateCode(
SAFETY_WEB_RESULTS_JSON: safetyWebResultsJson,
}).result;
- const result = await llm.generateConstrained({
- abortSignal,
- messages: contextMessage ? [contextMessage] : [],
- model,
- prompt,
- skipMcp: true,
- schema: z.object({
- rating: z.number().describe(`Rating from 1-${MAX_RATING}. Best is ${MAX_RATING}.`),
- summary: z.string().describe('Summary of the overall code quality.'),
- categories: z.array(
- z.object({
- name: z.string().describe('Category name'),
- message: z.string().describe('Short description of the problem.'),
- }),
- ),
- }),
- });
+ let output: ExecutorAutoRateResponse;
+ let usage: Usage | null;
+
+ if (environment.executor.autoRateCode) {
+ output = await environment.executor.autoRateCode(
+ {
+ ratingPrompt: prompt,
+ files: contextFiles,
+ minRating: MIN_RATING,
+ maxRating: MAX_RATING,
+ },
+ abortSignal,
+ );
+ usage = output.usage || null;
+ } else {
+ // TODO(crisbeto): move this into the local executor once
+ // `Executor.autoRateVisuals` becomes a required method.
+ const contextMessage = prepareContextFilesMessage(contextFiles);
+ const result = await llm.generateConstrained({
+ abortSignal,
+ messages: contextMessage ? [contextMessage] : [],
+ model,
+ prompt,
+ skipMcp: true,
+ schema: z.object({
+ rating: z
+ .number()
+ .describe(`Rating from ${MIN_RATING}-${MAX_RATING}. Best is ${MAX_RATING}.`),
+ summary: z.string().describe('Summary of the overall code quality.'),
+ categories: z.array(
+ z.object({
+ name: z.string().describe('Category name'),
+ message: z.string().describe('Short description of the problem.'),
+ }),
+ ),
+ }),
+ });
+
+ output = result.output!;
+ usage = result.usage || null;
+ }
return {
- coefficient: getCoefficient(result.output!.rating),
+ coefficient: getCoefficient(output.rating, MAX_RATING),
usage: {
- inputTokens: result.usage?.inputTokens ?? 0,
- outputTokens: result.usage?.outputTokens ?? 0,
- totalTokens: result.usage?.totalTokens ?? 0,
+ inputTokens: usage?.inputTokens ?? 0,
+ outputTokens: usage?.outputTokens ?? 0,
+ totalTokens: usage?.totalTokens ?? 0,
+ thinkingTokens: usage?.thinkingTokens ?? 0,
},
- details: result.output!,
+ details: output,
};
}
diff --git a/runner/ratings/autoraters/rate-files.ts b/runner/ratings/autoraters/rate-files.ts
index e11724e7..f3dc1649 100644
--- a/runner/ratings/autoraters/rate-files.ts
+++ b/runner/ratings/autoraters/rate-files.ts
@@ -1,15 +1,10 @@
import {greenCheckmark} from '../../reporting/format.js';
-import {
- AutoraterRunInfo,
- IndividualAssessment,
- LlmResponseFile,
- SkippedIndividualAssessment,
-} from '../../shared-interfaces.js';
+import {AutoraterRunInfo, LlmResponseFile} from '../../shared-interfaces.js';
import {autoRateCode} from './code-rater.js';
import {autoRateAppearance} from './visuals-rater.js';
import {Environment} from '../../configuration/environment.js';
-import {GenkitRunner} from '../../codegen/genkit/genkit-runner.js';
import {RatingsResult} from '../rating-types.js';
+import {AiSdkRunner} from '../../codegen/ai-sdk/ai-sdk-runner.js';
/**
* Automatically rates the code inside of a file.
@@ -22,7 +17,7 @@ import {RatingsResult} from '../rating-types.js';
* @param ratingsResult Context containing results from previous ratings.
*/
export async function autoRateFiles(
- llm: GenkitRunner,
+ llm: AiSdkRunner,
abortSignal: AbortSignal,
model: string,
environment: Environment,
diff --git a/runner/ratings/autoraters/visuals-rater.ts b/runner/ratings/autoraters/visuals-rater.ts
index e1b77d6e..c66ec3b0 100644
--- a/runner/ratings/autoraters/visuals-rater.ts
+++ b/runner/ratings/autoraters/visuals-rater.ts
@@ -1,10 +1,21 @@
import {z} from 'zod';
import {PromptDataMessage} from '../../codegen/llm-runner.js';
-import {AutoRateResult, getCoefficient, MAX_RATING} from './auto-rate-shared.js';
-import {GenkitRunner} from '../../codegen/genkit/genkit-runner.js';
+import {
+ AutoRateResult,
+ ExecutorAutoRateResponse,
+ getCoefficient,
+ MAX_RATING,
+ MIN_RATING,
+} from './auto-rate-shared.js';
import defaultVisualRaterPrompt from './visual-rating-prompt.js';
import {Environment} from '../../configuration/environment.js';
import {screenshotUrlToPngBuffer} from '../../utils/screenshots.js';
+import {Usage} from '../../shared-interfaces.js';
+import {AiSdkRunner} from '../../codegen/ai-sdk/ai-sdk-runner.js';
+import {readFileSync} from 'fs';
+
+/** Cache for visual rating prompts that have been read from disk. */
+const CACHED_VISUAL_RATING_PROMPTS: Record = {};
/**
* Automatically rate the appearance of a screenshot using an LLM.
@@ -17,7 +28,7 @@ import {screenshotUrlToPngBuffer} from '../../utils/screenshots.js';
* @param label Label for the rating, used for logging.
*/
export async function autoRateAppearance(
- llm: GenkitRunner,
+ llm: AiSdkRunner,
abortSignal: AbortSignal,
model: string,
environment: Environment,
@@ -25,56 +36,85 @@ export async function autoRateAppearance(
screenshotPngUrl: string,
label: string,
): Promise {
- const prompt = environment.renderPrompt(defaultVisualRaterPrompt, null, {
+ let promptText: string;
+ if (environment.visualRatingPromptPath) {
+ CACHED_VISUAL_RATING_PROMPTS[environment.visualRatingPromptPath] ??= readFileSync(
+ environment.visualRatingPromptPath,
+ 'utf8',
+ );
+ promptText = CACHED_VISUAL_RATING_PROMPTS[environment.visualRatingPromptPath];
+ } else {
+ promptText = defaultVisualRaterPrompt;
+ }
+
+ const prompt = environment.renderPrompt(promptText, environment.visualRatingPromptPath, {
APP_PROMPT: appPrompt,
}).result;
- const messages: PromptDataMessage[] = [
- {
- role: 'user',
- content: [
- {
- media: {
- base64PngImage: (await screenshotUrlToPngBuffer(screenshotPngUrl)).toString('base64'),
- url: screenshotPngUrl,
- },
- },
- ],
- },
- ];
+ const base64Image = (await screenshotUrlToPngBuffer(screenshotPngUrl)).toString('base64');
- const result = await llm.generateConstrained({
- abortSignal,
- messages,
- prompt,
- model,
- skipMcp: true,
- timeout: {
- description: `Rating screenshot of ${label} using ${model}`,
- durationInMins: 2.5,
- },
- schema: z.object({
- rating: z.number().describe(`Rating from 1-${MAX_RATING}. Best is ${MAX_RATING}.`),
- summary: z
- .string()
- .describe('Summary of the overall app, talking about concrete features, super concise.'),
- categories: z.array(
- z.object({
- name: z.string().describe('Category name'),
- message: z.string().describe('Short description of what is missing.'),
- }),
- ),
- }),
- });
+ let output: ExecutorAutoRateResponse;
+ let usage: Usage | null;
+
+ if (environment.executor.autoRateVisuals) {
+ output = await environment.executor.autoRateVisuals(
+ {
+ ratingPrompt: prompt,
+ imageUrl: screenshotPngUrl,
+ base64Image,
+ minRating: MIN_RATING,
+ maxRating: MAX_RATING,
+ },
+ abortSignal,
+ );
+ usage = output.usage || null;
+ } else {
+ // TODO(crisbeto): move this into the local executor once
+ // `Executor.autoRateVisuals` becomes a required method.
+ const messages: PromptDataMessage[] = [
+ {
+ role: 'user',
+ content: [{media: {base64PngImage: base64Image, url: screenshotPngUrl}}],
+ },
+ ];
+
+ const result = await llm.generateConstrained({
+ abortSignal,
+ messages,
+ prompt,
+ model,
+ skipMcp: true,
+ timeout: {
+ description: `Rating screenshot of ${label} using ${model}`,
+ durationInMins: 2.5,
+ },
+ schema: z.object({
+ rating: z
+ .number()
+ .describe(`Rating from ${MIN_RATING}-${MAX_RATING}. Best is ${MAX_RATING}.`),
+ summary: z
+ .string()
+ .describe('Summary of the overall app, talking about concrete features, super concise.'),
+ categories: z.array(
+ z.object({
+ name: z.string().describe('Category name'),
+ message: z.string().describe('Short description of what is missing.'),
+ }),
+ ),
+ }),
+ });
- const output = result.output!;
+ output = result.output!;
+ usage = result.usage || null;
+ }
return {
- coefficient: getCoefficient(output.rating),
+ coefficient: getCoefficient(output.rating, MAX_RATING),
usage: {
- inputTokens: result.usage?.inputTokens ?? 0,
- outputTokens: result.usage?.outputTokens ?? 0,
- totalTokens: result.usage?.totalTokens ?? 0,
+ inputTokens: usage?.inputTokens ?? 0,
+ outputTokens: usage?.outputTokens ?? 0,
+ totalTokens: usage?.totalTokens ?? 0,
+ thinkingTokens: usage?.thinkingTokens ?? 0,
},
details: output,
};
diff --git a/runner/ratings/built-in-ratings/axe-rating.ts b/runner/ratings/built-in-ratings/axe-rating.ts
index dc74ed7e..f8b9f4a3 100644
--- a/runner/ratings/built-in-ratings/axe-rating.ts
+++ b/runner/ratings/built-in-ratings/axe-rating.ts
@@ -20,6 +20,7 @@ export const axeRating: PerBuildRating = {
name: 'Axe Accessibility Violations',
description: 'Checks for accessibility violations using the Axe-core engine.',
category: RatingCategory.MEDIUM_IMPACT,
+ groupingLabels: ['accessibility'],
id: 'axe-a11y',
scoreReduction: '10%',
rate: ({serveResult, axeRepairAttempts}) => {
diff --git a/runner/ratings/built-in-ratings/code-quality-rating.ts b/runner/ratings/built-in-ratings/code-quality-rating.ts
index 2077c3e4..7bca99a8 100644
--- a/runner/ratings/built-in-ratings/code-quality-rating.ts
+++ b/runner/ratings/built-in-ratings/code-quality-rating.ts
@@ -7,6 +7,7 @@ export const codeQualityRating: LLMBasedRating = {
name: 'Code Quality (LLM-rated)',
description: `Rates the app's source code via LLM`,
category: RatingCategory.MEDIUM_IMPACT,
+ groupingLabels: ['llm-judge', 'llm-rated-code-quality'],
id: 'common-autorater-code-quality',
scoreReduction: '30%',
rate: async ctx => {
diff --git a/runner/ratings/built-in-ratings/no-dangerously-set-inner-html-rating.ts b/runner/ratings/built-in-ratings/no-dangerously-set-inner-html-rating.ts
index cbc729d4..0f325e8c 100644
--- a/runner/ratings/built-in-ratings/no-dangerously-set-inner-html-rating.ts
+++ b/runner/ratings/built-in-ratings/no-dangerously-set-inner-html-rating.ts
@@ -16,6 +16,7 @@ export const NoDangerouslySetInnerHtmlRating: PerFileRating = {
name: RATING_NAME,
id: 'no-dangerously-set-inner-html',
category: RatingCategory.MEDIUM_IMPACT,
+ groupingLabels: ['security'],
scoreReduction: '50%',
description: RATING_DESCRIPTION,
filter: {
diff --git a/runner/ratings/built-in-ratings/no-inner-html-bindings-rating.ts b/runner/ratings/built-in-ratings/no-inner-html-bindings-rating.ts
index 36a8e05e..0e69be08 100644
--- a/runner/ratings/built-in-ratings/no-inner-html-bindings-rating.ts
+++ b/runner/ratings/built-in-ratings/no-inner-html-bindings-rating.ts
@@ -16,6 +16,7 @@ export const NoInnerHtmlBindingsRating: PerFileRating = {
name: RATING_NAME,
id: 'no-inner-html-bindings',
category: RatingCategory.MEDIUM_IMPACT,
+ groupingLabels: ['security'],
scoreReduction: '50%',
description: RATING_DESCRIPTION,
filter: {
diff --git a/runner/ratings/built-in-ratings/no-runtime-errors-rating.ts b/runner/ratings/built-in-ratings/no-runtime-errors-rating.ts
index 53068a6c..d18b3cc5 100644
--- a/runner/ratings/built-in-ratings/no-runtime-errors-rating.ts
+++ b/runner/ratings/built-in-ratings/no-runtime-errors-rating.ts
@@ -7,6 +7,7 @@ export const noRuntimeExceptionsRating: PerBuildRating = {
description: "Ensures the app doesn't have runtime exceptions.",
kind: RatingKind.PER_BUILD,
category: RatingCategory.HIGH_IMPACT,
+ groupingLabels: ['functionality', 'no-runtime-errors', 'running-app-checks'],
scoreReduction: '50%',
id: 'common-no-runtime-errors',
rate: ({buildResult, serveResult}) => ({
diff --git a/runner/ratings/built-in-ratings/safety-web-rating.ts b/runner/ratings/built-in-ratings/safety-web-rating.ts
index 106038be..c7af4c28 100644
--- a/runner/ratings/built-in-ratings/safety-web-rating.ts
+++ b/runner/ratings/built-in-ratings/safety-web-rating.ts
@@ -8,6 +8,7 @@ export const safetyWebRating: PerBuildRating = {
name: 'SafetyWeb Violations',
description: 'Checks for TrustedTypes and CSP incompatible coding patterns.',
category: RatingCategory.HIGH_IMPACT,
+ groupingLabels: ['security'],
id: 'safety-web',
scoreReduction: '50%',
rate: ({buildResult}) => {
diff --git a/runner/ratings/built-in-ratings/security-ratings.ts b/runner/ratings/built-in-ratings/security-ratings.ts
index f9eb26df..5be32e99 100644
--- a/runner/ratings/built-in-ratings/security-ratings.ts
+++ b/runner/ratings/built-in-ratings/security-ratings.ts
@@ -38,6 +38,7 @@ export const cspViolationsRating: PerBuildRating = {
description: 'Checks for Content Security Policy violations, excluding Trusted Types.',
id: 'csp-violations',
category: RatingCategory.HIGH_IMPACT,
+ groupingLabels: ['security'],
scoreReduction: '50%',
rate: ({serveResult}) => {
if (!serveResult?.cspViolations) {
@@ -78,6 +79,7 @@ export const trustedTypesViolationsRating: PerBuildRating = {
description: 'Checks for Trusted Types violations specifically.',
id: 'trusted-types-violations',
category: RatingCategory.HIGH_IMPACT,
+ groupingLabels: ['security'],
scoreReduction: '50%',
rate: ({serveResult}) => {
if (!serveResult?.cspViolations) {
diff --git a/runner/ratings/built-in-ratings/successful-build-rating.ts b/runner/ratings/built-in-ratings/successful-build-rating.ts
index ec879137..5159fc40 100644
--- a/runner/ratings/built-in-ratings/successful-build-rating.ts
+++ b/runner/ratings/built-in-ratings/successful-build-rating.ts
@@ -8,6 +8,7 @@ export const successfulBuildRating: PerBuildRating = {
id: 'common-successful-build',
kind: RatingKind.PER_BUILD,
category: RatingCategory.HIGH_IMPACT,
+ groupingLabels: ['functionality', 'successful-builds'],
scoreReduction: '50%',
// Reduce the amount of points in case we've built the code with a few repair attempts.
rate: ({buildResult, repairAttempts}) => ({
diff --git a/runner/ratings/built-in-ratings/successful-tests-rating.ts b/runner/ratings/built-in-ratings/successful-tests-rating.ts
index 2941fd3c..24f757eb 100644
--- a/runner/ratings/built-in-ratings/successful-tests-rating.ts
+++ b/runner/ratings/built-in-ratings/successful-tests-rating.ts
@@ -7,6 +7,7 @@ export const successfulTestsRating: PerBuildRating = {
id: 'common-successful-tests',
kind: RatingKind.PER_BUILD,
category: RatingCategory.MEDIUM_IMPACT,
+ groupingLabels: ['functionality', 'project-tests'],
scoreReduction: '30%',
// Reduce the amount of points in case we've had test repair attempts.
rate: ({testResult, testRepairAttempts}) => {
diff --git a/runner/ratings/built-in-ratings/sufficient-code-size-rating.ts b/runner/ratings/built-in-ratings/sufficient-code-size-rating.ts
index 859d0d60..5994c9c3 100644
--- a/runner/ratings/built-in-ratings/sufficient-code-size-rating.ts
+++ b/runner/ratings/built-in-ratings/sufficient-code-size-rating.ts
@@ -10,6 +10,7 @@ export const sufficientCodeSizeRating: PerFileRating = {
name: 'Sufficient Code Size (over 50b)',
description: 'Ensures the generated code is not trivially small (e.g. < 50b).',
category: RatingCategory.HIGH_IMPACT,
+ groupingLabels: ['sufficient-code-checks'],
id: 'common-generated-code-size',
scoreReduction: '30%',
kind: RatingKind.PER_FILE,
diff --git a/runner/ratings/built-in-ratings/sufficient-generated-files-rating.ts b/runner/ratings/built-in-ratings/sufficient-generated-files-rating.ts
index e946699d..ba3bc3f3 100644
--- a/runner/ratings/built-in-ratings/sufficient-generated-files-rating.ts
+++ b/runner/ratings/built-in-ratings/sufficient-generated-files-rating.ts
@@ -5,6 +5,7 @@ export const sufficientGeneratedFilesRating: PerBuildRating = {
name: 'Sufficient number of generated files',
description: 'Ensures that the LLM produced at least one file.',
category: RatingCategory.HIGH_IMPACT,
+ groupingLabels: ['sufficient-code-checks'],
id: 'common-generated-file-count',
scoreReduction: '100%',
kind: RatingKind.PER_BUILD,
diff --git a/runner/ratings/built-in-ratings/user-journeys-rating.ts b/runner/ratings/built-in-ratings/user-journeys-rating.ts
index 57f4bc96..50f78de4 100644
--- a/runner/ratings/built-in-ratings/user-journeys-rating.ts
+++ b/runner/ratings/built-in-ratings/user-journeys-rating.ts
@@ -7,12 +7,13 @@ export const userJourneysRating: PerBuildRating = {
description: 'Ensures that all User Journeys are working in the generated app',
kind: RatingKind.PER_BUILD,
category: RatingCategory.MEDIUM_IMPACT,
+ groupingLabels: ['functionality', 'running-app-checks', 'interaction-testing'],
scoreReduction: '30%',
rate: ({serveResult}) => {
if (serveResult === null || serveResult.userJourneyAgentOutput === null) {
return {
state: RatingState.SKIPPED,
- message: 'Was not enabled for this run',
+ message: 'Not enabled for this run.',
};
}
diff --git a/runner/ratings/built-in-ratings/valid-css-rating.ts b/runner/ratings/built-in-ratings/valid-css-rating.ts
index 59f4c605..f37e00b8 100644
--- a/runner/ratings/built-in-ratings/valid-css-rating.ts
+++ b/runner/ratings/built-in-ratings/valid-css-rating.ts
@@ -11,6 +11,7 @@ export const validCssRating: PerFileRating = {
name: 'Valid CSS',
description: 'Ensures that the generated CSS code is valid',
category: RatingCategory.MEDIUM_IMPACT,
+ groupingLabels: ['functionality', 'styling', 'css-validity'],
scoreReduction: '20%',
kind: RatingKind.PER_FILE,
id: 'common-valid-css',
diff --git a/runner/ratings/built-in-ratings/visual-appearance-rating.ts b/runner/ratings/built-in-ratings/visual-appearance-rating.ts
index d8fa6f38..77ce10e6 100644
--- a/runner/ratings/built-in-ratings/visual-appearance-rating.ts
+++ b/runner/ratings/built-in-ratings/visual-appearance-rating.ts
@@ -9,6 +9,7 @@ export const visualAppearanceRating: LLMBasedRating = {
name: 'UI & Visual appearance (LLM-Rated)',
description: 'Rates the app based on its visuals (UI visuals and feature completeness).',
category: RatingCategory.MEDIUM_IMPACT,
+ groupingLabels: ['llm-judge', 'llm-rated-visual-appearance', 'running-app-checks'],
scoreReduction: '30%',
id: 'common-autorater-visuals',
rate: async ctx => {
diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts
index f7b5bcd2..198aec9d 100644
--- a/runner/ratings/rate-code.ts
+++ b/runner/ratings/rate-code.ts
@@ -9,6 +9,7 @@ import {
PromptDefinition,
AssessmentCategory,
TestExecutionResult,
+ Usage,
} from '../shared-interfaces.js';
import {
RatingState,
@@ -18,17 +19,16 @@ import {
PerFileRatingContentType,
RatingKind,
RatingCategory,
- POINTS_FOR_CATEGORIES,
Rating,
- CATEGORY_NAMES,
RatingsResult,
} from './rating-types.js';
import {extractEmbeddedCodeFromTypeScript} from './embedded-languages.js';
import {Environment} from '../configuration/environment.js';
-import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
import {ProgressLogger} from '../progress/progress-logger.js';
import {UserFacingError} from '../utils/errors.js';
import {ServeTestingResult} from '../workers/serve-testing/worker-types.js';
+import assert from 'assert';
+import {AiSdkRunner} from '../codegen/ai-sdk/ai-sdk-runner.js';
interface FileOrEmbeddedSyntheticFile {
/**
@@ -45,7 +45,7 @@ interface FileOrEmbeddedSyntheticFile {
type CategorizedFiles = Record;
export async function rateGeneratedCode(
- llm: GenkitRunner,
+ autoraterLlm: AiSdkRunner | null,
environment: Environment,
currentPromptDef: PromptDefinition,
fullPromptText: string,
@@ -70,7 +70,8 @@ export async function rateGeneratedCode(
inputTokens: 0,
outputTokens: 0,
totalTokens: 0,
- };
+ thinkingTokens: 0,
+ } satisfies Usage;
progress.log(currentPromptDef, 'eval', 'Rating generated code');
@@ -79,10 +80,9 @@ export async function rateGeneratedCode(
RatingCategory.MEDIUM_IMPACT,
RatingCategory.LOW_IMPACT,
].map(category => ({
+ ...environment.ratingCategories[category],
id: category,
- name: CATEGORY_NAMES[category],
points: 0,
- maxPoints: POINTS_FOR_CATEGORIES[category],
assessments: [],
}));
@@ -107,12 +107,13 @@ export async function rateGeneratedCode(
categorizedFiles ??= splitFilesIntoCategories(outputFiles);
result = await runPerFileRating(currentPromptDef, current, categorizedFiles, ratingsResult);
} else if (current.kind === RatingKind.LLM_BASED) {
+ assert(autoraterLlm !== null, 'Expected an auto-rater LLM to be available.');
result = await runLlmBasedRating(
environment,
current,
fullPromptText,
currentPromptDef,
- llm,
+ autoraterLlm,
outputFiles,
buildResult,
serveTestingResult,
@@ -130,9 +131,10 @@ export async function rateGeneratedCode(
}
if (result.state === IndividualAssessmentState.EXECUTED && result.usage) {
- tokenUsage.inputTokens += result.usage.inputTokens;
- tokenUsage.outputTokens += result.usage.outputTokens;
+ tokenUsage.inputTokens += result.usage.inputTokens ?? 0;
+ tokenUsage.outputTokens += result.usage.outputTokens ?? 0;
tokenUsage.totalTokens += result.usage.totalTokens ?? 0;
+ tokenUsage.thinkingTokens += result.usage.thinkingTokens ?? 0;
}
const category = categories.find(c => c.id === result.category);
@@ -278,7 +280,7 @@ async function runLlmBasedRating(
rating: LLMBasedRating,
fullPromptText: string,
currentPromptDef: PromptDefinition,
- llm: GenkitRunner,
+ llm: AiSdkRunner,
outputFiles: LlmResponseFile[],
buildResult: BuildResult,
serveTestingResult: ServeTestingResult | null,
@@ -310,7 +312,11 @@ async function runLlmBasedRating(
let message = `${getMessage(result.coefficient)}\n${result.details.summary}`;
if (result.coefficient < 1) {
- message += ':\n' + result.details.categories.map(category => category.message).join('\n ');
+ message +=
+ ':\n' +
+ result.details.categories
+ .map(category => `${category.name}: ${category.message}`)
+ .join('\n ');
}
return getIndividualAssessment(rating, result.coefficient, message);
@@ -329,6 +335,7 @@ function getIndividualAssessment(
scoreReduction: rating.scoreReduction,
successPercentage: rateResult,
category: rating.category,
+ groupingLabels: rating.groupingLabels,
message,
};
}
@@ -340,6 +347,7 @@ function getSkippedAssessment(rating: Rating, message: string): SkippedIndividua
description: rating.description,
id: rating.id,
category: rating.category,
+ groupingLabels: rating.groupingLabels,
message,
};
}
diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts
index 18a58085..76190c58 100644
--- a/runner/ratings/rating-types.ts
+++ b/runner/ratings/rating-types.ts
@@ -9,8 +9,8 @@ import type {
Usage,
} from '../shared-interfaces.js';
import {Environment} from '../configuration/environment.js';
-import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
import {ServeTestingResult} from '../workers/serve-testing/worker-types.js';
+import {AiSdkRunner} from '../codegen/ai-sdk/ai-sdk-runner.js';
/** Possible types of ratings. */
export enum RatingKind {
@@ -32,20 +32,6 @@ export enum RatingCategory {
LOW_IMPACT = 'low-impact',
}
-/** Points correspond to each `RatingCategory`. */
-export const POINTS_FOR_CATEGORIES = {
- [RatingCategory.HIGH_IMPACT]: 60,
- [RatingCategory.MEDIUM_IMPACT]: 30,
- [RatingCategory.LOW_IMPACT]: 10,
-};
-
-/** Display names for each `RatingCategory`. */
-export const CATEGORY_NAMES = {
- [RatingCategory.HIGH_IMPACT]: 'High Impact',
- [RatingCategory.MEDIUM_IMPACT]: 'Medium Impact',
- [RatingCategory.LOW_IMPACT]: 'Low Impact',
-};
-
const ratingCommonContextFields = {
ratingsResult: z.record(z.custom()),
prompt: z.custom(),
@@ -57,6 +43,7 @@ const ratingSchemaCommonFields = {
name: z.string(),
description: z.string(),
id: z.string(),
+ groupingLabels: z.array(z.string()).optional(),
} as const;
const perBuildRatingSchema = z
@@ -125,6 +112,12 @@ export const ratingSchema = z.union([
llmBasedRatingSchema,
]);
+export const ratingOverrideSchema = z.object({
+ category: z.custom().optional(),
+ scoreReduction: z.custom<`${number}%`>().optional(),
+ groupingLabels: z.array(z.string()).optional().optional(),
+});
+
/** Result of a per-build rating. */
export type PerBuildRatingResult =
| {
@@ -188,7 +181,7 @@ export interface LLMBasedRatingContext {
environment: Environment;
fullPromptText: string;
currentPromptDef: PromptDefinition;
- llm: GenkitRunner;
+ llm: AiSdkRunner;
model: string;
outputFiles: LlmResponseFile[];
buildResult: BuildResult;
diff --git a/runner/ratings/stats.ts b/runner/ratings/stats.ts
index a97e927e..3f39261e 100644
--- a/runner/ratings/stats.ts
+++ b/runner/ratings/stats.ts
@@ -2,6 +2,7 @@ import {BuildErrorType, BuildResultStatus} from '../workers/builder/builder-type
import {UserFacingError} from '../utils/errors.js';
import {
AggregatedRunStats,
+ AggregatedTimings,
AssessmentResult,
RuntimeStats,
ScoreBucket,
@@ -15,6 +16,21 @@ export const BUCKET_CONFIG = [
{name: 'Poor', min: 0, max: 70, id: 'poor'},
];
+function calculateMean(values: number[]): number {
+ if (values.length === 0) return 0;
+ return values.reduce((sum, value) => sum + value, 0) / values.length;
+}
+
+function calculateMedian(values: number[]): number {
+ if (values.length === 0) return 0;
+ const sorted = [...values].sort((a, b) => a - b);
+ const middle = Math.floor(sorted.length / 2);
+ if (sorted.length % 2 === 0) {
+ return (sorted[middle - 1] + sorted[middle]) / 2;
+ }
+ return sorted[middle];
+}
+
/**
* Calculates build and check statistics from assessment results.
*
@@ -22,6 +38,10 @@ export const BUCKET_CONFIG = [
* @returns An object containing aggregated build and check statistics.
*/
export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): AggregatedRunStats {
+ const generateDurations: number[] = [];
+ const buildDurations: number[] = [];
+ const repairDurations: number[] = [];
+
let successfulInitialBuilds = 0;
let successfulBuildsAfterRepair = 0;
let failedBuilds = 0;
@@ -63,6 +83,12 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
}
}
+ if (result.timings) {
+ generateDurations.push(result.timings.generateDurationMs);
+ buildDurations.push(result.timings.buildDurationMs);
+ repairDurations.push(result.timings.repairDurationMs);
+ }
+
// Calculate test statistics
if (result.testResult) {
if (result.testResult.passed) {
@@ -158,6 +184,22 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
: undefined,
accessibility: accessibilityStats,
security: securityStats,
+ ...(generateDurations.length > 0 && {
+ timings: {
+ generate: {
+ mean: calculateMean(generateDurations),
+ median: calculateMedian(generateDurations),
+ },
+ build: {
+ mean: calculateMean(buildDurations),
+ median: calculateMedian(buildDurations),
+ },
+ repair: {
+ mean: calculateMean(repairDurations),
+ median: calculateMedian(repairDurations),
+ },
+ },
+ }),
};
}
diff --git a/runner/reporting/report-ai-chat.ts b/runner/reporting/report-ai-chat.ts
index 0e88504c..83766899 100644
--- a/runner/reporting/report-ai-chat.ts
+++ b/runner/reporting/report-ai-chat.ts
@@ -1,5 +1,4 @@
import {marked} from 'marked';
-import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
import {
AiChatMessage,
AssessmentResult,
@@ -12,27 +11,7 @@ import {
} from '../shared-interfaces.js';
import {BuildResultStatus} from '../workers/builder/builder-types.js';
import {BUCKET_CONFIG} from '../ratings/stats.js';
-import {POINTS_FOR_CATEGORIES} from '../ratings/rating-types.js';
-
-export const reportLlmEvalsToolContext = `## What is a report?
-A report consists of many apps that were LLM generated. You will have information
-about checks that failed for this LLM generated app.
-
-Note that there may be multiple attempts for an app. E.g. an initial build may fail and
-another attempt might have repaired the build failure. The last attempt reflects the final
-state of the app. E.g. whether it does build, or if there are runtime errors.
-
-## Scoring mechanism
-Apps are rated based on their scores in the following buckets:
-${BUCKET_CONFIG.map(b => `* ${b.name}: ${b.min}-${b.max}`).join('\n')}
-
-The overall score of an app is determined based on score reductions.
-There are three pillars: ${Object.keys(POINTS_FOR_CATEGORIES).join(', ')}
-Pillars are a split up of a 100% perfect score, allowing for individual ratings
-to be less impactful than others. The pillars are distributed as follows:
-${Object.entries(POINTS_FOR_CATEGORIES).map(e => `* ${e[0]}: ${e[1]} points.`)}
-Within pillars, the available score can be reduced by individual ratings.
-`;
+import {AiSdkRunner} from '../codegen/ai-sdk/ai-sdk-runner.js';
const defaultAiChatPrompt = `Strictly follow the instructions here.
- You are an expert in LLM-based code generation evaluation and quality assessments.
@@ -48,7 +27,7 @@ const defaultAiChatPrompt = `Strictly follow the instructions here.
* Decide based on the question, whether you need to generate a larger response, or just a chat reply.`;
export async function chatWithReportAI(
- llm: GenkitRunner,
+ llm: AiSdkRunner,
message: string,
abortSignal: AbortSignal,
allAssessments: AssessmentResultFromReportServer[] | AssessmentResult[],
@@ -90,7 +69,7 @@ export async function chatWithReportAI(
${message}
\`\`\`
-${reportLlmEvalsToolContext}
+${getContextPrompt(assessmentsToProcess)}
### How many apps are there?
There are ${allAssessments.length} apps in this report.
@@ -108,7 +87,7 @@ ${serializeReportForPrompt(assessmentsToProcess, contextFilters)}
includeThoughts: false,
},
timeout: {
- description: `Generating summary for report`,
+ description: `Chatting with AI`,
durationInMins: 3,
},
abortSignal,
@@ -120,6 +99,7 @@ ${serializeReportForPrompt(assessmentsToProcess, contextFilters)}
inputTokens: result.usage?.inputTokens ?? 0,
outputTokens: result.usage?.outputTokens ?? 0,
totalTokens: result.usage?.totalTokens ?? 0,
+ thinkingTokens: result.usage?.thinkingTokens ?? 0,
},
};
}
@@ -192,3 +172,36 @@ function isAssessmentResultWithID(
): value is AssessmentResultFromReportServer {
return (value as Partial).id !== undefined;
}
+
+function getContextPrompt(assessments: AssessmentResultFromReportServer[] | AssessmentResult[]) {
+ let categoryCount = 0;
+ let pointsForCategories = {} as Record;
+
+ // Deduce the categories from the first result since they're the same for the entire run.
+ if (assessments.length) {
+ assessments[0].score.categories.forEach(category => {
+ categoryCount++;
+ pointsForCategories[category.id] = category.maxPoints;
+ });
+ }
+
+ return `## What is a report?
+A report consists of many apps that were LLM generated. You will have information
+about checks that failed for this LLM generated app.
+
+Note that there may be multiple attempts for an app. E.g. an initial build may fail and
+another attempt might have repaired the build failure. The last attempt reflects the final
+state of the app. E.g. whether it does build, or if there are runtime errors.
+
+## Scoring mechanism
+Apps are rated based on their scores in the following buckets:
+${BUCKET_CONFIG.map(b => `* ${b.name}: ${b.min}-${b.max}`).join('\n')}
+
+The overall score of an app is determined based on score reductions.
+There are ${categoryCount} pillars: ${Object.keys(pointsForCategories).join(', ')}
+Pillars are a split up of a 100% perfect score, allowing for individual ratings
+to be less impactful than others. The pillars are distributed as follows:
+${Object.entries(pointsForCategories).map(e => `* ${e[0]}: ${e[1]} points.`)}
+Within pillars, the available score can be reduced by individual ratings.
+`;
+}
diff --git a/runner/reporting/report-ai-summary.ts b/runner/reporting/report-ai-summary.ts
index be4adf78..7b9c5681 100644
--- a/runner/reporting/report-ai-summary.ts
+++ b/runner/reporting/report-ai-summary.ts
@@ -1,17 +1,19 @@
-import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
-import {
- AssessmentResult,
- ReportContextFilter,
- RatingContextFilter,
- AiChatContextFilters,
-} from '../shared-interfaces.js';
+import {AiSdkRunner} from '../codegen/ai-sdk/ai-sdk-runner.js';
+import {DEFAULT_SUMMARY_MODEL} from '../configuration/constants.js';
+import {AssessmentResult, ReportContextFilter, RatingContextFilter} from '../shared-interfaces.js';
import {chatWithReportAI} from './report-ai-chat.js';
export async function summarizeReportWithAI(
- llm: GenkitRunner,
+ llm: AiSdkRunner,
abortSignal: AbortSignal,
assessments: AssessmentResult[],
) {
+ const model = DEFAULT_SUMMARY_MODEL;
+
+ if (!llm.getSupportedModels().includes(model)) {
+ throw new Error(`Unable to generate AI summary due to unsupported model: ${model}`);
+ }
+
return chatWithReportAI(
llm,
`Strictly follow the instructions here.
@@ -31,7 +33,7 @@ Categorize the failures and provide a brief summary of the report. Keep it short
assessments,
[],
// For AI summaries we use lite model as it's faster and cheaper (+ reduces rate limiting)
- 'gemini-2.5-flash-lite',
+ model,
{
reportContextFilter: ReportContextFilter.NonPerfectReports,
ratingContextFilter: RatingContextFilter.NonPerfectRatings,
diff --git a/runner/reporting/report-local-disk.ts b/runner/reporting/report-local-disk.ts
index 1d3b8039..b695c364 100644
--- a/runner/reporting/report-local-disk.ts
+++ b/runner/reporting/report-local-disk.ts
@@ -37,6 +37,11 @@ export async function fetchReportsFromDisk(directory: string): Promise r.promptDef.name);
+
data.set(group.id, {group, run});
}),
);
diff --git a/runner/reporting/report-logging.ts b/runner/reporting/report-logging.ts
index 0e674556..8a938fa1 100644
--- a/runner/reporting/report-logging.ts
+++ b/runner/reporting/report-logging.ts
@@ -2,7 +2,7 @@ import {join} from 'path';
import chalk from 'chalk';
import boxen from 'boxen';
import {IndividualAssessmentState, RunInfo, ScoreBucket} from '../shared-interfaces.js';
-import {DEFAULT_AUTORATER_MODEL_NAME, REPORTS_ROOT_DIR} from '../configuration/constants.js';
+import {DEFAULT_AUTORATER_MODEL_NAME} from '../configuration/constants.js';
import {calculateBuildAndCheckStats} from '../ratings/stats.js';
import {safeWriteFile} from '../file-system-utils.js';
import {BuildResultStatus} from '../workers/builder/builder-types.js';
@@ -17,7 +17,6 @@ import {
} from './format.js';
import {Environment} from '../configuration/environment.js';
import {groupSimilarReports} from '../orchestration/grouping.js';
-import {LocalExecutor} from '../orchestration/executors/local-executor.js';
/**
* Generates a structured report on fs, based on the assessment run information.
@@ -38,14 +37,19 @@ import {LocalExecutor} from '../orchestration/executors/local-executor.js';
*
* @param runInfo An object containing all details and results of the assessment run.
* @param id ID of the environment that was used for the eval.
+ * @param reportsRootDir Root directory where the reports are written to.
* @returns The original `runInfo` object, allowing for chaining.
*/
-export async function writeReportToDisk(runInfo: RunInfo, id: string): Promise {
+export async function writeReportToDisk(
+ runInfo: RunInfo,
+ id: string,
+ reportsRootDir: string,
+): Promise {
// Sanitize report name: allow only a-z, A-Z, 0-9, and hyphens. Replace others with a hyphen.
const sanitizedReportName = runInfo.details.reportName.replace(/[^a-zA-Z0-9-]/g, '-');
const {results} = runInfo;
- const reportBaseDir = join(REPORTS_ROOT_DIR, id, sanitizedReportName);
+ const reportBaseDir = join(reportsRootDir, id, sanitizedReportName);
// Write `summary.json` file, which contains **all** available info.
const summaryJsonPath = join(reportBaseDir, 'summary.json');
@@ -266,6 +270,7 @@ export function logReportToConsole(runInfo: RunInfo): void {
'Usage info:',
` - Input tokens: ${formatTokenCount(usage.inputTokens)}`,
` - Output tokens: ${formatTokenCount(usage.outputTokens)}`,
+ ` - Thinking tokens: ${formatTokenCount(usage.thinkingTokens)}`,
` - Total tokens: ${formatTokenCount(usage.totalTokens)}`,
].filter(line => line != null);
diff --git a/runner/run-cli.ts b/runner/run-cli.ts
index 878833be..393b33d1 100644
--- a/runner/run-cli.ts
+++ b/runner/run-cli.ts
@@ -6,7 +6,7 @@ import {BUILT_IN_ENVIRONMENTS, LLM_OUTPUT_DIR} from './configuration/constants.j
import {UserFacingError} from './utils/errors.js';
import {existsSync, rmSync} from 'fs';
import {readFile, readdir} from 'fs/promises';
-import {join} from 'path';
+import {join, resolve} from 'path';
import {glob} from 'tinyglobby';
import {LlmResponseFile} from './shared-interfaces.js';
import {setupProjectStructure, writeResponseFiles} from './orchestration/file-system.js';
@@ -26,6 +26,7 @@ export const RunModule = {
interface Options {
environment: string;
prompt: string;
+ reportsDirectory?: string;
}
function builder(argv: Argv): Argv {
@@ -41,6 +42,11 @@ function builder(argv: Argv): Argv {
default: '',
description: 'ID of the prompt within the environment that should be run',
})
+ .option('reports-directory', {
+ type: 'string',
+ description: 'Path from which to read local reports',
+ demandOption: false,
+ })
.version(false)
.help();
}
@@ -90,12 +96,13 @@ async function runApp(options: Options) {
try {
await writeResponseFiles(directory, files, environment, rootPromptDef.name);
-
+ const abortController = new AbortController();
await serveApp(
environment.executor.getServeCommand(),
rootPromptDef,
directory,
new NoopProgressLogger(),
+ abortController.signal,
async url => {
console.log();
console.log(formatTitleCard(`🎉 App is up and running at ${url}`));
@@ -126,9 +133,9 @@ async function resolveConfig(options: Options) {
const environment = await getEnvironmentByPath(
BUILT_IN_ENVIRONMENTS.get(options.environment) || options.environment,
- 'genkit',
+ 'ai-sdk',
);
- const environmentDir = join(LLM_OUTPUT_DIR, environment.id);
+ const environmentDir = resolve(options.reportsDirectory ?? LLM_OUTPUT_DIR, environment.id);
if (!existsSync(environmentDir)) {
throw new UserFacingError(
diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts
index c1aa77f7..b7139ed6 100644
--- a/runner/shared-interfaces.ts
+++ b/runner/shared-interfaces.ts
@@ -5,12 +5,13 @@ import type {AutoRateResult} from './ratings/autoraters/auto-rate-shared.js';
import type {Rating, RatingCategory} from './ratings/rating-types.js';
import type {ServeTestingResult} from './workers/serve-testing/worker-types.js';
import type {RunnerName} from './codegen/runner-creation.js';
+import {Environment} from './configuration/environment.js';
/** Configuration options necessary for kicking off an assessment run. */
export interface AssessmentConfig {
model: string;
runner: RunnerName;
- environmentConfigPath: string;
+ environment: Environment | {configPath: string};
localMode: boolean;
limit: number;
concurrency: number | 'auto';
@@ -30,6 +31,7 @@ export interface AssessmentConfig {
skipLighthouse?: boolean;
maxTestRepairAttempts?: number;
maxBuildRepairAttempts?: number;
+ promptTimeoutRetries?: number;
abortSignal?: AbortSignal;
}
@@ -108,14 +110,21 @@ export interface Usage {
inputTokens: number;
/** Number of output tokens produced. */
outputTokens: number;
+ /**
+ * Thinking tokens.
+ *
+ * This could be `0` for models not using thinking, or model providers
+ * that include tokens directly in `outputTokens`.
+ */
+ thinkingTokens: number;
/**
* Number of total tokens involved.
*
* This number can be different from `input + output`. Presumably
* due to e.g. thinking process of models. See:
* https://ai.google.dev/gemini-api/docs/thinking.
- * */
- totalTokens?: number;
+ */
+ totalTokens: number;
}
/**
@@ -166,7 +175,7 @@ export interface LlmContextFile {
export interface AssessmentCategory {
/** Unique ID of the category. */
id: RatingCategory;
- /** Display name of the cateogry. */
+ /** Display name of the category. */
name: string;
/** Points that have been awarded to the category. */
points: number;
@@ -206,6 +215,8 @@ export interface IndividualAssessment {
message: string;
/** LLM usage for running the assessment. */
usage?: Usage;
+ /** Labels for this check. Useful for custom grouping of e.g. "best practice" checks. */
+ groupingLabels?: string[];
}
export interface SkippedIndividualAssessment {
@@ -220,6 +231,22 @@ export interface SkippedIndividualAssessment {
category: RatingCategory;
/** A message explaining why the check was skipped. */
message: string;
+ /** Labels for this check. Useful for custom grouping of e.g. "best practice" checks. */
+ groupingLabels?: string[];
+}
+
+/** Stores the duration in milliseconds for different phases of the evaluation. */
+export interface AssessmentTimings {
+ generateDurationMs: number;
+ buildDurationMs: number;
+ repairDurationMs: number;
+}
+
+/** Stores aggregated timing statistics. */
+export interface AggregatedTimings {
+ generate: {mean: number; median: number};
+ build: {mean: number; median: number};
+ repair: {mean: number; median: number};
}
/**
@@ -332,6 +359,9 @@ export interface AggregatedRunStats {
appsWithoutErrors: number;
};
security?: {appsWithErrors: number; appsWithoutErrors: number};
+
+ /** Timing statistics for the run. */
+ timings?: AggregatedTimings;
}
export interface CompletionStats {
@@ -425,11 +455,20 @@ export interface RunSummary {
completionStats?: CompletionStats;
/** AI summary (as HTML code) of all assessments in this run/report. */
aiSummary?: string;
+ /** Additional user-defined AI analysis. */
+ additionalAiAnalysis?: {name: string; summary: string}[];
/**
* Information about the runner that was used for the eval.
* Optional since some older reports might not have it.
*/
runner?: CodegenRunnerInfo;
+
+ /**
+ * Hash of the environment-level ratings. Can be used to
+ * validate that the ratings haven't changed between runs.
+ * This field is optional, because older reports might not have it.
+ */
+ ratingHash?: string;
}
/**
@@ -533,6 +572,8 @@ export interface AssessmentResult {
testResult: TestExecutionResult | null;
/** Number of repair attempts for tests. */
testRepairAttempts?: number;
+ /** Timings captured for the execution and repair stages. */
+ timings?: AssessmentTimings;
}
/**
@@ -615,6 +656,8 @@ export interface RunGroup {
};
/** Runner used to generate code for the runs in the group. */
runner?: CodegenRunnerInfo;
+ /** Names of prompts that were evaluated in this group. */
+ promptNames: string[];
}
/** Request information for a file generation. */
diff --git a/runner/utils/combine-reports.mts b/runner/utils/combine-reports.mts
new file mode 100644
index 00000000..e39ba062
--- /dev/null
+++ b/runner/utils/combine-reports.mts
@@ -0,0 +1,52 @@
+import assert from 'assert';
+import {RunGroup, RunInfo} from '../shared-interfaces.js';
+import {groupSimilarReports} from '../orchestration/grouping.js';
+
+/**
+ * Takes a list of individual WCS reports and combines
+ * them into a single WCS group with combined run.
+ */
+export function combineReports(
+ runs: RunInfo[],
+ groupId: string,
+ runId: string,
+): {
+ group: RunGroup;
+ runInfo: RunInfo;
+} {
+ assert.notEqual(runs.length, 0, 'Expected more than zero reports.');
+
+ const combinedRuns = groupSimilarReports(
+ runs.map(r => {
+ return {...r, group: groupId} satisfies RunInfo;
+ }),
+ );
+ assert.equal(combinedRuns.length, 1);
+
+ const combinedRun = combinedRuns[0];
+ const singleSampleRun = runs[0];
+ const runInfo: RunInfo = {
+ id: runId,
+ group: combinedRun.id,
+ results: runs.map(r => r.results).flat(),
+ version: singleSampleRun.version,
+ details: {
+ reportName: singleSampleRun.details.reportName,
+ summary: {
+ displayName: singleSampleRun.details.summary.displayName,
+ environmentId: singleSampleRun.details.summary.environmentId,
+ framework: singleSampleRun.details.summary.framework,
+ model: singleSampleRun.details.summary.model,
+ usage: singleSampleRun.details.summary.usage,
+ },
+ systemPromptGeneration: '',
+ systemPromptRepair: '',
+ timestamp: singleSampleRun.details.timestamp,
+ },
+ };
+
+ return {
+ group: combinedRun,
+ runInfo,
+ };
+}
diff --git a/runner/utils/extract-rubrics.ts b/runner/utils/extract-rubrics.ts
new file mode 100644
index 00000000..5cefc7f5
--- /dev/null
+++ b/runner/utils/extract-rubrics.ts
@@ -0,0 +1,48 @@
+import {AssessmentResult, IndividualAssessmentState} from '../shared-interfaces.js';
+
+export function extractRubrics(results: AssessmentResult[]): Record {
+ const rubricsBreakdown: Record = {};
+
+ for (const app of results) {
+ const rubricsAnalysis: Record = {};
+
+ for (const category of app.score.categories) {
+ for (const check of category.assessments) {
+ if (check.state === IndividualAssessmentState.SKIPPED) {
+ continue;
+ }
+
+ for (const label of check.groupingLabels ?? []) {
+ if (!rubricsAnalysis[label]) {
+ rubricsAnalysis[label] = {scores: []};
+ }
+
+ const checkWeightWithPillar =
+ category.maxPoints * (parseFloat(check.scoreReduction) / 100);
+
+ rubricsAnalysis[label]!.scores.push({
+ value: checkWeightWithPillar * check.successPercentage,
+ weight: checkWeightWithPillar,
+ });
+ }
+ }
+ }
+
+ for (const label in rubricsAnalysis) {
+ const scores = rubricsAnalysis[label]!.scores;
+ const numerator = scores.reduce((sum, score) => sum + score.value, 0);
+ const denominator = scores.reduce((sum, score) => sum + score.weight, 0);
+
+ rubricsBreakdown[label] ??= [];
+ rubricsBreakdown[label].push(numerator / denominator);
+ }
+ }
+
+ return Object.entries(rubricsBreakdown).reduce(
+ (rubricsResult, [label, scores]) => ({
+ ...rubricsResult,
+ [label]: scores.reduce((prev, cur) => prev + cur, 0) / scores.length,
+ }),
+ {} as Record,
+ );
+}
diff --git a/runner/utils/hashing.ts b/runner/utils/hashing.ts
new file mode 100644
index 00000000..90c28a42
--- /dev/null
+++ b/runner/utils/hashing.ts
@@ -0,0 +1,8 @@
+import {createHash} from 'node:crypto';
+
+/**
+ * Returns a sha-256 hash of a string.
+ */
+export function getSha256Hash(value: string): string {
+ return createHash('sha256').update(value).digest('hex');
+}
diff --git a/runner/workers/serve-testing/puppeteer.ts b/runner/workers/serve-testing/puppeteer.ts
index b4e028a1..1adb21d9 100644
--- a/runner/workers/serve-testing/puppeteer.ts
+++ b/runner/workers/serve-testing/puppeteer.ts
@@ -28,146 +28,143 @@ export async function runAppInPuppeteer(
let screenshotBase64Data: string | undefined;
let axeViolations: AxeResult[] | undefined;
let lighthouseResult: LighthouseResult | undefined;
+ let unexpectedErrorMessage: string | undefined;
- try {
- const browser = await puppeteer.launch({
- headless: true,
- args: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-gpu',
- ],
- });
- const page = await browser.newPage();
+ const browser = await puppeteer.launch({
+ headless: true,
+ args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'],
+ });
+ const page = await browser.newPage();
- page.on('console', async message => {
- if (message.type() !== 'error') return;
+ page.on('console', async message => {
+ if (message.type() !== 'error') return;
- if (!message.text().includes('JSHandle@error')) {
- progressLog('error', `${message.type().substring(0, 3).toUpperCase()} ${message.text()}`);
+ if (!message.text().includes('JSHandle@error')) {
+ progressLog(
+ 'error',
+ `Runtime Error: ${message.type().substring(0, 3).toUpperCase()} ${message.text()}`,
+ );
+ return;
+ }
+ const messages = await Promise.all(
+ message.args().map(async arg => {
+ const [message, stack] = await Promise.all([
+ arg.getProperty('message'),
+ arg.getProperty('stack'),
+ ]);
+
+ let result = '';
+ if (message) {
+ result += message;
+ }
+ if (stack) {
+ result += (result.length ? '\n\n' : '') + stack;
+ }
+ return result;
+ }),
+ );
+ runtimeErrors.push(messages.filter(Boolean).join('\n'));
+ });
+
+ page.on('pageerror', error => {
+ const errorMessage = error instanceof Error ? error.message : `${error}`;
+ progressLog('error', 'Page error', errorMessage);
+ runtimeErrors.push(errorMessage);
+ });
+
+ await page.setViewport({width: 1280, height: 720});
+
+ // Set up auto-CSP handling if enabled for the environment.
+ if (enableAutoCsp) {
+ const autoCsp = new AutoCsp();
+ await autoCsp.connectToDevTools(page);
+ await page.setRequestInterception(true);
+ page.on('request', async request => {
+ if (request.isInterceptResolutionHandled()) {
return;
}
- const messages = await Promise.all(
- message.args().map(async arg => {
- const [message, stack] = await Promise.all([
- arg.getProperty('message'),
- arg.getProperty('stack'),
- ]);
-
- let result = '';
- if (message) {
- result += message;
- }
- if (stack) {
- result += (result.length ? '\n\n' : '') + stack;
- }
- return result;
- }),
- );
- runtimeErrors.push(messages.filter(Boolean).join('\n'));
- });
- page.on('pageerror', error => {
- progressLog('error', 'Page error', error.message);
- runtimeErrors.push(error.toString());
+ // Delegate CSP-related requests to the AutoCsp class
+ const handled = await autoCsp.handleRequest(request);
+ if (!handled) {
+ // Other requests (CSS, JS, images) pass through
+ await request.continue();
+ }
});
- await page.setViewport({width: 1280, height: 720});
-
- // Set up auto-CSP handling if enabled for the environment.
- if (enableAutoCsp) {
- const autoCsp = new AutoCsp();
- await autoCsp.connectToDevTools(page);
- await page.setRequestInterception(true);
- page.on('request', async request => {
- if (request.isInterceptResolutionHandled()) {
- return;
- }
+ await page.goto(hostUrl, {
+ waitUntil: 'networkidle0',
+ timeout: 30000,
+ });
- // Delegate CSP-related requests to the AutoCsp class
- const handled = await autoCsp.handleRequest(request);
- if (!handled) {
- // Other requests (CSS, JS, images) pass through
- await request.continue();
- }
- });
-
- await page.goto(hostUrl, {
- waitUntil: 'networkidle0',
- timeout: 30000,
- });
-
- // Now that the page is loaded, process the collected CSP reports.
- autoCsp.processViolations();
- cspViolations = autoCsp.violations;
- } else {
- // If CSP is not enabled, just navigate to the page directly.
- await page.goto(hostUrl, {
- waitUntil: 'networkidle0',
- timeout: 30000,
- });
- }
+ // Now that the page is loaded, process the collected CSP reports.
+ autoCsp.processViolations();
+ cspViolations = autoCsp.violations;
+ } else {
+ // If CSP is not enabled, just navigate to the page directly.
+ await page.goto(hostUrl, {
+ waitUntil: 'networkidle0',
+ timeout: 30000,
+ });
+ }
- // Perform Axe Testing
- if (includeAxeTesting) {
- try {
- progressLog('eval', `Running Axe accessibility test from ${hostUrl}`);
- const axeResults = await new AxePuppeteer(page).analyze();
- axeViolations = axeResults.violations;
- progressLog('success', `Axe accessibility test completed.`);
-
- if (axeViolations.length > 0) {
- progressLog('error', `Found ${axeViolations.length} Axe violations.`);
- } else {
- progressLog('success', `No Axe violations found.`);
- }
- } catch (axeError: any) {
- progressLog('error', 'Could not perform Axe accessibility test', axeError.message);
+ // Perform Axe Testing
+ if (includeAxeTesting) {
+ try {
+ progressLog('eval', `Running Axe accessibility test from ${hostUrl}`);
+ const axeResults = await new AxePuppeteer(page).analyze();
+ axeViolations = axeResults.violations;
+ progressLog('success', `Axe accessibility test completed.`);
+
+ if (axeViolations.length > 0) {
+ progressLog('error', `Found ${axeViolations.length} Axe violations.`);
+ } else {
+ progressLog('success', `No Axe violations found.`);
}
+ } catch (axeError: any) {
+ progressLog('error', 'Could not perform Axe accessibility test', axeError.message);
}
+ }
- if (takeScreenshots) {
- progressLog('eval', `Taking screenshot from ${hostUrl}`);
-
- screenshotBase64Data = await callWithTimeout(
- `Taking screenshot for ${appName}`,
- () =>
- page.screenshot({
- type: 'png',
- fullPage: true,
- encoding: 'base64',
- }),
- 1, // 1 minute
- );
- progressLog('success', 'Screenshot captured and encoded');
- }
-
- if (includeLighthouseData) {
- try {
- progressLog('eval', `Gathering Lighthouse data from ${hostUrl}`);
- lighthouseResult = await getLighthouseData(hostUrl, page);
+ if (takeScreenshots) {
+ progressLog('eval', `Taking screenshot from ${hostUrl}`);
- if (lighthouseResult) {
- progressLog('success', 'Lighthouse data has been collected');
- } else {
- progressLog('error', 'Lighthouse did not produce usable data');
- }
- } catch (lighthouseError: any) {
- progressLog('error', 'Could not gather Lighthouse data', lighthouseError.message);
- }
- }
+ screenshotBase64Data = await callWithTimeout(
+ `Taking screenshot for ${appName}`,
+ () =>
+ page.screenshot({
+ type: 'png',
+ fullPage: true,
+ encoding: 'base64',
+ }),
+ 1, // 1 minute
+ );
+ progressLog('success', 'Screenshot captured and encoded');
+ }
- await browser.close();
- } catch (screenshotError: any) {
- let details: string = screenshotError.message;
+ if (includeLighthouseData) {
+ try {
+ progressLog('eval', `Gathering Lighthouse data from ${hostUrl}`);
+ lighthouseResult = await getLighthouseData(hostUrl, page);
- if (screenshotError.stack) {
- details += '\n' + screenshotError.stack;
+ if (lighthouseResult) {
+ progressLog('success', 'Lighthouse data has been collected');
+ } else {
+ progressLog('error', 'Lighthouse did not produce usable data');
+ }
+ } catch (lighthouseError: any) {
+ progressLog('error', 'Could not gather Lighthouse data', lighthouseError.message);
}
-
- progressLog('error', 'Could not take screenshot', details);
}
- return {screenshotBase64Data, runtimeErrors, axeViolations, cspViolations, lighthouseResult};
+ await browser.close();
+
+ return {
+ screenshotBase64Data,
+ runtimeErrors,
+ axeViolations,
+ cspViolations,
+ lighthouseResult,
+ unexpectedErrorMessage,
+ };
}
diff --git a/runner/workers/serve-testing/serve-app.ts b/runner/workers/serve-testing/serve-app.ts
index 54500ca7..ff769944 100644
--- a/runner/workers/serve-testing/serve-app.ts
+++ b/runner/workers/serve-testing/serve-app.ts
@@ -9,12 +9,13 @@ export async function serveApp(
rootPromptDef: RootPromptDefinition,
appDirectoryPath: string,
progress: ProgressLogger,
+ abortSignal: AbortSignal,
logicWhileServing: (serveUrl: string) => Promise,
): Promise {
let serveProcess: ChildProcess | null = null;
try {
- serveProcess = exec(serveCommand, {cwd: appDirectoryPath});
+ serveProcess = exec(serveCommand, {cwd: appDirectoryPath, signal: abortSignal});
progress.log(
rootPromptDef,
'eval',
@@ -23,7 +24,9 @@ export async function serveApp(
);
const actualPort = await new Promise((resolvePort, rejectPort) => {
- const serveStartTimeout = 45000; // 45s for serve to start
+ // Timeout for server to start. CPU queueing might cause this to take longer.
+ // It's a upper safety boundary. The overall eval timeout is the main control.
+ const serveStartTimeout = 1000 * 60 * 5;
const timeoutId = setTimeout(() => {
rejectPort(
new Error(
diff --git a/scripts/npm-publish.ts b/scripts/npm-publish.ts
index 4e263153..06b2c345 100644
--- a/scripts/npm-publish.ts
+++ b/scripts/npm-publish.ts
@@ -1,8 +1,8 @@
-import { join } from 'path';
-import { spawn } from 'child_process';
-import { input, select } from '@inquirer/prompts';
-import { executeCommand } from '../runner/utils/exec.js';
-import { readFile, writeFile } from 'fs/promises';
+import {join} from 'path';
+import {spawn} from 'child_process';
+import {input, select} from '@inquirer/prompts';
+import {executeCommand} from '../runner/utils/exec.js';
+import {readFile, writeFile} from 'fs/promises';
const root = join(import.meta.dirname, '..');
const distDirectory = join(root, 'dist');
@@ -22,22 +22,17 @@ const registry = 'https://wombat-dressing-room.appspot.com';
const distTag = await select({
choices: [
- { name: 'Pre-release', value: 'next' },
- { name: 'Stable', value: 'latest' },
+ {name: 'Stable', value: 'latest'},
+ {name: 'Pre-release', value: 'next'},
],
message: 'Select a release channel',
});
// Build the project.
- await executeCommand(
- `pnpm release-build --version=${version}`,
- root,
- undefined,
- {
- forwardStdoutToParent: true,
- forwardStderrToParent: true,
- }
- );
+ await executeCommand(`pnpm release-build --version=${version}`, root, undefined, {
+ forwardStdoutToParent: true,
+ forwardStderrToParent: true,
+ });
// Log into our registry.
await spawnInteractive('npm', ['login', '--registry', registry]);
@@ -50,15 +45,12 @@ const registry = 'https://wombat-dressing-room.appspot.com';
{
forwardStderrToParent: true,
forwardStdoutToParent: true,
- }
+ },
);
// Write the package.json back to disk so the version is in sync.
packageJson.version = version;
- await writeFile(
- packageJsonPath,
- JSON.stringify(packageJson, undefined, 2) + '\n'
- );
+ await writeFile(packageJsonPath, JSON.stringify(packageJson, undefined, 2) + '\n');
console.log('Done! 🎉');
console.log('Remember to push the changed package.json!');
@@ -77,8 +69,6 @@ function spawnInteractive(command: string, args: string[]) {
stdio: 'inherit',
});
- childProcess.on('close', (status) =>
- status === 0 ? resolve() : reject(status)
- );
+ childProcess.on('close', status => (status === 0 ? resolve() : reject(status)));
});
}
diff --git a/scripts/release-build.ts b/scripts/release-build.ts
index d01e877f..18166b84 100644
--- a/scripts/release-build.ts
+++ b/scripts/release-build.ts
@@ -1,9 +1,9 @@
-import { join } from 'path';
-import { rm, cp, readFile, writeFile } from 'fs/promises';
+import {join} from 'path';
+import {rm, cp, readFile, writeFile} from 'fs/promises';
import yargs from 'yargs';
-import { hideBin } from 'yargs/helpers';
-import { globSync as glob } from 'tinyglobby';
-import { executeCommand } from '../runner/utils/exec.js';
+import {hideBin} from 'yargs/helpers';
+import {globSync as glob} from 'tinyglobby';
+import {executeCommand} from '../runner/utils/exec.js';
const root = join(import.meta.dirname, '..');
const runnerSource = join(root, 'runner');
@@ -28,7 +28,7 @@ const args = yargs(hideBin(process.argv))
console.log('Building release output...');
// Clear out the target directory.
- await rm(targetDirectory, { recursive: true, force: true });
+ await rm(targetDirectory, {recursive: true, force: true});
// Build the runner. This also creates `dist`.
await executeCommand('pnpm build-runner', runnerSource, undefined, {
@@ -38,7 +38,7 @@ const args = yargs(hideBin(process.argv))
// Generate the package.json.
await writeFile(
join(targetDirectory, 'package.json'),
- await getPackageJson(join(root, 'package.json'), args.version)
+ await getPackageJson(join(root, 'package.json'), args.version),
);
// Copy the readme and license.
@@ -50,18 +50,10 @@ const args = yargs(hideBin(process.argv))
glob('**/*', {
cwd: join(root, 'examples'),
dot: true,
- ignore: [
- '**/node_modules/**',
- '**/dist/**',
- '**/.vinxi/**',
- '**/.output/**',
- ],
- }).map((agentFile) =>
- cp(
- join(root, 'examples', agentFile),
- join(targetDirectory, 'examples', agentFile)
- )
- )
+ ignore: ['**/node_modules/**', '**/dist/**', '**/.vinxi/**', '**/.output/**'],
+ }).map(agentFile =>
+ cp(join(root, 'examples', agentFile), join(targetDirectory, 'examples', agentFile)),
+ ),
);
// The user journey testing requires various files to work.
@@ -71,12 +63,12 @@ const args = yargs(hideBin(process.argv))
cwd: join(root, browserAgentRelativePath),
dot: true,
ignore: ['*.ts', 'README.md'],
- }).map((agentFile) =>
+ }).map(agentFile =>
cp(
join(root, browserAgentRelativePath, agentFile),
- join(targetDirectory, browserAgentRelativePath, agentFile)
- )
- )
+ join(targetDirectory, browserAgentRelativePath, agentFile),
+ ),
+ ),
);
if (!args.runnerOnly) {
@@ -86,16 +78,13 @@ const args = yargs(hideBin(process.argv))
});
// Copy the report artifacts into the `dist`.
- await cp(reportAppDist, targetDirectory, { recursive: true });
+ await cp(reportAppDist, targetDirectory, {recursive: true});
}
console.log(`Release output has been built in ${targetDirectory}`);
})();
-async function getPackageJson(
- path: string,
- version: string | null
-): Promise {
+async function getPackageJson(path: string, version: string | null): Promise {
const content = await readFile(path, 'utf8');
const parsed = JSON.parse(content) as {
version: string;
@@ -106,9 +95,7 @@ async function getPackageJson(
if (version) {
if (version === parsed.version) {
- throw new Error(
- `Specified version is the same version as the current one.`
- );
+ throw new Error(`Specified version is the same version as the current one.`);
} else {
parsed.version = version;
}