WIP: Add paiwise evaluation runner

Signed-off-by: Oleg Ivaniv <me@olegivaniv.com>
2025-11-20 17:46:34 +00:00 · 2025-11-19 16:21:44 +01:00 · 2025-11-19 16:21:44 +01:00 · 9f6b3be010
commit 9f6b3be010
parent a53e1476d4
5 changed files with 337 additions and 4 deletions
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/pairwise-evaluator.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/pairwise-evaluator.test.ts
@ -0,0 +1,76 @@
+import { evaluateWorkflowPairwise } from '../chains/pairwise-evaluator';
+import * as baseEvaluator from '../chains/evaluators/base';
+
+// Mock the base evaluator module
+jest.mock('../chains/evaluators/base', () => ({
+	createEvaluatorChain: jest.fn(),
+	invokeEvaluatorChain: jest.fn(),
+}));
+
+describe('evaluateWorkflowPairwise', () => {
+	const mockLlm = {
+		bindTools: jest.fn(),
+		withStructuredOutput: jest.fn(),
+	};
+
+	const input = {
+		evalCriteria: {
+			dos: ['Do this'],
+			donts: ["Don't do that"],
+		},
+		workflowJSON: {
+			nodes: [],
+			connections: {},
+			name: 'Test Workflow', // Added name to satisfy SimpleWorkflow type
+		} as any,
+	};
+
+	beforeEach(() => {
+		jest.clearAllMocks();
+	});
+
+	it('should return structured result from invokeEvaluatorChain', async () => {
+		const mockResult = {
+			violations: [],
+			passes: [
+				{ rule: 'Do this', justification: 'Done' },
+				{ rule: "Don't do that", justification: 'Not done' },
+			],
+		};
+
+		(baseEvaluator.invokeEvaluatorChain as jest.Mock).mockResolvedValue(mockResult);
+
+		const result = await evaluateWorkflowPairwise(mockLlm as any, input);
+
+		expect(result).toEqual({
+			...mockResult,
+			score: 1,
+		});
+		expect(baseEvaluator.createEvaluatorChain).toHaveBeenCalledWith(
+			mockLlm,
+			expect.anything(), // schema
+			expect.stringContaining('expert n8n workflow auditor'), // system prompt
+			expect.stringContaining('<task_context>'), // human template
+		);
+		expect(baseEvaluator.invokeEvaluatorChain).toHaveBeenCalledWith(
+			undefined, // The chain (undefined because createEvaluatorChain mock returns undefined)
+			expect.objectContaining({
+				userPrompt: expect.stringContaining('- [DO] Do this'),
+				generatedWorkflow: input.workflowJSON,
+			}),
+		);
+	});
+
+	it('should calculate score correctly with violations', async () => {
+		const mockResult = {
+			violations: [{ rule: "Don't do that", justification: 'Did it' }],
+			passes: [{ rule: 'Do this', justification: 'Done' }],
+		};
+
+		(baseEvaluator.invokeEvaluatorChain as jest.Mock).mockResolvedValue(mockResult);
+
+		const result = await evaluateWorkflowPairwise(mockLlm as any, input);
+
+		expect(result.score).toBe(0.5);
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/pairwise-evaluator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/pairwise-evaluator.ts
@ -0,0 +1,107 @@
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import { z } from 'zod';
+
+import { createEvaluatorChain, invokeEvaluatorChain } from './evaluators/base';
+import type { SimpleWorkflow } from '../../src/types/workflow';
+
+export interface PairwiseEvaluationInput {
+	evalCriteria: {
+		dos: string[];
+		donts: string[];
+	};
+	workflowJSON: SimpleWorkflow;
+}
+
+const pairwiseEvaluationLLMResultSchema = z.object({
+	violations: z
+		.array(
+			z.object({
+				rule: z.string(),
+				justification: z.string(),
+			}),
+		)
+		.describe('List of criteria that were violated'),
+	passes: z
+		.array(
+			z.object({
+				rule: z.string(),
+				justification: z.string(),
+			}),
+		)
+		.describe('List of criteria that were passed'),
+});
+
+export type PairwiseEvaluationResult = z.infer<typeof pairwiseEvaluationLLMResultSchema> & {
+	score: number;
+};
+
+const EVALUATOR_SYSTEM_PROMPT = `You are an expert n8n workflow auditor. Your task is to strictly evaluate a candidate workflow against a provided set of requirements.
+
+<role_definition>
+- You are objective, precise, and evidence-based.
+- You do not assume functionality that is not explicitly configured in the JSON.
+- You verify every claim against the actual node configurations, connections, and parameters.
+</role_definition>
+
+<constraints>
+- Judge ONLY against the provided evaluation criteria. Do not apply external "best practices" unless explicitly asked.
+- If a criterion is "not verifiable" from the JSON alone (e.g., requires runtime data), mark it as a violation and explain why.
+- For every pass or violation, you MUST cite the specific node name or parameter that serves as evidence.
+- Do not hallucinate nodes or parameters.
+</constraints>`;
+
+const humanTemplate = `
+<task_context>
+Analyze the following n8n workflow against the provided checklist of criteria.
+</task_context>
+
+<evaluation_criteria>
+{userPrompt}
+</evaluation_criteria>
+
+<workflow_candidate>
+{generatedWorkflow}
+</workflow_candidate>
+
+<instructions>
+1. Read the <evaluation_criteria> carefully.
+2. For each criterion:
+    - Search for evidence in the <workflow_candidate>.
+    - Determine if it passes or fails.
+    - Provide a clear 'justification' citing the evidence (e.g., "Node 'HTTP Request' has method set to 'GET'").
+3. Output the result as a structured JSON with 'violations' and 'passes'.
+</instructions>
+`;
+
+export function createPairwiseEvaluatorChain(llm: BaseChatModel) {
+	return createEvaluatorChain(
+		llm,
+		pairwiseEvaluationLLMResultSchema,
+		EVALUATOR_SYSTEM_PROMPT,
+		humanTemplate,
+	);
+}
+
+export async function evaluateWorkflowPairwise(
+	llm: BaseChatModel,
+	input: PairwiseEvaluationInput,
+): Promise<PairwiseEvaluationResult> {
+	// Format criteria into a checklist
+	const criteriaList = [
+		...input.evalCriteria.dos.map((c) => `- [DO] ${c}`),
+		...input.evalCriteria.donts.map((c) => `- [DONT] ${c}`),
+	].join('\n');
+
+	const result = await invokeEvaluatorChain(createPairwiseEvaluatorChain(llm), {
+		userPrompt: criteriaList,
+		generatedWorkflow: input.workflowJSON,
+	});
+
+	const totalRules = result.passes.length + result.violations.length;
+	const score = totalRules > 0 ? result.passes.length / totalRules : 0;
+
+	return {
+		...result,
+		score,
+	};
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/index.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/index.ts
@ -1,10 +1,12 @@
 import { runCliEvaluation } from './cli/runner.js';
 import { runLangsmithEvaluation } from './langsmith/runner.js';
+import { runPairwiseLangsmithEvaluation } from './langsmith/pairwise-runner.js';
 import { loadTestCasesFromCsv } from './utils/csv-prompt-loader.js';

 // Re-export for external use if needed
 export { runCliEvaluation } from './cli/runner.js';
 export { runLangsmithEvaluation } from './langsmith/runner.js';
+export { runPairwiseLangsmithEvaluation } from './langsmith/pairwise-runner.js';
 export { runSingleTest } from './core/test-runner.js';
 export { setupTestEnvironment, createAgent } from './core/environment.js';

@ -14,6 +16,7 @@ export { setupTestEnvironment, createAgent } from './core/environment.js';
 */
 async function main(): Promise<void> {
 	const useLangsmith = process.env.USE_LANGSMITH_EVAL === 'true';
+	const usePairwiseEval = process.env.USE_PAIRWISE_EVAL === 'true';

 	// Parse command line arguments for single test case
 	const testCaseId = process.argv.includes('--test-case')
@ -23,7 +26,7 @@ async function main(): Promise<void> {
 	// Parse command line argument for CSV prompts file path
 	const promptsCsvPath = getFlagValue('--prompts-csv') ?? process.env.PROMPTS_CSV_FILE;

-	if (promptsCsvPath && useLangsmith) {
+	if (promptsCsvPath && (useLangsmith || usePairwiseEval)) {
 		console.warn('CSV-driven evaluations are only supported in CLI mode. Ignoring --prompts-csv.');
 	}

@ -33,7 +36,9 @@ async function main(): Promise<void> {
 		: 1;
 	const repetitions = Number.isNaN(repetitionsArg) ? 1 : repetitionsArg;

-	if (useLangsmith) {
+	if (usePairwiseEval) {
+		await runPairwiseLangsmithEvaluation(repetitions);
+	} else if (useLangsmith) {
 		await runLangsmithEvaluation(repetitions);
 	} else {
 		const csvTestCases = promptsCsvPath ? loadTestCasesFromCsv(promptsCsvPath) : undefined;
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/langsmith/pairwise-runner.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/langsmith/pairwise-runner.ts
@ -0,0 +1,144 @@
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import type { LangChainTracer } from '@langchain/core/tracers/tracer_langchain';
+import { evaluate } from 'langsmith/evaluation';
+import type { Run, Example } from 'langsmith/schemas';
+import type { EvaluationResult as LangsmithEvaluationResult } from 'langsmith/evaluation';
+import pc from 'picocolors';
+
+import { evaluateWorkflowPairwise } from '../chains/pairwise-evaluator';
+import { setupTestEnvironment, createAgent } from '../core/environment';
+import { generateRunId, isWorkflowStateValues } from '../types/langsmith';
+import { consumeGenerator, formatHeader, getChatPayload } from '../utils/evaluation-helpers';
+import type { INodeTypeDescription } from 'n8n-workflow';
+
+interface PairwiseDatasetInput {
+	evals: {
+		dos: string[];
+		donts: string[];
+	};
+	prompt: string;
+}
+
+function createPairwiseWorkflowGenerator(
+	parsedNodeTypes: INodeTypeDescription[],
+	llm: BaseChatModel,
+	tracer?: LangChainTracer,
+) {
+	return async (inputs: PairwiseDatasetInput) => {
+		const runId = generateRunId();
+
+		// Create agent for this run
+		const agent = createAgent(parsedNodeTypes, llm, tracer);
+
+		// Use the prompt from the dataset
+		await consumeGenerator(
+			agent.chat(getChatPayload(inputs.prompt, runId), 'langsmith-pairwise-eval-user'),
+		);
+
+		// Get generated workflow
+		const state = await agent.getState(runId, 'langsmith-pairwise-eval-user');
+
+		if (!state.values || !isWorkflowStateValues(state.values)) {
+			throw new Error('Invalid workflow state');
+		}
+
+		return {
+			workflow: state.values.workflowJSON,
+			evalCriteria: inputs.evals,
+			prompt: inputs.prompt,
+		};
+	};
+}
+
+function createPairwiseLangsmithEvaluator(llm: BaseChatModel) {
+	return async (rootRun: Run, _example?: Example): Promise<LangsmithEvaluationResult[]> => {
+		const outputs = rootRun.outputs;
+		if (!outputs || !outputs.workflow || !outputs.evalCriteria) {
+			return [
+				{
+					key: 'pairwise_score',
+					score: 0,
+					comment: 'Missing workflow or evaluation criteria in outputs',
+				},
+			];
+		}
+
+		const result = await evaluateWorkflowPairwise(llm, {
+			workflowJSON: outputs.workflow,
+			evalCriteria: outputs.evalCriteria,
+		});
+
+		const violationsText =
+			result.violations.length > 0
+				? `Violations:\n${result.violations.map((v) => `- ${v.rule}: ${v.justification}`).join('\n')}`
+				: '';
+
+		const passesText =
+			result.passes.length > 0
+				? `Passes:\n${result.passes.map((p) => `- ${p.rule}: ${p.justification}`).join('\n')}`
+				: '';
+
+		const comment = [violationsText, passesText].filter(Boolean).join('\n\n');
+
+		return [
+			{
+				key: 'pairwise_score',
+				score: result.score,
+				comment: comment || 'No comments',
+			},
+			{
+				key: 'pairwise_passed_count',
+				score: result.passes.length,
+			},
+			{
+				key: 'pairwise_failed_count',
+				score: result.violations.length,
+			},
+		];
+	};
+}
+
+export async function runPairwiseLangsmithEvaluation(repetitions: number = 1): Promise<void> {
+	console.log(formatHeader('AI Workflow Builder Pairwise Evaluation', 70));
+
+	if (!process.env.LANGSMITH_API_KEY) {
+		console.error(pc.red('✗ LANGSMITH_API_KEY environment variable not set'));
+		process.exit(1);
+	}
+
+	try {
+		const { parsedNodeTypes, llm, tracer, lsClient } = await setupTestEnvironment();
+
+		if (!lsClient) {
+			throw new Error('Langsmith client not initialized');
+		}
+
+		const datasetName = process.env.LANGSMITH_DATASET_NAME ?? 'workflow-builder-pairwise-prompts';
+		console.log(pc.blue(`➔ Using dataset: ${datasetName}`));
+
+		// Verify dataset exists
+		try {
+			await lsClient.readDataset({ datasetName });
+		} catch (error) {
+			console.error(pc.red(`✗ Dataset "${datasetName}" not found`));
+			process.exit(1);
+		}
+
+		const generateWorkflow = createPairwiseWorkflowGenerator(parsedNodeTypes, llm, tracer);
+		const evaluator = createPairwiseLangsmithEvaluator(llm);
+
+		await evaluate(generateWorkflow, {
+			data: datasetName,
+			evaluators: [evaluator],
+			maxConcurrency: 5,
+			// @ts-ignore
+			experimentPrefix: `pairwise-eval-${llm.modelName ?? 'unknown'}`,
+			numRepetitions: repetitions,
+		});
+
+		console.log(pc.green('✓ Pairwise evaluation completed'));
+	} catch (error) {
+		console.error(pc.red('✗ Pairwise evaluation failed:'), error);
+		process.exit(1);
+	}
+}
--- a/packages/@n8n/ai-workflow-builder.ee/package.json
+++ b/packages/@n8n/ai-workflow-builder.ee/package.json
@ -27,7 +27,8 @@
    "eval:csv": "tsx evaluations --prompts-csv",
    "eval:langsmith": "USE_LANGSMITH_EVAL=true tsx evaluations",
    "eval:generate": "GENERATE_TEST_CASES=true tsx evaluations",
-    "eval:categorize": "tsx scripts/categorize-prompts.ts"
+    "eval:categorize": "tsx scripts/categorize-prompts.ts",
+    "eval:pairwise": "USE_PAIRWISE_EVAL=true tsx evaluations"
  },
  "main": "dist/index.js",
  "module": "src/index.ts",
@ -68,4 +69,4 @@
    "madge": "^8.0.0",
    "p-limit": "^3.1.0"
  }
-}
+}