WIP: Add paiwise evaluation runner

Signed-off-by: Oleg Ivaniv <me@olegivaniv.com>
This commit is contained in:
Oleg Ivaniv 2025-11-19 16:21:44 +01:00
parent a53e1476d4
commit 9f6b3be010
No known key found for this signature in database
5 changed files with 337 additions and 4 deletions

View File

@ -0,0 +1,76 @@
import { evaluateWorkflowPairwise } from '../chains/pairwise-evaluator';
import * as baseEvaluator from '../chains/evaluators/base';
// Mock the base evaluator module
jest.mock('../chains/evaluators/base', () => ({
createEvaluatorChain: jest.fn(),
invokeEvaluatorChain: jest.fn(),
}));
describe('evaluateWorkflowPairwise', () => {
const mockLlm = {
bindTools: jest.fn(),
withStructuredOutput: jest.fn(),
};
const input = {
evalCriteria: {
dos: ['Do this'],
donts: ["Don't do that"],
},
workflowJSON: {
nodes: [],
connections: {},
name: 'Test Workflow', // Added name to satisfy SimpleWorkflow type
} as any,
};
beforeEach(() => {
jest.clearAllMocks();
});
it('should return structured result from invokeEvaluatorChain', async () => {
const mockResult = {
violations: [],
passes: [
{ rule: 'Do this', justification: 'Done' },
{ rule: "Don't do that", justification: 'Not done' },
],
};
(baseEvaluator.invokeEvaluatorChain as jest.Mock).mockResolvedValue(mockResult);
const result = await evaluateWorkflowPairwise(mockLlm as any, input);
expect(result).toEqual({
...mockResult,
score: 1,
});
expect(baseEvaluator.createEvaluatorChain).toHaveBeenCalledWith(
mockLlm,
expect.anything(), // schema
expect.stringContaining('expert n8n workflow auditor'), // system prompt
expect.stringContaining('<task_context>'), // human template
);
expect(baseEvaluator.invokeEvaluatorChain).toHaveBeenCalledWith(
undefined, // The chain (undefined because createEvaluatorChain mock returns undefined)
expect.objectContaining({
userPrompt: expect.stringContaining('- [DO] Do this'),
generatedWorkflow: input.workflowJSON,
}),
);
});
it('should calculate score correctly with violations', async () => {
const mockResult = {
violations: [{ rule: "Don't do that", justification: 'Did it' }],
passes: [{ rule: 'Do this', justification: 'Done' }],
};
(baseEvaluator.invokeEvaluatorChain as jest.Mock).mockResolvedValue(mockResult);
const result = await evaluateWorkflowPairwise(mockLlm as any, input);
expect(result.score).toBe(0.5);
});
});

View File

@ -0,0 +1,107 @@
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import { z } from 'zod';
import { createEvaluatorChain, invokeEvaluatorChain } from './evaluators/base';
import type { SimpleWorkflow } from '../../src/types/workflow';
export interface PairwiseEvaluationInput {
evalCriteria: {
dos: string[];
donts: string[];
};
workflowJSON: SimpleWorkflow;
}
const pairwiseEvaluationLLMResultSchema = z.object({
violations: z
.array(
z.object({
rule: z.string(),
justification: z.string(),
}),
)
.describe('List of criteria that were violated'),
passes: z
.array(
z.object({
rule: z.string(),
justification: z.string(),
}),
)
.describe('List of criteria that were passed'),
});
export type PairwiseEvaluationResult = z.infer<typeof pairwiseEvaluationLLMResultSchema> & {
score: number;
};
const EVALUATOR_SYSTEM_PROMPT = `You are an expert n8n workflow auditor. Your task is to strictly evaluate a candidate workflow against a provided set of requirements.
<role_definition>
- You are objective, precise, and evidence-based.
- You do not assume functionality that is not explicitly configured in the JSON.
- You verify every claim against the actual node configurations, connections, and parameters.
</role_definition>
<constraints>
- Judge ONLY against the provided evaluation criteria. Do not apply external "best practices" unless explicitly asked.
- If a criterion is "not verifiable" from the JSON alone (e.g., requires runtime data), mark it as a violation and explain why.
- For every pass or violation, you MUST cite the specific node name or parameter that serves as evidence.
- Do not hallucinate nodes or parameters.
</constraints>`;
const humanTemplate = `
<task_context>
Analyze the following n8n workflow against the provided checklist of criteria.
</task_context>
<evaluation_criteria>
{userPrompt}
</evaluation_criteria>
<workflow_candidate>
{generatedWorkflow}
</workflow_candidate>
<instructions>
1. Read the <evaluation_criteria> carefully.
2. For each criterion:
- Search for evidence in the <workflow_candidate>.
- Determine if it passes or fails.
- Provide a clear 'justification' citing the evidence (e.g., "Node 'HTTP Request' has method set to 'GET'").
3. Output the result as a structured JSON with 'violations' and 'passes'.
</instructions>
`;
export function createPairwiseEvaluatorChain(llm: BaseChatModel) {
return createEvaluatorChain(
llm,
pairwiseEvaluationLLMResultSchema,
EVALUATOR_SYSTEM_PROMPT,
humanTemplate,
);
}
export async function evaluateWorkflowPairwise(
llm: BaseChatModel,
input: PairwiseEvaluationInput,
): Promise<PairwiseEvaluationResult> {
// Format criteria into a checklist
const criteriaList = [
...input.evalCriteria.dos.map((c) => `- [DO] ${c}`),
...input.evalCriteria.donts.map((c) => `- [DONT] ${c}`),
].join('\n');
const result = await invokeEvaluatorChain(createPairwiseEvaluatorChain(llm), {
userPrompt: criteriaList,
generatedWorkflow: input.workflowJSON,
});
const totalRules = result.passes.length + result.violations.length;
const score = totalRules > 0 ? result.passes.length / totalRules : 0;
return {
...result,
score,
};
}

View File

@ -1,10 +1,12 @@
import { runCliEvaluation } from './cli/runner.js';
import { runLangsmithEvaluation } from './langsmith/runner.js';
import { runPairwiseLangsmithEvaluation } from './langsmith/pairwise-runner.js';
import { loadTestCasesFromCsv } from './utils/csv-prompt-loader.js';
// Re-export for external use if needed
export { runCliEvaluation } from './cli/runner.js';
export { runLangsmithEvaluation } from './langsmith/runner.js';
export { runPairwiseLangsmithEvaluation } from './langsmith/pairwise-runner.js';
export { runSingleTest } from './core/test-runner.js';
export { setupTestEnvironment, createAgent } from './core/environment.js';
@ -14,6 +16,7 @@ export { setupTestEnvironment, createAgent } from './core/environment.js';
*/
async function main(): Promise<void> {
const useLangsmith = process.env.USE_LANGSMITH_EVAL === 'true';
const usePairwiseEval = process.env.USE_PAIRWISE_EVAL === 'true';
// Parse command line arguments for single test case
const testCaseId = process.argv.includes('--test-case')
@ -23,7 +26,7 @@ async function main(): Promise<void> {
// Parse command line argument for CSV prompts file path
const promptsCsvPath = getFlagValue('--prompts-csv') ?? process.env.PROMPTS_CSV_FILE;
if (promptsCsvPath && useLangsmith) {
if (promptsCsvPath && (useLangsmith || usePairwiseEval)) {
console.warn('CSV-driven evaluations are only supported in CLI mode. Ignoring --prompts-csv.');
}
@ -33,7 +36,9 @@ async function main(): Promise<void> {
: 1;
const repetitions = Number.isNaN(repetitionsArg) ? 1 : repetitionsArg;
if (useLangsmith) {
if (usePairwiseEval) {
await runPairwiseLangsmithEvaluation(repetitions);
} else if (useLangsmith) {
await runLangsmithEvaluation(repetitions);
} else {
const csvTestCases = promptsCsvPath ? loadTestCasesFromCsv(promptsCsvPath) : undefined;

View File

@ -0,0 +1,144 @@
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import type { LangChainTracer } from '@langchain/core/tracers/tracer_langchain';
import { evaluate } from 'langsmith/evaluation';
import type { Run, Example } from 'langsmith/schemas';
import type { EvaluationResult as LangsmithEvaluationResult } from 'langsmith/evaluation';
import pc from 'picocolors';
import { evaluateWorkflowPairwise } from '../chains/pairwise-evaluator';
import { setupTestEnvironment, createAgent } from '../core/environment';
import { generateRunId, isWorkflowStateValues } from '../types/langsmith';
import { consumeGenerator, formatHeader, getChatPayload } from '../utils/evaluation-helpers';
import type { INodeTypeDescription } from 'n8n-workflow';
interface PairwiseDatasetInput {
evals: {
dos: string[];
donts: string[];
};
prompt: string;
}
function createPairwiseWorkflowGenerator(
parsedNodeTypes: INodeTypeDescription[],
llm: BaseChatModel,
tracer?: LangChainTracer,
) {
return async (inputs: PairwiseDatasetInput) => {
const runId = generateRunId();
// Create agent for this run
const agent = createAgent(parsedNodeTypes, llm, tracer);
// Use the prompt from the dataset
await consumeGenerator(
agent.chat(getChatPayload(inputs.prompt, runId), 'langsmith-pairwise-eval-user'),
);
// Get generated workflow
const state = await agent.getState(runId, 'langsmith-pairwise-eval-user');
if (!state.values || !isWorkflowStateValues(state.values)) {
throw new Error('Invalid workflow state');
}
return {
workflow: state.values.workflowJSON,
evalCriteria: inputs.evals,
prompt: inputs.prompt,
};
};
}
function createPairwiseLangsmithEvaluator(llm: BaseChatModel) {
return async (rootRun: Run, _example?: Example): Promise<LangsmithEvaluationResult[]> => {
const outputs = rootRun.outputs;
if (!outputs || !outputs.workflow || !outputs.evalCriteria) {
return [
{
key: 'pairwise_score',
score: 0,
comment: 'Missing workflow or evaluation criteria in outputs',
},
];
}
const result = await evaluateWorkflowPairwise(llm, {
workflowJSON: outputs.workflow,
evalCriteria: outputs.evalCriteria,
});
const violationsText =
result.violations.length > 0
? `Violations:\n${result.violations.map((v) => `- ${v.rule}: ${v.justification}`).join('\n')}`
: '';
const passesText =
result.passes.length > 0
? `Passes:\n${result.passes.map((p) => `- ${p.rule}: ${p.justification}`).join('\n')}`
: '';
const comment = [violationsText, passesText].filter(Boolean).join('\n\n');
return [
{
key: 'pairwise_score',
score: result.score,
comment: comment || 'No comments',
},
{
key: 'pairwise_passed_count',
score: result.passes.length,
},
{
key: 'pairwise_failed_count',
score: result.violations.length,
},
];
};
}
export async function runPairwiseLangsmithEvaluation(repetitions: number = 1): Promise<void> {
console.log(formatHeader('AI Workflow Builder Pairwise Evaluation', 70));
if (!process.env.LANGSMITH_API_KEY) {
console.error(pc.red('✗ LANGSMITH_API_KEY environment variable not set'));
process.exit(1);
}
try {
const { parsedNodeTypes, llm, tracer, lsClient } = await setupTestEnvironment();
if (!lsClient) {
throw new Error('Langsmith client not initialized');
}
const datasetName = process.env.LANGSMITH_DATASET_NAME ?? 'workflow-builder-pairwise-prompts';
console.log(pc.blue(`➔ Using dataset: ${datasetName}`));
// Verify dataset exists
try {
await lsClient.readDataset({ datasetName });
} catch (error) {
console.error(pc.red(`✗ Dataset "${datasetName}" not found`));
process.exit(1);
}
const generateWorkflow = createPairwiseWorkflowGenerator(parsedNodeTypes, llm, tracer);
const evaluator = createPairwiseLangsmithEvaluator(llm);
await evaluate(generateWorkflow, {
data: datasetName,
evaluators: [evaluator],
maxConcurrency: 5,
// @ts-ignore
experimentPrefix: `pairwise-eval-${llm.modelName ?? 'unknown'}`,
numRepetitions: repetitions,
});
console.log(pc.green('✓ Pairwise evaluation completed'));
} catch (error) {
console.error(pc.red('✗ Pairwise evaluation failed:'), error);
process.exit(1);
}
}

View File

@ -27,7 +27,8 @@
"eval:csv": "tsx evaluations --prompts-csv",
"eval:langsmith": "USE_LANGSMITH_EVAL=true tsx evaluations",
"eval:generate": "GENERATE_TEST_CASES=true tsx evaluations",
"eval:categorize": "tsx scripts/categorize-prompts.ts"
"eval:categorize": "tsx scripts/categorize-prompts.ts",
"eval:pairwise": "USE_PAIRWISE_EVAL=true tsx evaluations"
},
"main": "dist/index.js",
"module": "src/index.ts",
@ -68,4 +69,4 @@
"madge": "^8.0.0",
"p-limit": "^3.1.0"
}
}
}