mirror of
https://github.com/n8n-io/n8n.git
synced 2025-11-20 17:46:34 +00:00
WIP: Add paiwise evaluation runner
Signed-off-by: Oleg Ivaniv <me@olegivaniv.com>
This commit is contained in:
parent
a53e1476d4
commit
9f6b3be010
@ -0,0 +1,76 @@
|
||||
import { evaluateWorkflowPairwise } from '../chains/pairwise-evaluator';
|
||||
import * as baseEvaluator from '../chains/evaluators/base';
|
||||
|
||||
// Mock the base evaluator module
|
||||
jest.mock('../chains/evaluators/base', () => ({
|
||||
createEvaluatorChain: jest.fn(),
|
||||
invokeEvaluatorChain: jest.fn(),
|
||||
}));
|
||||
|
||||
describe('evaluateWorkflowPairwise', () => {
|
||||
const mockLlm = {
|
||||
bindTools: jest.fn(),
|
||||
withStructuredOutput: jest.fn(),
|
||||
};
|
||||
|
||||
const input = {
|
||||
evalCriteria: {
|
||||
dos: ['Do this'],
|
||||
donts: ["Don't do that"],
|
||||
},
|
||||
workflowJSON: {
|
||||
nodes: [],
|
||||
connections: {},
|
||||
name: 'Test Workflow', // Added name to satisfy SimpleWorkflow type
|
||||
} as any,
|
||||
};
|
||||
|
||||
beforeEach(() => {
|
||||
jest.clearAllMocks();
|
||||
});
|
||||
|
||||
it('should return structured result from invokeEvaluatorChain', async () => {
|
||||
const mockResult = {
|
||||
violations: [],
|
||||
passes: [
|
||||
{ rule: 'Do this', justification: 'Done' },
|
||||
{ rule: "Don't do that", justification: 'Not done' },
|
||||
],
|
||||
};
|
||||
|
||||
(baseEvaluator.invokeEvaluatorChain as jest.Mock).mockResolvedValue(mockResult);
|
||||
|
||||
const result = await evaluateWorkflowPairwise(mockLlm as any, input);
|
||||
|
||||
expect(result).toEqual({
|
||||
...mockResult,
|
||||
score: 1,
|
||||
});
|
||||
expect(baseEvaluator.createEvaluatorChain).toHaveBeenCalledWith(
|
||||
mockLlm,
|
||||
expect.anything(), // schema
|
||||
expect.stringContaining('expert n8n workflow auditor'), // system prompt
|
||||
expect.stringContaining('<task_context>'), // human template
|
||||
);
|
||||
expect(baseEvaluator.invokeEvaluatorChain).toHaveBeenCalledWith(
|
||||
undefined, // The chain (undefined because createEvaluatorChain mock returns undefined)
|
||||
expect.objectContaining({
|
||||
userPrompt: expect.stringContaining('- [DO] Do this'),
|
||||
generatedWorkflow: input.workflowJSON,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('should calculate score correctly with violations', async () => {
|
||||
const mockResult = {
|
||||
violations: [{ rule: "Don't do that", justification: 'Did it' }],
|
||||
passes: [{ rule: 'Do this', justification: 'Done' }],
|
||||
};
|
||||
|
||||
(baseEvaluator.invokeEvaluatorChain as jest.Mock).mockResolvedValue(mockResult);
|
||||
|
||||
const result = await evaluateWorkflowPairwise(mockLlm as any, input);
|
||||
|
||||
expect(result.score).toBe(0.5);
|
||||
});
|
||||
});
|
||||
@ -0,0 +1,107 @@
|
||||
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
||||
import { z } from 'zod';
|
||||
|
||||
import { createEvaluatorChain, invokeEvaluatorChain } from './evaluators/base';
|
||||
import type { SimpleWorkflow } from '../../src/types/workflow';
|
||||
|
||||
export interface PairwiseEvaluationInput {
|
||||
evalCriteria: {
|
||||
dos: string[];
|
||||
donts: string[];
|
||||
};
|
||||
workflowJSON: SimpleWorkflow;
|
||||
}
|
||||
|
||||
const pairwiseEvaluationLLMResultSchema = z.object({
|
||||
violations: z
|
||||
.array(
|
||||
z.object({
|
||||
rule: z.string(),
|
||||
justification: z.string(),
|
||||
}),
|
||||
)
|
||||
.describe('List of criteria that were violated'),
|
||||
passes: z
|
||||
.array(
|
||||
z.object({
|
||||
rule: z.string(),
|
||||
justification: z.string(),
|
||||
}),
|
||||
)
|
||||
.describe('List of criteria that were passed'),
|
||||
});
|
||||
|
||||
export type PairwiseEvaluationResult = z.infer<typeof pairwiseEvaluationLLMResultSchema> & {
|
||||
score: number;
|
||||
};
|
||||
|
||||
const EVALUATOR_SYSTEM_PROMPT = `You are an expert n8n workflow auditor. Your task is to strictly evaluate a candidate workflow against a provided set of requirements.
|
||||
|
||||
<role_definition>
|
||||
- You are objective, precise, and evidence-based.
|
||||
- You do not assume functionality that is not explicitly configured in the JSON.
|
||||
- You verify every claim against the actual node configurations, connections, and parameters.
|
||||
</role_definition>
|
||||
|
||||
<constraints>
|
||||
- Judge ONLY against the provided evaluation criteria. Do not apply external "best practices" unless explicitly asked.
|
||||
- If a criterion is "not verifiable" from the JSON alone (e.g., requires runtime data), mark it as a violation and explain why.
|
||||
- For every pass or violation, you MUST cite the specific node name or parameter that serves as evidence.
|
||||
- Do not hallucinate nodes or parameters.
|
||||
</constraints>`;
|
||||
|
||||
const humanTemplate = `
|
||||
<task_context>
|
||||
Analyze the following n8n workflow against the provided checklist of criteria.
|
||||
</task_context>
|
||||
|
||||
<evaluation_criteria>
|
||||
{userPrompt}
|
||||
</evaluation_criteria>
|
||||
|
||||
<workflow_candidate>
|
||||
{generatedWorkflow}
|
||||
</workflow_candidate>
|
||||
|
||||
<instructions>
|
||||
1. Read the <evaluation_criteria> carefully.
|
||||
2. For each criterion:
|
||||
- Search for evidence in the <workflow_candidate>.
|
||||
- Determine if it passes or fails.
|
||||
- Provide a clear 'justification' citing the evidence (e.g., "Node 'HTTP Request' has method set to 'GET'").
|
||||
3. Output the result as a structured JSON with 'violations' and 'passes'.
|
||||
</instructions>
|
||||
`;
|
||||
|
||||
export function createPairwiseEvaluatorChain(llm: BaseChatModel) {
|
||||
return createEvaluatorChain(
|
||||
llm,
|
||||
pairwiseEvaluationLLMResultSchema,
|
||||
EVALUATOR_SYSTEM_PROMPT,
|
||||
humanTemplate,
|
||||
);
|
||||
}
|
||||
|
||||
export async function evaluateWorkflowPairwise(
|
||||
llm: BaseChatModel,
|
||||
input: PairwiseEvaluationInput,
|
||||
): Promise<PairwiseEvaluationResult> {
|
||||
// Format criteria into a checklist
|
||||
const criteriaList = [
|
||||
...input.evalCriteria.dos.map((c) => `- [DO] ${c}`),
|
||||
...input.evalCriteria.donts.map((c) => `- [DONT] ${c}`),
|
||||
].join('\n');
|
||||
|
||||
const result = await invokeEvaluatorChain(createPairwiseEvaluatorChain(llm), {
|
||||
userPrompt: criteriaList,
|
||||
generatedWorkflow: input.workflowJSON,
|
||||
});
|
||||
|
||||
const totalRules = result.passes.length + result.violations.length;
|
||||
const score = totalRules > 0 ? result.passes.length / totalRules : 0;
|
||||
|
||||
return {
|
||||
...result,
|
||||
score,
|
||||
};
|
||||
}
|
||||
@ -1,10 +1,12 @@
|
||||
import { runCliEvaluation } from './cli/runner.js';
|
||||
import { runLangsmithEvaluation } from './langsmith/runner.js';
|
||||
import { runPairwiseLangsmithEvaluation } from './langsmith/pairwise-runner.js';
|
||||
import { loadTestCasesFromCsv } from './utils/csv-prompt-loader.js';
|
||||
|
||||
// Re-export for external use if needed
|
||||
export { runCliEvaluation } from './cli/runner.js';
|
||||
export { runLangsmithEvaluation } from './langsmith/runner.js';
|
||||
export { runPairwiseLangsmithEvaluation } from './langsmith/pairwise-runner.js';
|
||||
export { runSingleTest } from './core/test-runner.js';
|
||||
export { setupTestEnvironment, createAgent } from './core/environment.js';
|
||||
|
||||
@ -14,6 +16,7 @@ export { setupTestEnvironment, createAgent } from './core/environment.js';
|
||||
*/
|
||||
async function main(): Promise<void> {
|
||||
const useLangsmith = process.env.USE_LANGSMITH_EVAL === 'true';
|
||||
const usePairwiseEval = process.env.USE_PAIRWISE_EVAL === 'true';
|
||||
|
||||
// Parse command line arguments for single test case
|
||||
const testCaseId = process.argv.includes('--test-case')
|
||||
@ -23,7 +26,7 @@ async function main(): Promise<void> {
|
||||
// Parse command line argument for CSV prompts file path
|
||||
const promptsCsvPath = getFlagValue('--prompts-csv') ?? process.env.PROMPTS_CSV_FILE;
|
||||
|
||||
if (promptsCsvPath && useLangsmith) {
|
||||
if (promptsCsvPath && (useLangsmith || usePairwiseEval)) {
|
||||
console.warn('CSV-driven evaluations are only supported in CLI mode. Ignoring --prompts-csv.');
|
||||
}
|
||||
|
||||
@ -33,7 +36,9 @@ async function main(): Promise<void> {
|
||||
: 1;
|
||||
const repetitions = Number.isNaN(repetitionsArg) ? 1 : repetitionsArg;
|
||||
|
||||
if (useLangsmith) {
|
||||
if (usePairwiseEval) {
|
||||
await runPairwiseLangsmithEvaluation(repetitions);
|
||||
} else if (useLangsmith) {
|
||||
await runLangsmithEvaluation(repetitions);
|
||||
} else {
|
||||
const csvTestCases = promptsCsvPath ? loadTestCasesFromCsv(promptsCsvPath) : undefined;
|
||||
|
||||
@ -0,0 +1,144 @@
|
||||
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
||||
import type { LangChainTracer } from '@langchain/core/tracers/tracer_langchain';
|
||||
import { evaluate } from 'langsmith/evaluation';
|
||||
import type { Run, Example } from 'langsmith/schemas';
|
||||
import type { EvaluationResult as LangsmithEvaluationResult } from 'langsmith/evaluation';
|
||||
import pc from 'picocolors';
|
||||
|
||||
import { evaluateWorkflowPairwise } from '../chains/pairwise-evaluator';
|
||||
import { setupTestEnvironment, createAgent } from '../core/environment';
|
||||
import { generateRunId, isWorkflowStateValues } from '../types/langsmith';
|
||||
import { consumeGenerator, formatHeader, getChatPayload } from '../utils/evaluation-helpers';
|
||||
import type { INodeTypeDescription } from 'n8n-workflow';
|
||||
|
||||
interface PairwiseDatasetInput {
|
||||
evals: {
|
||||
dos: string[];
|
||||
donts: string[];
|
||||
};
|
||||
prompt: string;
|
||||
}
|
||||
|
||||
function createPairwiseWorkflowGenerator(
|
||||
parsedNodeTypes: INodeTypeDescription[],
|
||||
llm: BaseChatModel,
|
||||
tracer?: LangChainTracer,
|
||||
) {
|
||||
return async (inputs: PairwiseDatasetInput) => {
|
||||
const runId = generateRunId();
|
||||
|
||||
// Create agent for this run
|
||||
const agent = createAgent(parsedNodeTypes, llm, tracer);
|
||||
|
||||
// Use the prompt from the dataset
|
||||
await consumeGenerator(
|
||||
agent.chat(getChatPayload(inputs.prompt, runId), 'langsmith-pairwise-eval-user'),
|
||||
);
|
||||
|
||||
// Get generated workflow
|
||||
const state = await agent.getState(runId, 'langsmith-pairwise-eval-user');
|
||||
|
||||
if (!state.values || !isWorkflowStateValues(state.values)) {
|
||||
throw new Error('Invalid workflow state');
|
||||
}
|
||||
|
||||
return {
|
||||
workflow: state.values.workflowJSON,
|
||||
evalCriteria: inputs.evals,
|
||||
prompt: inputs.prompt,
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
function createPairwiseLangsmithEvaluator(llm: BaseChatModel) {
|
||||
return async (rootRun: Run, _example?: Example): Promise<LangsmithEvaluationResult[]> => {
|
||||
const outputs = rootRun.outputs;
|
||||
if (!outputs || !outputs.workflow || !outputs.evalCriteria) {
|
||||
return [
|
||||
{
|
||||
key: 'pairwise_score',
|
||||
score: 0,
|
||||
comment: 'Missing workflow or evaluation criteria in outputs',
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
const result = await evaluateWorkflowPairwise(llm, {
|
||||
workflowJSON: outputs.workflow,
|
||||
evalCriteria: outputs.evalCriteria,
|
||||
});
|
||||
|
||||
const violationsText =
|
||||
result.violations.length > 0
|
||||
? `Violations:\n${result.violations.map((v) => `- ${v.rule}: ${v.justification}`).join('\n')}`
|
||||
: '';
|
||||
|
||||
const passesText =
|
||||
result.passes.length > 0
|
||||
? `Passes:\n${result.passes.map((p) => `- ${p.rule}: ${p.justification}`).join('\n')}`
|
||||
: '';
|
||||
|
||||
const comment = [violationsText, passesText].filter(Boolean).join('\n\n');
|
||||
|
||||
return [
|
||||
{
|
||||
key: 'pairwise_score',
|
||||
score: result.score,
|
||||
comment: comment || 'No comments',
|
||||
},
|
||||
{
|
||||
key: 'pairwise_passed_count',
|
||||
score: result.passes.length,
|
||||
},
|
||||
{
|
||||
key: 'pairwise_failed_count',
|
||||
score: result.violations.length,
|
||||
},
|
||||
];
|
||||
};
|
||||
}
|
||||
|
||||
export async function runPairwiseLangsmithEvaluation(repetitions: number = 1): Promise<void> {
|
||||
console.log(formatHeader('AI Workflow Builder Pairwise Evaluation', 70));
|
||||
|
||||
if (!process.env.LANGSMITH_API_KEY) {
|
||||
console.error(pc.red('✗ LANGSMITH_API_KEY environment variable not set'));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
try {
|
||||
const { parsedNodeTypes, llm, tracer, lsClient } = await setupTestEnvironment();
|
||||
|
||||
if (!lsClient) {
|
||||
throw new Error('Langsmith client not initialized');
|
||||
}
|
||||
|
||||
const datasetName = process.env.LANGSMITH_DATASET_NAME ?? 'workflow-builder-pairwise-prompts';
|
||||
console.log(pc.blue(`➔ Using dataset: ${datasetName}`));
|
||||
|
||||
// Verify dataset exists
|
||||
try {
|
||||
await lsClient.readDataset({ datasetName });
|
||||
} catch (error) {
|
||||
console.error(pc.red(`✗ Dataset "${datasetName}" not found`));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const generateWorkflow = createPairwiseWorkflowGenerator(parsedNodeTypes, llm, tracer);
|
||||
const evaluator = createPairwiseLangsmithEvaluator(llm);
|
||||
|
||||
await evaluate(generateWorkflow, {
|
||||
data: datasetName,
|
||||
evaluators: [evaluator],
|
||||
maxConcurrency: 5,
|
||||
// @ts-ignore
|
||||
experimentPrefix: `pairwise-eval-${llm.modelName ?? 'unknown'}`,
|
||||
numRepetitions: repetitions,
|
||||
});
|
||||
|
||||
console.log(pc.green('✓ Pairwise evaluation completed'));
|
||||
} catch (error) {
|
||||
console.error(pc.red('✗ Pairwise evaluation failed:'), error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
@ -27,7 +27,8 @@
|
||||
"eval:csv": "tsx evaluations --prompts-csv",
|
||||
"eval:langsmith": "USE_LANGSMITH_EVAL=true tsx evaluations",
|
||||
"eval:generate": "GENERATE_TEST_CASES=true tsx evaluations",
|
||||
"eval:categorize": "tsx scripts/categorize-prompts.ts"
|
||||
"eval:categorize": "tsx scripts/categorize-prompts.ts",
|
||||
"eval:pairwise": "USE_PAIRWISE_EVAL=true tsx evaluations"
|
||||
},
|
||||
"main": "dist/index.js",
|
||||
"module": "src/index.ts",
|
||||
@ -68,4 +69,4 @@
|
||||
"madge": "^8.0.0",
|
||||
"p-limit": "^3.1.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user