diff --git a/.changeset/tender-ducks-wave.md b/.changeset/tender-ducks-wave.md new file mode 100644 index 00000000..071ced21 --- /dev/null +++ b/.changeset/tender-ducks-wave.md @@ -0,0 +1,6 @@ +--- +"evalite-tests": patch +"evalite": patch +--- + +feat: allow scorer functions to return arrays of scores (#304) diff --git a/packages/evalite-tests/tests/custom-scorer.test.ts b/packages/evalite-tests/tests/custom-scorer.test.ts index 8756495d..3560d952 100644 --- a/packages/evalite-tests/tests/custom-scorer.test.ts +++ b/packages/evalite-tests/tests/custom-scorer.test.ts @@ -16,6 +16,32 @@ it("Should let users create custom scorers", async () => { expect(evals.Index![0]?.results[0]?.scores[0]?.score).toBe(1); }); +it("Should let users return an array of scores from custom scorers", async () => { + await using fixture = await loadFixture("custom-scorer-array"); + + await fixture.run({ + mode: "run-once-and-exit", + }); + + const evals = await getEvalsAsRecordViaStorage(fixture.storage); + const scores = evals.Index![0]?.results[0]?.scores; + + expect(scores).toHaveLength(4); + expect(scores![0]?.name).toBe("Multiple Criteria"); + expect(scores![0]?.score).toBe(1); + expect((scores![0]?.metadata as any)?.criterion).toBe("Is Same"); + + expect(scores![1]?.name).toBe("Multiple Criteria"); + expect(scores![1]?.score).toBe(1); + expect((scores![1]?.metadata as any)?.criterion).toBe("Length is 6"); + + expect(scores![2]?.name).toBe("Inline Scorer 1"); + expect(scores![2]?.score).toBe(1); + + expect(scores![3]?.name).toBe("Inline Scorer 2"); + expect(scores![3]?.score).toBe(0.5); +}); + it("Should fail if the custom scorer does not return a number", async () => { const scorer = createScorer({ name: "Is Same", diff --git a/packages/evalite-tests/tests/fixtures/custom-scorer-array/index.eval.ts b/packages/evalite-tests/tests/fixtures/custom-scorer-array/index.eval.ts new file mode 100644 index 00000000..97672d2e --- /dev/null +++ b/packages/evalite-tests/tests/fixtures/custom-scorer-array/index.eval.ts @@ -0,0 +1,49 @@ +import { createScorer, evalite } from "evalite"; +import { setTimeout } from "node:timers/promises"; + +evalite("Index", { + data: () => { + return [ + { + input: "abc", + expected: "abcdef", + }, + ]; + }, + task: async (input) => { + // To test whether duration is calculated properly + await setTimeout(10); + return input + "def"; + }, + scorers: [ + createScorer({ + name: "Multiple Criteria", + scorer: ({ output, expected }) => { + return [ + { + score: output === expected ? 1 : 0, + metadata: { criterion: "Is Same" }, + }, + { + score: output.length === 6 ? 1 : 0, + metadata: { criterion: "Length is 6" }, + }, + ]; + }, + }), + ({ output, expected }) => { + return [ + { + name: "Inline Scorer 1", + score: output === expected ? 1 : 0, + description: "Inline Same", + }, + { + name: "Inline Scorer 2", + score: 0.5, + description: "Inline Half", + }, + ]; + }, + ], +}); diff --git a/packages/evalite/src/create-scorer.ts b/packages/evalite/src/create-scorer.ts index dc95721d..82698685 100644 --- a/packages/evalite/src/create-scorer.ts +++ b/packages/evalite/src/create-scorer.ts @@ -4,28 +4,31 @@ export const createScorer = ( opts: Evalite.ScorerOpts ): Evalite.Scorer => { return async (input: Evalite.ScoreInput) => { - const score = await opts.scorer(input); + const scores = await opts.scorer(input); + const scoresArray = Array.isArray(scores) ? scores : [scores]; - if (typeof score === "object") { - if (typeof score.score !== "number") { - throw new Error(`The scorer '${opts.name}' must return a number.`); + return scoresArray.map((score) => { + if (typeof score === "object" && score !== null) { + if (typeof score.score !== "number") { + throw new Error(`The scorer '${opts.name}' must return a number.`); + } + + return { + score: score.score, + metadata: score.metadata, + description: opts.description, + name: opts.name, + }; } + if (typeof score !== "number") { + throw new Error(`The scorer '${opts.name}' must return a number.`); + } return { - score: score.score, - metadata: score.metadata, description: opts.description, name: opts.name, + score, }; - } - - if (typeof score !== "number") { - throw new Error(`The scorer '${opts.name}' must return a number.`); - } - return { - description: opts.description, - name: opts.name, - score, - }; + }); }; }; diff --git a/packages/evalite/src/evalite.ts b/packages/evalite/src/evalite.ts index e35e1ce1..b7704ac0 100644 --- a/packages/evalite/src/evalite.ts +++ b/packages/evalite/src/evalite.ts @@ -86,23 +86,25 @@ const runTask = async ( const output = await executeTask(opts.task, opts.input, opts.variant); const duration = Math.round(performance.now() - start); - const scores = await Promise.all( - (opts.scorers || []).map(async (scorerOrOpts) => { - if (typeof scorerOrOpts === "function") { - return scorerOrOpts({ - input: opts.input, - output, - expected: opts.expected, - }); - } else { - return createScorer(scorerOrOpts)({ - input: opts.input, - output, - expected: opts.expected, - }); - } - }) - ); + const scores = ( + await Promise.all( + (opts.scorers || []).map(async (scorerOrOpts) => { + if (typeof scorerOrOpts === "function") { + return scorerOrOpts({ + input: opts.input, + output, + expected: opts.expected, + }); + } else { + return createScorer(scorerOrOpts)({ + input: opts.input, + output, + expected: opts.expected, + }); + } + }) + ) + ).flat(); const columns = (await opts.columns?.({ diff --git a/packages/evalite/src/types.ts b/packages/evalite/src/types.ts index 1ef89f68..5628e0b7 100644 --- a/packages/evalite/src/types.ts +++ b/packages/evalite/src/types.ts @@ -195,7 +195,7 @@ export declare namespace Evalite { export type Scorer = ( opts: ScoreInput - ) => MaybePromise; + ) => MaybePromise; export type DataShape = { input: TInput; @@ -245,7 +245,11 @@ export declare namespace Evalite { description?: string; scorer: ( input: Evalite.ScoreInput - ) => Evalite.MaybePromise; + ) => Evalite.MaybePromise< + | number + | Evalite.UserProvidedScoreWithMetadata + | Array + >; }; export interface Trace {