Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/tender-ducks-wave.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"evalite-tests": patch
"evalite": patch
---

feat: allow scorer functions to return arrays of scores (#304)
26 changes: 26 additions & 0 deletions packages/evalite-tests/tests/custom-scorer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,32 @@ it("Should let users create custom scorers", async () => {
expect(evals.Index![0]?.results[0]?.scores[0]?.score).toBe(1);
});

it("Should let users return an array of scores from custom scorers", async () => {
await using fixture = await loadFixture("custom-scorer-array");

await fixture.run({
mode: "run-once-and-exit",
});

const evals = await getEvalsAsRecordViaStorage(fixture.storage);
const scores = evals.Index![0]?.results[0]?.scores;

expect(scores).toHaveLength(4);
expect(scores![0]?.name).toBe("Multiple Criteria");
expect(scores![0]?.score).toBe(1);
expect((scores![0]?.metadata as any)?.criterion).toBe("Is Same");

expect(scores![1]?.name).toBe("Multiple Criteria");
expect(scores![1]?.score).toBe(1);
expect((scores![1]?.metadata as any)?.criterion).toBe("Length is 6");

expect(scores![2]?.name).toBe("Inline Scorer 1");
expect(scores![2]?.score).toBe(1);

expect(scores![3]?.name).toBe("Inline Scorer 2");
expect(scores![3]?.score).toBe(0.5);
});

it("Should fail if the custom scorer does not return a number", async () => {
const scorer = createScorer<string, string, never>({
name: "Is Same",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import { createScorer, evalite } from "evalite";
import { setTimeout } from "node:timers/promises";

evalite("Index", {
data: () => {
return [
{
input: "abc",
expected: "abcdef",
},
];
},
task: async (input) => {
// To test whether duration is calculated properly
await setTimeout(10);
return input + "def";
},
scorers: [
createScorer({
name: "Multiple Criteria",
scorer: ({ output, expected }) => {
return [
{
score: output === expected ? 1 : 0,
metadata: { criterion: "Is Same" },
},
{
score: output.length === 6 ? 1 : 0,
metadata: { criterion: "Length is 6" },
},
];
},
}),
({ output, expected }) => {
return [
{
name: "Inline Scorer 1",
score: output === expected ? 1 : 0,
description: "Inline Same",
},
{
name: "Inline Scorer 2",
score: 0.5,
description: "Inline Half",
},
];
},
],
});
35 changes: 19 additions & 16 deletions packages/evalite/src/create-scorer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,31 @@ export const createScorer = <TInput, TOutput, TExpected = TOutput>(
opts: Evalite.ScorerOpts<TInput, TOutput, TExpected>
): Evalite.Scorer<TInput, TOutput, TExpected> => {
return async (input: Evalite.ScoreInput<TInput, TOutput, TExpected>) => {
const score = await opts.scorer(input);
const scores = await opts.scorer(input);
const scoresArray = Array.isArray(scores) ? scores : [scores];

if (typeof score === "object") {
if (typeof score.score !== "number") {
throw new Error(`The scorer '${opts.name}' must return a number.`);
return scoresArray.map((score) => {
if (typeof score === "object" && score !== null) {
if (typeof score.score !== "number") {
throw new Error(`The scorer '${opts.name}' must return a number.`);
}

return {
score: score.score,
metadata: score.metadata,
description: opts.description,
name: opts.name,
};
}

if (typeof score !== "number") {
throw new Error(`The scorer '${opts.name}' must return a number.`);
}
return {
score: score.score,
metadata: score.metadata,
description: opts.description,
name: opts.name,
score,
};
}

if (typeof score !== "number") {
throw new Error(`The scorer '${opts.name}' must return a number.`);
}
return {
description: opts.description,
name: opts.name,
score,
};
});
};
};
36 changes: 19 additions & 17 deletions packages/evalite/src/evalite.ts
Original file line number Diff line number Diff line change
Expand Up @@ -86,23 +86,25 @@ const runTask = async <TInput, TOutput, TExpected, TVariant = undefined>(
const output = await executeTask(opts.task, opts.input, opts.variant);
const duration = Math.round(performance.now() - start);

const scores = await Promise.all(
(opts.scorers || []).map(async (scorerOrOpts) => {
if (typeof scorerOrOpts === "function") {
return scorerOrOpts({
input: opts.input,
output,
expected: opts.expected,
});
} else {
return createScorer(scorerOrOpts)({
input: opts.input,
output,
expected: opts.expected,
});
}
})
);
const scores = (
await Promise.all(
(opts.scorers || []).map(async (scorerOrOpts) => {
if (typeof scorerOrOpts === "function") {
return scorerOrOpts({
input: opts.input,
output,
expected: opts.expected,
});
} else {
return createScorer(scorerOrOpts)({
input: opts.input,
output,
expected: opts.expected,
});
}
})
)
).flat();

const columns =
(await opts.columns?.({
Expand Down
8 changes: 6 additions & 2 deletions packages/evalite/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ export declare namespace Evalite {

export type Scorer<TInput, TOutput, TExpected> = (
opts: ScoreInput<TInput, TOutput, TExpected>
) => MaybePromise<Score>;
) => MaybePromise<Score | Score[]>;

export type DataShape<TInput, TExpected> = {
input: TInput;
Expand Down Expand Up @@ -245,7 +245,11 @@ export declare namespace Evalite {
description?: string;
scorer: (
input: Evalite.ScoreInput<TInput, TOutput, TExpected>
) => Evalite.MaybePromise<number | Evalite.UserProvidedScoreWithMetadata>;
) => Evalite.MaybePromise<
| number
| Evalite.UserProvidedScoreWithMetadata
| Array<number | Evalite.UserProvidedScoreWithMetadata>
>;
};

export interface Trace {
Expand Down