diff --git a/e2e/scenarios/openai-instrumentation/assertions.ts b/e2e/scenarios/openai-instrumentation/assertions.ts index 8534428c3..df03b9d48 100644 --- a/e2e/scenarios/openai-instrumentation/assertions.ts +++ b/e2e/scenarios/openai-instrumentation/assertions.ts @@ -315,9 +315,16 @@ function summarizeResponsesOutput(output: Json): Json { return null; } - return output.map((item) => { + // Deduplicate identical items — the Responses API occasionally returns + // duplicate output entries (e.g., two identical "message" items when + // streaming), which would cause non-deterministic snapshot failures. + const seen = new Set(); + const result: Json[] = []; + + for (const item of output) { if (!isRecord(item as Json)) { - return null; + result.push(null); + continue; } const content = Array.isArray(item.content) ? item.content : []; @@ -330,14 +337,22 @@ function summarizeResponsesOutput(output: Json): Json { isRecord(entry as Json) ? jsonKeysFromText(entry.text) : [], ); - return { + const summarized = { content_types: contentTypes, json_keys: [...new Set(jsonKeys)].sort(), role: item.role ?? null, status: item.status ?? null, type: item.type ?? null, } satisfies Json; - }) satisfies Json; + + const key = JSON.stringify(summarized); + if (!seen.has(key)) { + seen.add(key); + result.push(summarized); + } + } + + return result; } function summarizeOutput(name: string, output: Json): Json { diff --git a/e2e/scenarios/openai-instrumentation/scenario.test.ts b/e2e/scenarios/openai-instrumentation/scenario.test.ts index c6ded0d15..0b23a2f35 100644 --- a/e2e/scenarios/openai-instrumentation/scenario.test.ts +++ b/e2e/scenarios/openai-instrumentation/scenario.test.ts @@ -9,7 +9,7 @@ import { defineOpenAIInstrumentationAssertions } from "./assertions"; const scenarioDir = await prepareScenarioDir({ scenarioDir: resolveScenarioDir(import.meta.url), }); -const TIMEOUT_MS = 60_000; +const TIMEOUT_MS = 120_000; const openaiScenarios = await Promise.all( [ { diff --git a/js/src/wrappers/ai-sdk/ai-sdk.test.ts b/js/src/wrappers/ai-sdk/ai-sdk.test.ts index c6cc49fd5..3c59692ae 100644 --- a/js/src/wrappers/ai-sdk/ai-sdk.test.ts +++ b/js/src/wrappers/ai-sdk/ai-sdk.test.ts @@ -1537,122 +1537,130 @@ describe("ai sdk client unit tests", TEST_SUITE_OPTIONS, () => { // Once processInputAttachments is made async and properly handles the Promise, // we should verify that the schema is serialized correctly in the logs. - test("ai sdk multi-round tool use with metrics", async () => { - expect(await backgroundLogger.drain()).toHaveLength(0); + test( + "ai sdk multi-round tool use with metrics", + { timeout: 60000 }, + async () => { + expect(await backgroundLogger.drain()).toHaveLength(0); - const getStorePriceTool = ai.tool({ - description: "Get the price of an item from a specific store", - inputSchema: z.object({ - store: z.string().describe("The store name (e.g., 'StoreA', 'StoreB')"), - item: z.string().describe("The item to get the price for"), - }), - execute: async (args: { store: string; item: string }) => { - const prices: Record> = { - StoreA: { laptop: 999, mouse: 25, keyboard: 75 }, - StoreB: { laptop: 1099, mouse: 20, keyboard: 80 }, - }; - const price = prices[args.store]?.[args.item] ?? 0; - return JSON.stringify({ store: args.store, item: args.item, price }); - }, - }); + const getStorePriceTool = ai.tool({ + description: "Get the price of an item from a specific store", + inputSchema: z.object({ + store: z + .string() + .describe("The store name (e.g., 'StoreA', 'StoreB')"), + item: z.string().describe("The item to get the price for"), + }), + execute: async (args: { store: string; item: string }) => { + const prices: Record> = { + StoreA: { laptop: 999, mouse: 25, keyboard: 75 }, + StoreB: { laptop: 1099, mouse: 20, keyboard: 80 }, + }; + const price = prices[args.store]?.[args.item] ?? 0; + return JSON.stringify({ store: args.store, item: args.item, price }); + }, + }); - const applyDiscountTool = ai.tool({ - description: "Apply a discount code to a total amount", - inputSchema: z.object({ - total: z.number().describe("The total amount before discount"), - discountCode: z.string().describe("The discount code to apply"), - }), - execute: async (args: { total: number; discountCode: string }) => { - const discounts: Record = { - SAVE10: 0.1, - SAVE20: 0.2, - }; - const discountRate = discounts[args.discountCode] ?? 0; - const finalTotal = args.total - args.total * discountRate; - return JSON.stringify({ - originalTotal: args.total, - discountCode: args.discountCode, - finalTotal, - }); - }, - }); + const applyDiscountTool = ai.tool({ + description: "Apply a discount code to a total amount", + inputSchema: z.object({ + total: z.number().describe("The total amount before discount"), + discountCode: z.string().describe("The discount code to apply"), + }), + execute: async (args: { total: number; discountCode: string }) => { + const discounts: Record = { + SAVE10: 0.1, + SAVE20: 0.2, + }; + const discountRate = discounts[args.discountCode] ?? 0; + const finalTotal = args.total - args.total * discountRate; + return JSON.stringify({ + originalTotal: args.total, + discountCode: args.discountCode, + finalTotal, + }); + }, + }); - const model = openai(TEST_MODEL); - const start = getCurrentUnixTimestamp(); + const model = openai(TEST_MODEL); + const start = getCurrentUnixTimestamp(); - const result = await wrappedAI.generateText({ - model, - system: - "You are a shopping assistant. When asked about prices, always get the price from each store mentioned using get_store_price, then apply any discount codes using apply_discount. Use the tools provided.", - tools: { - get_store_price: getStorePriceTool, - apply_discount: applyDiscountTool, - }, - toolChoice: "required", - prompt: - "I want to buy a laptop. Get the price from StoreA and StoreB, then apply the discount code SAVE20 to whichever is cheaper.", - stopWhen: ai.stepCountIs(6), - }); + const result = await wrappedAI.generateText({ + model, + system: + "You are a shopping assistant. When asked about prices, always get the price from each store mentioned using get_store_price, then apply any discount codes using apply_discount. Use the tools provided.", + tools: { + get_store_price: getStorePriceTool, + apply_discount: applyDiscountTool, + }, + toolChoice: "required", + prompt: + "I want to buy a laptop. Get the price from StoreA and StoreB, then apply the discount code SAVE20 to whichever is cheaper.", + stopWhen: ai.stepCountIs(6), + }); - const end = getCurrentUnixTimestamp(); - assert.ok(result); + const end = getCurrentUnixTimestamp(); + assert.ok(result); - const spans = await backgroundLogger.drain(); + const spans = await backgroundLogger.drain(); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const llmSpans = spans.filter( - (s: any) => - s.span_attributes?.type === "llm" && - s.span_attributes?.name === "doGenerate", - ); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const toolSpans = spans.filter( // eslint-disable-next-line @typescript-eslint/no-explicit-any - (s: any) => s.span_attributes?.type === "tool", - ); - - // Should have multiple doGenerate spans - one per LLM round/step - // This allows visualizing the LLM ↔ tool roundtrips - expect(llmSpans.length).toBeGreaterThanOrEqual(2); - - // Should have tool spans for get_store_price calls (at least 2 for StoreA and StoreB) - expect(toolSpans.length).toBeGreaterThanOrEqual(2); - - // Verify each doGenerate span has its own metrics - for (const llmSpan of llmSpans) { + const llmSpans = spans.filter( + (s: any) => + s.span_attributes?.type === "llm" && + s.span_attributes?.name === "doGenerate", + ); // eslint-disable-next-line @typescript-eslint/no-explicit-any - const span = llmSpan as any; - expect(span.metrics).toBeDefined(); - expect(span.metrics.start).toBeDefined(); - expect(span.metrics.end).toBeDefined(); - expect(start).toBeLessThanOrEqual(span.metrics.start); - expect(span.metrics.end).toBeLessThanOrEqual(end); - - // Token metrics structure varies by AI SDK version - // v5: metrics.tokens, prompt_tokens, completion_tokens are defined - // v6: metrics structure may differ - see v5-specific tests for strict assertions - } + const toolSpans = spans.filter( + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (s: any) => s.span_attributes?.type === "tool", + ); - // Verify tool spans have the expected structure - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const storePriceSpans = toolSpans.filter( - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (s: any) => s.span_attributes?.name === "get_store_price", - ); - expect(storePriceSpans.length).toBeGreaterThanOrEqual(2); + // Should have multiple doGenerate spans - one per LLM round/step + // This allows visualizing the LLM ↔ tool roundtrips + expect(llmSpans.length).toBeGreaterThanOrEqual(2); + + // Should have tool spans for get_store_price calls (at least 2 for StoreA and StoreB) + expect(toolSpans.length).toBeGreaterThanOrEqual(2); + + // Verify each doGenerate span has its own metrics + for (const llmSpan of llmSpans) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const span = llmSpan as any; + expect(span.metrics).toBeDefined(); + expect(span.metrics.start).toBeDefined(); + expect(span.metrics.end).toBeDefined(); + expect(start).toBeLessThanOrEqual(span.metrics.start); + expect(span.metrics.end).toBeLessThanOrEqual(end); + + // Token metrics structure varies by AI SDK version + // v5: metrics.tokens, prompt_tokens, completion_tokens are defined + // v6: metrics structure may differ - see v5-specific tests for strict assertions + } - // Verify tool spans have input/output - for (const toolSpan of storePriceSpans) { + // Verify tool spans have the expected structure // eslint-disable-next-line @typescript-eslint/no-explicit-any - const span = toolSpan as any; - expect(span.input).toBeDefined(); - expect(span.output).toBeDefined(); - - const inputData = Array.isArray(span.input) ? span.input[0] : span.input; - expect(inputData.store).toMatch(/^Store[AB]$/); - expect(inputData.item).toBe("laptop"); - } - }); + const storePriceSpans = toolSpans.filter( + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (s: any) => s.span_attributes?.name === "get_store_price", + ); + expect(storePriceSpans.length).toBeGreaterThanOrEqual(2); + + // Verify tool spans have input/output + for (const toolSpan of storePriceSpans) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const span = toolSpan as any; + expect(span.input).toBeDefined(); + expect(span.output).toBeDefined(); + + const inputData = Array.isArray(span.input) + ? span.input[0] + : span.input; + expect(inputData.store).toMatch(/^Store[AB]$/); + expect(inputData.item).toBe("laptop"); + } + }, + ); test("ai sdk multi-round tool use span hierarchy", async () => { expect(await backgroundLogger.drain()).toHaveLength(0); diff --git a/js/src/wrappers/oai.test.ts b/js/src/wrappers/oai.test.ts index ea2261fc9..e41128d8e 100644 --- a/js/src/wrappers/oai.test.ts +++ b/js/src/wrappers/oai.test.ts @@ -23,7 +23,7 @@ import { parseMetricsFromUsage } from "./oai_responses"; // use the cheapest model for tests const TEST_MODEL = "gpt-4o-mini"; -const TEST_SUITE_OPTIONS = { timeout: 10000, retry: 3 }; +const TEST_SUITE_OPTIONS = { timeout: 30000, retry: 3 }; try { configureNode(); @@ -74,49 +74,53 @@ describe("openai client unit tests", TEST_SUITE_OPTIONS, () => { _exportsForTestingOnly.clearTestBackgroundLogger(); }); - test("openai.chat.completions.streaming", async (context) => { - assert.lengthOf(await backgroundLogger.drain(), 0); + test( + "openai.chat.completions.streaming", + { timeout: 30000 }, + async (context) => { + assert.lengthOf(await backgroundLogger.drain(), 0); - for (const includeUsage of [false, true]) { - const start = getCurrentUnixTimestamp(); - const stream = await client.chat.completions.create({ - messages: [{ role: "user", content: "1+1" }], - model: TEST_MODEL, - stream: true, - stream_options: { - include_usage: includeUsage, - }, - }); - - let ttft = -1.0; - for await (const event of stream) { - if (ttft < 0) { - ttft = getCurrentUnixTimestamp() - start; + for (const includeUsage of [false, true]) { + const start = getCurrentUnixTimestamp(); + const stream = await client.chat.completions.create({ + messages: [{ role: "user", content: "1+1" }], + model: TEST_MODEL, + stream: true, + stream_options: { + include_usage: includeUsage, + }, + }); + + let ttft = -1.0; + for await (const event of stream) { + if (ttft < 0) { + ttft = getCurrentUnixTimestamp() - start; + } + assert.ok(event); } - assert.ok(event); - } - const end = getCurrentUnixTimestamp(); + const end = getCurrentUnixTimestamp(); - const spans = await backgroundLogger.drain(); - assert.lengthOf(spans, 1); - // eslint-disable-next-line @typescript-eslint/consistent-type-assertions, @typescript-eslint/no-explicit-any - const span = spans[0] as any; - assert.equal(span.span_attributes.name, "Chat Completion"); - assert.equal(span.span_attributes.type, "llm"); - const m = span.metrics; - assert.isTrue(start <= m.start && m.start < m.end && m.end <= end); - assert.isTrue(ttft >= m.time_to_first_token); - if (includeUsage) { - assert.isTrue(m.tokens > 0); - assert.isTrue(m.prompt_tokens > 0); - assert.isTrue(m.time_to_first_token > 0); - assert.isTrue(m.prompt_cached_tokens >= 0); - assert.isTrue(m.completion_reasoning_tokens >= 0); - } else { - assert.isTrue(m.tokens === undefined); + const spans = await backgroundLogger.drain(); + assert.lengthOf(spans, 1); + // eslint-disable-next-line @typescript-eslint/consistent-type-assertions, @typescript-eslint/no-explicit-any + const span = spans[0] as any; + assert.equal(span.span_attributes.name, "Chat Completion"); + assert.equal(span.span_attributes.type, "llm"); + const m = span.metrics; + assert.isTrue(start <= m.start && m.start < m.end && m.end <= end); + assert.isTrue(ttft >= m.time_to_first_token); + if (includeUsage) { + assert.isTrue(m.tokens > 0); + assert.isTrue(m.prompt_tokens > 0); + assert.isTrue(m.time_to_first_token > 0); + assert.isTrue(m.prompt_cached_tokens >= 0); + assert.isTrue(m.completion_reasoning_tokens >= 0); + } else { + assert.isTrue(m.tokens === undefined); + } } - } - }); + }, + ); test("openai.chat.completions", async (context) => { assert.lengthOf(await backgroundLogger.drain(), 0); @@ -320,7 +324,7 @@ describe("openai client unit tests", TEST_SUITE_OPTIONS, () => { expect(start <= m.start && m.start < m.end && m.end <= end).toBe(true); }); - test("openai.chat.completions.tools", async () => { + test("openai.chat.completions.tools", { timeout: 30000 }, async () => { expect(await backgroundLogger.drain()).toHaveLength(0); // Define tools that can be called in parallel @@ -734,7 +738,7 @@ describe("openai client unit tests", TEST_SUITE_OPTIONS, () => { assert.isTrue(m.completion_reasoning_tokens >= 0); }); - test("openai.responses.compact", async (context) => { + test("openai.responses.compact", { timeout: 30000 }, async (context) => { if (!oai.responses || typeof oai.responses.compact !== "function") { context.skip(); }