diff --git a/.agents/skills/sdk-integrations/SKILL.md b/.agents/skills/sdk-integrations/SKILL.md index 07bdd99e..7f796a05 100644 --- a/.agents/skills/sdk-integrations/SKILL.md +++ b/.agents/skills/sdk-integrations/SKILL.md @@ -28,6 +28,7 @@ Always read: - `py/src/braintrust/integrations/base.py` - `py/src/braintrust/integrations/versioning.py` - `py/src/braintrust/integrations/__init__.py` +- `py/src/braintrust/integrations/utils.py` - `py/noxfile.py` Read these when working on an existing integration: @@ -43,8 +44,9 @@ Read these when relevant: - `py/src/braintrust/auto.py` for `auto_instrument()` changes - `py/src/braintrust/conftest.py` for VCR behavior - `py/src/braintrust/integrations/auto_test_scripts/` for subprocess auto-instrument coverage -- `py/src/braintrust/integrations/adk/test_adk.py` and `py/src/braintrust/integrations/anthropic/test_anthropic.py` for test layout patterns -- `py/src/braintrust/integrations/adk/tracing.py` and `py/src/braintrust/integrations/google_genai/tracing.py` when handling multimodal content, binary inputs, or generated media +- `py/src/braintrust/integrations/test_utils.py` when touching shared attachment materialization or multimodal payload shaping +- `py/src/braintrust/integrations/adk/test_adk.py`, `py/src/braintrust/integrations/anthropic/test_anthropic.py`, and `py/src/braintrust/integrations/google_genai/test_google_genai.py` for attachment-focused test layout patterns +- `py/src/braintrust/integrations/adk/tracing.py`, `py/src/braintrust/integrations/anthropic/tracing.py`, and `py/src/braintrust/integrations/google_genai/tracing.py` when handling multimodal content, binary inputs, generated media, or attachment materialization behavior Do not forget `auto.py` and `auto_test_scripts/`. Import-order and subprocess regressions often only show up there. @@ -54,8 +56,8 @@ Start from the nearest current integration: - ADK: direct method patching, `target_module`, `CompositeFunctionWrapperPatcher`, manual `wrap_*()` helpers, context propagation, inline data to `Attachment` - Agno: multi-target patching, several related patchers, version-conditional fallbacks with `superseded_by` -- Anthropic: compact constructor patching and a small public surface -- Google GenAI: multimodal tracing, generated media, output-side `Attachment` handling +- Anthropic: compact constructor patching, a small public surface, and multimodal request blocks that distinguish image vs document attachment payloads +- Google GenAI: multimodal tracing, generated media, output-side `Attachment` handling, and nested attachment materialization while preserving non-attachment values Choose the reference based on the hardest part of the task: @@ -188,10 +190,16 @@ def _process_result(result: Any, start: float) -> tuple[dict[str, Any], dict[str Treat binary payloads as attachments, not logged bytes: -- convert raw `bytes` to `braintrust.logger.Attachment` +- prefer the shared `_materialize_attachment(...)` helper in `py/src/braintrust/integrations/utils.py` over provider-local base64 or file-decoding code +- convert provider-owned raw `bytes`, base64 payloads, data URLs, file inputs, and generated media into `braintrust.logger.Attachment` objects when Braintrust should upload the content - preserve normal remote URLs as strings -- keep useful metadata such as MIME type, size, or provider ids next to the attachment -- follow existing repo content shapes for multimodal payloads +- use the repo's existing multimodal payload shapes after materialization: + - images -> `{"image_url": {"url": attachment}}` + - non-image media/documents/files -> `{"file": {"file_data": attachment, "filename": resolved.filename}}` +- do not force non-image payloads through `image_url` shims +- if attachment materialization fails, keep the original value instead of dropping it or replacing it with `None` +- preserve non-attachment values while walking nested payloads unless you are intentionally normalizing them for readability +- keep useful metadata such as MIME type, size, safety data, filenames, or provider ids next to the attachment ## Patcher Rules @@ -251,7 +259,7 @@ Cover the surfaces that changed: - idempotence - failure and error logging - patcher resolution and duplicate detection when relevant -- attachment conversion for binary inputs or generated media +- attachment conversion for binary inputs or generated media, including assertions that images land under `image_url.url`, non-image payloads land under `file.file_data`, and traced payloads contain `Attachment` objects rather than raw bytes or base64 blobs - span structure, especially `input`, `output`, `metadata`, and `metrics` For streaming changes, verify both: @@ -259,6 +267,13 @@ For streaming changes, verify both: - the provider still returns the expected iterator or async iterator - the final logged span contains the aggregated `output` and stream-specific `metrics` +Also verify, when relevant: + +- the `input` contains the expected model/messages/prompt/config fields +- the `output` contains normalized provider results rather than opaque SDK instances +- the `metadata` contains finish reasons, ids, or annotations in the expected place +- binary payloads are represented as `Attachment` objects where applicable, while remote URLs and non-attachment values remain unchanged and unmaterialized file inputs are preserved rather than dropped + Keep VCR cassettes in `py/src/braintrust/integrations//cassettes/`. Re-record only when behavior intentionally changes. When the provider returns binary HTTP responses or generated media, sanitize cassettes as needed so fixtures do not store raw file bytes. @@ -296,3 +311,4 @@ Avoid these failures: - re-recording cassettes when behavior did not intentionally change - adding a custom `_instrument_*` helper where `_instrument_integration()` already fits - forgetting `target_module` for deep or optional patch targets +- forcing non-image attachments through `image_url` shims, dropping unrecognized file inputs, or re-serializing non-attachment values while materializing payloads diff --git a/py/src/braintrust/integrations/adk/test_adk.py b/py/src/braintrust/integrations/adk/test_adk.py index 4e424a4c..3e8bbcb5 100644 --- a/py/src/braintrust/integrations/adk/test_adk.py +++ b/py/src/braintrust/integrations/adk/test_adk.py @@ -314,8 +314,9 @@ async def generate_content_async(self, llm_request: LlmRequest, stream: bool = F assert len(new_message["parts"]) == 2 document_part = new_message["parts"][0] - assert "image_url" in document_part - attachment = document_part["image_url"]["url"] + assert "file" in document_part + assert document_part["file"]["filename"] == "file.pdf" + attachment = document_part["file"]["file_data"] assert isinstance(attachment, Attachment) assert attachment.reference["content_type"] == "application/pdf" assert attachment.reference["filename"] == "file.pdf" @@ -329,8 +330,8 @@ async def generate_content_async(self, llm_request: LlmRequest, stream: bool = F llm_span = next(row for row in spans if row["span_attributes"]["type"] == "llm") llm_contents = llm_span["input"]["contents"] llm_document_part = llm_contents[0]["parts"][0] - assert isinstance(llm_document_part["image_url"]["url"], Attachment) - assert llm_document_part["image_url"]["url"].reference["content_type"] == "application/pdf" + assert isinstance(llm_document_part["file"]["file_data"], Attachment) + assert llm_document_part["file"]["file_data"].reference["content_type"] == "application/pdf" @pytest.mark.vcr @@ -592,7 +593,7 @@ def __init__(self, inline_data=None, text=None): assert isinstance(attachment, Attachment), "Should be an Attachment object" assert attachment.reference["type"] == "braintrust_attachment" assert attachment.reference["content_type"] == "image/png" - assert attachment.reference["filename"] == "file.png" + assert attachment.reference["filename"] == "image.png" assert "key" in attachment.reference # Test serializing a Part with text @@ -740,7 +741,7 @@ async def test_adk_binary_data_attachment_conversion(memory_logger): assert "filename" in ref, "Attachment reference should have a filename" assert "content_type" in ref, "Attachment reference should have a content_type" assert ref["content_type"] == "image/png", "Content type should be image/png" - assert ref["filename"] == "file.png", "Filename should be file.png" + assert ref["filename"] == "image.png", "Filename should be image.png" # Second part should be the text text_part = new_message["parts"][1] diff --git a/py/src/braintrust/integrations/adk/tracing.py b/py/src/braintrust/integrations/adk/tracing.py index d25c68fd..e0c8bc37 100644 --- a/py/src/braintrust/integrations/adk/tracing.py +++ b/py/src/braintrust/integrations/adk/tracing.py @@ -9,7 +9,7 @@ from typing import Any, cast from braintrust.bt_json import bt_safe_deep_copy -from braintrust.integrations.utils import _attachment_from_bytes, _image_url_payload +from braintrust.integrations.utils import _materialize_attachment from braintrust.logger import start_span from braintrust.span_types import SpanTypeAttribute @@ -58,12 +58,10 @@ def _serialize_part(part: Any) -> Any: data = inline_data.data mime_type = inline_data.mime_type - # Convert bytes to Attachment if isinstance(data, bytes): - attachment = _attachment_from_bytes(data, mime_type) - - # Return in image_url format - SDK will replace with AttachmentReference - return _image_url_payload(attachment) + resolved_attachment = _materialize_attachment(data, mime_type=mime_type) + if resolved_attachment is not None: + return resolved_attachment.multimodal_part_payload # Handle Part objects with file_data (file references) if hasattr(part, "file_data") and part.file_data: diff --git a/py/src/braintrust/integrations/anthropic/test_anthropic.py b/py/src/braintrust/integrations/anthropic/test_anthropic.py index 386440d5..109350d0 100644 --- a/py/src/braintrust/integrations/anthropic/test_anthropic.py +++ b/py/src/braintrust/integrations/anthropic/test_anthropic.py @@ -124,9 +124,10 @@ def test_get_input_from_kwargs_converts_multimodal_base64_blocks_to_attachments( assert document_block["type"] == "document" assert document_block["source"] == {"type": "base64", "media_type": "application/pdf"} - assert isinstance(document_block["image_url"]["url"], Attachment) - assert document_block["image_url"]["url"].reference["content_type"] == "application/pdf" - assert document_block["image_url"]["url"].reference["filename"] == "document.pdf" + assert document_block["file"]["filename"] == "document.pdf" + assert isinstance(document_block["file"]["file_data"], Attachment) + assert document_block["file"]["file_data"].reference["content_type"] == "application/pdf" + assert document_block["file"]["file_data"].reference["filename"] == "document.pdf" serialized = str(processed_input) assert PNG_BASE64 not in serialized @@ -365,9 +366,10 @@ def test_anthropic_messages_create_with_document_attachment_input(memory_logger) assert document_block["type"] == "document" assert document_block["source"] == {"type": "base64", "media_type": "application/pdf"} - assert isinstance(document_block["image_url"]["url"], Attachment) - assert document_block["image_url"]["url"].reference["content_type"] == "application/pdf" - assert document_block["image_url"]["url"].reference["filename"] == "document.pdf" + assert document_block["file"]["filename"] == "document.pdf" + assert isinstance(document_block["file"]["file_data"], Attachment) + assert document_block["file"]["file_data"].reference["content_type"] == "application/pdf" + assert document_block["file"]["file_data"].reference["filename"] == "document.pdf" assert PDF_BASE64 not in str(span["input"]) diff --git a/py/src/braintrust/integrations/anthropic/tracing.py b/py/src/braintrust/integrations/anthropic/tracing.py index 74a14688..b9f9a096 100644 --- a/py/src/braintrust/integrations/anthropic/tracing.py +++ b/py/src/braintrust/integrations/anthropic/tracing.py @@ -4,11 +4,7 @@ from braintrust.bt_json import bt_safe_deep_copy from braintrust.integrations.anthropic._utils import Wrapper, extract_anthropic_usage -from braintrust.integrations.utils import ( - _attachment_filename_for_mime_type, - _attachment_from_base64_data, - _image_url_payload, -) +from braintrust.integrations.utils import _materialize_attachment from braintrust.logger import log_exc_info_to_span, start_span @@ -415,11 +411,6 @@ def _start_batch_results_span(args, kwargs): return start_span(name="anthropic.messages.batches.results", type="task", metadata=metadata, input=_input) -def _attachment_filename_for_media_type(media_type: str, block_type: str) -> str: - prefix = "image" if block_type == "image" else "document" - return _attachment_filename_for_mime_type(media_type, prefix=prefix) - - def _convert_base64_source_to_attachment(block_type, source): if not isinstance(source, dict): return None @@ -431,10 +422,10 @@ def _convert_base64_source_to_attachment(block_type, source): if not isinstance(media_type, str) or not isinstance(data, str): return None - return _attachment_from_base64_data( + return _materialize_attachment( data, - media_type, - filename=_attachment_filename_for_media_type(media_type, block_type), + mime_type=media_type, + prefix="image" if block_type == "image" else "document", ) @@ -447,11 +438,11 @@ def _process_input_attachments(value): source = value.get("source") if block_type in {"image", "document"} and isinstance(source, dict): - attachment = _convert_base64_source_to_attachment(block_type, source) - if attachment is not None: + resolved_attachment = _convert_base64_source_to_attachment(block_type, source) + if resolved_attachment is not None: processed = {k: _process_input_attachments(v) for k, v in value.items() if k != "source"} processed["source"] = {k: _process_input_attachments(v) for k, v in source.items() if k != "data"} - processed.update(_image_url_payload(attachment)) + processed.update(resolved_attachment.multimodal_part_payload) return processed return {k: _process_input_attachments(v) for k, v in value.items()} diff --git a/py/src/braintrust/integrations/google_genai/test_google_genai.py b/py/src/braintrust/integrations/google_genai/test_google_genai.py index f82bb1c6..c8821aed 100644 --- a/py/src/braintrust/integrations/google_genai/test_google_genai.py +++ b/py/src/braintrust/integrations/google_genai/test_google_genai.py @@ -143,10 +143,16 @@ def _assert_timing_metrics_are_valid(metrics, start=None, end=None): def _assert_attachment_part(part, *, content_type, filename): - assert "image_url" in part - assert "url" in part["image_url"] + if content_type.startswith("image/"): + assert "image_url" in part + assert "url" in part["image_url"] + attachment = part["image_url"]["url"] + else: + assert "file" in part + assert "file_data" in part["file"] + assert part["file"]["filename"] == filename + attachment = part["file"]["file_data"] - attachment = part["image_url"]["url"] assert isinstance(attachment, Attachment) assert attachment.reference["type"] == "braintrust_attachment" assert attachment.reference["content_type"] == content_type @@ -1030,6 +1036,43 @@ class TestModel(BaseModel): assert copied["context_file"] is attachment +def test_interaction_materialization_only_converts_multimodal_payloads(): + """Interaction helpers should only materialize attachments, not re-serialize values.""" + from datetime import datetime + from enum import Enum + + from braintrust.integrations.google_genai.tracing import _materialize_interaction_value + from pydantic import BaseModel + + class Mode(Enum): + CHAT = "chat" + + class InteractionPayload(BaseModel): + created_at: datetime + mode: Mode + media: dict[str, object] + + created_at = datetime(2024, 1, 2, 3, 4, 5) + materialized = _materialize_interaction_value( + InteractionPayload( + created_at=created_at, + mode=Mode.CHAT, + media={ + "type": "image", + "data": TINY_PNG_BASE64, + "mime_type": "image/png", + "caption": None, + }, + ) + ) + + assert materialized["created_at"] == created_at + assert materialized["mode"] is Mode.CHAT + assert materialized["media"]["caption"] is None + assert isinstance(materialized["media"]["data"], Attachment) + assert materialized["media"]["image_url"]["url"] is materialized["media"]["data"] + + GROUNDING_MODEL = "gemini-2.0-flash-001" diff --git a/py/src/braintrust/integrations/google_genai/tracing.py b/py/src/braintrust/integrations/google_genai/tracing.py index f56d31c3..ffa3ac4b 100644 --- a/py/src/braintrust/integrations/google_genai/tracing.py +++ b/py/src/braintrust/integrations/google_genai/tracing.py @@ -5,18 +5,11 @@ import logging import time from collections.abc import Awaitable, Callable, Iterable -from datetime import date, datetime -from enum import Enum from typing import TYPE_CHECKING, Any from braintrust.bt_json import bt_safe_deep_copy -from braintrust.integrations.utils import ( - _attachment_filename_for_mime_type, - _attachment_from_base64_data, - _attachment_from_bytes, - _image_url_payload, -) -from braintrust.logger import Attachment, start_span +from braintrust.integrations.utils import _materialize_attachment +from braintrust.logger import start_span from braintrust.span_types import SpanTypeAttribute from braintrust.util import clean_nones @@ -62,7 +55,7 @@ class _ActiveInteractionToolSpan: # --------------------------------------------------------------------------- -# Serialization helpers +# Interaction payload helpers # --------------------------------------------------------------------------- @@ -118,11 +111,9 @@ def _serialize_content_item(item: Any) -> Any: # Ensure data is bytes if isinstance(data, bytes): - attachment = _attachment_from_bytes(data, mime_type) - - # Return the attachment object in image_url format - # The SDK's _extract_attachments will replace it with its reference when logging - return _image_url_payload(attachment) + resolved_attachment = _materialize_attachment(data, mime_type=mime_type, prefix="file") + if resolved_attachment is not None: + return resolved_attachment.multimodal_part_payload # Try to use built-in serialization if available if hasattr(item, "model_dump"): @@ -155,36 +146,43 @@ def _serialize_tools(api_client: Any, input: Any | None) -> Any | None: return None -def _serialize_interaction_content_dict(value: dict[str, Any]) -> dict[str, Any]: - serialized = {key: _serialize_interaction_value(val) for key, val in value.items() if val is not None} +def _materialize_interaction_content_dict(value: dict[str, Any]) -> dict[str, Any]: + materialized = {key: _materialize_interaction_value(val) for key, val in value.items()} - content_type = serialized.get("type") - data = serialized.get("data") - mime_type = serialized.get("mime_type") - if content_type in _MEDIA_CONTENT_TYPES and isinstance(data, str) and isinstance(mime_type, str): - attachment = _attachment_from_base64_data(data, mime_type, label=content_type) - if attachment is not None: - serialized["data"] = attachment + content_type = materialized.get("type") + data = materialized.get("data") + mime_type = materialized.get("mime_type") + if content_type in _MEDIA_CONTENT_TYPES and isinstance(mime_type, str): + resolved_attachment = _materialize_attachment(data, mime_type=mime_type, label=content_type) + if resolved_attachment is not None: + materialized["data"] = resolved_attachment.attachment + materialized.update(resolved_attachment.multimodal_part_payload) - return serialized + return materialized -def _serialize_interaction_value(value: Any) -> Any: - if value is None or isinstance(value, (str, int, float, bool, Attachment)): +def _materialize_interaction_value(value: Any) -> Any: + if value is None or isinstance(value, (str, int, float, bool)): return value - if isinstance(value, Enum): - return value.value - if isinstance(value, (date, datetime)): - return value.isoformat() if isinstance(value, (list, tuple)): - return [_serialize_interaction_value(item) for item in value] + return [_materialize_interaction_value(item) for item in value] if isinstance(value, dict): - return _serialize_interaction_content_dict(value) + return _materialize_interaction_content_dict(value) + if dataclasses.is_dataclass(value) and not isinstance(value, type): + return { + field.name: _materialize_interaction_value(getattr(value, field.name)) + for field in dataclasses.fields(value) + } if hasattr(value, "model_dump"): try: - return _serialize_interaction_value(value.model_dump(exclude_none=True)) + return _materialize_interaction_value(value.model_dump(exclude_none=True)) + except TypeError: + return _materialize_interaction_value(value.model_dump()) + if hasattr(value, "dict") and not isinstance(value, type): + try: + return _materialize_interaction_value(value.dict(exclude_none=True)) except TypeError: - return _serialize_interaction_value(value.model_dump()) + return _materialize_interaction_value(value.dict()) return value @@ -229,18 +227,18 @@ def _prepare_interaction_create_traced_call( { "model": kwargs.get("model"), "agent": kwargs.get("agent"), - "input": _serialize_interaction_value(kwargs.get("input")), + "input": _materialize_interaction_value(kwargs.get("input")), "background": kwargs.get("background"), - "generation_config": _serialize_interaction_value(kwargs.get("generation_config")), + "generation_config": _materialize_interaction_value(kwargs.get("generation_config")), "previous_interaction_id": kwargs.get("previous_interaction_id"), - "response_format": _serialize_interaction_value(kwargs.get("response_format")), + "response_format": _materialize_interaction_value(kwargs.get("response_format")), "response_mime_type": kwargs.get("response_mime_type"), - "response_modalities": _serialize_interaction_value(kwargs.get("response_modalities")), + "response_modalities": _materialize_interaction_value(kwargs.get("response_modalities")), "store": kwargs.get("store"), "stream": kwargs.get("stream"), "system_instruction": kwargs.get("system_instruction"), - "tools": _serialize_interaction_value(kwargs.get("tools")), - "agent_config": _serialize_interaction_value(kwargs.get("agent_config")), + "tools": _materialize_interaction_value(kwargs.get("tools")), + "agent_config": _materialize_interaction_value(kwargs.get("agent_config")), } ) metadata = clean_nones( @@ -386,12 +384,13 @@ def _extract_generate_images_output(response: Any) -> dict[str, Any]: # Convert image bytes to an Attachment so the SDK uploads them to # object storage and the Braintrust UI can render the image. if isinstance(image_bytes, bytes) and mime_type: - attachment = _attachment_from_bytes( + resolved_attachment = _materialize_attachment( image_bytes, - mime_type, - filename=_attachment_filename_for_mime_type(mime_type, prefix=f"generated_image_{i}"), + mime_type=mime_type, + prefix=f"generated_image_{i}", ) - image_entry.update(_image_url_payload(attachment)) + if resolved_attachment is not None: + image_entry.update(resolved_attachment.multimodal_part_payload) serialized_images.append(image_entry) @@ -454,7 +453,7 @@ def _extract_interaction_text(outputs: list[dict[str, Any]]) -> str | None: def _serialize_interaction_outputs(response: "Interaction") -> list[dict[str, Any]]: - outputs = _serialize_interaction_value(getattr(response, "outputs", None)) + outputs = _materialize_interaction_value(getattr(response, "outputs", None)) return outputs if isinstance(outputs, list) else ([] if outputs is None else [outputs]) @@ -474,7 +473,7 @@ def _extract_interaction_output( def _extract_interaction_metadata(response: "Interaction") -> dict[str, Any]: usage = getattr(response, "usage", None) - usage_serialized = _serialize_interaction_value(usage) + usage_serialized = _materialize_interaction_value(usage) usage_by_modality = None if isinstance(usage_serialized, dict): usage_by_modality = clean_nones( @@ -492,7 +491,7 @@ def _extract_interaction_metadata(response: "Interaction") -> dict[str, Any]: "previous_interaction_id": getattr(response, "previous_interaction_id", None), "role": getattr(response, "role", None), "response_mime_type": getattr(response, "response_mime_type", None), - "response_modalities": _serialize_interaction_value(getattr(response, "response_modalities", None)), + "response_modalities": _materialize_interaction_value(getattr(response, "response_modalities", None)), "usage_by_modality": usage_by_modality, } ) @@ -576,7 +575,7 @@ def _interaction_process_result( def _generic_process_result(result: Any, start: float) -> tuple[Any, dict[str, Any]]: - return _serialize_interaction_value(result), _extract_generic_timing_metrics(start) + return _materialize_interaction_value(result), _extract_generic_timing_metrics(start) # --------------------------------------------------------------------------- @@ -722,10 +721,10 @@ def _reconstruct_interaction_outputs_from_events(events: list[Any]) -> list[dict continue if event_type == "content.start": - outputs_by_index[index] = _serialize_interaction_value(getattr(event, "content", None)) or {} + outputs_by_index[index] = _materialize_interaction_value(getattr(event, "content", None)) or {} elif event_type == "content.delta": item = outputs_by_index.setdefault(index, {}) - delta = _serialize_interaction_value(getattr(event, "delta", None)) or {} + delta = _materialize_interaction_value(getattr(event, "delta", None)) or {} if isinstance(delta, dict): outputs_by_index[index] = _merge_interaction_content_delta(item, delta) @@ -829,7 +828,7 @@ def _activate_interaction_tool_span( def _serialize_interaction_items(value: Any) -> list[dict[str, Any]]: - serialized = _serialize_interaction_value(value) + serialized = _materialize_interaction_value(value) if serialized is None: return [] items = serialized if isinstance(serialized, list) else [serialized] @@ -965,8 +964,8 @@ def _aggregate_interaction_events( None, ) if error_event is not None: - metadata["stream_error"] = _serialize_interaction_value(error_event.error) - return {"events": _serialize_interaction_value(events)}, clean_nones(metrics), metadata + metadata["stream_error"] = _materialize_interaction_value(error_event.error) + return {"events": _materialize_interaction_value(events)}, clean_nones(metrics), metadata final_outputs_list = _serialize_interaction_outputs(final_interaction) diff --git a/py/src/braintrust/integrations/mistral/test_mistral.py b/py/src/braintrust/integrations/mistral/test_mistral.py index ab1cfdab..3ea76973 100644 --- a/py/src/braintrust/integrations/mistral/test_mistral.py +++ b/py/src/braintrust/integrations/mistral/test_mistral.py @@ -1,3 +1,4 @@ +import base64 import importlib import inspect import os @@ -6,12 +7,13 @@ from pathlib import Path import pytest -from braintrust import logger +from braintrust import Attachment, logger from braintrust.integrations.mistral import MistralIntegration, wrap_mistral from braintrust.integrations.mistral.tracing import ( _aggregate_completion_events, _chat_complete_async_wrapper, _chat_complete_wrapper, + sanitize_mistral_logged_value, ) from braintrust.test_helpers import init_test_logger from braintrust.wrappers.test_utils import assert_metrics_are_valid, verify_autoinstrument_script @@ -604,6 +606,38 @@ async def fail(*args, **kwargs): assert "async boom" in span["error"] +def test_sanitize_mistral_logged_value_converts_image_url_data_uri_to_attachment(): + sanitized = sanitize_mistral_logged_value( + { + "type": "image_url", + "image_url": {"url": "data:image/png;base64,aGVsbG8="}, + } + ) + + assert isinstance(sanitized["image_url"]["url"], Attachment) + assert sanitized["image_url"]["url"].reference["content_type"] == "image/png" + + +def test_sanitize_mistral_logged_value_converts_large_base64_input_audio_to_attachment(): + sanitized = sanitize_mistral_logged_value( + { + "type": "input_audio", + "input_audio": base64.b64encode(b"hello" * 16).decode("ascii"), + } + ) + + assert isinstance(sanitized["input_audio"], Attachment) + assert sanitized["input_audio"].reference["filename"] == "input_audio.bin" + + +def test_sanitize_mistral_logged_value_leaves_non_base64_input_audio_unchanged(): + original = {"type": "input_audio", "input_audio": "not base64"} + + sanitized = sanitize_mistral_logged_value(original) + + assert sanitized == original + + def test_aggregate_completion_events_merges_tool_calls_and_content(): events = [ models.CompletionEvent( diff --git a/py/src/braintrust/integrations/mistral/tracing.py b/py/src/braintrust/integrations/mistral/tracing.py index 789f6439..c9584d9a 100644 --- a/py/src/braintrust/integrations/mistral/tracing.py +++ b/py/src/braintrust/integrations/mistral/tracing.py @@ -1,7 +1,5 @@ """Mistral-specific tracing helpers.""" -import base64 -import binascii import logging import re import time @@ -11,13 +9,13 @@ from braintrust.bt_json import bt_safe_deep_copy from braintrust.integrations.utils import ( _camel_to_snake, - _convert_data_url_to_attachment, _is_supported_metric_value, _log_and_end_span, _log_error_and_end_span, + _materialize_attachment, _merge_timing_and_usage_metrics, ) -from braintrust.logger import Attachment, start_span +from braintrust.logger import start_span from braintrust.span_types import SpanTypeAttribute @@ -83,17 +81,30 @@ def _is_unset(value: Any) -> bool: return value.__class__.__name__ == "Unset" -def _convert_input_audio_to_attachment(value: str) -> Attachment | str: +def _normalize_base64_payload(value: str) -> str | None: normalized = value.strip().replace("\n", "") - if len(normalized) < 64 or len(normalized) % 4 != 0 or not _BASE64_RE.fullmatch(normalized): - return value + if len(normalized) >= 64 and len(normalized) % 4 == 0 and _BASE64_RE.fullmatch(normalized) is not None: + return normalized + return None - try: - binary_data = base64.b64decode(normalized, validate=True) - except (binascii.Error, ValueError): + +def _convert_input_audio_to_attachment(value: str) -> Any: + normalized = _normalize_base64_payload(value) + if normalized is None: return value - return Attachment(data=binary_data, filename="input_audio.bin", content_type="application/octet-stream") + return ( + resolved.attachment + if ( + resolved := _materialize_attachment( + normalized, + mime_type="application/octet-stream", + filename="input_audio.bin", + ) + ) + is not None + else value + ) def _normalize_special_payloads(value: Any) -> Any: @@ -106,14 +117,18 @@ def _normalize_special_payloads(value: Any) -> Any: if isinstance(image_url, str): return { **value, - "image_url": _convert_data_url_to_attachment(image_url), + "image_url": resolved.attachment + if (resolved := _materialize_attachment(image_url)) is not None + else image_url, } if isinstance(image_url, dict) and isinstance(image_url.get("url"), str): return { **value, "image_url": { **image_url, - "url": _convert_data_url_to_attachment(image_url["url"]), + "url": resolved.attachment + if (resolved := _materialize_attachment(image_url["url"])) is not None + else image_url["url"], }, } diff --git a/py/src/braintrust/integrations/openai/test_openai.py b/py/src/braintrust/integrations/openai/test_openai.py index c2609775..b41931ce 100644 --- a/py/src/braintrust/integrations/openai/test_openai.py +++ b/py/src/braintrust/integrations/openai/test_openai.py @@ -10,7 +10,11 @@ import pytest from braintrust import Attachment, logger, wrap_openai from braintrust.integrations.openai import OpenAIIntegration -from braintrust.integrations.openai.tracing import RAW_RESPONSE_HEADER, ChatCompletionWrapper +from braintrust.integrations.openai.tracing import ( + RAW_RESPONSE_HEADER, + ChatCompletionWrapper, + _materialize_logged_file_input, +) from braintrust.test_helpers import assert_dict_matches, init_test_logger from braintrust.wrappers.test_utils import assert_metrics_are_valid, verify_autoinstrument_script from openai import AsyncOpenAI @@ -1772,6 +1776,16 @@ def test_openai_images_generate(memory_logger): assert span["metrics"]["duration"] >= 0 +def test_materialize_logged_file_input_preserves_unrecognized_values(): + file_id = "file-123" + values = [file_id, NOT_GIVEN] + + materialized = _materialize_logged_file_input(values) + + assert materialized[0] == file_id + assert materialized[1] is NOT_GIVEN + + @pytest.mark.vcr def test_openai_images_edit(memory_logger): assert not memory_logger.pop() diff --git a/py/src/braintrust/integrations/openai/tracing.py b/py/src/braintrust/integrations/openai/tracing.py index 016678c2..00e5dd1f 100644 --- a/py/src/braintrust/integrations/openai/tracing.py +++ b/py/src/braintrust/integrations/openai/tracing.py @@ -7,13 +7,10 @@ from typing import Any from braintrust.integrations.utils import ( - _attachment_filename_for_mime_type, - _attachment_from_base64_data, - _attachment_from_file_input, - _convert_data_url_to_attachment, - _image_url_payload, + _materialize_attachment, _parse_openai_usage_metrics, _prettify_response_params, + _ResolvedAttachment, _timing_metrics, _try_to_dict, ) @@ -104,47 +101,54 @@ def _raw_response_requested(kwargs: dict[str, Any]) -> bool: return False +def _materialize_logged_file_input(value: Any) -> Any: + if isinstance(value, list): + return [_materialize_logged_file_input(item) for item in value] + + resolved = _materialize_attachment(value) + return resolved.attachment if resolved is not None else value + + def _process_attachments_in_input(input_data: Any) -> Any: """Process input to convert data URL images and base64 documents to Attachment objects.""" if isinstance(input_data, list): return [_process_attachments_in_input(item) for item in input_data] if isinstance(input_data, dict): - # Check for OpenAI's image_url format with data URLs if ( input_data.get("type") == "image_url" and isinstance(input_data.get("image_url"), dict) and isinstance(input_data["image_url"].get("url"), str) ): - processed_url = _convert_data_url_to_attachment(input_data["image_url"]["url"]) + url = input_data["image_url"]["url"] + resolved = _materialize_attachment(url) return { **input_data, "image_url": { **input_data["image_url"], - "url": processed_url, + "url": resolved.attachment if resolved is not None else url, }, } - # Check for OpenAI's file format with data URL (e.g., PDFs) if ( input_data.get("type") == "file" and isinstance(input_data.get("file"), dict) and isinstance(input_data["file"].get("file_data"), str) ): + file_data = input_data["file"]["file_data"] file_filename = input_data["file"].get("filename") - processed_file_data = _convert_data_url_to_attachment( - input_data["file"]["file_data"], + resolved = _materialize_attachment( + file_data, filename=file_filename if isinstance(file_filename, str) else None, ) return { **input_data, "file": { **input_data["file"], - "file_data": processed_file_data, + "file_data": resolved.attachment if resolved is not None else file_data, }, } - # Recursively process nested objects return {key: _process_attachments_in_input(value) for key, value in input_data.items()} return input_data @@ -883,20 +887,20 @@ def _image_attachment_from_base64( *, output_format: Any, index: int, -) -> tuple[Any | None, int | None, str | None]: +) -> tuple[_ResolvedAttachment | None, int | None]: if not isinstance(data, str): - return None, None, None + return None, None extension = output_format if isinstance(output_format, str) and output_format else "png" mime_type = extension if "/" in extension else f"image/{extension}" - attachment = _attachment_from_base64_data( + resolved_attachment = _materialize_attachment( data, - mime_type, - filename=_attachment_filename_for_mime_type(mime_type, prefix=f"generated_image_{index}"), + mime_type=mime_type, + prefix=f"generated_image_{index}", ) - if attachment is None: - return None, None, None - return attachment, len(attachment.data), mime_type + if resolved_attachment is None: + return None, None + return resolved_attachment, len(resolved_attachment.attachment.data) def _extract_images_output(response: dict[str, Any]) -> dict[str, Any]: @@ -915,18 +919,18 @@ def _extract_images_output(response: dict[str, Any]) -> dict[str, Any]: ) if isinstance(image_dict.get("url"), str): - image_entry.update(_image_url_payload(image_dict["url"])) + image_entry["image_url"] = {"url": image_dict["url"]} b64_json = image_dict.get("b64_json") - attachment, image_size_bytes, mime_type = _image_attachment_from_base64( + resolved_attachment, image_size_bytes = _image_attachment_from_base64( b64_json, output_format=output_format, index=index, ) - if attachment is not None: - image_entry.update(_image_url_payload(attachment)) + if resolved_attachment is not None: + image_entry.update(resolved_attachment.multimodal_part_payload) image_entry["image_size_bytes"] = image_size_bytes - image_entry["mime_type"] = mime_type + image_entry["mime_type"] = resolved_attachment.mime_type elif isinstance(b64_json, str): image_entry["b64_json_present"] = True @@ -1067,8 +1071,8 @@ def _parse_params(cls, params: dict[str, Any]) -> dict[str, Any]: input_data = clean_nones( { "prompt": prompt, - "image": _attachment_from_file_input(image), - "mask": _attachment_from_file_input(mask), + "image": _materialize_logged_file_input(image), + "mask": _materialize_logged_file_input(mask), } ) diff --git a/py/src/braintrust/integrations/pydantic_ai/tracing.py b/py/src/braintrust/integrations/pydantic_ai/tracing.py index ddf9706d..547cd857 100644 --- a/py/src/braintrust/integrations/pydantic_ai/tracing.py +++ b/py/src/braintrust/integrations/pydantic_ai/tracing.py @@ -7,7 +7,7 @@ from typing import Any from braintrust.bt_json import bt_safe_deep_copy -from braintrust.integrations.utils import _attachment_from_bytes +from braintrust.integrations.utils import _materialize_attachment from braintrust.logger import start_span from braintrust.span_types import SpanTypeAttribute from wrapt import wrap_function_wrapper @@ -945,8 +945,13 @@ def _serialize_content_part(part: Any) -> Any: data = part.data media_type = part.media_type - attachment = _attachment_from_bytes(data, media_type) - return {"type": "binary", "attachment": attachment, "media_type": media_type} + resolved_attachment = _materialize_attachment(data, mime_type=media_type) + if resolved_attachment is not None: + return { + "type": "binary", + "attachment": resolved_attachment.attachment, + "media_type": resolved_attachment.mime_type, + } if hasattr(part, "content"): content = part.content diff --git a/py/src/braintrust/integrations/test_utils.py b/py/src/braintrust/integrations/test_utils.py index 5d76ea2d..ac61d7bd 100644 --- a/py/src/braintrust/integrations/test_utils.py +++ b/py/src/braintrust/integrations/test_utils.py @@ -4,18 +4,15 @@ from braintrust import Attachment from braintrust.integrations.utils import ( _attachment_filename_for_mime_type, - _attachment_from_base64_data, - _attachment_from_bytes, - _attachment_from_file_input, _camel_to_snake, - _convert_data_url_to_attachment, - _image_url_payload, _is_supported_metric_value, _log_and_end_span, _log_error_and_end_span, + _materialize_attachment, _merge_timing_and_usage_metrics, _parse_openai_usage_metrics, _prettify_response_params, + _ResolvedAttachment, _serialize_response_format, _timing_metrics, _try_to_dict, @@ -154,56 +151,91 @@ def test_prettify_response_params_filters_not_given_without_mutating_input(): assert "optional" in original -def test_attachment_filename_for_mime_type_handles_suffixes_and_prefixes(): - assert _attachment_filename_for_mime_type("image/png", prefix="image") == "image.png" - assert _attachment_filename_for_mime_type("application/pdf", prefix="document") == "document.pdf" - assert _attachment_filename_for_mime_type("image/svg+xml", prefix="file") == "file.svg" +def test_attachment_filename_for_mime_type_prefers_known_extensions(): + assert _attachment_filename_for_mime_type("image/svg+xml", prefix="image") == "image.svg" + assert ( + _attachment_filename_for_mime_type( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ) + == "file.xlsx" + ) + + +def test_materialize_attachment_from_bytes_uses_default_filename(): + resolved = _materialize_attachment(b"hello", mime_type="image/png") + + assert isinstance(resolved, _ResolvedAttachment) + assert isinstance(resolved.attachment, Attachment) + assert resolved.mime_type == "image/png" + assert resolved.filename == "image.png" + assert resolved.attachment.reference["content_type"] == "image/png" + assert resolved.attachment.reference["filename"] == "image.png" + + +def test_materialize_attachment_from_bytes_accepts_custom_prefix(): + resolved = _materialize_attachment(b"hello", mime_type="application/pdf", prefix="document") + assert isinstance(resolved, _ResolvedAttachment) + assert resolved.mime_type == "application/pdf" + assert resolved.filename == "document.pdf" -def test_attachment_from_bytes_uses_default_filename(): - attachment = _attachment_from_bytes(b"hello", "image/png") - assert isinstance(attachment, Attachment) - assert attachment.reference["content_type"] == "image/png" - assert attachment.reference["filename"] == "file.png" +def test_materialize_attachment_prefix_handles_mime_suffixes(): + resolved = _materialize_attachment(b"", mime_type="image/svg+xml", prefix="image") + assert isinstance(resolved, _ResolvedAttachment) + assert resolved.mime_type == "image/svg+xml" + assert resolved.filename == "image.svg" -def test_attachment_from_base64_data_accepts_data_urls_and_custom_filenames(): - attachment = _attachment_from_base64_data( + +def test_materialize_attachment_from_base64_accepts_data_urls_and_custom_filenames(): + resolved = _materialize_attachment( "data:image/png;base64,aGVsbG8=", - "image/png", + mime_type="image/png", filename="generated_image_0.png", ) - assert isinstance(attachment, Attachment) - assert attachment.reference["content_type"] == "image/png" - assert attachment.reference["filename"] == "generated_image_0.png" + assert isinstance(resolved, _ResolvedAttachment) + assert isinstance(resolved.attachment, Attachment) + assert resolved.attachment.reference["content_type"] == "image/png" + assert resolved.attachment.reference["filename"] == "generated_image_0.png" -def test_attachment_from_base64_data_returns_none_for_invalid_payloads(): - assert _attachment_from_base64_data("aGVsbG8=!", "image/png") is None +def test_materialize_attachment_returns_none_for_invalid_base64_payloads(): + assert _materialize_attachment("aGVsbG8=!", mime_type="image/png") is None -def test_convert_data_url_to_attachment_converts_valid_base64(): +def test_materialize_attachment_converts_valid_data_url(): data_url = "data:image/png;base64,aGVsbG8=" - attachment = _convert_data_url_to_attachment(data_url) + resolved = _materialize_attachment(data_url, label="image") + + assert isinstance(resolved, _ResolvedAttachment) + assert isinstance(resolved.attachment, Attachment) + assert resolved.attachment.reference["content_type"] == "image/png" + assert resolved.attachment.reference["filename"] == "image.png" + - assert isinstance(attachment, Attachment) - assert attachment.reference["content_type"] == "image/png" - assert attachment.reference["filename"] == "image.png" +def test_resolved_attachment_multimodal_part_payload_uses_image_url_for_images(): + resolved = _materialize_attachment(b"hello", mime_type="image/png") + assert resolved is not None + assert resolved.multimodal_part_payload == {"image_url": {"url": resolved.attachment}} -def test_image_url_payload_wraps_attachment_and_string_urls(): - attachment = _attachment_from_bytes(b"hello", "image/png") - assert _image_url_payload(attachment) == {"image_url": {"url": attachment}} - assert _image_url_payload("https://example.com/image.png") == { - "image_url": {"url": "https://example.com/image.png"} +def test_resolved_attachment_multimodal_part_payload_uses_file_parts_for_non_images(): + resolved = _materialize_attachment(b"hello", mime_type="application/pdf", filename="document.pdf") + + assert resolved is not None + assert resolved.multimodal_part_payload == { + "file": { + "file_data": resolved.attachment, + "filename": "document.pdf", + } } -def test_attachment_from_file_input_handles_common_input_shapes(tmp_path): +def test_materialize_attachment_handles_common_input_shapes(tmp_path): file_path = tmp_path / "example.png" file_path.write_bytes(b"abc") @@ -222,52 +254,51 @@ def tell(self): def seek(self, position): self._position = position - path_attachment = _attachment_from_file_input(file_path) - bytes_attachment = _attachment_from_file_input(b"abc", filename="example.png") - tuple_attachment = _attachment_from_file_input((str(file_path), b"abc", "image/png")) - file_attachment = _attachment_from_file_input(FileLike()) + path_attachment = _materialize_attachment(file_path) + bytes_attachment = _materialize_attachment(b"abc", filename="example.png") + tuple_attachment = _materialize_attachment((str(file_path), b"abc", "image/png")) + file_attachment = _materialize_attachment(FileLike()) - for attachment in (path_attachment, bytes_attachment, tuple_attachment, file_attachment): - assert isinstance(attachment, Attachment) - assert attachment.reference["filename"] == "example.png" - assert attachment.reference["content_type"] == "image/png" + for resolved in (path_attachment, bytes_attachment, tuple_attachment, file_attachment): + assert isinstance(resolved, _ResolvedAttachment) + assert resolved.attachment.reference["filename"] == "example.png" + assert resolved.attachment.reference["content_type"] == "image/png" -def test_attachment_from_file_input_preserves_file_position(tmp_path): +def test_materialize_attachment_preserves_file_position(tmp_path): file_path = tmp_path / "example.png" file_path.write_bytes(b"abc") with file_path.open("rb") as file_obj: assert file_obj.tell() == 0 - attachment = _attachment_from_file_input(file_obj) - assert isinstance(attachment, Attachment) + resolved = _materialize_attachment(file_obj) + assert isinstance(resolved, _ResolvedAttachment) assert file_obj.tell() == 0 -def test_convert_data_url_to_attachment_preserves_invalid_base64(): - data_url = "data:image/png;base64,aGVsbG8=!" - - converted = _convert_data_url_to_attachment(data_url) +def test_materialize_attachment_preserves_invalid_base64_strings_without_mime_type(): + assert _materialize_attachment("data:image/png;base64,aGVsbG8=!") is None - assert converted == data_url +def test_materialize_attachment_uses_file_prefix_for_non_image_mime_types(): + resolved = _materialize_attachment("data:application/pdf;base64,aGVsbG8=") -def test_convert_data_url_to_attachment_uses_file_prefix_for_non_image_mime_types(): - data_url = "data:application/pdf;base64,aGVsbG8=" + assert isinstance(resolved, _ResolvedAttachment) + assert resolved.attachment.reference["content_type"] == "application/pdf" + assert resolved.attachment.reference["filename"] == "file.pdf" - attachment = _convert_data_url_to_attachment(data_url) - assert isinstance(attachment, Attachment) - assert attachment.reference["content_type"] == "application/pdf" - assert attachment.reference["filename"] == "file.pdf" +def test_materialize_attachment_preserves_existing_attachment_filename_over_prefix(): + attachment = Attachment(data=b"hello", filename="existing.pdf", content_type="application/pdf") + resolved = _materialize_attachment(attachment, prefix="document") -def test_convert_data_url_to_attachment_preserves_non_data_urls(): - value = "https://example.com/image.png" + assert isinstance(resolved, _ResolvedAttachment) + assert resolved.attachment.reference["filename"] == "existing.pdf" - converted = _convert_data_url_to_attachment(value) - assert converted == value +def test_materialize_attachment_returns_none_for_non_data_url_strings(): + assert _materialize_attachment("https://example.com/image.png") is None def test_serialize_response_format_with_pydantic_basemodel_subclass(): diff --git a/py/src/braintrust/integrations/utils.py b/py/src/braintrust/integrations/utils.py index ca9d1741..fae511d9 100644 --- a/py/src/braintrust/integrations/utils.py +++ b/py/src/braintrust/integrations/utils.py @@ -17,6 +17,7 @@ import time import warnings from collections.abc import Callable, Mapping +from dataclasses import dataclass from numbers import Real from typing import Any @@ -26,6 +27,13 @@ _DATA_URL_RE = re.compile(r"^data:([^;]+);base64,(.+)$") +# Keep these overrides narrow and deterministic across platforms. Python's +# mimetypes registry varies by OS (notably on Windows), which can otherwise +# produce verbose vendor-subtype suffixes instead of common file extensions. +_KNOWN_ATTACHMENT_EXTENSIONS = { + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx", +} + def _try_to_dict(obj: Any) -> dict[str, Any] | Any: """Best-effort conversion of an SDK response object to a plain dict. @@ -106,112 +114,201 @@ def _attachment_filename_for_mime_type(mime_type: str, *, prefix: str = "file") - ``image/png`` with prefix ``image`` -> ``image.png`` - ``application/pdf`` with prefix ``document`` -> ``document.pdf`` - ``image/svg+xml`` with prefix ``file`` -> ``file.svg`` + - ``application/vnd.openxmlformats-officedocument.spreadsheetml.sheet`` + with prefix ``file`` -> ``file.xlsx`` """ - extension = mime_type.split("/", 1)[1] if "/" in mime_type else "bin" - extension = extension.split("+", 1)[0] + extension = _KNOWN_ATTACHMENT_EXTENSIONS.get(mime_type) + if extension is None: + guessed_extension = mimetypes.guess_extension(mime_type) + if guessed_extension: + extension = guessed_extension.lstrip(".") + else: + extension = mime_type.split("/", 1)[1] if "/" in mime_type else "bin" + extension = extension.split("+", 1)[0] return f"{prefix}.{extension}" -def _attachment_from_bytes( +@dataclass(frozen=True) +class _ResolvedAttachment: + attachment: Attachment + + @property + def mime_type(self) -> str: + return self.attachment.reference.get("content_type") or "application/octet-stream" + + @property + def filename(self) -> str: + return self.attachment.reference.get("filename") or "file" + + @property + def is_image(self) -> bool: + return self.mime_type.startswith("image/") + + @property + def multimodal_part_payload(self) -> dict[str, Any]: + if self.is_image: + return {"image_url": {"url": self.attachment}} + return {"file": {"file_data": self.attachment, "filename": self.filename}} + + +def _label_for_mime_type(mime_type: str, label: str | None) -> str: + if label is not None: + return label + return "image" if mime_type.startswith("image/") else "file" + + +def _default_attachment_filename( + mime_type: str, + *, + filename: str | None = None, + label: str | None = None, + prefix: str | None = None, +) -> str: + return filename or _attachment_filename_for_mime_type( + mime_type, + prefix=prefix or _label_for_mime_type(mime_type, label), + ) + + +def _resolved_attachment_from_bytes( data: bytes | bytearray, mime_type: str, *, filename: str | None = None, - label: str = "file", -) -> Attachment: - """Build an :class:`Attachment` from provider-owned binary data.""" - resolved_filename = filename or _attachment_filename_for_mime_type(mime_type, prefix=label) - return Attachment( - data=data if isinstance(data, bytes) else bytes(data), filename=resolved_filename, content_type=mime_type + label: str | None = None, + prefix: str | None = None, +) -> _ResolvedAttachment: + resolved_filename = _default_attachment_filename(mime_type, filename=filename, label=label, prefix=prefix) + attachment = Attachment( + data=data if isinstance(data, bytes) else bytes(data), + filename=resolved_filename, + content_type=mime_type, ) + return _ResolvedAttachment(attachment=attachment) -def _attachment_from_base64_data( +def _resolved_attachment_from_base64( data: str, mime_type: str, *, filename: str | None = None, - label: str = "file", -) -> Attachment | None: - """Decode base64 or data-URL content into an :class:`Attachment`.""" - raw_data = data - if raw_data.startswith("data:"): - _, _, encoded = raw_data.partition(",") - raw_data = encoded + label: str | None = None, + prefix: str | None = None, +) -> _ResolvedAttachment | None: + raw_data = data.partition(",")[2] if data.startswith("data:") else data try: decoded = base64.b64decode(raw_data, validate=True) except (binascii.Error, ValueError): return None - return _attachment_from_bytes(decoded, mime_type, filename=filename, label=label) - - -def _image_url_payload(url: Attachment | str) -> dict[str, Any]: - """Return the common Braintrust multimodal image/file payload shape.""" - return {"image_url": {"url": url}} + return _resolved_attachment_from_bytes(decoded, mime_type, filename=filename, label=label, prefix=prefix) -def _attachment_from_file_input( +def _materialize_attachment( value: Any, *, + mime_type: str | None = None, filename: str | None = None, - content_type: str | None = None, - label: str = "file", -) -> Any: - """Convert common provider file-input shapes into :class:`Attachment` objects. - - This is for traced input logging only; callers should pass the original - file objects/paths to the provider API unchanged. + label: str | None = None, + prefix: str | None = None, +) -> _ResolvedAttachment | None: + """Resolve common attachment inputs into a concrete attachment object. + + Supports existing :class:`Attachment` objects, bytes-like data, raw base64 + strings, data URLs, filesystem paths, file-like objects, and common + ``(filename, value, content_type)`` tuple inputs. """ - if isinstance(value, list): - return [ - _attachment_from_file_input(item, filename=filename, content_type=content_type, label=label) - for item in value - ] + if value is None: + return None - if isinstance(value, Attachment) or value is None: - return value + if isinstance(value, Attachment): + ref_ct = value.reference.get("content_type") + ref_fn = value.reference.get("filename") + resolved_mime_type = mime_type or ref_ct or "application/octet-stream" + resolved_filename = ( + filename + or ref_fn + or _default_attachment_filename( + resolved_mime_type, + label=label, + prefix=prefix, + ) + ) + if ref_ct != resolved_mime_type or ref_fn != resolved_filename: + attachment = Attachment( + data=value.data, + filename=resolved_filename, + content_type=resolved_mime_type, + ) + return _ResolvedAttachment(attachment=attachment) + return _ResolvedAttachment(attachment=value) if isinstance(value, tuple): tuple_filename = value[0] if value and isinstance(value[0], (str, os.PathLike)) else None tuple_value = value[1] if len(value) > 1 else None tuple_content_type = value[2] if len(value) > 2 and isinstance(value[2], str) else None - return _attachment_from_file_input( + return _materialize_attachment( tuple_value, + mime_type=mime_type or tuple_content_type, filename=filename or (os.path.basename(os.fspath(tuple_filename)) if tuple_filename is not None else None), - content_type=content_type or tuple_content_type, label=label, + prefix=prefix, + ) + + if isinstance(value, (bytes, bytearray)): + resolved_mime_type = ( + mime_type + or (mimetypes.guess_type(filename)[0] if filename is not None else None) + or "application/octet-stream" + ) + return _resolved_attachment_from_bytes( + value, resolved_mime_type, filename=filename, label=label, prefix=prefix ) if isinstance(value, (str, os.PathLike)): - path = os.fspath(value) + path_or_data = os.fspath(value) + data_url_match = _DATA_URL_RE.match(path_or_data) if isinstance(value, str) else None + if data_url_match: + data_url_mime_type, _ = data_url_match.groups() + return _resolved_attachment_from_base64( + path_or_data, + mime_type or data_url_mime_type, + filename=filename, + label=label, + prefix=prefix, + ) + try: - with open(path, "rb") as file_obj: + with open(path_or_data, "rb") as file_obj: data = file_obj.read() except OSError: + if isinstance(value, str) and mime_type is not None: + return _resolved_attachment_from_base64( + value, + mime_type, + filename=filename, + label=label, + prefix=prefix, + ) return None - resolved_filename = filename or os.path.basename(path) - resolved_content_type = ( - content_type or mimetypes.guess_type(resolved_filename)[0] or "application/octet-stream" - ) - return _attachment_from_bytes(data, resolved_content_type, filename=resolved_filename, label=label) - if isinstance(value, (bytes, bytearray)): - resolved_filename = filename - resolved_content_type = ( - content_type - or (mimetypes.guess_type(resolved_filename)[0] if resolved_filename is not None else None) - or "application/octet-stream" + resolved_filename = filename or os.path.basename(path_or_data) + resolved_mime_type = mime_type or mimetypes.guess_type(resolved_filename)[0] or "application/octet-stream" + return _resolved_attachment_from_bytes( + data, + resolved_mime_type, + filename=resolved_filename, + label=label, + prefix=prefix, ) - return _attachment_from_bytes(value, resolved_content_type, filename=resolved_filename, label=label) read = getattr(value, "read", None) if callable(read): file_name_attr = getattr(value, "name", None) resolved_filename = filename or (os.path.basename(file_name_attr) if isinstance(file_name_attr, str) else None) - resolved_content_type = ( - content_type + resolved_mime_type = ( + mime_type or (mimetypes.guess_type(resolved_filename)[0] if resolved_filename is not None else None) or "application/octet-stream" ) @@ -220,7 +317,7 @@ def _attachment_from_file_input( try: position = value.tell() except Exception: - position = None + pass try: data = value.read() @@ -234,32 +331,18 @@ def _attachment_from_file_input( if isinstance(data, str): data = data.encode() if isinstance(data, (bytes, bytearray)): - return _attachment_from_bytes(data, resolved_content_type, filename=resolved_filename, label=label) + return _resolved_attachment_from_bytes( + data, + resolved_mime_type, + filename=resolved_filename, + label=label, + prefix=prefix, + ) return None return None -def _convert_data_url_to_attachment(data_url: str, filename: str | None = None) -> Attachment | str: - """Convert a ``data:;base64,…`` URL into an :class:`Attachment`. - - Returns the original *data_url* string unchanged when it does not match - the expected format or cannot be decoded. - """ - match = _DATA_URL_RE.match(data_url) - if not match: - return data_url - - mime_type, _base64_data = match.groups() - attachment = _attachment_from_base64_data( - data_url, - mime_type, - filename=filename, - label="image" if mime_type.startswith("image/") else "file", - ) - return attachment or data_url - - def _is_not_given(value: object) -> bool: """Return ``True`` when *value* is a provider ``NOT_GIVEN`` sentinel.