From f268f244fd6cae6e241920cf8c78adeba36aebfa Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Sat, 14 Feb 2026 20:48:46 -0800 Subject: [PATCH 1/3] Predicate agent --- CHANGELOG.md | 77 +++++ predicate/__init__.py | 9 + predicate/agents/__init__.py | 24 ++ predicate/agents/browser_agent.py | 340 +++++++++++++++++++++ tests/unit/test_predicate_browser_agent.py | 164 ++++++++++ 5 files changed, 614 insertions(+) create mode 100644 predicate/agents/__init__.py create mode 100644 predicate/agents/browser_agent.py create mode 100644 tests/unit/test_predicate_browser_agent.py diff --git a/CHANGELOG.md b/CHANGELOG.md index bf45ddd..0cc54cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,83 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### 2026-02-15 + +#### PredicateBrowserAgent (snapshot-first, verification-first) + +`PredicateBrowserAgent` is a new high-level agent wrapper that gives you a **browser-use-like** `step()` / `run()` surface, but keeps Predicate’s core philosophy: + +- **Snapshot-first perception** (structured DOM snapshot is the default) +- **Verification-first control plane** (you can gate progress with deterministic checks) +- Optional **vision fallback** (bounded) when snapshots aren’t sufficient + +It’s built on top of `AgentRuntime` + `RuntimeAgent`. + +##### Quickstart (single step) + +```python +from predicate import AgentRuntime, PredicateBrowserAgent, PredicateBrowserAgentConfig, RuntimeStep +from predicate.llm_provider import OpenAIProvider # or AnthropicProvider / DeepInfraProvider / LocalLLMProvider + +runtime = AgentRuntime(backend=...) # PlaywrightBackend, CDPBackendV0, etc. +llm = OpenAIProvider(model="gpt-4o-mini") + +agent = PredicateBrowserAgent( + runtime=runtime, + executor=llm, + config=PredicateBrowserAgentConfig( + # Token control: include last N step summaries in the prompt (0 disables history). + history_last_n=2, + ), +) + +ok = await agent.step( + task_goal="Find pricing and verify checkout button exists", + step=RuntimeStep(goal="Open pricing page"), +) +``` + +##### Customize the compact prompt (advanced) + +If you want to change the “compact prompt” the executor sees (e.g. fewer fields / different layout), you can override it: + +```python +from predicate import PredicateBrowserAgentConfig + +def compact_prompt_builder(task_goal, step_goal, dom_context, snapshot, history_summary): + system = "You are a web automation agent. Return ONLY one action: CLICK(id) | TYPE(id, \"text\") | PRESS(\"key\") | FINISH()" + user = f"TASK: {task_goal}\nSTEP: {step_goal}\n\nRECENT:\n{history_summary}\n\nELEMENTS:\n{dom_context}\n\nReturn the single best action:" + return system, user + +config = PredicateBrowserAgentConfig(compact_prompt_builder=compact_prompt_builder) +``` + +##### CAPTCHA handling (interface-only; no solver shipped) + +If you set `captcha.policy="callback"`, you must provide a handler. The SDK does **not** include a public CAPTCHA solver. + +```python +from predicate import CaptchaConfig, HumanHandoffSolver, PredicateBrowserAgentConfig + +config = PredicateBrowserAgentConfig( + captcha=CaptchaConfig( + policy="callback", + # Manual solve in the live session; SDK waits until it clears: + handler=HumanHandoffSolver(timeout_ms=10 * 60_000, poll_ms=1_000), + ) +) +``` + +##### LLM providers (cloud or local) + +`PredicateBrowserAgent` works with any `LLMProvider` implementation. For a local HF Transformers model: + +```python +from predicate.llm_provider import LocalLLMProvider + +llm = LocalLLMProvider(model_name="Qwen/Qwen2.5-3B-Instruct", device="auto", load_in_4bit=True) +``` + ### 2026-02-13 #### Expanded deterministic verifications (adaptive resnapshotting) diff --git a/predicate/__init__.py b/predicate/__init__.py index 71a55de..ef0f60c 100644 --- a/predicate/__init__.py +++ b/predicate/__init__.py @@ -33,6 +33,15 @@ from .agent_config import AgentConfig from .agent_runtime import AgentRuntime, AssertionHandle +# Snapshot-first browser agent (new high-level surface) +from .agents import ( + CaptchaConfig, + PermissionRecoveryConfig, + PredicateBrowserAgent, + PredicateBrowserAgentConfig, + VisionFallbackConfig, +) + # Backend-agnostic actions (aliased to avoid conflict with existing actions) # Browser backends (for browser-use integration) from .backends import ( diff --git a/predicate/agents/__init__.py b/predicate/agents/__init__.py new file mode 100644 index 0000000..513ab6e --- /dev/null +++ b/predicate/agents/__init__.py @@ -0,0 +1,24 @@ +""" +Agent-level orchestration helpers (snapshot-first, verification-first). + +This package provides a "browser-use-like" agent surface built on top of: +- AgentRuntime (snapshots, verification, tracing) +- RuntimeAgent (execution loop and bounded vision fallback) +""" + +from .browser_agent import ( + CaptchaConfig, + PermissionRecoveryConfig, + PredicateBrowserAgent, + PredicateBrowserAgentConfig, + VisionFallbackConfig, +) + +__all__ = [ + "CaptchaConfig", + "PermissionRecoveryConfig", + "PredicateBrowserAgent", + "PredicateBrowserAgentConfig", + "VisionFallbackConfig", +] + diff --git a/predicate/agents/browser_agent.py b/predicate/agents/browser_agent.py new file mode 100644 index 0000000..15c4a5a --- /dev/null +++ b/predicate/agents/browser_agent.py @@ -0,0 +1,340 @@ +from __future__ import annotations + +import importlib +from collections import deque +from collections.abc import Callable +from dataclasses import dataclass, field +from typing import Any, Literal + +from ..agent_runtime import AgentRuntime +from ..captcha import CaptchaHandler, CaptchaOptions +from ..captcha_strategies import ExternalSolver, HumanHandoffSolver, VisionSolver +from ..llm_interaction_handler import LLMInteractionHandler +from ..llm_provider import LLMProvider +from ..models import Snapshot, StepHookContext +from ..permissions import PermissionPolicy +from ..runtime_agent import RuntimeAgent, RuntimeStep + + +@dataclass(frozen=True) +class PermissionRecoveryConfig: + """ + Configuration for a bounded "permission recovery" policy. + + Note: startup permissions are applied at browser/context creation time via + `PermissionPolicy`. Recovery is intentionally best-effort and may require a + browser/context-level integration (outside AgentRuntime) depending on backend. + """ + + enabled: bool = True + max_restarts: int = 1 + auto_grant: list[str] = field(default_factory=list) + geolocation: dict | None = None + origin: str | None = None + + +@dataclass(frozen=True) +class VisionFallbackConfig: + """ + Controls if/when the agent may use a vision executor as a bounded fallback. + """ + + enabled: bool = False + max_vision_calls: int = 0 + trigger_requires_vision: bool = True + trigger_repeated_noop: bool = True + trigger_canvas_or_low_actionables: bool = True + + +@dataclass(frozen=True) +class CaptchaConfig: + """ + SDK-level CAPTCHA configuration, mapped onto `AgentRuntime.set_captcha_options()`. + """ + + policy: Literal["abort", "callback"] = "abort" + # Interface-only: the SDK does not ship a captcha solver. Users provide a handler/callback. + handler: CaptchaHandler | None = None + timeout_ms: int | None = None + poll_ms: int | None = None + min_confidence: float = 0.7 + + +@dataclass(frozen=True) +class PredicateBrowserAgentConfig: + """ + High-level agent configuration. + + This is intentionally small and focused on: + - operational knobs (vision/captcha/permissions) + - token controls (history_last_n) + - prompt customization hooks (compact_prompt_builder) + """ + + # Permissions + permission_startup: PermissionPolicy | None = None + permission_recovery: PermissionRecoveryConfig | None = None + + # Vision fallback + vision: VisionFallbackConfig = VisionFallbackConfig() + + # CAPTCHA handling + captcha: CaptchaConfig = CaptchaConfig() + + # Prompt / token controls + history_last_n: int = 0 # 0 disables LLM-facing step history (lowest token usage) + + # Compact prompt customization + # Signature: builder(task_goal, step_goal, dom_context, snapshot, history_summary) -> (system, user) + compact_prompt_builder: Callable[ + [str, str, str, Snapshot, str], tuple[str, str] + ] | None = None + + # Optional last-mile truncation of dom_context to control tokens + compact_prompt_postprocessor: Callable[[str], str] | None = None + + +def _history_summary(items: list[str]) -> str: + if not items: + return "" + return "\n".join(f"- {s}" for s in items if s) + + +def apply_captcha_config_to_runtime( + *, + runtime: AgentRuntime, + captcha: CaptchaConfig, + reset_session: Callable[[], Any] | None = None, +) -> None: + """ + Map `CaptchaConfig` onto `AgentRuntime.set_captcha_options`. + + This mirrors WebBench semantics: + - abort: fail fast + - callback: invoke handler and wait/retry per resolution + """ + + policy = (captcha.policy or "abort").strip().lower() + if policy not in {"abort", "callback"}: + raise ValueError("captcha.policy must be 'abort' or 'callback'") + + if policy == "abort": + runtime.set_captcha_options( + CaptchaOptions(policy="abort", min_confidence=float(captcha.min_confidence)) + ) + return + + poll_ms = int(captcha.poll_ms or 1_000) + timeout_ms = int(captcha.timeout_ms or 120_000) + + handler = captcha.handler + if handler is None: + raise ValueError( + 'captcha.handler is required when captcha.policy="callback". ' + "Use HumanHandoffSolver(...) for manual solve, or ExternalSolver(...) to integrate your system." + ) + + runtime.set_captcha_options( + CaptchaOptions( + policy="callback", + handler=handler, + timeout_ms=timeout_ms, + poll_ms=poll_ms, + min_confidence=float(captcha.min_confidence), + reset_session=reset_session, # used if handler returns retry_new_session + ) + ) + + +class _RuntimeAgentWithPromptOverrides(RuntimeAgent): + def __init__( + self, + *, + runtime: AgentRuntime, + executor: LLMProvider, + vision_executor: LLMProvider | None, + vision_verifier: LLMProvider | None, + compact_prompt_builder: PredicateBrowserAgentConfig["compact_prompt_builder"], + compact_prompt_postprocessor: PredicateBrowserAgentConfig["compact_prompt_postprocessor"], + history_summary_provider: Callable[[], str], + ) -> None: + super().__init__( + runtime=runtime, + executor=executor, + vision_executor=vision_executor, + vision_verifier=vision_verifier, + ) + self._structured_llm = LLMInteractionHandler(executor) + self._compact_prompt_builder = compact_prompt_builder + self._compact_prompt_postprocessor = compact_prompt_postprocessor + self._history_summary_provider = history_summary_provider + + def _propose_structured_action( + self, *, task_goal: str, step: RuntimeStep, snap: Snapshot + ) -> str: + dom_context = self._structured_llm.build_context(snap, step.goal) + if self._compact_prompt_postprocessor is not None: + dom_context = self._compact_prompt_postprocessor(dom_context) + + history_summary = self._history_summary_provider() or "" + + if self._compact_prompt_builder is not None: + system_prompt, user_prompt = self._compact_prompt_builder( + task_goal, + step.goal, + dom_context, + snap, + history_summary, + ) + resp = self.executor.generate(system_prompt, user_prompt, temperature=0.0) + return self._structured_llm.extract_action(resp.content) + + # Default: reuse SDK's standard system prompt template by calling query_llm, + # but include a small history block inside the goal string. + combined_goal = task_goal + if history_summary: + combined_goal = f"{task_goal}\n\nRECENT STEPS:\n{history_summary}" + combined_goal = f"{combined_goal}\n\nSTEP: {step.goal}" + resp = self._structured_llm.query_llm(dom_context, combined_goal) + return self._structured_llm.extract_action(resp.content) + + +@dataclass +class StepOutcome: + step_goal: str + ok: bool + used_vision: bool = False + + +class PredicateBrowserAgent: + """ + Snapshot-first, verification-first browser agent. + + This is a thin user-facing wrapper over `RuntimeAgent` with: + - a browser-use-like `run()` loop over `step()` + - bounded prompt-history injection (history_last_n) + - bounded vision fallback budgeting (max_vision_calls) + - CAPTCHA configuration mapping to AgentRuntime + """ + + def __init__( + self, + *, + runtime: AgentRuntime, + executor: LLMProvider, + vision_executor: LLMProvider | None = None, + vision_verifier: LLMProvider | None = None, + config: PredicateBrowserAgentConfig = PredicateBrowserAgentConfig(), + ) -> None: + self.runtime = runtime + self.executor = executor + self.vision_executor = vision_executor + self.vision_verifier = vision_verifier + self.config = config + + # LLM-facing step history summaries (bounded) + self._history: deque[str] = deque(maxlen=max(0, int(config.history_last_n))) + + # Vision budgeting + self._vision_calls_used = 0 + + # Apply CAPTCHA settings immediately (if enabled by config) + if self.config.captcha is not None: + apply_captcha_config_to_runtime(runtime=self.runtime, captcha=self.config.captcha) + + self._runner = _RuntimeAgentWithPromptOverrides( + runtime=self.runtime, + executor=self.executor, + vision_executor=self.vision_executor, + vision_verifier=self.vision_verifier, + compact_prompt_builder=self.config.compact_prompt_builder, + compact_prompt_postprocessor=self.config.compact_prompt_postprocessor, + history_summary_provider=self._get_history_summary, + ) + + def _get_history_summary(self) -> str: + if int(self.config.history_last_n) <= 0: + return "" + return _history_summary(list(self._history)) + + def _record_step_history(self, *, step_goal: str, ok: bool) -> None: + if int(self.config.history_last_n) <= 0: + return + self._history.append(f"{step_goal} -> {'ok' if ok else 'fail'}") + + async def step( + self, + *, + task_goal: str, + step: RuntimeStep, + on_step_start: Callable[[StepHookContext], Any] | None = None, + on_step_end: Callable[[StepHookContext], Any] | None = None, + ) -> StepOutcome: + # Enforce run-level max vision calls (coarse budget). + used_vision = False + if ( + self.config.vision.enabled + and int(self.config.vision.max_vision_calls) > 0 + and self._vision_calls_used >= int(self.config.vision.max_vision_calls) + ): + step = RuntimeStep( + goal=step.goal, + intent=step.intent, + verifications=list(step.verifications), + snapshot_limit_base=step.snapshot_limit_base, + snapshot_limit_step=step.snapshot_limit_step, + snapshot_limit_max=step.snapshot_limit_max, + max_snapshot_attempts=step.max_snapshot_attempts, + min_confidence=step.min_confidence, + min_actionables=step.min_actionables, + vision_executor_enabled=False, + max_vision_executor_attempts=0, + ) + + ok = await self._runner.run_step( + task_goal=task_goal, + step=step, + on_step_start=on_step_start, + on_step_end=on_step_end, + ) + + # Best-effort: detect vision usage by comparing executor call count. If vision executor exists + # and it was called, we count it. RuntimeAgent doesn't expose a structured outcome today. + if self.vision_executor is not None and getattr(self.vision_executor, "supports_vision", lambda: False)(): + # If vision path was used, the vision executor provider would have been called. + # We can't reliably introspect, so we treat it as "possibly used" based on verification fail patterns later. + # For now, increment budget only when we know vision was enabled and structured attempt failed. + pass + + # Conservative: increment vision budget if step had vision enabled and structured verification failed once. + # This is a heuristic until RuntimeAgent exposes a structured outcome. + if bool(getattr(step, "vision_executor_enabled", False)) and not bool(ok): + # If vision is enabled and we still failed, we likely spent vision if it was available. + # (If it wasn't available, this doesn't matter for budgeting because we only *cap* usage.) + used_vision = bool(self.vision_executor and self.vision_executor.supports_vision()) + if used_vision: + self._vision_calls_used += 1 + + self._record_step_history(step_goal=step.goal, ok=bool(ok)) + return StepOutcome(step_goal=step.goal, ok=bool(ok), used_vision=used_vision) + + async def run( + self, + *, + task_goal: str, + steps: list[RuntimeStep], + on_step_start: Callable[[StepHookContext], Any] | None = None, + on_step_end: Callable[[StepHookContext], Any] | None = None, + stop_on_failure: bool = True, + ) -> bool: + for step in steps: + out = await self.step( + task_goal=task_goal, + step=step, + on_step_start=on_step_start, + on_step_end=on_step_end, + ) + if stop_on_failure and not out.ok: + return False + return True + diff --git a/tests/unit/test_predicate_browser_agent.py b/tests/unit/test_predicate_browser_agent.py new file mode 100644 index 0000000..76a1fa1 --- /dev/null +++ b/tests/unit/test_predicate_browser_agent.py @@ -0,0 +1,164 @@ +from __future__ import annotations + +import asyncio +from unittest.mock import AsyncMock + +import pytest + +from predicate.agent_runtime import AgentRuntime +from predicate.agents import PredicateBrowserAgent, PredicateBrowserAgentConfig +from predicate.llm_provider import LLMProvider, LLMResponse +from predicate.models import ( + BBox, + Element, + Snapshot, + SnapshotDiagnostics, + Viewport, + VisualCues, +) +from predicate.runtime_agent import RuntimeStep +from predicate.verification import AssertContext, AssertOutcome + + +class MockTracer: + def __init__(self) -> None: + self.events: list[dict] = [] + + def emit(self, event_type: str, data: dict, step_id: str | None = None) -> None: + self.events.append({"type": event_type, "data": data, "step_id": step_id}) + + +class MockBackend: + def __init__(self) -> None: + self.mouse_clicks: list[tuple[float, float]] = [] + + async def refresh_page_info(self): + return None + + async def eval(self, expression: str): + # default: no canvas + if "querySelectorAll('canvas')" in expression: + return 0 + return None + + async def call(self, function_declaration: str, args=None): + _ = function_declaration, args + return None + + async def get_layout_metrics(self): + return None + + async def screenshot_png(self) -> bytes: + return b"png" + + async def screenshot_jpeg(self, quality: int | None = None) -> bytes: + _ = quality + return b"jpeg" + + async def mouse_move(self, x: float, y: float) -> None: + _ = x, y + return None + + async def mouse_click(self, x: float, y: float, button="left", click_count=1) -> None: + _ = button, click_count + self.mouse_clicks.append((float(x), float(y))) + + async def wheel(self, delta_y: float, x=None, y=None) -> None: + _ = delta_y, x, y + return None + + async def type_text(self, text: str) -> None: + _ = text + return None + + async def wait_ready_state(self, state="interactive", timeout_ms=15000) -> None: + _ = state, timeout_ms + return None + + +class ProviderStub(LLMProvider): + def __init__(self, *, model: str = "stub", responses: list[str] | None = None): + super().__init__(model) + self._responses = responses or [] + self.calls: list[dict] = [] + + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse: + self.calls.append({"system": system_prompt, "user": user_prompt, "kwargs": kwargs}) + content = self._responses.pop(0) if self._responses else "FINISH()" + return LLMResponse(content=content, model_name=self.model_name) + + def supports_json_mode(self) -> bool: + return True + + @property + def model_name(self) -> str: + return self._model_name + + +def make_snapshot(*, url: str, elements: list[Element], confidence: float | None = None) -> Snapshot: + diagnostics = SnapshotDiagnostics(confidence=confidence) if confidence is not None else None + return Snapshot( + status="success", + url=url, + elements=elements, + viewport=Viewport(width=1280, height=720), + diagnostics=diagnostics, + ) + + +def make_clickable_element(element_id: int) -> Element: + return Element( + id=element_id, + role="button", + text="OK", + importance=100, + bbox=BBox(x=10, y=20, width=100, height=40), + visual_cues=VisualCues(is_primary=True, is_clickable=True, background_color_name=None), + in_viewport=True, + is_occluded=False, + ) + + +def test_predicate_browser_agent_allows_compact_prompt_builder_override() -> None: + async def _run() -> None: + backend = MockBackend() + tracer = MockTracer() + runtime = AgentRuntime(backend=backend, tracer=tracer) + + s0 = make_snapshot(url="https://example.com/start", elements=[make_clickable_element(1)]) + s1 = make_snapshot(url="https://example.com/done", elements=[make_clickable_element(1)]) + + async def fake_snapshot(**_kwargs): + runtime.last_snapshot = snaps.pop(0) + return runtime.last_snapshot + + snaps = [s0, s1] + runtime.snapshot = AsyncMock(side_effect=fake_snapshot) # type: ignore[method-assign] + + step = RuntimeStep(goal="Click OK", verifications=[]) + executor = ProviderStub(responses=["CLICK(1)"]) + + def builder( + task_goal: str, + step_goal: str, + dom_context: str, + snap: Snapshot, + history: str, + ): + _ = task_goal, step_goal, dom_context, snap, history + return ("SYSTEM_CUSTOM", "USER_CUSTOM") + + agent = PredicateBrowserAgent( + runtime=runtime, + executor=executor, + config=PredicateBrowserAgentConfig(compact_prompt_builder=builder), + ) + + out = await agent.step(task_goal="test", step=step) + assert out.ok is True + assert executor.calls + assert "SYSTEM_CUSTOM" in executor.calls[0]["system"] + assert executor.calls[0]["user"] == "USER_CUSTOM" + + asyncio.run(_run()) + From 8263270f07e79303854ee9d232b18dc6eb9aad63 Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Sat, 14 Feb 2026 21:19:36 -0800 Subject: [PATCH 2/3] agent examples --- examples/agent/README.md | 5 + .../predicate_browser_agent_custom_prompt.py | 117 +++++++++++++++ .../agent/predicate_browser_agent_minimal.py | 101 +++++++++++++ predicate/runtime_agent.py | 131 ++++++++++++++++- tests/unit/test_runtime_agent_act_once.py | 133 ++++++++++++++++++ 5 files changed, 486 insertions(+), 1 deletion(-) create mode 100644 examples/agent/README.md create mode 100644 examples/agent/predicate_browser_agent_custom_prompt.py create mode 100644 examples/agent/predicate_browser_agent_minimal.py create mode 100644 tests/unit/test_runtime_agent_act_once.py diff --git a/examples/agent/README.md b/examples/agent/README.md new file mode 100644 index 0000000..a363d52 --- /dev/null +++ b/examples/agent/README.md @@ -0,0 +1,5 @@ +Predicate agent examples. + +- `predicate_browser_agent_minimal.py`: minimal `PredicateBrowserAgent` usage. +- `predicate_browser_agent_custom_prompt.py`: customize the compact prompt builder. + diff --git a/examples/agent/predicate_browser_agent_custom_prompt.py b/examples/agent/predicate_browser_agent_custom_prompt.py new file mode 100644 index 0000000..06c96c2 --- /dev/null +++ b/examples/agent/predicate_browser_agent_custom_prompt.py @@ -0,0 +1,117 @@ +""" +Example: PredicateBrowserAgent with compact prompt customization. + +This shows how to override the compact prompt used for action proposal. + +Usage: + python examples/agent/predicate_browser_agent_custom_prompt.py +""" + +import asyncio +import os + +from predicate import AsyncSentienceBrowser, PredicateBrowserAgent, PredicateBrowserAgentConfig +from predicate.agent_runtime import AgentRuntime +from predicate.llm_provider import LLMProvider, LLMResponse +from predicate.models import Snapshot +from predicate.runtime_agent import RuntimeStep +from predicate.tracing import JsonlTraceSink, Tracer + + +class RecordingProvider(LLMProvider): + """ + Example provider that records the prompts it receives. + + Swap this for OpenAIProvider / AnthropicProvider / DeepInfraProvider / LocalLLMProvider in real usage. + """ + + def __init__(self, action: str = "FINISH()"): + super().__init__(model="recording-provider") + self._action = action + self.last_system: str | None = None + self.last_user: str | None = None + + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse: + _ = kwargs + self.last_system = system_prompt + self.last_user = user_prompt + return LLMResponse(content=self._action, model_name=self.model_name) + + def supports_json_mode(self) -> bool: + return False + + @property + def model_name(self) -> str: + return "recording-provider" + + +def compact_prompt_builder( + task_goal: str, + step_goal: str, + dom_context: str, + snap: Snapshot, + history_summary: str, +) -> tuple[str, str]: + _ = snap + system = ( + "You are a web automation executor.\n" + "Return ONLY ONE action in this format:\n" + "- CLICK(id)\n" + '- TYPE(id, "text")\n' + "- PRESS('key')\n" + "- FINISH()\n" + "No prose." + ) + # Optional: aggressively control token usage by truncating DOM context. + dom_context = dom_context[:4000] + user = ( + f"TASK GOAL:\n{task_goal}\n\n" + + (f"RECENT STEPS:\n{history_summary}\n\n" if history_summary else "") + + f"STEP GOAL:\n{step_goal}\n\n" + f"DOM CONTEXT:\n{dom_context}\n" + ) + return system, user + + +async def main() -> None: + run_id = "predicate-browser-agent-custom-prompt" + tracer = Tracer(run_id=run_id, sink=JsonlTraceSink(f"traces/{run_id}.jsonl")) + + api_key = os.environ.get("PREDICATE_API_KEY") or os.environ.get("SENTIENCE_API_KEY") + + async with AsyncSentienceBrowser(api_key=api_key, headless=False) as browser: + page = await browser.new_page() + await page.goto("https://example.com") + await page.wait_for_load_state("networkidle") + + runtime = await AgentRuntime.from_sentience_browser( + browser=browser, page=page, tracer=tracer + ) + + executor = RecordingProvider(action="FINISH()") + + agent = PredicateBrowserAgent( + runtime=runtime, + executor=executor, + config=PredicateBrowserAgentConfig( + history_last_n=2, + compact_prompt_builder=compact_prompt_builder, + ), + ) + + out = await agent.step( + task_goal="Open example.com", + step=RuntimeStep(goal="Take no action; just finish"), + ) + print(f"step ok: {out.ok}") + print("--- prompt preview (system) ---") + print((executor.last_system or "")[:300]) + print("--- prompt preview (user) ---") + print((executor.last_user or "")[:300]) + + tracer.close() + + +if __name__ == "__main__": + asyncio.run(main()) + diff --git a/examples/agent/predicate_browser_agent_minimal.py b/examples/agent/predicate_browser_agent_minimal.py new file mode 100644 index 0000000..25d3ae5 --- /dev/null +++ b/examples/agent/predicate_browser_agent_minimal.py @@ -0,0 +1,101 @@ +""" +Example: PredicateBrowserAgent minimal demo. + +PredicateBrowserAgent is a higher-level, browser-use-like wrapper over: +AgentRuntime + RuntimeAgent (snapshot-first action proposal + execution + verification). + +Usage: + python examples/agent/predicate_browser_agent_minimal.py +""" + +import asyncio +import os + +from predicate import AsyncSentienceBrowser, PredicateBrowserAgent, PredicateBrowserAgentConfig +from predicate.agent_runtime import AgentRuntime +from predicate.llm_provider import LLMProvider, LLMResponse +from predicate.runtime_agent import RuntimeStep, StepVerification +from predicate.tracing import JsonlTraceSink, Tracer +from predicate.verification import exists, url_contains + + +class FixedActionProvider(LLMProvider): + """Tiny in-process provider for examples/tests.""" + + def __init__(self, action: str): + super().__init__(model="fixed-action") + self._action = action + + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse: + _ = system_prompt, user_prompt, kwargs + return LLMResponse(content=self._action, model_name=self.model_name) + + def supports_json_mode(self) -> bool: + return False + + @property + def model_name(self) -> str: + return "fixed-action" + + +async def main() -> None: + run_id = "predicate-browser-agent-minimal" + tracer = Tracer(run_id=run_id, sink=JsonlTraceSink(f"traces/{run_id}.jsonl")) + + api_key = os.environ.get("PREDICATE_API_KEY") or os.environ.get("SENTIENCE_API_KEY") + + async with AsyncSentienceBrowser(api_key=api_key, headless=False) as browser: + page = await browser.new_page() + await page.goto("https://example.com") + await page.wait_for_load_state("networkidle") + + runtime = await AgentRuntime.from_sentience_browser( + browser=browser, page=page, tracer=tracer + ) + + # For a "real" run, swap this for OpenAIProvider / AnthropicProvider / DeepInfraProvider / LocalLLMProvider. + executor = FixedActionProvider("FINISH()") + + agent = PredicateBrowserAgent( + runtime=runtime, + executor=executor, + config=PredicateBrowserAgentConfig( + # Keep a tiny, bounded LLM-facing step history (0 disables history entirely). + history_last_n=2, + ), + ) + + steps = [ + RuntimeStep( + goal="Verify Example Domain is loaded", + verifications=[ + StepVerification( + predicate=url_contains("example.com"), + label="url_contains_example", + required=True, + eventually=True, + timeout_s=5.0, + ), + StepVerification( + predicate=exists("role=heading"), + label="has_heading", + required=True, + eventually=True, + timeout_s=5.0, + ), + ], + max_snapshot_attempts=2, + snapshot_limit_base=60, + ) + ] + + ok = await agent.run(task_goal="Open example.com and verify", steps=steps) + print(f"run ok: {ok}") + + tracer.close() + print(f"trace written to traces/{run_id}.jsonl") + + +if __name__ == "__main__": + asyncio.run(main()) + diff --git a/predicate/runtime_agent.py b/predicate/runtime_agent.py index b6c4193..f919a67 100644 --- a/predicate/runtime_agent.py +++ b/predicate/runtime_agent.py @@ -21,7 +21,7 @@ from .llm_interaction_handler import LLMInteractionHandler from .llm_provider import LLMProvider from .models import BBox, Snapshot, StepHookContext -from .verification import AssertContext, AssertOutcome, Predicate +from .verification import Predicate @dataclass(frozen=True) @@ -55,6 +55,13 @@ class RuntimeStep: max_vision_executor_attempts: int = 1 +@dataclass(frozen=True) +class ActOnceResult: + action: str + snap: Snapshot + used_vision: bool + + class RuntimeAgent: """ A thin orchestration layer over AgentRuntime: @@ -164,6 +171,128 @@ async def run_step( ), ) + async def act_once( + self, + *, + task_goal: str, + step: RuntimeStep, + allow_vision_fallback: bool = True, + history_summary: str = "", + compact_prompt_builder: Callable[ + [str, str, str, Snapshot, str], tuple[str, str] + ] + | None = None, + dom_context_postprocessor: Callable[[str], str] | None = None, + ) -> str: + """ + Execute exactly one action for a step without owning step lifecycle. + + This helper is designed for orchestration layers (e.g. WebBench) that already + call `runtime.begin_step(...)` / `runtime.emit_step_end(...)` and want to + reuse RuntimeAgent's snapshot-first action proposal + execution logic without: + - double-counting step budgets + - emitting duplicate step_start/step_end events + + Returns: + Action string (e.g. "CLICK(123)", "TYPE(5, \"foo\")", "PRESS(\"Enter\")", "FINISH()") + """ + res = await self.act_once_result( + task_goal=task_goal, + step=step, + allow_vision_fallback=allow_vision_fallback, + history_summary=history_summary, + compact_prompt_builder=compact_prompt_builder, + dom_context_postprocessor=dom_context_postprocessor, + ) + return res.action + + async def act_once_with_snapshot( + self, + *, + task_goal: str, + step: RuntimeStep, + allow_vision_fallback: bool = True, + history_summary: str = "", + compact_prompt_builder: Callable[ + [str, str, str, Snapshot, str], tuple[str, str] + ] + | None = None, + dom_context_postprocessor: Callable[[str], str] | None = None, + ) -> tuple[str, Snapshot]: + """ + Like `act_once`, but also returns the pre-action snapshot used for proposal. + """ + res = await self.act_once_result( + task_goal=task_goal, + step=step, + allow_vision_fallback=allow_vision_fallback, + history_summary=history_summary, + compact_prompt_builder=compact_prompt_builder, + dom_context_postprocessor=dom_context_postprocessor, + ) + return res.action, res.snap + + async def act_once_result( + self, + *, + task_goal: str, + step: RuntimeStep, + allow_vision_fallback: bool = True, + history_summary: str = "", + compact_prompt_builder: Callable[ + [str, str, str, Snapshot, str], tuple[str, str] + ] + | None = None, + dom_context_postprocessor: Callable[[str], str] | None = None, + ) -> ActOnceResult: + """ + Like `act_once`, but returns action + proposal snapshot + whether vision was used. + """ + snap = await self._snapshot_with_ramp(step=step) + + # Optional short-circuit to vision (bounded by caller). + if allow_vision_fallback and await self._should_short_circuit_to_vision(step=step, snap=snap): + if self.vision_executor and self.vision_executor.supports_vision(): + url = await self._get_url_for_prompt() + image_b64 = await self._screenshot_base64_png() + system_prompt, user_prompt = self._vision_executor_prompts( + task_goal=task_goal, + step=step, + url=url, + snap=snap, + ) + resp = self.vision_executor.generate_with_image( + system_prompt, + user_prompt, + image_b64, + temperature=0.0, + ) + action = self._extract_action_from_text(resp.content) + await self._execute_action(action=action, snap=snap) + return ActOnceResult(action=action, snap=snap, used_vision=True) + + # Structured snapshot-first proposal. + dom_context = self._structured_llm.build_context(snap, step.goal) + if dom_context_postprocessor is not None: + dom_context = dom_context_postprocessor(dom_context) + + if compact_prompt_builder is not None: + system_prompt, user_prompt = compact_prompt_builder( + task_goal, step.goal, dom_context, snap, history_summary or "" + ) + resp = self.executor.generate(system_prompt, user_prompt, temperature=0.0) + action = self._structured_llm.extract_action(resp.content) + else: + combined_goal = task_goal + if history_summary: + combined_goal = f"{task_goal}\n\nRECENT STEPS:\n{history_summary}" + combined_goal = f"{combined_goal}\n\nSTEP: {step.goal}" + resp = self._structured_llm.query_llm(dom_context, combined_goal) + action = self._structured_llm.extract_action(resp.content) + + await self._execute_action(action=action, snap=snap) + return ActOnceResult(action=action, snap=snap, used_vision=False) + async def _run_hook( self, hook: Callable[[StepHookContext], Any] | None, diff --git a/tests/unit/test_runtime_agent_act_once.py b/tests/unit/test_runtime_agent_act_once.py new file mode 100644 index 0000000..e78e890 --- /dev/null +++ b/tests/unit/test_runtime_agent_act_once.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +import asyncio +from dataclasses import dataclass +from typing import Any + +from predicate.llm_provider import LLMProvider, LLMResponse +from predicate.models import BBox, Element, Snapshot, Viewport, VisualCues +from predicate.runtime_agent import RuntimeAgent, RuntimeStep + + +class _BackendStub: + def __init__(self) -> None: + self._url = "https://example.com/" + self.mouse_clicks: list[tuple[float, float]] = [] + self.typed: list[str] = [] + + async def get_url(self) -> str: + return self._url + + async def eval(self, expression: str) -> Any: + # Used for canvas detection + keypress best-effort; keep simple. + if "querySelectorAll('canvas')" in expression: + return 0 + return None + + async def screenshot_png(self) -> bytes: + return b"png" + + async def wait_ready_state(self, state: str = "interactive", timeout_ms: int = 15000) -> None: + _ = state, timeout_ms + return None + + async def mouse_click(self, x: float, y: float, button: str = "left", click_count: int = 1) -> None: + _ = button, click_count + self.mouse_clicks.append((float(x), float(y))) + + async def mouse_move(self, x: float, y: float) -> None: + _ = x, y + return None + + async def type_text(self, text: str) -> None: + self.typed.append(str(text)) + + +@dataclass +class _RuntimeStub: + backend: _BackendStub + last_snapshot: Snapshot | None = None + recorded_actions: list[str] = None # type: ignore[assignment] + + def __post_init__(self) -> None: + self.recorded_actions = [] + + async def snapshot( + self, + goal: str, + *, + limit: int, + max_attempts: int = 3, + min_confidence: float | None = None, + min_actionables: int | None = None, + ) -> Snapshot: + _ = goal, limit, max_attempts, min_confidence, min_actionables + snap = Snapshot( + status="success", + url="https://example.com/", + viewport=Viewport(width=1200, height=800), + elements=[ + Element( + id=1, + bbox=BBox(x=10, y=10, width=100, height=20), + text="Click me", + role="button", + importance=100, + visual_cues=VisualCues( + is_primary=True, + is_clickable=True, + background_color_name=None, + ), + in_viewport=True, + is_occluded=False, + ) + ], + ) + self.last_snapshot = snap + return snap + + async def get_url(self) -> str: + return await self.backend.get_url() + + async def record_action(self, action: str, url: str | None = None) -> None: + _ = url + self.recorded_actions.append(action) + + +class _ProviderStub(LLMProvider): + def __init__(self, *, response: str, model: str = "stub") -> None: + super().__init__(model) + self._response = response + + def supports_json_mode(self) -> bool: + return True + + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse: + _ = system_prompt, user_prompt, kwargs + return LLMResponse(content=self._response, model_name=self.model_name) + + @property + def model_name(self) -> str: + return self._model_name + + +def test_runtime_agent_act_once_does_not_require_step_lifecycle() -> None: + async def _run() -> None: + backend = _BackendStub() + runtime = _RuntimeStub(backend=backend) + llm = _ProviderStub(response="CLICK(1)") + agent = RuntimeAgent(runtime=runtime, executor=llm) + + action, snap = await agent.act_once_with_snapshot( + task_goal="Do a thing", + step=RuntimeStep(goal="Click the button"), + allow_vision_fallback=False, + ) + + assert action.strip().upper().startswith("CLICK(") + assert snap is runtime.last_snapshot + assert runtime.recorded_actions == ["CLICK(1)"] + assert len(backend.mouse_clicks) == 1 + + asyncio.run(_run()) + From 810391697f27f0cb2405740bdbbfe00480644b9d Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Sat, 14 Feb 2026 21:34:05 -0800 Subject: [PATCH 3/3] token usage & video recording --- CHANGELOG.md | 22 +++ examples/agent/README.md | 1 + ...rowser_agent_video_recording_playwright.py | 92 ++++++++++ predicate/agents/browser_agent.py | 158 +++++++++++++++++- tests/unit/test_predicate_browser_agent.py | 59 +++++++ 5 files changed, 328 insertions(+), 4 deletions(-) create mode 100644 examples/agent/predicate_browser_agent_video_recording_playwright.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0cc54cd..d2f1afc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -84,6 +84,28 @@ from predicate.llm_provider import LocalLLMProvider llm = LocalLLMProvider(model_name="Qwen/Qwen2.5-3B-Instruct", device="auto", load_in_4bit=True) ``` +##### Opt-in token usage accounting (best-effort) + +If you want to measure token spend, you can enable best-effort accounting (depends on provider reporting `prompt_tokens` / `completion_tokens` / `total_tokens` in `LLMResponse`): + +```python +from predicate import PredicateBrowserAgentConfig + +config = PredicateBrowserAgentConfig(token_usage_enabled=True) + +# Later: +usage = agent.get_token_usage() +agent.reset_token_usage() +``` + +##### RuntimeAgent: act once without step lifecycle (orchestrators) + +`RuntimeAgent` now exposes `act_once(...)` helpers that execute exactly one action **without** calling `runtime.begin_step()` / `runtime.emit_step_end()`. This is intended for external orchestrators (e.g. WebBench) that already own step lifecycle and just want the SDK’s snapshot-first propose+execute block. + +- `await agent.act_once(...) -> str` +- `await agent.act_once_with_snapshot(...) -> (action, snap)` +- `await agent.act_once_result(...) -> { action, snap, used_vision }` + ### 2026-02-13 #### Expanded deterministic verifications (adaptive resnapshotting) diff --git a/examples/agent/README.md b/examples/agent/README.md index a363d52..c737716 100644 --- a/examples/agent/README.md +++ b/examples/agent/README.md @@ -2,4 +2,5 @@ Predicate agent examples. - `predicate_browser_agent_minimal.py`: minimal `PredicateBrowserAgent` usage. - `predicate_browser_agent_custom_prompt.py`: customize the compact prompt builder. +- `predicate_browser_agent_video_recording_playwright.py`: enable Playwright video recording via context options (recommended). diff --git a/examples/agent/predicate_browser_agent_video_recording_playwright.py b/examples/agent/predicate_browser_agent_video_recording_playwright.py new file mode 100644 index 0000000..8f3b14c --- /dev/null +++ b/examples/agent/predicate_browser_agent_video_recording_playwright.py @@ -0,0 +1,92 @@ +""" +Example: PredicateBrowserAgent + Playwright video recording (recommended approach). + +Video recording is a *Playwright context feature* (record_video_dir), not a PredicateBrowserAgent knob. +This example shows how to: +1) create a Playwright context with video recording enabled +2) wrap the existing page with AsyncSentienceBrowser.from_page(...) +3) use AgentRuntime + PredicateBrowserAgent normally + +Usage: + python examples/agent/predicate_browser_agent_video_recording_playwright.py +""" + +import asyncio +import os +from pathlib import Path + +from playwright.async_api import async_playwright + +from predicate import AsyncSentienceBrowser, PredicateBrowserAgent, PredicateBrowserAgentConfig +from predicate.agent_runtime import AgentRuntime +from predicate.llm_provider import LLMProvider, LLMResponse +from predicate.runtime_agent import RuntimeStep + + +class FixedActionProvider(LLMProvider): + def __init__(self, action: str): + super().__init__(model="fixed-action") + self._action = action + + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse: + _ = system_prompt, user_prompt, kwargs + return LLMResponse(content=self._action, model_name=self.model_name) + + def supports_json_mode(self) -> bool: + return False + + @property + def model_name(self) -> str: + return "fixed-action" + + +async def main() -> None: + api_key = os.environ.get("PREDICATE_API_KEY") or os.environ.get("SENTIENCE_API_KEY") + + recordings_dir = Path("recordings") + recordings_dir.mkdir(parents=True, exist_ok=True) + + async with async_playwright() as p: + browser = await p.chromium.launch(headless=False) + context = await browser.new_context( + record_video_dir=str(recordings_dir), + record_video_size={"width": 1280, "height": 720}, + ) + page = await context.new_page() + + # Wrap existing Playwright page. + sentience_browser = await AsyncSentienceBrowser.from_page( + page, api_key=api_key + ) + + try: + await page.goto("https://example.com") + await page.wait_for_load_state("networkidle") + + runtime = await AgentRuntime.from_sentience_browser( + browser=sentience_browser, page=page, tracer=None + ) + + agent = PredicateBrowserAgent( + runtime=runtime, + executor=FixedActionProvider("FINISH()"), + config=PredicateBrowserAgentConfig(history_last_n=0), + ) + + out = await agent.step( + task_goal="Open example.com", + step=RuntimeStep(goal="Finish immediately"), + ) + print(f"step ok: {out.ok}") + print(f"videos will be saved under: {recordings_dir.resolve()}") + finally: + # Close the Playwright context to flush the video. + try: + await context.close() + finally: + await browser.close() + + +if __name__ == "__main__": + asyncio.run(main()) + diff --git a/predicate/agents/browser_agent.py b/predicate/agents/browser_agent.py index 15c4a5a..a559347 100644 --- a/predicate/agents/browser_agent.py +++ b/predicate/agents/browser_agent.py @@ -10,7 +10,7 @@ from ..captcha import CaptchaHandler, CaptchaOptions from ..captcha_strategies import ExternalSolver, HumanHandoffSolver, VisionSolver from ..llm_interaction_handler import LLMInteractionHandler -from ..llm_provider import LLMProvider +from ..llm_provider import LLMProvider, LLMResponse from ..models import Snapshot, StepHookContext from ..permissions import PermissionPolicy from ..runtime_agent import RuntimeAgent, RuntimeStep @@ -84,6 +84,9 @@ class PredicateBrowserAgentConfig: # Prompt / token controls history_last_n: int = 0 # 0 disables LLM-facing step history (lowest token usage) + # Opt-in: track token usage from LLM provider responses (best-effort; depends on provider reporting). + token_usage_enabled: bool = False + # Compact prompt customization # Signature: builder(task_goal, step_goal, dom_context, snapshot, history_summary) -> (system, user) compact_prompt_builder: Callable[ @@ -146,6 +149,112 @@ def apply_captcha_config_to_runtime( ) +@dataclass +class TokenUsageTotals: + calls: int = 0 + prompt_tokens: int = 0 + completion_tokens: int = 0 + total_tokens: int = 0 + + def add(self, resp: LLMResponse) -> None: + self.calls += 1 + pt = resp.prompt_tokens if isinstance(resp.prompt_tokens, int) else 0 + ct = resp.completion_tokens if isinstance(resp.completion_tokens, int) else 0 + tt = resp.total_tokens if isinstance(resp.total_tokens, int) else (pt + ct) + self.prompt_tokens += max(0, int(pt)) + self.completion_tokens += max(0, int(ct)) + self.total_tokens += max(0, int(tt)) + + +class _TokenUsageCollector: + def __init__(self) -> None: + self._by_role: dict[str, TokenUsageTotals] = {} + self._by_model: dict[str, TokenUsageTotals] = {} + + def record(self, *, role: str, resp: LLMResponse) -> None: + self._by_role.setdefault(role, TokenUsageTotals()).add(resp) + m = str(resp.model_name or "").strip() or "unknown" + self._by_model.setdefault(m, TokenUsageTotals()).add(resp) + + def reset(self) -> None: + self._by_role.clear() + self._by_model.clear() + + def summary(self) -> dict[str, Any]: + def _sum(items: dict[str, TokenUsageTotals]) -> TokenUsageTotals: + out = TokenUsageTotals() + for t in items.values(): + out.calls += t.calls + out.prompt_tokens += t.prompt_tokens + out.completion_tokens += t.completion_tokens + out.total_tokens += t.total_tokens + return out + + total = _sum(self._by_role) + return { + "total": { + "calls": total.calls, + "prompt_tokens": total.prompt_tokens, + "completion_tokens": total.completion_tokens, + "total_tokens": total.total_tokens, + }, + "by_role": { + k: { + "calls": v.calls, + "prompt_tokens": v.prompt_tokens, + "completion_tokens": v.completion_tokens, + "total_tokens": v.total_tokens, + } + for k, v in self._by_role.items() + }, + "by_model": { + k: { + "calls": v.calls, + "prompt_tokens": v.prompt_tokens, + "completion_tokens": v.completion_tokens, + "total_tokens": v.total_tokens, + } + for k, v in self._by_model.items() + }, + } + + +class _TokenAccountingProvider(LLMProvider): + def __init__(self, *, inner: LLMProvider, collector: _TokenUsageCollector, role: str): + super().__init__(model=getattr(inner, "model_name", "wrapped")) + self._inner = inner + self._collector = collector + self._role = role + + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse: + resp = self._inner.generate(system_prompt, user_prompt, **kwargs) + try: + self._collector.record(role=self._role, resp=resp) + except Exception: + pass + return resp + + def supports_json_mode(self) -> bool: + return self._inner.supports_json_mode() + + def supports_vision(self) -> bool: + return self._inner.supports_vision() + + def generate_with_image( + self, system_prompt: str, user_prompt: str, image_base64: str, **kwargs + ) -> LLMResponse: + resp = self._inner.generate_with_image(system_prompt, user_prompt, image_base64, **kwargs) + try: + self._collector.record(role=self._role, resp=resp) + except Exception: + pass + return resp + + @property + def model_name(self) -> str: + return self._inner.model_name + + class _RuntimeAgentWithPromptOverrides(RuntimeAgent): def __init__( self, @@ -227,9 +336,33 @@ def __init__( config: PredicateBrowserAgentConfig = PredicateBrowserAgentConfig(), ) -> None: self.runtime = runtime - self.executor = executor - self.vision_executor = vision_executor - self.vision_verifier = vision_verifier + self._token_usage: _TokenUsageCollector | None = ( + _TokenUsageCollector() if bool(config.token_usage_enabled) else None + ) + + # Optionally wrap providers for best-effort token usage accounting. + if self._token_usage is not None: + self.executor = _TokenAccountingProvider( + inner=executor, collector=self._token_usage, role="executor" + ) + self.vision_executor = ( + _TokenAccountingProvider( + inner=vision_executor, collector=self._token_usage, role="vision_executor" + ) + if vision_executor is not None + else None + ) + self.vision_verifier = ( + _TokenAccountingProvider( + inner=vision_verifier, collector=self._token_usage, role="vision_verifier" + ) + if vision_verifier is not None + else None + ) + else: + self.executor = executor + self.vision_executor = vision_executor + self.vision_verifier = vision_verifier self.config = config # LLM-facing step history summaries (bounded) @@ -252,6 +385,23 @@ def __init__( history_summary_provider=self._get_history_summary, ) + def get_token_usage(self) -> dict[str, Any]: + """ + Best-effort token usage summary. + + Only available when `PredicateBrowserAgentConfig.token_usage_enabled=True`. + """ + if self._token_usage is None: + return {"enabled": False, "reason": "token_usage_enabled is False"} + out = self._token_usage.summary() + out["enabled"] = True + return out + + def reset_token_usage(self) -> None: + if self._token_usage is None: + return + self._token_usage.reset() + def _get_history_summary(self) -> str: if int(self.config.history_last_n) <= 0: return "" diff --git a/tests/unit/test_predicate_browser_agent.py b/tests/unit/test_predicate_browser_agent.py index 76a1fa1..13b0b6e 100644 --- a/tests/unit/test_predicate_browser_agent.py +++ b/tests/unit/test_predicate_browser_agent.py @@ -95,6 +95,29 @@ def model_name(self) -> str: return self._model_name +class TokenProviderStub(LLMProvider): + def __init__(self, *, model: str = "stub", response: str = "FINISH()"): + super().__init__(model) + self._response = response + + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse: + _ = system_prompt, user_prompt, kwargs + return LLMResponse( + content=self._response, + model_name=self.model_name, + prompt_tokens=11, + completion_tokens=7, + total_tokens=18, + ) + + def supports_json_mode(self) -> bool: + return True + + @property + def model_name(self) -> str: + return self._model_name + + def make_snapshot(*, url: str, elements: list[Element], confidence: float | None = None) -> Snapshot: diagnostics = SnapshotDiagnostics(confidence=confidence) if confidence is not None else None return Snapshot( @@ -162,3 +185,39 @@ def builder( asyncio.run(_run()) + +def test_predicate_browser_agent_token_usage_is_opt_in_and_best_effort() -> None: + async def _run() -> None: + backend = MockBackend() + tracer = MockTracer() + runtime = AgentRuntime(backend=backend, tracer=tracer) + + s0 = make_snapshot(url="https://example.com/start", elements=[make_clickable_element(1)]) + async def fake_snapshot(**_kwargs): + runtime.last_snapshot = s0 + return runtime.last_snapshot + runtime.snapshot = AsyncMock(side_effect=fake_snapshot) # type: ignore[method-assign] + + step = RuntimeStep(goal="No-op", verifications=[]) + executor = TokenProviderStub(response="FINISH()") + + agent = PredicateBrowserAgent( + runtime=runtime, + executor=executor, + config=PredicateBrowserAgentConfig(token_usage_enabled=True), + ) + + out = await agent.step(task_goal="test", step=step) + assert out.ok is True + + usage = agent.get_token_usage() + assert usage["enabled"] is True + assert usage["total"]["total_tokens"] >= 18 + assert usage["by_role"]["executor"]["calls"] >= 1 + + agent.reset_token_usage() + usage2 = agent.get_token_usage() + assert usage2["total"]["total_tokens"] == 0 + + asyncio.run(_run()) +