Providers¶

Bases: LLM

OpenAI LLM provider.

Implements the LLM interface for OpenAI's GPT models, including support for structured outputs via the responses.parse API.

The API key is read from the OPENAI_API_KEY environment variable.

Attributes:

Name	Type	Description
`client`		The async OpenAI client instance.

Example

llm = OpenAI( ... model="gpt-4o", ... input_cost=2.5, ... output_cost=10.0, ... ) response = await llm.get_response("Hello, GPT!")

Source code in majordomo_llm/providers/openai.py

class OpenAI(LLM):
    """OpenAI LLM provider.

    Implements the LLM interface for OpenAI's GPT models, including
    support for structured outputs via the responses.parse API.

    The API key is read from the ``OPENAI_API_KEY`` environment variable.

    Attributes:
        client: The async OpenAI client instance.

    Example:
        >>> llm = OpenAI(
        ...     model="gpt-4o",
        ...     input_cost=2.5,
        ...     output_cost=10.0,
        ... )
        >>> response = await llm.get_response("Hello, GPT!")
    """

    def __init__(
        self,
        model: str,
        input_cost: float,
        output_cost: float,
        supports_temperature_top_p: bool = True,
        use_web_search: bool = False,
        *,
        cached_input_cost: float | None = None,
        cache_write_cost: float | None = None,
        api_key: str | None = None,
        api_key_alias: str | None = None,
        base_url: str | None = None,
        default_headers: dict[str, str] | None = None,
    ) -> None:
        """Initialize the OpenAI provider.

        Args:
            model: The GPT model identifier (e.g., "gpt-4o", "gpt-5").
            input_cost: Cost per million input tokens in USD.
            output_cost: Cost per million output tokens in USD.
            supports_temperature_top_p: Whether temperature/top_p are supported.
            use_web_search: Enable Responses API ``web_search_preview`` tool.
            cached_input_cost: Cost per million cache-read tokens in USD. OpenAI
                reports cached tokens as a subset of input tokens, so this
                re-prices them below ``input_cost``.
            cache_write_cost: Unused by OpenAI (no distinct cache-write rate);
                accepted for a uniform factory signature.
            api_key: Optional API key. Defaults to ``OPENAI_API_KEY`` env var.
            api_key_alias: Optional human-readable name for the API key.
            base_url: Optional custom base URL for routing through a proxy.
            default_headers: Optional headers sent with every request.

        Raises:
            ConfigurationError: If no API key is provided and env var is not set.
        """
        resolved_api_key = resolve_api_key(api_key, "OPENAI_API_KEY", "OpenAI")
        super().__init__(
            provider="openai",
            model=model,
            input_cost=input_cost,
            output_cost=output_cost,
            cached_input_cost=cached_input_cost,
            cache_write_cost=cache_write_cost,
            supports_temperature_top_p=supports_temperature_top_p,
            use_web_search=use_web_search,
            api_key=resolved_api_key,
            api_key_alias=api_key_alias,
            base_url=base_url,
            default_headers=default_headers,
        )
        self.client = openai.AsyncOpenAI(
            api_key=resolved_api_key,
            base_url=self.base_url,
            default_headers=self.default_headers,
        )

    def _web_search_kwargs(self) -> dict[str, Any]:
        """Return ``tools=`` kwarg for the Responses API when web search is on.

        OpenAI bills the web_search_preview tool's tokens through normal output
        tokens, so no separate ``tool_use_cost`` is added.
        """
        if not self.use_web_search:
            return {}
        return {"tools": [{"type": "web_search_preview"}]}

    @retry_provider_call
    async def _get_response_impl(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Get a plain text response from OpenAI."""
        return await self._get_response(
            user_prompt, system_prompt, temperature, top_p, extra_headers=extra_headers
        )

    async def _get_response(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Internal method to get a response from OpenAI."""
        start_time = time.time()
        web_search_kwargs = self._web_search_kwargs()
        try:
            if self.supports_temperature_top_p:
                response = await self.client.responses.create(
                    model=self.model,
                    instructions=system_prompt,
                    input=user_prompt,
                    temperature=temperature,
                    top_p=top_p,
                    extra_headers=extra_headers,
                    **web_search_kwargs,
                )
            else:
                response = await self.client.responses.create(
                    model=self.model,
                    instructions=system_prompt,
                    input=user_prompt,
                    extra_headers=extra_headers,
                    **web_search_kwargs,
                )
        except openai.APIError as e:
            raise ProviderError(
                f"OpenAI API error: {e}",
                provider="openai",
                original_error=e,
            ) from e

        execution_time = time.time() - start_time
        assert response.usage is not None
        input_tokens = response.usage.input_tokens
        output_tokens = response.usage.output_tokens
        cached_tokens = response.usage.input_tokens_details.cached_tokens
        input_cost, output_cost, total_cost = self._calculate_costs(
            input_tokens, output_tokens, cached_tokens
        )

        return LLMResponse(
            content=response.output_text,
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            input_cost=input_cost,
            output_cost=output_cost,
            total_cost=total_cost,
            response_time=execution_time,
            deprecation_warning=self.deprecation_warning,
        )

    async def _get_response_stream_impl(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMStreamResponse:
        """Get a streaming text response from OpenAI."""
        state = _StreamState()
        web_search_kwargs = self._web_search_kwargs()

        try:
            if self.supports_temperature_top_p:
                response = await self.client.responses.create(
                    model=self.model,
                    instructions=system_prompt,
                    input=user_prompt,
                    temperature=temperature,
                    top_p=top_p,
                    stream=True,
                    extra_headers=extra_headers,
                    **web_search_kwargs,
                )
            else:
                response = await self.client.responses.create(
                    model=self.model,
                    instructions=system_prompt,
                    input=user_prompt,
                    stream=True,
                    extra_headers=extra_headers,
                    **web_search_kwargs,
                )
        except openai.APIError as e:
            raise ProviderError(
                f"OpenAI API error: {e}",
                provider="openai",
                original_error=e,
            ) from e

        async def generator() -> AsyncIterator[str]:
            try:
                async for event in response:
                    if event.type == "response.output_text.delta":
                        yield event.delta
                    elif event.type == "response.completed":
                        assert event.response.usage is not None
                        state.input_tokens = event.response.usage.input_tokens
                        state.output_tokens = event.response.usage.output_tokens
                        cached = event.response.usage.input_tokens_details
                        state.cached_tokens = cached.cached_tokens
            except openai.APIError as e:
                raise ProviderError(
                    f"OpenAI API error: {e}",
                    provider="openai",
                    original_error=e,
                ) from e

        return LLMStreamResponse(stream=generator(), state=state, llm=self)

    async def _get_json_schema_response(
        self,
        user_prompt: str,
        response_schema: dict[str, object],
        system_prompt: str | None = None,
        schema_name: str = "Response",
        schema_description: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """OpenAI-specific implementation using structured outputs with JSON Schema."""
        start_time = time.time()
        strict_schema = _enforce_openai_strict_schema(response_schema)
        response_format: dict[str, object] = {
            "type": "json_schema",
            "name": schema_name,
            "schema": strict_schema,
            "strict": True,
        }
        if schema_description is not None:
            response_format["description"] = schema_description
        text_config: Any = {"format": response_format}
        web_search_kwargs = self._web_search_kwargs()

        try:
            if self.supports_temperature_top_p:
                response = await self.client.responses.create(
                    model=self.model,
                    instructions=system_prompt,
                    input=user_prompt,
                    temperature=temperature,
                    top_p=top_p,
                    text=text_config,
                    extra_headers=extra_headers,
                    **web_search_kwargs,
                )
            else:
                response = await self.client.responses.create(
                    model=self.model,
                    instructions=system_prompt,
                    input=user_prompt,
                    text=text_config,
                    extra_headers=extra_headers,
                    **web_search_kwargs,
                )
        except openai.APIError as e:
            raise ProviderError(
                f"OpenAI API error: {e}",
                provider="openai",
                original_error=e,
            ) from e

        execution_time = time.time() - start_time
        assert response.usage is not None
        input_tokens = response.usage.input_tokens
        output_tokens = response.usage.output_tokens
        cached_tokens = getattr(
            getattr(response.usage, "input_tokens_details", None),
            "cached_tokens",
            0,
        ) or 0
        input_cost, output_cost, total_cost = self._calculate_costs(
            input_tokens, output_tokens, cached_tokens
        )

        return LLMResponse(
            content=canonicalize_json_schema_output(response.output_text, response_schema),
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            input_cost=input_cost,
            output_cost=output_cost,
            total_cost=total_cost,
            response_time=execution_time,
        )

init ¶

__init__(model, input_cost, output_cost, supports_temperature_top_p=True, use_web_search=False, *, cached_input_cost=None, cache_write_cost=None, api_key=None, api_key_alias=None, base_url=None, default_headers=None)

Initialize the OpenAI provider.

Parameters:

Name	Type	Description	Default
`model`	`str`	The GPT model identifier (e.g., "gpt-4o", "gpt-5").	required
`input_cost`	`float`	Cost per million input tokens in USD.	required
`output_cost`	`float`	Cost per million output tokens in USD.	required
`supports_temperature_top_p`	`bool`	Whether temperature/top_p are supported.	`True`
`use_web_search`	`bool`	Enable Responses API `web_search_preview` tool.	`False`
`cached_input_cost`	`float \| None`	Cost per million cache-read tokens in USD. OpenAI reports cached tokens as a subset of input tokens, so this re-prices them below `input_cost`.	`None`
`cache_write_cost`	`float \| None`	Unused by OpenAI (no distinct cache-write rate); accepted for a uniform factory signature.	`None`
`api_key`	`str \| None`	Optional API key. Defaults to `OPENAI_API_KEY` env var.	`None`
`api_key_alias`	`str \| None`	Optional human-readable name for the API key.	`None`
`base_url`	`str \| None`	Optional custom base URL for routing through a proxy.	`None`
`default_headers`	`dict[str, str] \| None`	Optional headers sent with every request.	`None`

Raises:

Type	Description
`ConfigurationError`	If no API key is provided and env var is not set.

Source code in majordomo_llm/providers/openai.py

def __init__(
    self,
    model: str,
    input_cost: float,
    output_cost: float,
    supports_temperature_top_p: bool = True,
    use_web_search: bool = False,
    *,
    cached_input_cost: float | None = None,
    cache_write_cost: float | None = None,
    api_key: str | None = None,
    api_key_alias: str | None = None,
    base_url: str | None = None,
    default_headers: dict[str, str] | None = None,
) -> None:
    """Initialize the OpenAI provider.

    Args:
        model: The GPT model identifier (e.g., "gpt-4o", "gpt-5").
        input_cost: Cost per million input tokens in USD.
        output_cost: Cost per million output tokens in USD.
        supports_temperature_top_p: Whether temperature/top_p are supported.
        use_web_search: Enable Responses API ``web_search_preview`` tool.
        cached_input_cost: Cost per million cache-read tokens in USD. OpenAI
            reports cached tokens as a subset of input tokens, so this
            re-prices them below ``input_cost``.
        cache_write_cost: Unused by OpenAI (no distinct cache-write rate);
            accepted for a uniform factory signature.
        api_key: Optional API key. Defaults to ``OPENAI_API_KEY`` env var.
        api_key_alias: Optional human-readable name for the API key.
        base_url: Optional custom base URL for routing through a proxy.
        default_headers: Optional headers sent with every request.

    Raises:
        ConfigurationError: If no API key is provided and env var is not set.
    """
    resolved_api_key = resolve_api_key(api_key, "OPENAI_API_KEY", "OpenAI")
    super().__init__(
        provider="openai",
        model=model,
        input_cost=input_cost,
        output_cost=output_cost,
        cached_input_cost=cached_input_cost,
        cache_write_cost=cache_write_cost,
        supports_temperature_top_p=supports_temperature_top_p,
        use_web_search=use_web_search,
        api_key=resolved_api_key,
        api_key_alias=api_key_alias,
        base_url=base_url,
        default_headers=default_headers,
    )
    self.client = openai.AsyncOpenAI(
        api_key=resolved_api_key,
        base_url=self.base_url,
        default_headers=self.default_headers,
    )

Bases: LLM

Anthropic (Claude) LLM provider.

Implements the LLM interface for Anthropic's Claude models, including support for tool calling for structured outputs and optional web search.

The API key is read from the ANTHROPIC_API_KEY environment variable.

Attributes:

Name	Type	Description
`client`		The async Anthropic client instance.

Example

llm = Anthropic( ... model="claude-sonnet-4-20250514", ... input_cost=3.0, ... output_cost=15.0, ... ) response = await llm.get_response("Hello, Claude!")

Source code in majordomo_llm/providers/anthropic.py

class Anthropic(LLM):
    """Anthropic (Claude) LLM provider.

    Implements the LLM interface for Anthropic's Claude models, including
    support for tool calling for structured outputs and optional web search.

    The API key is read from the ``ANTHROPIC_API_KEY`` environment variable.

    Attributes:
        client: The async Anthropic client instance.

    Example:
        >>> llm = Anthropic(
        ...     model="claude-sonnet-4-20250514",
        ...     input_cost=3.0,
        ...     output_cost=15.0,
        ... )
        >>> response = await llm.get_response("Hello, Claude!")
    """

    #: Anthropic reports cache-read/cache-write tokens separately from
    #: ``input_tokens``, so cache cost is added on top of the uncached input.
    _cache_accounting = "additive"

    def __init__(
        self,
        model: str,
        input_cost: float,
        output_cost: float,
        supports_temperature_top_p: bool = True,
        use_web_search: bool = False,
        supports_structured_outputs: bool = False,
        reasoning_effort: str | None = None,
        thinking: str | None = None,
        *,
        cached_input_cost: float | None = None,
        cache_write_cost: float | None = None,
        use_prompt_caching: bool = True,
        api_key: str | None = None,
        api_key_alias: str | None = None,
        base_url: str | None = None,
        default_headers: dict[str, str] | None = None,
    ) -> None:
        """Initialize the Anthropic provider.

        Args:
            model: The Claude model identifier (e.g., "claude-sonnet-4-20250514").
            input_cost: Cost per million input tokens in USD.
            output_cost: Cost per million output tokens in USD.
            supports_temperature_top_p: Whether temperature/top_p are supported.
            use_web_search: Enable web search (requires claude-sonnet-4-5-20250929).
            supports_structured_outputs: Whether the model supports native
                structured outputs (constrained decoding via
                ``output_config.format``). When False, structured JSON requests
                fall back to forced tool calling. Defaults to False.
            reasoning_effort: Optional ``output_config.effort`` level applied to
                every request — one of ``low``, ``medium``, ``high``, ``xhigh``,
                ``max``. Controls thinking depth and overall token spend on the
                4.7+/5 generation. ``None`` (default) sends no effort, so the
                API default (``high``) applies. Register the same SKU under
                multiple YAML keys (via the ``model`` override) to expose
                distinct effort profiles.
            thinking: Optional ``thinking.type`` mode applied to every request —
                ``adaptive`` (Claude decides how much to think; the on-mode for
                the 4.6+/5 generation) or ``disabled``. ``None`` (default) omits
                the field, so the model runs without thinking. Effort only
                meaningfully modulates depth when thinking is on, so pair the two.
                Note: ``disabled`` is rejected on Fable 5 (thinking is always on),
                and with thinking on the fixed ``max_tokens`` (1024 for plain
                responses, 4096/8192 for structured) covers thinking + answer —
                raise it via a dedicated config entry if answers truncate.
            cached_input_cost: Cost per million cache-read tokens in USD
                (``cache_read_input_tokens``), billed on top of uncached input.
            cache_write_cost: Cost per million cache-creation tokens in USD
                (``cache_creation_input_tokens``), billed on top of uncached input.
            use_prompt_caching: When ``True`` (default), the system prompt is sent
                with an ephemeral ``cache_control`` breakpoint so Anthropic caches
                it. Set ``False`` to disable prompt caching (e.g. for short,
                non-reused system prompts where the cache-write premium is wasted).
            api_key: Optional API key. Defaults to ``ANTHROPIC_API_KEY`` env var.
            api_key_alias: Optional human-readable name for the API key.
            base_url: Optional custom base URL for routing through a proxy.
            default_headers: Optional headers sent with every request.

        Raises:
            ConfigurationError: If no API key is provided and env var is not set.
            ValueError: If ``reasoning_effort`` or ``thinking`` is invalid.
        """
        if reasoning_effort is not None and reasoning_effort not in _EFFORT_LEVELS:
            valid = ", ".join(sorted(_EFFORT_LEVELS))
            raise ValueError(
                f"Invalid Anthropic reasoning_effort '{reasoning_effort}'. Valid: {valid}"
            )
        if thinking is not None and thinking not in _THINKING_MODES:
            valid = ", ".join(sorted(_THINKING_MODES))
            raise ValueError(f"Invalid Anthropic thinking mode '{thinking}'. Valid: {valid}")
        resolved_api_key = resolve_api_key(api_key, "ANTHROPIC_API_KEY", "Anthropic")
        self.supports_structured_outputs = supports_structured_outputs
        self.reasoning_effort = reasoning_effort
        self.thinking = thinking
        super().__init__(
            provider="anthropic",
            model=model,
            input_cost=input_cost,
            output_cost=output_cost,
            cached_input_cost=cached_input_cost,
            cache_write_cost=cache_write_cost,
            use_prompt_caching=use_prompt_caching,
            supports_temperature_top_p=supports_temperature_top_p,
            use_web_search=use_web_search,
            api_key=resolved_api_key,
            api_key_alias=api_key_alias,
            base_url=base_url,
            default_headers=default_headers,
        )
        self.client = anthropic.AsyncAnthropic(
            api_key=resolved_api_key,
            base_url=self.base_url,
            default_headers=self.default_headers,
        )

    def _config_create_kwargs(self, fmt: dict[str, Any] | None = None) -> dict[str, Any]:
        """Build config-derived ``messages.create`` kwargs, for splatting.

        Combines the optional structured-output ``format`` with the configured
        ``reasoning_effort`` (both under ``output_config``) and the configured
        ``thinking`` mode. Returns ``{}`` (a no-op splat) when none are present,
        so callers can uniformly write ``**self._config_create_kwargs()`` without
        a conditional.
        """
        kwargs: dict[str, Any] = {}
        output_config: dict[str, Any] = {}
        if fmt is not None:
            output_config["format"] = fmt
        if self.reasoning_effort is not None:
            output_config["effort"] = self.reasoning_effort
        if output_config:
            kwargs["output_config"] = output_config
        if self.thinking is not None:
            kwargs["thinking"] = {"type": self.thinking}
        return kwargs

    # Anthropic bills server-side web search at $10 per 1,000 requests.
    _WEB_SEARCH_COST_PER_REQUEST = 0.01

    def _compute_web_search_cost(self, response: Any) -> float:
        """Return the per-call web-search fee charged by Anthropic.

        Reads ``response.usage.server_tool_use.web_search_requests`` which is
        populated only when the web_search tool was actually invoked.
        """
        server_tool_use = getattr(response.usage, "server_tool_use", None)
        if server_tool_use is None:
            return 0.0
        requests = getattr(server_tool_use, "web_search_requests", 0) or 0
        return requests * self._WEB_SEARCH_COST_PER_REQUEST

    @retry_provider_call
    async def _get_response_impl(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Get a plain text response from Anthropic."""
        if system_prompt is None:
            system_prompt = "You are a helpful assistant"
        start_time = time.time()

        messages = _anthropic_user_message(user_prompt)
        system_message = _anthropic_system_prompt(system_prompt, self.use_prompt_caching)

        tools: list[Any] = []
        if self.use_web_search:
            tools.append(
                WebSearchTool20250305Param(type="web_search_20250305", name="web_search")
            )

        try:
            if self.supports_temperature_top_p:
                response_message = await self.client.messages.create(
                    model=self.model,
                    max_tokens=1024,
                    system=system_message,
                    messages=messages,
                    temperature=temperature,
                    top_p=top_p,
                    tools=tools,
                    tool_choice=ToolChoiceAutoParam(type="auto"),
                    **self._config_create_kwargs(),
                    extra_headers=extra_headers,
                )
            else:
                response_message = await self.client.messages.create(
                    model=self.model,
                    max_tokens=1024,
                    system=system_message,
                    messages=messages,
                    tools=tools,
                    tool_choice=ToolChoiceAutoParam(type="auto"),
                    **self._config_create_kwargs(),
                    extra_headers=extra_headers,
                )
        except anthropic.APIError as e:
            raise ProviderError(
                f"Anthropic API error: {e}",
                provider="anthropic",
                original_error=e,
            ) from e

        execution_time = time.time() - start_time
        final_response = [c.text for c in response_message.content if c.type == "text"]

        input_tokens = response_message.usage.input_tokens
        output_tokens = response_message.usage.output_tokens
        cached_tokens = response_message.usage.cache_read_input_tokens or 0
        cache_creation_tokens = response_message.usage.cache_creation_input_tokens or 0
        input_cost, output_cost, total_cost = self._calculate_costs(
            input_tokens, output_tokens, cached_tokens, cache_creation_tokens
        )
        tool_use_cost = self._compute_web_search_cost(response_message)
        total_cost += tool_use_cost

        return LLMResponse(
            content="\n".join(final_response),
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            cache_creation_tokens=cache_creation_tokens,
            input_cost=input_cost,
            output_cost=output_cost,
            total_cost=total_cost,
            response_time=execution_time,
            tool_use_cost=tool_use_cost,
            deprecation_warning=self.deprecation_warning,
        )

    async def _get_response_stream_impl(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMStreamResponse:
        """Get a streaming text response from Anthropic."""
        if system_prompt is None:
            system_prompt = "You are a helpful assistant"

        state = _StreamState()
        messages = _anthropic_user_message(user_prompt)
        system_message = _anthropic_system_prompt(system_prompt, self.use_prompt_caching)

        try:
            if self.supports_temperature_top_p:
                response = await self.client.messages.create(
                    model=self.model,
                    max_tokens=1024,
                    system=system_message,
                    messages=messages,
                    temperature=temperature,
                    top_p=top_p,
                    stream=True,
                    **self._config_create_kwargs(),
                    extra_headers=extra_headers,
                )
            else:
                response = await self.client.messages.create(
                    model=self.model,
                    max_tokens=1024,
                    system=system_message,
                    messages=messages,
                    stream=True,
                    **self._config_create_kwargs(),
                    extra_headers=extra_headers,
                )
        except anthropic.APIError as e:
            raise ProviderError(
                f"Anthropic API error: {e}",
                provider="anthropic",
                original_error=e,
            ) from e

        async def generator() -> AsyncIterator[str]:
            try:
                async for event in response:
                    if event.type == "message_start":
                        state.input_tokens = event.message.usage.input_tokens
                        state.cached_tokens = event.message.usage.cache_read_input_tokens or 0
                        state.cache_creation_tokens = (
                            event.message.usage.cache_creation_input_tokens or 0
                        )
                    elif event.type == "content_block_delta" and event.delta.type == "text_delta":
                        yield event.delta.text
                    elif event.type == "message_delta":
                        state.output_tokens = event.usage.output_tokens
            except anthropic.APIError as e:
                raise ProviderError(
                    f"Anthropic API error: {e}",
                    provider="anthropic",
                    original_error=e,
                ) from e

        return LLMStreamResponse(stream=generator(), state=state, llm=self)

    async def _get_json_schema_response(
        self,
        user_prompt: str,
        response_schema: dict[str, Any],
        system_prompt: str | None = None,
        schema_name: str = "Response",
        schema_description: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Anthropic structured JSON output.

        Uses native structured outputs (constrained decoding via
        ``output_config.format``) on models that support it — the model
        physically cannot emit malformed or missing-key output. Falls back to
        forced tool calling on models without native support, and always uses
        the forced-tool path when web search is enabled (structured outputs and
        the ``web_search`` tool cannot be combined). Every path validates
        against the caller's original schema and rejects an empty/all-null
        result via :func:`canonicalize_json_schema_output`.
        """
        if self.use_web_search:
            response, execution_time = await self._json_schema_response_with_web_search_helper(
                user_prompt=user_prompt,
                response_schema=response_schema,
                system_prompt=system_prompt,
                schema_name=schema_name,
                schema_description=schema_description,
                extra_headers=extra_headers,
            )
            content = _extract_tool_use_content(response.content, schema_name)
            return self._finalize_json_schema_response(
                content, response, execution_time, response_schema
            )

        if self.supports_structured_outputs:
            return await self._native_json_schema_response(
                user_prompt=user_prompt,
                response_schema=response_schema,
                system_prompt=system_prompt,
                temperature=temperature,
                top_p=top_p,
                extra_headers=extra_headers,
            )

        return await self._forced_tool_json_schema_response(
            user_prompt=user_prompt,
            response_schema=response_schema,
            system_prompt=system_prompt,
            schema_name=schema_name,
            schema_description=schema_description,
            temperature=temperature,
            top_p=top_p,
            extra_headers=extra_headers,
        )

    async def _native_json_schema_response(
        self,
        user_prompt: str,
        response_schema: dict[str, Any],
        system_prompt: str | None,
        temperature: float,
        top_p: float,
        extra_headers: dict[str, str] | None,
    ) -> LLMResponse:
        """Native structured outputs via ``output_config.format`` (constrained decoding).

        The constrained decoder requires strict object schemas
        (``additionalProperties: false`` + full ``required``) and rejects a set
        of validation keywords (numeric/string/array bounds, ``pattern``,
        ``format``). Those are stripped from the wire schema and re-enforced
        post-hoc by validating the response against the original schema.
        """
        sent_schema = strip_unsupported_schema_constraints(
            enforce_strict_object_schema(response_schema)
        )
        output_config = self._config_create_kwargs(
            fmt={"type": "json_schema", "schema": sent_schema}
        )

        if system_prompt is None:
            system_prompt = "You are a helpful assistant."
        messages = _anthropic_user_message(user_prompt)
        system_message = _anthropic_system_prompt(system_prompt, self.use_prompt_caching)

        start_time = time.time()
        try:
            if self.supports_temperature_top_p:
                response = await self.client.messages.create(
                    model=self.model,
                    max_tokens=4096,
                    system=system_message,
                    messages=messages,
                    temperature=temperature,
                    top_p=top_p,
                    **output_config,
                    extra_headers=extra_headers,
                )
            else:
                response = await self.client.messages.create(
                    model=self.model,
                    max_tokens=8192,
                    system=system_message,
                    messages=messages,
                    **output_config,
                    extra_headers=extra_headers,
                )
        except anthropic.APIError as e:
            raise ProviderError(
                f"Anthropic API error: {e}",
                provider="anthropic",
                original_error=e,
            ) from e

        execution_time = time.time() - start_time
        if response.stop_reason == "refusal":
            raise ResponseParsingError(
                "Anthropic refused the structured-output request.",
                raw_content=str(response.content),
            )
        content = _extract_structured_text(response.content)
        return self._finalize_json_schema_response(
            content, response, execution_time, response_schema
        )

    async def _forced_tool_json_schema_response(
        self,
        user_prompt: str,
        response_schema: dict[str, Any],
        system_prompt: str | None,
        schema_name: str,
        schema_description: str | None,
        temperature: float,
        top_p: float,
        extra_headers: dict[str, str] | None,
    ) -> LLMResponse:
        """Forced-tool fallback for models without native structured outputs.

        Sends the schema in strict form (full ``required``) so an empty ``{}``
        fails schema validation loudly; an all-null result is caught by the
        emptiness check in :func:`canonicalize_json_schema_output`. Both surface
        as :class:`EmptyStructuredResponseError`, which
        :func:`~majordomo_llm.retry.retry_provider_call` re-samples before it
        propagates.
        """
        tool_instruction = f"Use the {schema_name} tool to provide your answer."
        if system_prompt is None:
            system_prompt = f"You are a helpful assistant. {tool_instruction}"
        else:
            system_prompt = f"{system_prompt}\n\n{tool_instruction}"

        messages = _anthropic_user_message(user_prompt)
        system_message = _anthropic_system_prompt(system_prompt, self.use_prompt_caching)
        tools = [
            ToolParam(
                name=schema_name,
                description=schema_description
                or f"Provide a structured response using the {schema_name} JSON schema",
                input_schema=enforce_strict_object_schema(response_schema),
            )
        ]

        start_time = time.time()
        try:
            if self.supports_temperature_top_p:
                response = await self.client.messages.create(
                    model=self.model,
                    max_tokens=4096,
                    system=system_message,
                    messages=messages,
                    temperature=temperature,
                    top_p=top_p,
                    tools=tools,
                    tool_choice=ToolChoiceToolParam(type="tool", name=schema_name),
                    **self._config_create_kwargs(),
                    extra_headers=extra_headers,
                )
            else:
                response = await self.client.messages.create(
                    model=self.model,
                    max_tokens=8192,
                    system=system_message,
                    messages=messages,
                    tools=tools,
                    tool_choice=ToolChoiceToolParam(type="tool", name=schema_name),
                    **self._config_create_kwargs(),
                    extra_headers=extra_headers,
                )
        except anthropic.APIError as e:
            raise ProviderError(
                f"Anthropic API error: {e}",
                provider="anthropic",
                original_error=e,
            ) from e

        execution_time = time.time() - start_time
        content = _extract_tool_use_content(response.content, schema_name)
        return self._finalize_json_schema_response(
            content, response, execution_time, response_schema
        )

    def _finalize_json_schema_response(
        self,
        content: Any,
        response: Any,
        execution_time: float,
        response_schema: dict[str, Any],
    ) -> LLMResponse:
        """Compute usage/cost and validate structured content into an ``LLMResponse``."""
        input_tokens = response.usage.input_tokens
        output_tokens = response.usage.output_tokens
        cached_tokens = response.usage.cache_read_input_tokens or 0
        cache_creation_tokens = response.usage.cache_creation_input_tokens or 0
        input_cost, output_cost, total_cost = self._calculate_costs(
            input_tokens, output_tokens, cached_tokens, cache_creation_tokens
        )
        tool_use_cost = self._compute_web_search_cost(response)
        total_cost += tool_use_cost

        return LLMResponse(
            content=canonicalize_json_schema_output(content, response_schema),
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            cache_creation_tokens=cache_creation_tokens,
            input_cost=input_cost,
            output_cost=output_cost,
            total_cost=total_cost,
            response_time=execution_time,
            tool_use_cost=tool_use_cost,
        )

    async def _json_schema_response_with_web_search_helper(
        self,
        user_prompt: str,
        response_schema: dict[str, Any],
        system_prompt: str | None = None,
        schema_name: str = "Response",
        schema_description: str | None = None,
        extra_headers: dict[str, str] | None = None,
    ) -> tuple[Any, float]:
        """Helper for web search with raw JSON-schema structured response."""
        structured_response_tool = ToolParam(
            name=schema_name,
            description=schema_description
            or f"Provide a structured response using the {schema_name} JSON schema",
            input_schema=enforce_strict_object_schema(response_schema),
        )
        web_search_tool = WebSearchTool20250305Param(
            name="web_search",
            type="web_search_20250305",
        )
        tools: list[Any] = [structured_response_tool, web_search_tool]

        tool_instruction = f"Use the {schema_name} tool to provide your answer."
        if system_prompt is None:
            system_prompt = f"You are a helpful assistant. {tool_instruction}"
        else:
            system_prompt = f"{system_prompt}\n\n{tool_instruction}"

        messages = _anthropic_user_message(user_prompt)
        system_message = _anthropic_system_prompt(system_prompt, self.use_prompt_caching)

        start_time = time.time()
        current_messages = messages.copy()
        search_count = 0

        try:
            while search_count < 3:
                response = await self.client.messages.create(
                    model=self.model,
                    max_tokens=8192,
                    system=system_message,
                    messages=current_messages,
                    tools=tools,
                    tool_choice=ToolChoiceAutoParam(type="auto"),
                    **self._config_create_kwargs(),
                    extra_headers=extra_headers,
                )

                if response.stop_reason == "tool_use":
                    tool_uses = [block for block in response.content if block.type == "tool_use"]
                    if any(tool_use.name == schema_name for tool_use in tool_uses):
                        execution_time = time.time() - start_time
                        return response, execution_time

                    if any(tool_use.name == "web_search" for tool_use in tool_uses):
                        logger.info("Web search initiated (turn %d)", search_count + 1)
                        search_count += 1
                        current_messages.append({"role": "assistant", "content": response.content})
                        current_messages.append({
                            "role": "user",
                            "content": (
                                "Continue with your analysis. Use the structured response "
                                "tool when ready to generate the final output."
                            ),
                        })
                        continue
                break

            final_response = await self.client.messages.create(
                model=self.model,
                max_tokens=4096,
                system=_anthropic_system_prompt(system_prompt, self.use_prompt_caching),
                messages=current_messages,
                tools=[structured_response_tool],
                tool_choice=ToolChoiceToolParam(type="tool", name=schema_name),
                **self._config_create_kwargs(),
                extra_headers=extra_headers,
            )
        except anthropic.APIError as e:
            raise ProviderError(
                f"Anthropic API error: {e}",
                provider="anthropic",
                original_error=e,
            ) from e

        execution_time = time.time() - start_time
        return final_response, execution_time

init ¶

__init__(model, input_cost, output_cost, supports_temperature_top_p=True, use_web_search=False, supports_structured_outputs=False, reasoning_effort=None, thinking=None, *, cached_input_cost=None, cache_write_cost=None, use_prompt_caching=True, api_key=None, api_key_alias=None, base_url=None, default_headers=None)

Initialize the Anthropic provider.

Parameters:

Name	Type	Description	Default
`model`	`str`	The Claude model identifier (e.g., "claude-sonnet-4-20250514").	required
`input_cost`	`float`	Cost per million input tokens in USD.	required
`output_cost`	`float`	Cost per million output tokens in USD.	required
`supports_temperature_top_p`	`bool`	Whether temperature/top_p are supported.	`True`
`use_web_search`	`bool`	Enable web search (requires claude-sonnet-4-5-20250929).	`False`
`supports_structured_outputs`	`bool`	Whether the model supports native structured outputs (constrained decoding via `output_config.format`). When False, structured JSON requests fall back to forced tool calling. Defaults to False.	`False`
`reasoning_effort`	`str \| None`	Optional `output_config.effort` level applied to every request — one of `low`, `medium`, `high`, `xhigh`, `max`. Controls thinking depth and overall token spend on the 4.7+/5 generation. `None` (default) sends no effort, so the API default (`high`) applies. Register the same SKU under multiple YAML keys (via the `model` override) to expose distinct effort profiles.	`None`
`thinking`	`str \| None`	Optional `thinking.type` mode applied to every request — `adaptive` (Claude decides how much to think; the on-mode for the 4.6+/5 generation) or `disabled`. `None` (default) omits the field, so the model runs without thinking. Effort only meaningfully modulates depth when thinking is on, so pair the two. Note: `disabled` is rejected on Fable 5 (thinking is always on), and with thinking on the fixed `max_tokens` (1024 for plain responses, 4096/8192 for structured) covers thinking + answer — raise it via a dedicated config entry if answers truncate.	`None`
`cached_input_cost`	`float \| None`	Cost per million cache-read tokens in USD (`cache_read_input_tokens`), billed on top of uncached input.	`None`
`cache_write_cost`	`float \| None`	Cost per million cache-creation tokens in USD (`cache_creation_input_tokens`), billed on top of uncached input.	`None`
`use_prompt_caching`	`bool`	When `True` (default), the system prompt is sent with an ephemeral `cache_control` breakpoint so Anthropic caches it. Set `False` to disable prompt caching (e.g. for short, non-reused system prompts where the cache-write premium is wasted).	`True`
`api_key`	`str \| None`	Optional API key. Defaults to `ANTHROPIC_API_KEY` env var.	`None`
`api_key_alias`	`str \| None`	Optional human-readable name for the API key.	`None`
`base_url`	`str \| None`	Optional custom base URL for routing through a proxy.	`None`
`default_headers`	`dict[str, str] \| None`	Optional headers sent with every request.	`None`

Raises:

Type	Description
`ConfigurationError`	If no API key is provided and env var is not set.
`ValueError`	If `reasoning_effort` or `thinking` is invalid.

Source code in majordomo_llm/providers/anthropic.py

def __init__(
    self,
    model: str,
    input_cost: float,
    output_cost: float,
    supports_temperature_top_p: bool = True,
    use_web_search: bool = False,
    supports_structured_outputs: bool = False,
    reasoning_effort: str | None = None,
    thinking: str | None = None,
    *,
    cached_input_cost: float | None = None,
    cache_write_cost: float | None = None,
    use_prompt_caching: bool = True,
    api_key: str | None = None,
    api_key_alias: str | None = None,
    base_url: str | None = None,
    default_headers: dict[str, str] | None = None,
) -> None:
    """Initialize the Anthropic provider.

    Args:
        model: The Claude model identifier (e.g., "claude-sonnet-4-20250514").
        input_cost: Cost per million input tokens in USD.
        output_cost: Cost per million output tokens in USD.
        supports_temperature_top_p: Whether temperature/top_p are supported.
        use_web_search: Enable web search (requires claude-sonnet-4-5-20250929).
        supports_structured_outputs: Whether the model supports native
            structured outputs (constrained decoding via
            ``output_config.format``). When False, structured JSON requests
            fall back to forced tool calling. Defaults to False.
        reasoning_effort: Optional ``output_config.effort`` level applied to
            every request — one of ``low``, ``medium``, ``high``, ``xhigh``,
            ``max``. Controls thinking depth and overall token spend on the
            4.7+/5 generation. ``None`` (default) sends no effort, so the
            API default (``high``) applies. Register the same SKU under
            multiple YAML keys (via the ``model`` override) to expose
            distinct effort profiles.
        thinking: Optional ``thinking.type`` mode applied to every request —
            ``adaptive`` (Claude decides how much to think; the on-mode for
            the 4.6+/5 generation) or ``disabled``. ``None`` (default) omits
            the field, so the model runs without thinking. Effort only
            meaningfully modulates depth when thinking is on, so pair the two.
            Note: ``disabled`` is rejected on Fable 5 (thinking is always on),
            and with thinking on the fixed ``max_tokens`` (1024 for plain
            responses, 4096/8192 for structured) covers thinking + answer —
            raise it via a dedicated config entry if answers truncate.
        cached_input_cost: Cost per million cache-read tokens in USD
            (``cache_read_input_tokens``), billed on top of uncached input.
        cache_write_cost: Cost per million cache-creation tokens in USD
            (``cache_creation_input_tokens``), billed on top of uncached input.
        use_prompt_caching: When ``True`` (default), the system prompt is sent
            with an ephemeral ``cache_control`` breakpoint so Anthropic caches
            it. Set ``False`` to disable prompt caching (e.g. for short,
            non-reused system prompts where the cache-write premium is wasted).
        api_key: Optional API key. Defaults to ``ANTHROPIC_API_KEY`` env var.
        api_key_alias: Optional human-readable name for the API key.
        base_url: Optional custom base URL for routing through a proxy.
        default_headers: Optional headers sent with every request.

    Raises:
        ConfigurationError: If no API key is provided and env var is not set.
        ValueError: If ``reasoning_effort`` or ``thinking`` is invalid.
    """
    if reasoning_effort is not None and reasoning_effort not in _EFFORT_LEVELS:
        valid = ", ".join(sorted(_EFFORT_LEVELS))
        raise ValueError(
            f"Invalid Anthropic reasoning_effort '{reasoning_effort}'. Valid: {valid}"
        )
    if thinking is not None and thinking not in _THINKING_MODES:
        valid = ", ".join(sorted(_THINKING_MODES))
        raise ValueError(f"Invalid Anthropic thinking mode '{thinking}'. Valid: {valid}")
    resolved_api_key = resolve_api_key(api_key, "ANTHROPIC_API_KEY", "Anthropic")
    self.supports_structured_outputs = supports_structured_outputs
    self.reasoning_effort = reasoning_effort
    self.thinking = thinking
    super().__init__(
        provider="anthropic",
        model=model,
        input_cost=input_cost,
        output_cost=output_cost,
        cached_input_cost=cached_input_cost,
        cache_write_cost=cache_write_cost,
        use_prompt_caching=use_prompt_caching,
        supports_temperature_top_p=supports_temperature_top_p,
        use_web_search=use_web_search,
        api_key=resolved_api_key,
        api_key_alias=api_key_alias,
        base_url=base_url,
        default_headers=default_headers,
    )
    self.client = anthropic.AsyncAnthropic(
        api_key=resolved_api_key,
        base_url=self.base_url,
        default_headers=self.default_headers,
    )

Bases: LLM

Google Gemini LLM provider.

Implements the LLM interface for Google's Gemini models, including support for structured outputs via response schemas.

The API key is read from the GEMINI_API_KEY environment variable.

Attributes:

Name	Type	Description
`client`		The Google GenAI client instance.

Example

llm = Gemini( ... model="gemini-2.5-flash", ... input_cost=0.30, ... output_cost=2.50, ... ) response = await llm.get_response("Hello, Gemini!")

Source code in majordomo_llm/providers/gemini.py

class Gemini(LLM):
    """Google Gemini LLM provider.

    Implements the LLM interface for Google's Gemini models, including
    support for structured outputs via response schemas.

    The API key is read from the ``GEMINI_API_KEY`` environment variable.

    Attributes:
        client: The Google GenAI client instance.

    Example:
        >>> llm = Gemini(
        ...     model="gemini-2.5-flash",
        ...     input_cost=0.30,
        ...     output_cost=2.50,
        ... )
        >>> response = await llm.get_response("Hello, Gemini!")
    """

    def __init__(
        self,
        model: str,
        input_cost: float,
        output_cost: float,
        *,
        supports_temperature_top_p: bool = True,
        use_web_search: bool = False,
        cached_input_cost: float | None = None,
        cache_write_cost: float | None = None,
        api_key: str | None = None,
        api_key_alias: str | None = None,
        base_url: str | None = None,
        default_headers: dict[str, str] | None = None,
    ) -> None:
        """Initialize the Gemini provider.

        Args:
            model: The Gemini model identifier (e.g., "gemini-2.5-flash").
            input_cost: Cost per million input tokens in USD.
            output_cost: Cost per million output tokens in USD.
            supports_temperature_top_p: Whether the model supports temperature/top_p.
            use_web_search: Enable the Google Search grounding tool.
            cached_input_cost: Cost per million cached-content (cache-read) tokens
                in USD. Gemini reports cached content as a subset of prompt
                tokens, so this re-prices them below ``input_cost``.
            cache_write_cost: Unused by Gemini (implicit caching has no per-token
                write fee); accepted for a uniform factory signature.
            api_key: Optional API key. Defaults to ``GEMINI_API_KEY`` env var.
            api_key_alias: Optional human-readable name for the API key.
            base_url: Optional custom base URL for routing through a proxy.
            default_headers: Optional headers sent with every request.

        Raises:
            ConfigurationError: If no API key is provided and env var is not set.
        """
        resolved_api_key = resolve_api_key(api_key, "GEMINI_API_KEY", "Gemini")
        super().__init__(
            provider="gemini",
            model=model,
            input_cost=input_cost,
            output_cost=output_cost,
            cached_input_cost=cached_input_cost,
            cache_write_cost=cache_write_cost,
            supports_temperature_top_p=True,
            use_web_search=use_web_search,
            api_key=resolved_api_key,
            api_key_alias=api_key_alias,
            base_url=base_url,
            default_headers=default_headers,
        )
        http_options = None
        if self.base_url or self.default_headers:
            http_options = types.HttpOptions(
                base_url=self.base_url,
                headers=self.default_headers,
            )
        self.client = genai.Client(api_key=resolved_api_key, http_options=http_options)

    # Gemini bills grounded queries at $35 per 1,000 requests.
    _GROUNDED_QUERY_COST = 0.035

    def _apply_web_search(self, config_kwargs: dict[str, Any]) -> None:
        """Attach the Google Search tool to a request config when enabled."""
        if not self.use_web_search:
            return
        config_kwargs["tools"] = [types.Tool(google_search=types.GoogleSearch())]

    def _supports_search_with_structured_output(self) -> bool:
        """Whether this model can combine a grounding tool with a response schema.

        Grounded structured outputs are a Gemini 3 series preview feature; 2.5
        and earlier reject a request that sets both a grounding tool and a
        response schema.
        """
        return (
            self.model.startswith("gemini-3.")
            or self.model.startswith("gemini-3-")
            or self.model == "gemini-3"
        )

    def _compute_web_search_cost(self, response: Any) -> float:
        """Return the per-call grounded-query fee charged by Gemini.

        Counts response candidates that carry ``grounding_metadata`` — the
        only signal the API surfaces when a grounded query was actually
        performed.
        """
        candidates = getattr(response, "candidates", None) or []
        grounded = sum(
            1 for c in candidates if getattr(c, "grounding_metadata", None) is not None
        )
        return grounded * self._GROUNDED_QUERY_COST

    @retry_provider_call
    async def _get_response_impl(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Get a plain text response from Gemini."""
        return await self._get_response(
            user_prompt, system_prompt, temperature, top_p, extra_headers=extra_headers
        )

    async def _get_response(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Internal method to get a response from Gemini."""
        start_time = time.time()
        config_kwargs: dict[str, Any] = {
            "system_instruction": system_prompt,
            "temperature": temperature,
            "top_p": top_p,
        }
        if extra_headers:
            config_kwargs["http_options"] = types.HttpOptions(headers=extra_headers)
        self._apply_web_search(config_kwargs)
        try:
            response = await self.client.aio.models.generate_content(
                model=self.model,
                config=types.GenerateContentConfig(**config_kwargs),
                contents=user_prompt,
            )
        except genai_errors.APIError as e:
            raise ProviderError(
                f"Gemini API error: {e}",
                provider="gemini",
                original_error=e,
            ) from e
        execution_time = time.time() - start_time

        input_tokens, output_tokens, cached_tokens = _gemini_token_counts(response)
        input_cost, output_cost, total_cost = self._calculate_costs(
            input_tokens, output_tokens, cached_tokens
        )
        tool_use_cost = self._compute_web_search_cost(response)
        total_cost += tool_use_cost

        return LLMResponse(
            content=response.text or "",
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            input_cost=input_cost,
            output_cost=output_cost,
            total_cost=total_cost,
            response_time=execution_time,
            tool_use_cost=tool_use_cost,
            deprecation_warning=self.deprecation_warning,
        )

    async def _get_response_stream_impl(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMStreamResponse:
        """Get a streaming text response from Gemini."""
        state = _StreamState()
        config_kwargs: dict[str, Any] = {
            "system_instruction": system_prompt,
            "temperature": temperature,
            "top_p": top_p,
        }
        if extra_headers:
            config_kwargs["http_options"] = types.HttpOptions(headers=extra_headers)
        self._apply_web_search(config_kwargs)

        try:
            response = await self.client.aio.models.generate_content_stream(
                model=self.model,
                config=types.GenerateContentConfig(**config_kwargs),
                contents=user_prompt,
            )
        except genai_errors.APIError as e:
            raise ProviderError(
                f"Gemini API error: {e}",
                provider="gemini",
                original_error=e,
            ) from e

        async def generator() -> AsyncIterator[str]:
            try:
                async for chunk in response:
                    if chunk.text:
                        yield chunk.text
                    if chunk.usage_metadata:
                        state.input_tokens = chunk.usage_metadata.prompt_token_count or 0
                        state.output_tokens = chunk.usage_metadata.candidates_token_count or 0
                        state.cached_tokens = (
                            chunk.usage_metadata.cached_content_token_count or 0
                        )
            except genai_errors.APIError as e:
                raise ProviderError(
                    f"Gemini API error: {e}",
                    provider="gemini",
                    original_error=e,
                ) from e

        return LLMStreamResponse(stream=generator(), state=state, llm=self)

    async def _get_json_schema_response(
        self,
        user_prompt: str,
        response_schema: dict[str, Any],
        system_prompt: str | None = None,
        schema_name: str = "Response",
        schema_description: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Gemini-specific implementation using response schema for structured outputs."""
        if self.use_web_search and not self._supports_search_with_structured_output():
            raise ConfigurationError(
                f"Gemini model '{self.model}' does not support combining grounded "
                "web search with response_schema in the same request. Only Gemini 3 "
                "series models support grounded structured outputs. Use a separate "
                "Gemini instance with use_web_search=False for structured calls."
            )
        config_kwargs: dict[str, Any] = {
            "system_instruction": system_prompt,
            "temperature": temperature,
            "top_p": top_p,
            "response_schema": _gemini_schema(response_schema),
            "response_mime_type": "application/json",
        }
        if extra_headers:
            config_kwargs["http_options"] = types.HttpOptions(headers=extra_headers)
        self._apply_web_search(config_kwargs)

        start_time = time.time()
        try:
            response = await self.client.aio.models.generate_content(
                model=self.model,
                config=types.GenerateContentConfig(**config_kwargs),
                contents=user_prompt,
            )
        except genai_errors.APIError as e:
            raise ProviderError(
                f"Gemini API error: {e}",
                provider="gemini",
                original_error=e,
            ) from e
        execution_time = time.time() - start_time

        input_tokens, output_tokens, cached_tokens = _gemini_token_counts(response)
        input_cost, output_cost, total_cost = self._calculate_costs(
            input_tokens, output_tokens, cached_tokens
        )
        tool_use_cost = self._compute_web_search_cost(response)
        total_cost += tool_use_cost

        return LLMResponse(
            content=canonicalize_json_schema_output(response.text or "", response_schema),
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            input_cost=input_cost,
            output_cost=output_cost,
            total_cost=total_cost,
            response_time=execution_time,
            tool_use_cost=tool_use_cost,
        )

init ¶

__init__(model, input_cost, output_cost, *, supports_temperature_top_p=True, use_web_search=False, cached_input_cost=None, cache_write_cost=None, api_key=None, api_key_alias=None, base_url=None, default_headers=None)

Initialize the Gemini provider.

Parameters:

Name	Type	Description	Default
`model`	`str`	The Gemini model identifier (e.g., "gemini-2.5-flash").	required
`input_cost`	`float`	Cost per million input tokens in USD.	required
`output_cost`	`float`	Cost per million output tokens in USD.	required
`supports_temperature_top_p`	`bool`	Whether the model supports temperature/top_p.	`True`
`use_web_search`	`bool`	Enable the Google Search grounding tool.	`False`
`cached_input_cost`	`float \| None`	Cost per million cached-content (cache-read) tokens in USD. Gemini reports cached content as a subset of prompt tokens, so this re-prices them below `input_cost`.	`None`
`cache_write_cost`	`float \| None`	Unused by Gemini (implicit caching has no per-token write fee); accepted for a uniform factory signature.	`None`
`api_key`	`str \| None`	Optional API key. Defaults to `GEMINI_API_KEY` env var.	`None`
`api_key_alias`	`str \| None`	Optional human-readable name for the API key.	`None`
`base_url`	`str \| None`	Optional custom base URL for routing through a proxy.	`None`
`default_headers`	`dict[str, str] \| None`	Optional headers sent with every request.	`None`

Raises:

Type	Description
`ConfigurationError`	If no API key is provided and env var is not set.

Source code in majordomo_llm/providers/gemini.py

def __init__(
    self,
    model: str,
    input_cost: float,
    output_cost: float,
    *,
    supports_temperature_top_p: bool = True,
    use_web_search: bool = False,
    cached_input_cost: float | None = None,
    cache_write_cost: float | None = None,
    api_key: str | None = None,
    api_key_alias: str | None = None,
    base_url: str | None = None,
    default_headers: dict[str, str] | None = None,
) -> None:
    """Initialize the Gemini provider.

    Args:
        model: The Gemini model identifier (e.g., "gemini-2.5-flash").
        input_cost: Cost per million input tokens in USD.
        output_cost: Cost per million output tokens in USD.
        supports_temperature_top_p: Whether the model supports temperature/top_p.
        use_web_search: Enable the Google Search grounding tool.
        cached_input_cost: Cost per million cached-content (cache-read) tokens
            in USD. Gemini reports cached content as a subset of prompt
            tokens, so this re-prices them below ``input_cost``.
        cache_write_cost: Unused by Gemini (implicit caching has no per-token
            write fee); accepted for a uniform factory signature.
        api_key: Optional API key. Defaults to ``GEMINI_API_KEY`` env var.
        api_key_alias: Optional human-readable name for the API key.
        base_url: Optional custom base URL for routing through a proxy.
        default_headers: Optional headers sent with every request.

    Raises:
        ConfigurationError: If no API key is provided and env var is not set.
    """
    resolved_api_key = resolve_api_key(api_key, "GEMINI_API_KEY", "Gemini")
    super().__init__(
        provider="gemini",
        model=model,
        input_cost=input_cost,
        output_cost=output_cost,
        cached_input_cost=cached_input_cost,
        cache_write_cost=cache_write_cost,
        supports_temperature_top_p=True,
        use_web_search=use_web_search,
        api_key=resolved_api_key,
        api_key_alias=api_key_alias,
        base_url=base_url,
        default_headers=default_headers,
    )
    http_options = None
    if self.base_url or self.default_headers:
        http_options = types.HttpOptions(
            base_url=self.base_url,
            headers=self.default_headers,
        )
    self.client = genai.Client(api_key=resolved_api_key, http_options=http_options)

Bases: LLM

DeepSeek LLM provider.

Implements the LLM interface for DeepSeek's models using the OpenAI-compatible API. Supports both DeepSeek-V3 (chat) and DeepSeek-R1 (reasoner) models.

The API key is read from the DEEPSEEK_API_KEY environment variable.

Attributes:

Name	Type	Description
`client`		The async OpenAI client instance configured for DeepSeek.

Example

llm = DeepSeek( ... model="deepseek-chat", ... input_cost=0.28, ... output_cost=0.42, ... ) response = await llm.get_response("Hello, DeepSeek!")

Source code in majordomo_llm/providers/deepseek.py

class DeepSeek(LLM):
    """DeepSeek LLM provider.

    Implements the LLM interface for DeepSeek's models using the OpenAI-compatible
    API. Supports both DeepSeek-V3 (chat) and DeepSeek-R1 (reasoner) models.

    The API key is read from the ``DEEPSEEK_API_KEY`` environment variable.

    Attributes:
        client: The async OpenAI client instance configured for DeepSeek.

    Example:
        >>> llm = DeepSeek(
        ...     model="deepseek-chat",
        ...     input_cost=0.28,
        ...     output_cost=0.42,
        ... )
        >>> response = await llm.get_response("Hello, DeepSeek!")
    """

    DEEPSEEK_BASE_URL = "https://api.deepseek.com"
    REASONING_EFFORTS = frozenset({"minimal", "low", "medium", "high"})
    THINKING_MODES = frozenset({"enabled", "disabled"})

    def __init__(
        self,
        model: str,
        input_cost: float,
        output_cost: float,
        supports_temperature_top_p: bool = True,
        *,
        cached_input_cost: float | None = None,
        cache_write_cost: float | None = None,
        api_key: str | None = None,
        api_key_alias: str | None = None,
        base_url: str | None = None,
        default_headers: dict[str, str] | None = None,
        reasoning_effort: str | None = None,
        thinking: str | None = None,
    ) -> None:
        """Initialize the DeepSeek provider.

        Args:
            model: The DeepSeek model identifier (e.g., "deepseek-chat", "deepseek-reasoner").
            input_cost: Cost per million input tokens in USD.
            output_cost: Cost per million output tokens in USD.
            supports_temperature_top_p: Whether temperature/top_p are supported.
            cached_input_cost: Cost per million cache-hit (cache-read) tokens in
                USD; a subset of input tokens re-priced below ``input_cost``.
            cache_write_cost: Unused by DeepSeek; accepted for a uniform factory
                signature.
            api_key: Optional API key. Defaults to ``DEEPSEEK_API_KEY`` env var.
            api_key_alias: Optional human-readable name for the API key.
            base_url: Optional custom base URL. Overrides DEEPSEEK_BASE_URL when set.
            default_headers: Optional headers sent with every request.
            reasoning_effort: Optional reasoning effort for supported DeepSeek models.
            thinking: Optional thinking mode ("enabled" or "disabled") for supported models.

        Raises:
            ConfigurationError: If no API key is provided and env var is not set.
            ValueError: If reasoning_effort or thinking is invalid.
        """
        if reasoning_effort is not None and reasoning_effort not in self.REASONING_EFFORTS:
            valid = ", ".join(sorted(self.REASONING_EFFORTS))
            raise ValueError(
                f"Invalid DeepSeek reasoning_effort '{reasoning_effort}'. Valid: {valid}"
            )
        if thinking is not None and thinking not in self.THINKING_MODES:
            valid = ", ".join(sorted(self.THINKING_MODES))
            raise ValueError(f"Invalid DeepSeek thinking mode '{thinking}'. Valid: {valid}")

        resolved_api_key = resolve_api_key(api_key, "DEEPSEEK_API_KEY", "DeepSeek")

        # When routing through a proxy (e.g. Majordomo Steward), auto-inject
        # ``x-majordomo-provider: deepseek`` so the gateway can disambiguate
        # DeepSeek traffic from vanilla OpenAI (both speak the same wire
        # shape). Caller-supplied default_headers win on key collision.
        if base_url is not None:
            merged_headers: dict[str, str] = {"x-majordomo-provider": "deepseek"}
            if default_headers:
                merged_headers.update(default_headers)
            default_headers = merged_headers

        super().__init__(
            provider="deepseek",
            model=model,
            input_cost=input_cost,
            output_cost=output_cost,
            cached_input_cost=cached_input_cost,
            cache_write_cost=cache_write_cost,
            supports_temperature_top_p=supports_temperature_top_p,
            api_key=resolved_api_key,
            api_key_alias=api_key_alias,
            base_url=base_url,
            default_headers=default_headers,
        )
        self.client = openai.AsyncOpenAI(
            api_key=resolved_api_key,
            base_url=self.base_url or self.DEEPSEEK_BASE_URL,
            default_headers=self.default_headers,
        )
        self.reasoning_effort = reasoning_effort
        self.thinking = thinking

    def _deepseek_request_kwargs(self) -> dict[str, Any]:
        """Build DeepSeek-specific request options for supported models."""
        kwargs: dict[str, Any] = {}
        if self.reasoning_effort is not None:
            kwargs["reasoning_effort"] = self.reasoning_effort
        if self.thinking is not None:
            kwargs["extra_body"] = {"thinking": {"type": self.thinking}}
        return kwargs

    @retry_provider_call
    async def _get_response_impl(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Get a plain text response from DeepSeek."""
        return await self._get_response(
            user_prompt, system_prompt, temperature, top_p, extra_headers=extra_headers
        )

    async def _get_response(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Internal method to get a response from DeepSeek."""
        messages: list[Any] = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        messages.append({"role": "user", "content": user_prompt})

        start_time = time.time()
        request_kwargs = self._deepseek_request_kwargs()
        try:
            if self.supports_temperature_top_p:
                response = await self.client.chat.completions.create(
                    model=self.model,
                    messages=messages,
                    temperature=temperature,
                    top_p=top_p,
                    extra_headers=extra_headers,
                    **request_kwargs,
                )
            else:
                response = await self.client.chat.completions.create(
                    model=self.model,
                    messages=messages,
                    extra_headers=extra_headers,
                    **request_kwargs,
                )
        except openai.APIError as e:
            raise ProviderError(
                f"DeepSeek API error: {e}",
                provider="deepseek",
                original_error=e,
            ) from e

        execution_time = time.time() - start_time
        assert response.usage is not None
        input_tokens = response.usage.prompt_tokens
        output_tokens = response.usage.completion_tokens
        cached_tokens = (
            getattr(
                getattr(response.usage, "prompt_tokens_details", None),
                "cached_tokens",
                0,
            )
            or 0
        )
        input_cost, output_cost, total_cost = self._calculate_costs(
            input_tokens, output_tokens, cached_tokens
        )

        return LLMResponse(
            content=response.choices[0].message.content or "",
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            input_cost=input_cost,
            output_cost=output_cost,
            total_cost=total_cost,
            response_time=execution_time,
            deprecation_warning=self.deprecation_warning,
        )

    async def _get_response_stream_impl(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMStreamResponse:
        """Get a streaming text response from DeepSeek."""
        messages: list[Any] = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        messages.append({"role": "user", "content": user_prompt})

        state = _StreamState()
        request_kwargs = self._deepseek_request_kwargs()

        try:
            if self.supports_temperature_top_p:
                response = await self.client.chat.completions.create(
                    model=self.model,
                    messages=messages,
                    temperature=temperature,
                    top_p=top_p,
                    stream=True,
                    stream_options={"include_usage": True},
                    extra_headers=extra_headers,
                    **request_kwargs,
                )
            else:
                response = await self.client.chat.completions.create(
                    model=self.model,
                    messages=messages,
                    stream=True,
                    stream_options={"include_usage": True},
                    extra_headers=extra_headers,
                    **request_kwargs,
                )
        except openai.APIError as e:
            raise ProviderError(
                f"DeepSeek API error: {e}",
                provider="deepseek",
                original_error=e,
            ) from e

        async def generator() -> AsyncIterator[str]:
            try:
                async for chunk in response:
                    if chunk.choices and chunk.choices[0].delta.content:
                        yield chunk.choices[0].delta.content
                    if chunk.usage:
                        state.input_tokens = chunk.usage.prompt_tokens
                        state.output_tokens = chunk.usage.completion_tokens
                        state.cached_tokens = (
                            getattr(
                                getattr(chunk.usage, "prompt_tokens_details", None),
                                "cached_tokens",
                                0,
                            )
                            or 0
                        )
            except openai.APIError as e:
                raise ProviderError(
                    f"DeepSeek API error: {e}",
                    provider="deepseek",
                    original_error=e,
                ) from e

        return LLMStreamResponse(stream=generator(), state=state, llm=self)

    async def _get_json_schema_response(
        self,
        user_prompt: str,
        response_schema: dict[str, Any],
        system_prompt: str | None = None,
        schema_name: str = "Response",
        schema_description: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """DeepSeek-specific implementation using json_object mode.

        DeepSeek's API supports only ``response_format={'type': 'json_object'}``
        (see https://api-docs.deepseek.com/guides/json_mode); ``json_schema`` is
        rejected with ``"This response_format type is unavailable now"``. The
        schema is therefore injected into the system prompt so the model knows
        the expected shape, and ``json_object`` mode constrains the output to
        valid JSON. The response is still canonicalized against the schema so
        callers receive a deterministic, byte-comparable string.
        """
        effective_system_prompt = build_schema_prompt(response_schema, system_prompt)

        messages: list[Any] = [
            {"role": "system", "content": effective_system_prompt},
            {"role": "user", "content": user_prompt},
        ]

        response_format: Any = {"type": "json_object"}

        start_time = time.time()
        request_kwargs = self._deepseek_request_kwargs()
        try:
            if self.supports_temperature_top_p:
                response = await self.client.chat.completions.create(
                    model=self.model,
                    messages=messages,
                    temperature=temperature,
                    top_p=top_p,
                    response_format=response_format,
                    extra_headers=extra_headers,
                    **request_kwargs,
                )
            else:
                response = await self.client.chat.completions.create(
                    model=self.model,
                    messages=messages,
                    response_format=response_format,
                    extra_headers=extra_headers,
                    **request_kwargs,
                )
        except openai.APIError as e:
            raise ProviderError(
                f"DeepSeek API error: {e}",
                provider="deepseek",
                original_error=e,
            ) from e

        execution_time = time.time() - start_time

        assert response.usage is not None
        input_tokens = response.usage.prompt_tokens
        output_tokens = response.usage.completion_tokens
        cached_tokens = (
            getattr(
                getattr(response.usage, "prompt_tokens_details", None),
                "cached_tokens",
                0,
            )
            or 0
        )
        input_cost, output_cost, total_cost = self._calculate_costs(
            input_tokens, output_tokens, cached_tokens
        )

        return LLMResponse(
            content=canonicalize_json_schema_output(
                response.choices[0].message.content or "",
                response_schema,
            ),
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            input_cost=input_cost,
            output_cost=output_cost,
            total_cost=total_cost,
            response_time=execution_time,
        )

init ¶

__init__(model, input_cost, output_cost, supports_temperature_top_p=True, *, cached_input_cost=None, cache_write_cost=None, api_key=None, api_key_alias=None, base_url=None, default_headers=None, reasoning_effort=None, thinking=None)

Initialize the DeepSeek provider.

Parameters:

Name	Type	Description	Default
`model`	`str`	The DeepSeek model identifier (e.g., "deepseek-chat", "deepseek-reasoner").	required
`input_cost`	`float`	Cost per million input tokens in USD.	required
`output_cost`	`float`	Cost per million output tokens in USD.	required
`supports_temperature_top_p`	`bool`	Whether temperature/top_p are supported.	`True`
`cached_input_cost`	`float \| None`	Cost per million cache-hit (cache-read) tokens in USD; a subset of input tokens re-priced below `input_cost`.	`None`
`cache_write_cost`	`float \| None`	Unused by DeepSeek; accepted for a uniform factory signature.	`None`
`api_key`	`str \| None`	Optional API key. Defaults to `DEEPSEEK_API_KEY` env var.	`None`
`api_key_alias`	`str \| None`	Optional human-readable name for the API key.	`None`
`base_url`	`str \| None`	Optional custom base URL. Overrides DEEPSEEK_BASE_URL when set.	`None`
`default_headers`	`dict[str, str] \| None`	Optional headers sent with every request.	`None`
`reasoning_effort`	`str \| None`	Optional reasoning effort for supported DeepSeek models.	`None`
`thinking`	`str \| None`	Optional thinking mode ("enabled" or "disabled") for supported models.	`None`

Raises:

Type	Description
`ConfigurationError`	If no API key is provided and env var is not set.
`ValueError`	If reasoning_effort or thinking is invalid.

Source code in majordomo_llm/providers/deepseek.py

def __init__(
    self,
    model: str,
    input_cost: float,
    output_cost: float,
    supports_temperature_top_p: bool = True,
    *,
    cached_input_cost: float | None = None,
    cache_write_cost: float | None = None,
    api_key: str | None = None,
    api_key_alias: str | None = None,
    base_url: str | None = None,
    default_headers: dict[str, str] | None = None,
    reasoning_effort: str | None = None,
    thinking: str | None = None,
) -> None:
    """Initialize the DeepSeek provider.

    Args:
        model: The DeepSeek model identifier (e.g., "deepseek-chat", "deepseek-reasoner").
        input_cost: Cost per million input tokens in USD.
        output_cost: Cost per million output tokens in USD.
        supports_temperature_top_p: Whether temperature/top_p are supported.
        cached_input_cost: Cost per million cache-hit (cache-read) tokens in
            USD; a subset of input tokens re-priced below ``input_cost``.
        cache_write_cost: Unused by DeepSeek; accepted for a uniform factory
            signature.
        api_key: Optional API key. Defaults to ``DEEPSEEK_API_KEY`` env var.
        api_key_alias: Optional human-readable name for the API key.
        base_url: Optional custom base URL. Overrides DEEPSEEK_BASE_URL when set.
        default_headers: Optional headers sent with every request.
        reasoning_effort: Optional reasoning effort for supported DeepSeek models.
        thinking: Optional thinking mode ("enabled" or "disabled") for supported models.

    Raises:
        ConfigurationError: If no API key is provided and env var is not set.
        ValueError: If reasoning_effort or thinking is invalid.
    """
    if reasoning_effort is not None and reasoning_effort not in self.REASONING_EFFORTS:
        valid = ", ".join(sorted(self.REASONING_EFFORTS))
        raise ValueError(
            f"Invalid DeepSeek reasoning_effort '{reasoning_effort}'. Valid: {valid}"
        )
    if thinking is not None and thinking not in self.THINKING_MODES:
        valid = ", ".join(sorted(self.THINKING_MODES))
        raise ValueError(f"Invalid DeepSeek thinking mode '{thinking}'. Valid: {valid}")

    resolved_api_key = resolve_api_key(api_key, "DEEPSEEK_API_KEY", "DeepSeek")

    # When routing through a proxy (e.g. Majordomo Steward), auto-inject
    # ``x-majordomo-provider: deepseek`` so the gateway can disambiguate
    # DeepSeek traffic from vanilla OpenAI (both speak the same wire
    # shape). Caller-supplied default_headers win on key collision.
    if base_url is not None:
        merged_headers: dict[str, str] = {"x-majordomo-provider": "deepseek"}
        if default_headers:
            merged_headers.update(default_headers)
        default_headers = merged_headers

    super().__init__(
        provider="deepseek",
        model=model,
        input_cost=input_cost,
        output_cost=output_cost,
        cached_input_cost=cached_input_cost,
        cache_write_cost=cache_write_cost,
        supports_temperature_top_p=supports_temperature_top_p,
        api_key=resolved_api_key,
        api_key_alias=api_key_alias,
        base_url=base_url,
        default_headers=default_headers,
    )
    self.client = openai.AsyncOpenAI(
        api_key=resolved_api_key,
        base_url=self.base_url or self.DEEPSEEK_BASE_URL,
        default_headers=self.default_headers,
    )
    self.reasoning_effort = reasoning_effort
    self.thinking = thinking

Bases: LLM

Cohere LLM provider.

Implements the LLM interface for Cohere's models using the V2 API. Supports Command A, Command R+, Command R, and Command R7B models.

The API key is read from the CO_API_KEY environment variable.

Attributes:

Name	Type	Description
`client`		The async Cohere client instance.

Example

llm = Cohere( ... model="command-a-03-2025", ... input_cost=2.50, ... output_cost=10.00, ... ) response = await llm.get_response("Hello, Cohere!")

Source code in majordomo_llm/providers/cohere.py

class Cohere(LLM):
    """Cohere LLM provider.

    Implements the LLM interface for Cohere's models using the V2 API.
    Supports Command A, Command R+, Command R, and Command R7B models.

    The API key is read from the ``CO_API_KEY`` environment variable.

    Attributes:
        client: The async Cohere client instance.

    Example:
        >>> llm = Cohere(
        ...     model="command-a-03-2025",
        ...     input_cost=2.50,
        ...     output_cost=10.00,
        ... )
        >>> response = await llm.get_response("Hello, Cohere!")
    """

    def __init__(
        self,
        model: str,
        input_cost: float,
        output_cost: float,
        supports_temperature_top_p: bool = True,
        *,
        cached_input_cost: float | None = None,
        cache_write_cost: float | None = None,
        api_key: str | None = None,
        api_key_alias: str | None = None,
        base_url: str | None = None,
        default_headers: dict[str, str] | None = None,
    ) -> None:
        """Initialize the Cohere provider.

        Args:
            model: The Cohere model identifier (e.g., "command-a-03-2025").
            input_cost: Cost per million input tokens in USD.
            output_cost: Cost per million output tokens in USD.
            supports_temperature_top_p: Whether temperature/top_p are supported.
            cached_input_cost: Unused by Cohere (no prompt caching); accepted for
                a uniform factory signature.
            cache_write_cost: Unused by Cohere; accepted for a uniform factory
                signature.
            api_key: Optional API key. Defaults to ``CO_API_KEY`` env var.
            api_key_alias: Optional human-readable name for the API key.
            base_url: Optional custom base URL for routing through a proxy.
            default_headers: Optional headers sent with every request.

        Raises:
            ConfigurationError: If no API key is provided and env var is not set.
        """
        resolved_api_key = resolve_api_key(api_key, "CO_API_KEY", "Cohere")
        super().__init__(
            provider="cohere",
            model=model,
            input_cost=input_cost,
            output_cost=output_cost,
            cached_input_cost=cached_input_cost,
            cache_write_cost=cache_write_cost,
            supports_temperature_top_p=supports_temperature_top_p,
            api_key=resolved_api_key,
            api_key_alias=api_key_alias,
            base_url=base_url,
            default_headers=default_headers,
        )
        self.client = cohere.AsyncClientV2(
            api_key=resolved_api_key,
            base_url=self.base_url,
        )

    def _cohere_request_options(
        self, extra_headers: dict[str, str] | None
    ) -> RequestOptions | None:
        """Build request_options with merged default + extra headers."""
        merged = dict(self.default_headers or {})
        if extra_headers:
            merged.update(extra_headers)
        if not merged:
            return None
        return RequestOptions(additional_headers=merged)

    @retry_provider_call
    async def _get_response_impl(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Get a plain text response from Cohere."""
        return await self._get_response(
            user_prompt, system_prompt, temperature, top_p, extra_headers=extra_headers
        )

    async def _get_response(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Internal method to get a response from Cohere."""
        messages: list[Any] = []
        if system_prompt:
            messages.append(SystemChatMessageV2(content=system_prompt))
        messages.append(UserChatMessageV2(content=user_prompt))

        request_options = self._cohere_request_options(extra_headers)

        start_time = time.time()
        try:
            if self.supports_temperature_top_p:
                response = await self.client.chat(
                    model=self.model,
                    messages=messages,
                    temperature=temperature,
                    p=top_p,
                    request_options=request_options,
                )
            else:
                response = await self.client.chat(
                    model=self.model,
                    messages=messages,
                    request_options=request_options,
                )
        except cohere.core.api_error.ApiError as e:
            raise ProviderError(
                f"Cohere API error: {e}",
                provider="cohere",
                original_error=e,
            ) from e

        execution_time = time.time() - start_time
        input_tokens, output_tokens = _cohere_token_counts(response)
        cached_tokens = 0
        input_cost, output_cost, total_cost = self._calculate_costs(input_tokens, output_tokens)

        return LLMResponse(
            content=_cohere_text_content(response),
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            input_cost=input_cost,
            output_cost=output_cost,
            total_cost=total_cost,
            response_time=execution_time,
            deprecation_warning=self.deprecation_warning,
        )

    async def _get_response_stream_impl(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMStreamResponse:
        """Get a streaming text response from Cohere."""
        messages: list[Any] = []
        if system_prompt:
            messages.append(SystemChatMessageV2(content=system_prompt))
        messages.append(UserChatMessageV2(content=user_prompt))

        state = _StreamState()
        request_options = self._cohere_request_options(extra_headers)

        try:
            if self.supports_temperature_top_p:
                response = self.client.chat_stream(
                    model=self.model,
                    messages=messages,
                    temperature=temperature,
                    p=top_p,
                    request_options=request_options,
                )
            else:
                response = self.client.chat_stream(
                    model=self.model,
                    messages=messages,
                    request_options=request_options,
                )
        except cohere.core.api_error.ApiError as e:
            raise ProviderError(
                f"Cohere API error: {e}",
                provider="cohere",
                original_error=e,
            ) from e

        async def generator() -> AsyncIterator[str]:
            try:
                async for event in response:
                    event_any: Any = event
                    if event_any.type == "content-delta":
                        yield event_any.delta.message.content.text or ""
                    elif event_any.type == "message-end":
                        state.input_tokens, state.output_tokens = _cohere_usage_tokens(
                            event_any.delta.usage
                        )
            except cohere.core.api_error.ApiError as e:
                raise ProviderError(
                    f"Cohere API error: {e}",
                    provider="cohere",
                    original_error=e,
                ) from e

        return LLMStreamResponse(stream=generator(), state=state, llm=self)

    async def _get_json_schema_response(
        self,
        user_prompt: str,
        response_schema: dict[str, Any],
        system_prompt: str | None = None,
        schema_name: str = "Response",
        schema_description: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Cohere-specific implementation using native JSON schema response format."""
        schema = _strip_cohere_unsupported_constraints(inline_schema_refs(response_schema))
        messages: list[Any] = []
        if system_prompt is not None:
            messages.append(SystemChatMessageV2(content=system_prompt))
        messages.append(UserChatMessageV2(content=user_prompt))

        request_options = self._cohere_request_options(extra_headers)

        start_time = time.time()
        try:
            if self.supports_temperature_top_p:
                response = await self.client.chat(
                    model=self.model,
                    messages=messages,
                    temperature=temperature,
                    p=top_p,
                    response_format=JsonObjectResponseFormatV2(json_schema=schema),
                    request_options=request_options,
                )
            else:
                response = await self.client.chat(
                    model=self.model,
                    messages=messages,
                    response_format=JsonObjectResponseFormatV2(json_schema=schema),
                    request_options=request_options,
                )
        except cohere.core.api_error.ApiError as e:
            raise ProviderError(
                f"Cohere API error: {e}",
                provider="cohere",
                original_error=e,
            ) from e

        execution_time = time.time() - start_time

        input_tokens, output_tokens = _cohere_token_counts(response)
        cached_tokens = 0
        input_cost, output_cost, total_cost = self._calculate_costs(input_tokens, output_tokens)

        return LLMResponse(
            content=canonicalize_json_schema_output(
                _cohere_text_content(response),
                response_schema,
            ),
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            input_cost=input_cost,
            output_cost=output_cost,
            total_cost=total_cost,
            response_time=execution_time,
        )

init ¶

__init__(model, input_cost, output_cost, supports_temperature_top_p=True, *, cached_input_cost=None, cache_write_cost=None, api_key=None, api_key_alias=None, base_url=None, default_headers=None)

Initialize the Cohere provider.

Parameters:

Name	Type	Description	Default
`model`	`str`	The Cohere model identifier (e.g., "command-a-03-2025").	required
`input_cost`	`float`	Cost per million input tokens in USD.	required
`output_cost`	`float`	Cost per million output tokens in USD.	required
`supports_temperature_top_p`	`bool`	Whether temperature/top_p are supported.	`True`
`cached_input_cost`	`float \| None`	Unused by Cohere (no prompt caching); accepted for a uniform factory signature.	`None`
`cache_write_cost`	`float \| None`	Unused by Cohere; accepted for a uniform factory signature.	`None`
`api_key`	`str \| None`	Optional API key. Defaults to `CO_API_KEY` env var.	`None`
`api_key_alias`	`str \| None`	Optional human-readable name for the API key.	`None`
`base_url`	`str \| None`	Optional custom base URL for routing through a proxy.	`None`
`default_headers`	`dict[str, str] \| None`	Optional headers sent with every request.	`None`

Raises:

Type	Description
`ConfigurationError`	If no API key is provided and env var is not set.

Source code in majordomo_llm/providers/cohere.py

def __init__(
    self,
    model: str,
    input_cost: float,
    output_cost: float,
    supports_temperature_top_p: bool = True,
    *,
    cached_input_cost: float | None = None,
    cache_write_cost: float | None = None,
    api_key: str | None = None,
    api_key_alias: str | None = None,
    base_url: str | None = None,
    default_headers: dict[str, str] | None = None,
) -> None:
    """Initialize the Cohere provider.

    Args:
        model: The Cohere model identifier (e.g., "command-a-03-2025").
        input_cost: Cost per million input tokens in USD.
        output_cost: Cost per million output tokens in USD.
        supports_temperature_top_p: Whether temperature/top_p are supported.
        cached_input_cost: Unused by Cohere (no prompt caching); accepted for
            a uniform factory signature.
        cache_write_cost: Unused by Cohere; accepted for a uniform factory
            signature.
        api_key: Optional API key. Defaults to ``CO_API_KEY`` env var.
        api_key_alias: Optional human-readable name for the API key.
        base_url: Optional custom base URL for routing through a proxy.
        default_headers: Optional headers sent with every request.

    Raises:
        ConfigurationError: If no API key is provided and env var is not set.
    """
    resolved_api_key = resolve_api_key(api_key, "CO_API_KEY", "Cohere")
    super().__init__(
        provider="cohere",
        model=model,
        input_cost=input_cost,
        output_cost=output_cost,
        cached_input_cost=cached_input_cost,
        cache_write_cost=cache_write_cost,
        supports_temperature_top_p=supports_temperature_top_p,
        api_key=resolved_api_key,
        api_key_alias=api_key_alias,
        base_url=base_url,
        default_headers=default_headers,
    )
    self.client = cohere.AsyncClientV2(
        api_key=resolved_api_key,
        base_url=self.base_url,
    )

Providers¶

__init__ ¶

__init__ ¶

__init__ ¶

__init__ ¶

__init__ ¶

init ¶

init ¶

init ¶

init ¶

init ¶