Skip to content

Factory & Cascade

Create an LLM instance for the specified provider and model.

This is the primary factory function for creating LLM instances. It handles provider-specific initialization and configuration lookup.

Parameters:

Name Type Description Default
provider str

The LLM provider name. One of: "openai", "anthropic", "gemini", "deepseek", "cohere".

required
model str

The model identifier (e.g., "gpt-4o", "claude-sonnet-4-20250514").

required
api_key str | None

Optional API key. If not provided, the provider will fall back to its respective environment variable.

None
base_url str | None

Optional custom base URL for routing through a proxy.

None
default_headers dict[str, str] | None

Optional headers sent with every request.

None
region str | None

AWS region for the Bedrock provider (e.g., "us-east-1"). Ignored by other providers. Defaults to AWS_REGION / AWS_DEFAULT_REGION env vars when not specified.

None
use_web_search bool

Enable the provider's server-side web search tool. Validated against the model's supports_web_search flag in llm_config.yaml. Silently ignored for providers that do not implement web search (cohere, deepseek, fireworks, together, bedrock_mantle).

False

Returns:

Type Description
LLM

An LLM instance configured for the specified provider and model.

Raises:

Type Description
ConfigurationError

If the provider or model is not recognized, or if use_web_search is set on a model whose config does not declare supports_web_search: true.

Example

llm = get_llm_instance("anthropic", "claude-sonnet-4-20250514") response = await llm.get_response("Hello!")

Source code in majordomo_llm/factory.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
def get_llm_instance(
    provider: str,
    model: str,
    *,
    api_key: str | None = None,
    base_url: str | None = None,
    default_headers: dict[str, str] | None = None,
    region: str | None = None,
    use_web_search: bool = False,
) -> LLM:
    """Create an LLM instance for the specified provider and model.

    This is the primary factory function for creating LLM instances. It handles
    provider-specific initialization and configuration lookup.

    Args:
        provider: The LLM provider name. One of: "openai", "anthropic", "gemini",
            "deepseek", "cohere".
        model: The model identifier (e.g., "gpt-4o", "claude-sonnet-4-20250514").
        api_key: Optional API key. If not provided, the provider will fall back
            to its respective environment variable.
        base_url: Optional custom base URL for routing through a proxy.
        default_headers: Optional headers sent with every request.
        region: AWS region for the Bedrock provider (e.g., "us-east-1").
            Ignored by other providers. Defaults to ``AWS_REGION`` /
            ``AWS_DEFAULT_REGION`` env vars when not specified.
        use_web_search: Enable the provider's server-side web search tool.
            Validated against the model's ``supports_web_search`` flag in
            ``llm_config.yaml``. Silently ignored for providers that do not
            implement web search (cohere, deepseek, fireworks, together,
            bedrock_mantle).

    Returns:
        An LLM instance configured for the specified provider and model.

    Raises:
        ConfigurationError: If the provider or model is not recognized, or if
            ``use_web_search`` is set on a model whose config does not declare
            ``supports_web_search: true``.

    Example:
        >>> llm = get_llm_instance("anthropic", "claude-sonnet-4-20250514")
        >>> response = await llm.get_response("Hello!")
    """
    llm_config_entry = LLM_CONFIG.get(provider)
    if llm_config_entry is None:
        available = ", ".join(LLM_CONFIG.keys())
        raise ConfigurationError(f"Unknown LLM provider '{provider}'. Available: {available}")

    llm_models = llm_config_entry["models"]
    model_attributes = llm_models.get(model)

    # Check if the requested model is deprecated and resolve to its replacement.
    deprecation_warning = None
    requested_model = None
    if model_attributes is None:
        provider_deprecated = _DEPRECATED_MODELS.get(provider, {})
        replacement = provider_deprecated.get(model)
        if replacement is not None:
            deprecation_warning = (
                f"Model '{model}' for provider '{provider}' is deprecated. "
                f"Automatically replaced with '{replacement}'."
            )
            logger.warning(deprecation_warning)
            requested_model = model
            model = replacement
            model_attributes = llm_models.get(model)

    if model_attributes is None:
        available = ", ".join(llm_models.keys())
        raise ConfigurationError(
            f"Unknown model '{model}' for provider '{provider}'. Available: {available}"
        )

    _WEB_SEARCH_PROVIDERS = ("openai", "anthropic", "gemini", "bedrock")
    if (
        use_web_search
        and provider in _WEB_SEARCH_PROVIDERS
        and not model_attributes.get("supports_web_search", False)
    ):
        raise ConfigurationError(
            f"Model '{model}' for provider '{provider}' does not support web search."
        )

    _PROVIDER_CLASSES: dict[str, type[LLM]] = {
        "openai": OpenAI,
        "anthropic": Anthropic,
        "gemini": Gemini,
        "deepseek": DeepSeek,
        "cohere": Cohere,
        "bedrock": Bedrock,
        "bedrock_mantle": BedrockMantle,
        "fireworks": Fireworks,
        "together": Together,
    }
    cls = _PROVIDER_CLASSES.get(provider)
    if cls is None:
        raise ConfigurationError(f"Unknown LLM provider '{provider}'")

    provider_kwargs: dict[str, Any] = {}
    if provider in ("deepseek", "fireworks", "together"):
        provider_kwargs = {
            "reasoning_effort": model_attributes.get("reasoning_effort"),
            "thinking": model_attributes.get("thinking"),
        }
    elif provider in ("bedrock", "bedrock_mantle"):
        provider_kwargs = {"region": region}

    if provider in ("openai", "anthropic", "gemini", "bedrock"):
        provider_kwargs["use_web_search"] = use_web_search

    # An entry may override its API model ID via the ``model`` attribute. This
    # lets the same underlying model be registered under multiple YAML keys —
    # e.g. distinct "reasoning effort" profiles that share one upstream SKU.
    api_model = model_attributes.get("model", model)

    llm = cls(
        model=api_model,
        input_cost=model_attributes["input_cost"],
        output_cost=model_attributes["output_cost"],
        supports_temperature_top_p=model_attributes.get("supports_temperature_top_p", True),
        api_key=api_key,
        base_url=base_url,
        default_headers=default_headers,
        **provider_kwargs,
    )

    if deprecation_warning:
        llm.deprecation_warning = deprecation_warning
        llm.requested_model = requested_model

    return llm

Create LLM instances for all configured providers and models.

Yields LLM instances one at a time, which is useful for initialization or testing all available models.

Yields:

Type Description
LLM

LLM instances for each configured provider/model combination.

Example

for llm in get_all_llm_instances(): ... print(llm.get_full_model_name())

Source code in majordomo_llm/factory.py
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
def get_all_llm_instances() -> Iterator[LLM]:
    """Create LLM instances for all configured providers and models.

    Yields LLM instances one at a time, which is useful for initialization
    or testing all available models.

    Yields:
        LLM instances for each configured provider/model combination.

    Example:
        >>> for llm in get_all_llm_instances():
        ...     print(llm.get_full_model_name())
    """
    for provider, provider_config in LLM_CONFIG.items():
        models = provider_config.get("models", {})
        for model in models:
            logger.debug("Creating LLM instance: %s/%s", provider, model)
            yield get_llm_instance(provider, model)

Bases: LLM

LLM wrapper that tries multiple providers in priority order.

When a provider fails with a ProviderError, the next provider in the cascade is tried. This provides automatic failover for resilience.

The providers list defines priority order - first provider is tried first.

Attributes:

Name Type Description
llms

List of LLM instances in priority order.

Example

cascade = LLMCascade([ ... ("anthropic", "claude-sonnet-4-20250514"), # Primary ... ("openai", "gpt-4o"), # First fallback ... ("gemini", "gemini-2.5-flash"), # Last resort ... ]) response = await cascade.get_response("Hello!")

Source code in majordomo_llm/cascade.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
class LLMCascade(LLM):
    """LLM wrapper that tries multiple providers in priority order.

    When a provider fails with a ProviderError, the next provider in the
    cascade is tried. This provides automatic failover for resilience.

    The providers list defines priority order - first provider is tried first.

    Attributes:
        llms: List of LLM instances in priority order.

    Example:
        >>> cascade = LLMCascade([
        ...     ("anthropic", "claude-sonnet-4-20250514"),  # Primary
        ...     ("openai", "gpt-4o"),                       # First fallback
        ...     ("gemini", "gemini-2.5-flash"),             # Last resort
        ... ])
        >>> response = await cascade.get_response("Hello!")
    """

    def __init__(
        self,
        providers: list[tuple[str, str]],
        *,
        api_key: str | None = None,
        base_url: str | None = None,
        default_headers: dict[str, str] | None = None,
        hook_pipeline: HookPipeline | None = None,
    ) -> None:
        """Initialize the cascade with a list of providers.

        Args:
            providers: List of (provider, model) tuples in priority order.
                First provider is tried first.
            api_key: Optional API key. If not provided, each provider will fall
                back to its respective environment variable.
            base_url: Optional custom base URL for routing through a proxy.
            default_headers: Optional headers sent with every request.
            hook_pipeline: Optional pipeline that fires once at the cascade
                boundary. The pipeline is intentionally not propagated to
                child providers — hooks evaluate the eventual response from
                whichever provider succeeded, not each failover attempt.

        Raises:
            ValueError: If providers list is empty.
        """
        if not providers:
            raise ValueError("LLMCascade requires at least one provider")

        self.llms = [
            get_llm_instance(
                p,
                m,
                api_key=api_key,
                base_url=base_url,
                default_headers=default_headers,
            )
            for p, m in providers
        ]

        # Use primary provider's attributes for metadata
        primary = self.llms[0]
        super().__init__(
            provider="cascade",
            model=primary.model,
            input_cost=primary.input_cost,
            output_cost=primary.output_cost,
            supports_temperature_top_p=primary.supports_temperature_top_p,
            hook_pipeline=hook_pipeline,
        )

    async def _get_response_impl(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Dispatch to the first child provider that succeeds."""
        return cast(
            LLMResponse,
            await self._cascade_call(
                "get_response",
                user_prompt=user_prompt,
                system_prompt=system_prompt,
                temperature=temperature,
                top_p=top_p,
                extra_headers=extra_headers,
            ),
        )

    async def _get_response_stream_impl(
        self,
        user_prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMStreamResponse:
        """Dispatch streaming to the first child provider that succeeds."""
        return cast(
            LLMStreamResponse,
            await self._cascade_call(
                "get_response_stream",
                user_prompt=user_prompt,
                system_prompt=system_prompt,
                temperature=temperature,
                top_p=top_p,
                extra_headers=extra_headers,
            ),
        )

    async def _get_json_schema_response_retried(
        self,
        user_prompt: str,
        response_schema: dict[str, Any],
        system_prompt: str | None = None,
        schema_name: str = "Response",
        schema_description: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Cascade overrides the retried wrapper to skip outer retries.

        Each child provider already retries via its own ``_get_response_impl``
        / ``_get_json_schema_response_retried``. Adding another retry layer
        at the cascade boundary would multiply attempts; the cascade's job
        is failover, not retry. Failover behavior is preserved by
        ``_cascade_call``.
        """
        return cast(
            LLMResponse,
            await self._cascade_call(
                "get_json_schema_response",
                user_prompt=user_prompt,
                response_schema=response_schema,
                system_prompt=system_prompt,
                schema_name=schema_name,
                schema_description=schema_description,
                temperature=temperature,
                top_p=top_p,
                extra_headers=extra_headers,
                failover_exceptions=(ProviderError, ResponseParsingError),
            ),
        )

    async def _get_json_schema_response(
        self,
        user_prompt: str,
        response_schema: dict[str, Any],
        system_prompt: str | None = None,
        schema_name: str = "Response",
        schema_description: str | None = None,
        temperature: float = 0.3,
        top_p: float = 1.0,
        extra_headers: dict[str, str] | None = None,
    ) -> LLMResponse:
        """Unused on the cascade — ``_get_json_schema_response_retried`` dispatches directly."""
        raise NotImplementedError(
            "LLMCascade dispatches via _get_json_schema_response_retried"
        )

    async def _cascade_call(
        self,
        method_name: str,
        failover_exceptions: tuple[type[Exception], ...] = (ProviderError,),
        **kwargs: Any,
    ) -> LLMResponse | LLMJSONResponse | LLMStreamResponse:
        """Try each provider in order until one succeeds.

        Args:
            method_name: The LLM method to call.
            **kwargs: Arguments to pass to the method.

        Returns:
            The response from the first successful provider.

        Raises:
            ProviderError: If all providers fail.
        """
        last_error: Exception | None = None

        for llm in self.llms:
            try:
                method = getattr(llm, method_name)
                result = await method(**kwargs)
                return cast(LLMResponse | LLMJSONResponse | LLMStreamResponse, result)
            except failover_exceptions as e:
                self._log_provider_failure(llm, e)
                last_error = e
                continue
            except RetryError as e:
                exc = e.last_attempt.exception()
                if not isinstance(exc, ProviderError):
                    raise

                self._log_provider_failure(llm, exc)
                last_error = exc
                continue

        raise ProviderError(
            f"All providers in cascade failed. Last error: {last_error}",
            provider="cascade",
            original_error=last_error,
        )

    def _log_provider_failure(self, llm: LLM, exc: Exception) -> None:
        """Log a provider failure before trying the next cascade entry."""
        logger.warning(
            "Provider %s/%s failed: %s. Trying next provider.",
            llm.provider,
            llm.model,
            exc,
        )

__init__

__init__(providers, *, api_key=None, base_url=None, default_headers=None, hook_pipeline=None)

Initialize the cascade with a list of providers.

Parameters:

Name Type Description Default
providers list[tuple[str, str]]

List of (provider, model) tuples in priority order. First provider is tried first.

required
api_key str | None

Optional API key. If not provided, each provider will fall back to its respective environment variable.

None
base_url str | None

Optional custom base URL for routing through a proxy.

None
default_headers dict[str, str] | None

Optional headers sent with every request.

None
hook_pipeline HookPipeline | None

Optional pipeline that fires once at the cascade boundary. The pipeline is intentionally not propagated to child providers — hooks evaluate the eventual response from whichever provider succeeded, not each failover attempt.

None

Raises:

Type Description
ValueError

If providers list is empty.

Source code in majordomo_llm/cascade.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def __init__(
    self,
    providers: list[tuple[str, str]],
    *,
    api_key: str | None = None,
    base_url: str | None = None,
    default_headers: dict[str, str] | None = None,
    hook_pipeline: HookPipeline | None = None,
) -> None:
    """Initialize the cascade with a list of providers.

    Args:
        providers: List of (provider, model) tuples in priority order.
            First provider is tried first.
        api_key: Optional API key. If not provided, each provider will fall
            back to its respective environment variable.
        base_url: Optional custom base URL for routing through a proxy.
        default_headers: Optional headers sent with every request.
        hook_pipeline: Optional pipeline that fires once at the cascade
            boundary. The pipeline is intentionally not propagated to
            child providers — hooks evaluate the eventual response from
            whichever provider succeeded, not each failover attempt.

    Raises:
        ValueError: If providers list is empty.
    """
    if not providers:
        raise ValueError("LLMCascade requires at least one provider")

    self.llms = [
        get_llm_instance(
            p,
            m,
            api_key=api_key,
            base_url=base_url,
            default_headers=default_headers,
        )
        for p, m in providers
    ]

    # Use primary provider's attributes for metadata
    primary = self.llms[0]
    super().__init__(
        provider="cascade",
        model=primary.model,
        input_cost=primary.input_cost,
        output_cost=primary.output_cost,
        supports_temperature_top_p=primary.supports_temperature_top_p,
        hook_pipeline=hook_pipeline,
    )