diff --git a/opto/optimizers/__init__.py b/opto/optimizers/__init__.py
index 482b1b2d..a41b6d34 100644
--- a/opto/optimizers/__init__.py
+++ b/opto/optimizers/__init__.py
@@ -4,7 +4,9 @@
from opto.optimizers.opro_v2 import OPROv2
from opto.optimizers.textgrad import TextGrad
from opto.optimizers.optoprime_v2 import OptoPrimeV2
+from opto.optimizers.optoprime_v3 import OptoPrimeV3
+from opto.optimizers.opro_v3 import OPROv3
OptoPrime = OptoPrimeV1
-__all__ = ["OPRO", "OptoPrime", "OptoPrimeMulti", "TextGrad", "OptoPrimeV2", "OptoPrimeV1", "OPROv2"]
\ No newline at end of file
+__all__ = ["OPRO", "OptoPrime", "OptoPrimeMulti", "TextGrad", "OptoPrimeV2", "OptoPrimeV1", "OPROv2", "OptoPrimeV3", "OPROv3"]
\ No newline at end of file
diff --git a/opto/optimizers/opro_v3.py b/opto/optimizers/opro_v3.py
new file mode 100644
index 00000000..473ac28c
--- /dev/null
+++ b/opto/optimizers/opro_v3.py
@@ -0,0 +1,541 @@
+"""
+Key difference to v2:
+1. Use the new backbone conversation history manager
+2. Support multimodal node (both trainable and non-trainable)
+3. Break from the OptoPrime style template, support more customizable template from user, for brevity and streamlined usage.
+"""
+
+from textwrap import dedent
+from dataclasses import dataclass
+from typing import Dict, Optional, List, Union
+from opto.trace.nodes import ParameterNode
+
+from opto.optimizers.optoprime_v3 import OptoPrimeV3, OptimizerPromptSymbolSet
+from opto.utils.backbone import (
+ ContentBase, ImageContent, ContentBlockList,
+ DEFAULT_IMAGE_PLACEHOLDER
+)
+
+# Not inheriting from optoprime_v2 because this should have a smaller set
+class OPROPromptSymbolSet(OptimizerPromptSymbolSet):
+ """Prompt symbol set for OPRO optimizer.
+
+ This class defines the tags and symbols used in the OPRO optimizer's prompts
+ and output parsing. It provides a structured way to format problems and parse
+ responses from the language model.
+
+ Attributes
+ ----------
+ instruction_section_title : str
+ Title for the instruction section in prompts.
+ variable_section_title : str
+ Title for the variable/solution section in prompts.
+ feedback_section_title : str
+ Title for the feedback section in prompts.
+ node_tag : str
+ Tag used to identify constant nodes in the computation graph.
+ variable_tag : str
+ Tag used to identify variable nodes that can be optimized.
+ value_tag : str
+ Tag used to wrap the value of a node.
+ constraint_tag : str
+ Tag used to wrap constraint expressions for nodes.
+ reasoning_tag : str
+ Tag used to wrap reasoning in the output.
+ improved_variable_tag : str
+ Tag used to wrap improved variable values in the output.
+ name_tag : str
+ Tag used to wrap variable names.
+ expect_json : bool
+ Whether to expect JSON output format (default: False).
+
+ Methods
+ -------
+ default_prompt_symbols
+ Returns default prompt symbols dictionary.
+
+ Notes
+ -----
+ This class inherits from OptimizerPromptSymbolSet but defines a smaller,
+ more focused set of symbols specifically for OPRO optimization.
+ """
+
+ instruction_section_title = "# Instruction"
+ variables_section_title = "# Solution"
+ feedback_section_title = "# Feedback"
+ context_section_title = "# Context"
+
+ node_tag = "node" # nodes that are constants in the graph
+ variable_tag = "solution" # nodes that can be changed
+ value_tag = "value" # inside node, we have value tag
+ constraint_tag = "constraint" # inside node, we have constraint tag
+
+ # output format
+ # Note: we currently don't support extracting format's like "```code```" because we assume supplied tag is name-only, i.e.,
+ reasoning_tag = "reasoning"
+ improved_variable_tag = "variable"
+ name_tag = "name"
+
+ expect_json = False # this will stop `enforce_json` arguments passed to LLM calls
+
+ @property
+ def default_prompt_symbols(self) -> Dict[str, str]:
+ return {
+ "variables": self.variables_section_title,
+ "feedback": self.feedback_section_title,
+ "instruction": self.instruction_section_title,
+ "context": self.context_section_title
+ }
+
+@dataclass
+class ProblemInstance:
+ """Represents a problem instance for OPRO optimization.
+
+ This dataclass encapsulates a complete problem instance including the
+ instruction, current variables/solution, and feedback received.
+
+ Supports multimodal content - variables can contain images.
+
+ Attributes
+ ----------
+ instruction : str
+ The instruction describing what needs to be done or the question to answer.
+ variables : Union[str, List[ContentBase]]
+ The current proposed solution that can be modified. Can contain images.
+ feedback : str
+ Feedback about the current solution.
+ context: str
+ Optional context information that might be useful to solve the problem.
+
+ optimizer_prompt_symbol_set : OPROPromptSymbolSet
+ The symbol set used for formatting the problem.
+ problem_template : str
+ Template for formatting the problem instance as a string.
+
+ Methods
+ -------
+ __repr__()
+ Returns a formatted string representation of the problem instance.
+ to_content_blocks()
+ Returns a ContentBlockList for multimodal prompts.
+ has_images()
+ Returns True if the problem instance contains images.
+
+ Notes
+ -----
+ The problem instance is formatted using the problem_template which
+ organizes the instruction, variables, and feedback into a structured format.
+ """
+ instruction: str
+ variables: Union[str, List[ContentBase]]
+ feedback: str
+ context: Optional[ContentBlockList]
+
+ optimizer_prompt_symbol_set: OPROPromptSymbolSet
+
+ problem_template = dedent(
+ """
+ # Instruction
+ {instruction}
+
+ # Solution
+ {variables}
+
+ # Feedback
+ {feedback}
+ """
+ )
+
+ @staticmethod
+ def _content_to_text(content: Union[str, List[ContentBase]]) -> str:
+ """Convert content (str or List[ContentBlock]) to text representation.
+
+ Images are rendered as the default placeholder.
+ """
+ if isinstance(content, str):
+ return content
+ return ContentBlockList.ensure(content).to_text(DEFAULT_IMAGE_PLACEHOLDER)
+
+ def __repr__(self) -> str:
+ """Return text-only representation for backward compatibility."""
+ optimization_query = self.problem_template.format(
+ instruction=self.instruction,
+ variables=self._content_to_text(self.variables),
+ feedback=self.feedback,
+ )
+
+ context_section = dedent("""
+
+ # Context
+ {context}
+ """)
+
+ if self.context is not None and self.context.to_text().strip() != "":
+ context_section = context_section.format(context=self.context.to_text())
+ optimization_query += context_section
+
+ return optimization_query
+
+ def to_content_blocks(self) -> ContentBlockList:
+ """Convert the problem instance to a list of ContentBlocks.
+
+ Consecutive TextContent blocks are merged into a single block for efficiency.
+ Images and other non-text blocks are kept separate.
+
+ Returns:
+ ContentBlockList: A list containing TextContent and ImageContent blocks
+ that represent the complete problem instance.
+ """
+ blocks = ContentBlockList()
+
+ # Instruction section
+ blocks.append(f"# Instruction\n{self.instruction}\n\n# Solution\n")
+
+ # Variables/Solution section (may contain images)
+ blocks.extend(self.variables)
+
+ # Feedback section
+ blocks.append(f"\n\n# Feedback\n{self.feedback}")
+
+ # Context section (optional)
+ if self.context is not None and self.context.to_text().strip() != "":
+ blocks.append(f"\n\n# Context\n")
+ blocks.extend(self.context)
+
+ return blocks
+
+ def has_images(self) -> bool:
+ """Check if this problem instance contains any images.
+
+ Returns:
+ bool: True if variables field contains ImageContent blocks.
+ """
+ if isinstance(self.variables, list):
+ for block in self.variables:
+ if isinstance(block, ImageContent):
+ return True
+ return False
+
+class OPROv3(OptoPrimeV3):
+ """OPRO (Optimization by PROmpting) optimizer version 2.
+
+ OPRO is an optimization algorithm that leverages large language models to
+ iteratively improve solutions based on feedback. It treats optimization as
+ a natural language problem where the LLM proposes improvements to variables
+ based on instruction and feedback.
+
+ Parameters
+ ----------
+ *args
+ Variable length argument list passed to parent class.
+ optimizer_prompt_symbol_set : OptimizerPromptSymbolSet, optional
+ The symbol set for formatting prompts and parsing outputs.
+ Defaults to OPROPromptSymbolSet().
+ include_example : bool, optional
+ Whether to include examples in the prompt. Default is False as
+ the default example in OptoPrimeV2 does not work well with OPRO.
+ memory_size : int, optional
+ Number of past optimization steps to remember. Default is 5.
+ **kwargs
+ Additional keyword arguments passed to parent class.
+
+ Attributes
+ ----------
+ representation_prompt : str
+ Template for explaining the problem representation to the LLM.
+ output_format_prompt_template : str
+ Template for specifying the expected output format.
+ user_prompt_template : str
+ Template for presenting the problem instance to the LLM.
+ final_prompt : str
+ Template for requesting the final revised solutions.
+ default_objective : str
+ Default objective when none is specified.
+
+ Methods
+ -------
+ problem_instance(summary, mask=None)
+ Creates a ProblemInstance from an optimization summary.
+ initialize_prompt()
+ Initializes and formats the prompt templates.
+
+ Notes
+ -----
+ OPRO differs from OptoPrime by focusing on simpler problem representations
+ and clearer feedback incorporation. It is particularly effective for
+ problems where the optimization can be expressed in natural language.
+
+ See Also
+ --------
+ OptoPrimeV2 : Parent class providing core optimization functionality.
+ OPROPromptSymbolSet : Symbol set used for formatting.
+
+ Examples
+ --------
+ >>> optimizer = OPROv3(memory_size=10)
+ >>> # Use optimizer to improve solutions based on feedback
+ """
+ representation_prompt = dedent(
+ """
+ You're tasked to change the proposed solution according to feedback.
+
+ Specifically, a problem will be composed of the following parts:
+ - {instruction_section_title}: the instruction which describes the things you need to do or the question you should answer.
+ - {variables_section_title}: the proposed solution that you can change/tweak (trainable).
+ - {feedback_section_title}: the feedback about the solution.
+ - {context_section_title}: the context information that might be useful to solve the problem.
+
+ If `data_type` is `code`, it means `{value_tag}` is the source code of a python code, which may include docstring and definitions.
+ """
+ )
+
+ output_format_prompt_template = dedent(
+ """
+ Output_format: Your output should be in the following XML/HTML format:
+
+ ```
+ {output_format}
+ ```
+
+ In <{reasoning_tag}>, explain the problem: 1. what the {instruction_section_title} means 2. what the {feedback_section_title} means to {variables_section_title} considering how {variables_section_title} follow {instruction_section_title}. 3. Reasoning about the suggested changes in {variables_section_title} (if needed) and the expected result.
+
+ If you need to suggest a change in the values of {variables_section_title}, write down the suggested values in <{improved_variable_tag}>. Remember you can change only the values in {variables_section_title}, not others. When `type` of a variable is `code`, you should write the new definition in the format of python code without syntax errors, and you should not change the function name or the function signature.
+
+ If no changes are needed, just output TERMINATE.
+ """
+ )
+
+ user_prompt_template = dedent(
+ """
+ Now you see problem instance:
+
+ ================================
+ {problem_instance}
+ ================================
+
+ """
+ )
+
+ context_prompt = dedent(
+ """
+ Here is some additional **context** to solving this problem:
+
+ {context}
+ """
+ )
+
+ final_prompt = dedent(
+ """
+ What are your revised solutions on {names}?
+
+ Your response:
+ """
+ )
+
+ # Default Objective becomes instruction for the next block
+ default_objective = "Propose a new solution that will incorporate the feedback."
+
+ def __init__(self, *args,
+ optimizer_prompt_symbol_set: OptimizerPromptSymbolSet = None,
+ include_example=False, # default example in OptoPrimeV2 does not work in OPRO
+ memory_size=5,
+ problem_context: Optional[ContentBlockList] = None,
+ **kwargs):
+ """Initialize the OPROv2 optimizer.
+
+ Parameters
+ ----------
+ *args
+ Variable length argument list passed to parent class.
+ optimizer_prompt_symbol_set : OptimizerPromptSymbolSet, optional
+ The symbol set for formatting prompts and parsing outputs.
+ If None, uses OPROPromptSymbolSet().
+ include_example : bool, optional
+ Whether to include examples in the prompt. Default is False.
+ memory_size : int, optional
+ Number of past optimization steps to remember. Default is 5.
+ **kwargs
+ Additional keyword arguments passed to parent class.
+ """
+ optimizer_prompt_symbol_set = optimizer_prompt_symbol_set or OPROPromptSymbolSet()
+ super().__init__(*args, optimizer_prompt_symbol_set=optimizer_prompt_symbol_set,
+ include_example=include_example, memory_size=memory_size,
+ problem_context=problem_context,
+ **kwargs)
+
+ def parameter_check(self, parameters: List[ParameterNode]):
+ """Check if the parameters are valid.
+ This can be overloaded by subclasses to add more checks.
+
+ Args:
+ parameters: List[ParameterNode]
+ The parameters to check.
+
+ Raises:
+ AssertionError: If more than one parameter contains image data.
+
+ Notes:
+ OPROv2 supports image parameters, but only one parameter can be
+ an image at a time since LLMs can only generate one image per inference.
+ """
+ # Count image parameters
+ image_params = [param for param in parameters if param.is_image]
+
+ if len(image_params) > 1:
+ param_names = ', '.join([f"'{p.name}'" for p in image_params])
+ raise AssertionError(
+ f"OPROv2 supports at most one image parameter, but found {len(image_params)}: "
+ f"{param_names}. LLMs can only generate one image at a time."
+ )
+
+ def problem_instance(self, summary, mask=None, use_content_blocks=False):
+ """Create a ProblemInstance from an optimization summary.
+
+ Parameters
+ ----------
+ summary : object
+ The optimization summary containing variables and feedback.
+ mask : list, optional
+ List of sections to mask/hide in the problem instance.
+ Can include "#Instruction", variable section title, or feedback section title.
+ use_content_blocks : bool, optional
+ If True, use content blocks for multimodal support (images).
+ If False, use text-only representation.
+
+ Returns
+ -------
+ ProblemInstance
+ A formatted problem instance ready for presentation to the LLM.
+
+ Notes
+ -----
+ The mask parameter allows selective hiding of problem components,
+ useful for ablation studies or specific optimization strategies.
+ """
+ mask = mask or []
+
+ if use_content_blocks:
+ # Use content block representation for multimodal support
+ variables_content = (
+ self.repr_node_value_compact_as_content_blocks(
+ summary.variables,
+ node_tag=self.optimizer_prompt_symbol_set.variable_tag,
+ value_tag=self.optimizer_prompt_symbol_set.value_tag,
+ constraint_tag=self.optimizer_prompt_symbol_set.constraint_tag
+ )
+ if self.optimizer_prompt_symbol_set.variables_section_title not in mask
+ else ContentBlockList()
+ )
+ else:
+ # Use text-only representation (backward compatible)
+ variables_content = (
+ self.repr_node_value_compact(
+ summary.variables,
+ node_tag=self.optimizer_prompt_symbol_set.variable_tag,
+ value_tag=self.optimizer_prompt_symbol_set.value_tag,
+ constraint_tag=self.optimizer_prompt_symbol_set.constraint_tag
+ )
+ if self.optimizer_prompt_symbol_set.variables_section_title not in mask
+ else ""
+ )
+
+ return ProblemInstance(
+ instruction=self.objective if "#Instruction" not in mask else "",
+ variables=variables_content,
+ feedback=summary.user_feedback if self.optimizer_prompt_symbol_set.feedback_section_title not in mask else "",
+ context=self.problem_context if hasattr(self, 'problem_context') else None,
+ optimizer_prompt_symbol_set=self.optimizer_prompt_symbol_set
+ )
+
+ def repr_node_value_compact_as_content_blocks(self, node_dict, node_tag="node",
+ value_tag="value", constraint_tag="constraint") -> ContentBlockList:
+ """Returns a ContentBlockList with compact representation, including images.
+
+ Consecutive TextContent blocks are merged for efficiency.
+ Non-image values are truncated. Images break the text flow.
+ """
+ from opto.optimizers.optoprime_v3 import value_to_image_content
+
+ blocks = ContentBlockList()
+
+ for k, v in node_dict.items():
+ value_data = v[0]
+ constraint = v[1]
+
+ if "__code" not in k:
+ # Check if this is an image
+ image_content = value_to_image_content(value_data)
+
+ if image_content is not None:
+ # Image node: output XML structure, then image, then closing
+ type_name = "image"
+ constraint_expr = f"<{constraint_tag}>\n{constraint}\n{constraint_tag}>" if constraint is not None and node_tag == self.optimizer_prompt_symbol_set.variable_tag else ""
+
+ xml_text = f"<{node_tag} name=\"{k}\" type=\"{type_name}\">\n<{value_tag}>\n"
+ blocks.append(xml_text)
+ blocks.append(image_content) # Image breaks the text flow
+
+ closing_text = f"\n{value_tag}>\n{constraint_expr}{node_tag}>\n\n" if constraint_expr else f"\n{value_tag}>\n{node_tag}>\n\n"
+ blocks.append(closing_text)
+ else:
+ # Non-image node: truncated text representation
+ node_value = self.truncate_expression(value_data, self.initial_var_char_limit)
+ if constraint is not None and node_tag == self.optimizer_prompt_symbol_set.variable_tag:
+ constraint_expr = f"<{constraint_tag}>\n{constraint}\n{constraint_tag}>"
+ blocks.append(
+ f"<{node_tag} name=\"{k}\" type=\"{type(value_data).__name__}\">\n<{value_tag}>\n{node_value}\n{value_tag}>\n{constraint_expr}\n{node_tag}>\n\n"
+ )
+ else:
+ blocks.append(
+ f"<{node_tag} name=\"{k}\" type=\"{type(value_data).__name__}\">\n<{value_tag}>\n{node_value}\n{value_tag}>\n{node_tag}>\n\n"
+ )
+ else:
+ # Code node (never an image)
+ constraint_expr = f"<{constraint_tag}>\n{constraint}\n{constraint_tag}>"
+ signature = constraint.replace("The code should start with:\n", "")
+ func_body = value_data.replace(signature, "")
+ node_value = self.truncate_expression(func_body, self.initial_var_char_limit)
+ blocks.append(
+ f"<{node_tag} name=\"{k}\" type=\"code\">\n<{value_tag}>\n{signature}{node_value}\n{value_tag}>\n{constraint_expr}\n{node_tag}>\n\n"
+ )
+
+ return blocks
+
+ def initialize_prompt(self):
+ """Initialize and format the prompt templates.
+
+ This method formats the representation_prompt and output_format_prompt
+ templates with the appropriate symbols from the optimizer_prompt_symbol_set.
+ It prepares the prompts for use in optimization.
+
+ Notes
+ -----
+ This method should be called during initialization to ensure all
+ prompt templates are properly formatted with the correct tags and symbols.
+ """
+ self.representation_prompt = self.representation_prompt.format(
+ variable_expression_format=dedent(f"""
+ <{self.optimizer_prompt_symbol_set.variable_tag} name="variable_name" type="data_type">
+ <{self.optimizer_prompt_symbol_set.value_tag}>
+ value
+ {self.optimizer_prompt_symbol_set.value_tag}>
+ <{self.optimizer_prompt_symbol_set.constraint_tag}>
+ constraint_expression
+ {self.optimizer_prompt_symbol_set.constraint_tag}>
+ {self.optimizer_prompt_symbol_set.variable_tag}>
+ """),
+ value_tag=self.optimizer_prompt_symbol_set.value_tag,
+ variables_section_title=self.optimizer_prompt_symbol_set.variables_section_title.replace(" ", ""),
+ feedback_section_title=self.optimizer_prompt_symbol_set.feedback_section_title.replace(" ", ""),
+ instruction_section_title=self.optimizer_prompt_symbol_set.instruction_section_title.replace(" ", ""),
+ context_section_title=self.optimizer_prompt_symbol_set.context_section_title.replace(" ", "")
+ )
+ self.output_format_prompt = self.output_format_prompt_template.format(
+ output_format=self.optimizer_prompt_symbol_set.output_format,
+ reasoning_tag=self.optimizer_prompt_symbol_set.reasoning_tag,
+ improved_variable_tag=self.optimizer_prompt_symbol_set.improved_variable_tag,
+ instruction_section_title=self.optimizer_prompt_symbol_set.instruction_section_title.replace(" ", ""),
+ feedback_section_title=self.optimizer_prompt_symbol_set.feedback_section_title.replace(" ", ""),
+ variables_section_title=self.optimizer_prompt_symbol_set.variables_section_title.replace(" ", ""),
+ context_section_title=self.optimizer_prompt_symbol_set.context_section_title.replace(" ", "")
+ )
diff --git a/opto/optimizers/optoprime_v3.py b/opto/optimizers/optoprime_v3.py
new file mode 100644
index 00000000..dafbc8a3
--- /dev/null
+++ b/opto/optimizers/optoprime_v3.py
@@ -0,0 +1,1283 @@
+"""
+Key difference to v2:
+1. Use the new backbone conversation history manager
+2. Support multimodal node (both trainable and non-trainable)
+"""
+
+import re
+import json
+from typing import List, Union, Tuple, Optional
+from dataclasses import dataclass
+from opto.optimizers.optoprime import OptoPrime, node_to_function_feedback
+from opto.trace.utils import dedent
+from opto.optimizers.utils import truncate_expression, extract_xml_like_data, is_bedrock_model
+from opto.trace.nodes import ParameterNode, is_image
+from opto.trace.propagators import GraphPropagator
+from opto.trace.propagators.propagators import Propagator
+
+from opto.utils.llm import AbstractModel, LLM
+from opto.optimizers.buffers import FIFOBuffer
+from opto.utils.backbone import (
+ UserTurn, AssistantTurn, PromptTemplate,
+ TextContent, ImageContent, ContentBlockList,
+ DEFAULT_IMAGE_PLACEHOLDER, Content, to_messages
+)
+import copy
+import pickle
+from typing import Dict, Any
+
+
+def value_to_image_content(value: Any) -> Optional[ImageContent]:
+ """Convert a value to ImageContent if it's an image, otherwise return None.
+
+ Uses is_image() from opto.trace.nodes for validation (stricter than ImageContent.build,
+ e.g., only accepts URLs with image extensions), then delegates to ImageContent.build().
+
+ Supports (via is_image detection):
+ - Base64 data URL strings (data:image/...)
+ - HTTP/HTTPS URLs pointing to images (pattern-based, must have image extension)
+ - PIL Image objects
+ - Raw image bytes
+ """
+ if not is_image(value):
+ return None
+ return ImageContent.build(value)
+
+
+class OptimizerPromptSymbolSet:
+ """
+ By inheriting this class and pass into the optimizer. People can change the optimizer documentation
+
+ This divides into three parts:
+ - Section titles: the title of each section in the prompt
+ - Node tags: the tags that capture the graph structure (only tag names are allowed to be changed)
+ - Output format: the format of the output of the optimizer
+ """
+
+ # Titles should be written as markdown titles (space between # and title)
+ # In text, we automatically remove space in the title, so it will become `#Title`
+ variables_section_title = "# Variables"
+ inputs_section_title = "# Inputs"
+ outputs_section_title = "# Outputs"
+ others_section_title = "# Others"
+ feedback_section_title = "# Feedback"
+ instruction_section_title = "# Instruction"
+ code_section_title = "# Code"
+ documentation_section_title = "# Documentation"
+ context_section_title = "# Context"
+
+ node_tag = "node" # nodes that are constants in the graph
+ variable_tag = "variable" # nodes that can be changed
+ value_tag = "value" # inside node, we have value tag
+ constraint_tag = "constraint" # inside node, we have constraint tag
+
+ # output format
+ # Note: we currently don't support extracting format's like "```code```" because we assume supplied tag is name-only, i.e.,
+ reasoning_tag = "reasoning"
+ improved_variable_tag = "variable"
+ name_tag = "name"
+
+ # only used by JSON format
+ suggestion_tag = "suggestion"
+
+ expect_json = False # this will stop `enforce_json` arguments passed to LLM calls
+
+ # custom output format
+ # if this is not None, then the user needs to implement the following functions:
+ # - output_response_extractor
+ # - example_output
+ custom_output_format_instruction = None
+
+ @property
+ def output_format(self) -> str:
+ """
+ This function defines the input to:
+ ```
+ {output_format}
+ ```
+ In the self.output_format_prompt_template in the OptoPrimeV2
+ """
+ if self.custom_output_format_instruction is None:
+ # we use a default XML like format
+ return dedent(f"""
+ <{self.reasoning_tag}>
+ reasoning
+ {self.reasoning_tag}>
+ <{self.improved_variable_tag}>
+ <{self.name_tag}>variable_name{self.name_tag}>
+ <{self.value_tag}>
+ value
+ {self.value_tag}>
+ {self.improved_variable_tag}>
+ """)
+ else:
+ return self.custom_output_format_instruction.strip()
+
+ def example_output(self, reasoning, variables):
+ """
+ reasoning: str
+ variables: format {variable_name, value}
+ """
+ if self.custom_output_format_instruction is not None:
+ raise NotImplementedError
+ else:
+ # Build the output string in the same XML-like format as self.output_format
+ output = []
+ if reasoning != "":
+ output.append(f"<{self.reasoning_tag}>")
+ output.append(reasoning)
+ output.append(f"{self.reasoning_tag}>")
+ for var_name, value in variables.items():
+ output.append(f"<{self.improved_variable_tag}>")
+ output.append(f"<{self.name_tag}>{var_name}{self.name_tag}>")
+ output.append(f"<{self.value_tag}>")
+ output.append(str(value))
+ output.append(f"{self.value_tag}>")
+ output.append(f"{self.improved_variable_tag}>")
+ return "\n".join(output)
+
+ def output_response_extractor(self, response: str) -> Dict[str, Any]:
+ # the response here should just be plain text
+
+ if self.custom_output_format_instruction is None:
+ extracted_data = extract_xml_like_data(response,
+ reasoning_tag=self.reasoning_tag,
+ improved_variable_tag=self.improved_variable_tag,
+ name_tag=self.name_tag,
+ value_tag=self.value_tag)
+
+ # if the suggested value is a code, and the entire code body is empty (i.e., not even function signature is present)
+ # then we remove such suggestion
+ keys_to_remove = []
+ for key, value in extracted_data['variables'].items():
+ if "__code" in key and value.strip() == "":
+ keys_to_remove.append(key)
+
+ for key in keys_to_remove:
+ del extracted_data['variables'][key]
+
+ return extracted_data
+ else:
+ raise NotImplementedError(
+ "If you supplied a custom output format prompt template, you need to implement your own response extractor")
+
+ @property
+ def default_prompt_symbols(self) -> Dict[str, str]:
+ return {
+ "variables": self.variables_section_title,
+ "inputs": self.inputs_section_title,
+ "outputs": self.outputs_section_title,
+ "others": self.others_section_title,
+ "feedback": self.feedback_section_title,
+ "instruction": self.instruction_section_title,
+ "code": self.code_section_title,
+ "documentation": self.documentation_section_title,
+ "context": self.context_section_title,
+ "reasoning": self.reasoning_tag,
+ "suggestion": self.suggestion_tag
+ }
+
+
+class OptimizerPromptSymbolSetJSON(OptimizerPromptSymbolSet):
+ """We enforce a JSON output format extraction"""
+
+ expect_json = True
+
+ custom_output_format_instruction = dedent("""
+ {
+ "reasoning": ,
+ "suggestion": {
+ : ,
+ : ,
+ }
+ }
+ """)
+
+ def example_output(self, reasoning, variables):
+ """
+ reasoning: str
+ variables: format {variable_name, value}
+ """
+
+ # Build the output string in the same JSON format as described in custom_output_format_instruction
+ output = {
+ "reasoning": reasoning,
+ "suggestion": {var_name: value for var_name, value in variables.items()}
+ }
+ return json.dumps(output, indent=2)
+
+ def output_response_extractor(self, response: str) -> Dict[str, Any]:
+ """
+ Extracts reasoning and suggestion variables from the LLM response using OptoPrime's extraction logic.
+ """
+ # Use the centralized extraction logic from OptoPrime
+ suggestion_tag = self.default_prompt_symbols.get("suggestion", "suggestion")
+ reasoning_tag = self.default_prompt_symbols.get("reasoning", "reasoning")
+
+ ignore_extraction_error = True
+
+ reasoning = "(Unable to extract, possibly due to parsing failure)"
+
+ if "```" in response:
+ # First try to extract from ```json ... ``` blocks
+ json_match = re.findall(r"```json\s*(.*?)```", response, re.DOTALL)
+ if len(json_match) > 0:
+ response = json_match[0].strip()
+ else:
+ # Fall back to regular ``` ... ``` blocks
+ match = re.findall(r"```(.*?)```", response, re.DOTALL)
+ if len(match) > 0:
+ # Remove language identifier if present (e.g., "json", "python")
+ content = match[0].strip()
+ # Check if first line is a language identifier
+ lines = content.split('\n', 1)
+ if len(lines) > 1 and lines[0].strip().isalpha() and len(lines[0].strip()) < 20:
+ response = lines[1].strip()
+ else:
+ response = content
+
+ json_extracted = {}
+ suggestion = {}
+ attempt_n = 0
+ while attempt_n < 2:
+ try:
+ json_extracted = json.loads(response)
+ if isinstance(json_extracted, dict): # trim all whitespace keys in the json_extracted
+ json_extracted = {k.strip(): v for k, v in json_extracted.items()}
+ suggestion = json_extracted.get(suggestion_tag, json_extracted)
+ reasoning = json_extracted.get(reasoning_tag, "")
+ break
+ except json.JSONDecodeError:
+ response = re.findall(r"{.*}", response, re.DOTALL)
+ if len(response) > 0:
+ response = response[0]
+ attempt_n += 1
+ except Exception:
+ attempt_n += 1
+
+ if not isinstance(suggestion, dict):
+ suggestion = json_extracted if isinstance(json_extracted, dict) else {}
+
+ if len(suggestion) == 0:
+ pattern = rf'"{suggestion_tag}"\s*:\s*\{{(.*?)\}}'
+ suggestion_match = re.search(pattern, str(response), re.DOTALL)
+ if suggestion_match:
+ suggestion = {}
+ suggestion_content = suggestion_match.group(1)
+ pair_pattern = r'"([a-zA-Z0-9_]+)"\s*:\s*"(.*)"'
+ pairs = re.findall(pair_pattern, suggestion_content, re.DOTALL)
+ for key, value in pairs:
+ suggestion[key] = value
+
+ if len(suggestion) == 0 and not ignore_extraction_error:
+ print(f"Cannot extract {suggestion_tag} from LLM's response:\n{response}")
+
+ keys_to_remove = []
+ for key, value in suggestion.items():
+ if "__code" in key and value.strip() == "":
+ keys_to_remove.append(key)
+ for key in keys_to_remove:
+ del suggestion[key]
+
+ return {"reasoning": reasoning, "variables": suggestion}
+
+
+class OptimizerPromptSymbolSet2(OptimizerPromptSymbolSet):
+ variables_section_title = "# Variables"
+ inputs_section_title = "# Inputs"
+ outputs_section_title = "# Outputs"
+ others_section_title = "# Others"
+ feedback_section_title = "# Feedback"
+ instruction_section_title = "# Instruction"
+ code_section_title = "# Code"
+ documentation_section_title = "# Documentation"
+ context_section_title = "# Context"
+
+ node_tag = "const" # nodes that are constants in the graph
+ variable_tag = "var" # nodes that can be changed
+ value_tag = "data" # inside node, we have value tag
+ constraint_tag = "constraint" # inside node, we have constraint tag
+
+ # output format
+ reasoning_tag = "reason"
+ improved_variable_tag = "var"
+ name_tag = "name"
+
+
+@dataclass
+class FunctionFeedback:
+ """Container for structured feedback from function execution traces.
+
+ Used by OptoPrime to organize execution traces into a format suitable
+ for LLM-based optimization.
+
+ Attributes
+ ----------
+ graph : list[tuple[int, str]]
+ Topologically sorted function calls with (depth, representation) pairs.
+ documentation : dict[str, str]
+ Mapping of function names to their documentation strings.
+ others : dict[str, Any]
+ Intermediate variables with (data, description) tuples.
+ roots : dict[str, Any]
+ Input/root variables with (data, description) tuples.
+ output : dict[str, Any]
+ Output/leaf variables with (data, description) tuples.
+ user_feedback : Union[str, ContentBlockList]
+ User-provided feedback about the execution. May include images.
+
+ Notes
+ -----
+ This structure separates the execution trace into logical components
+ that can be formatted into prompts for LLM-based optimization.
+ """
+
+ graph: List[
+ Tuple[int, str]
+ ] # Each item is is a representation of function call. The items are topologically sorted.
+ documentation: Dict[str, str] # Function name and its documentationstring
+ others: Dict[str, Any] # Intermediate variable names and their data
+ roots: Dict[str, Any] # Root variable name and its data
+ output: Dict[str, Any] # Leaf variable name and its data
+ user_feedback: Union[str, ContentBlockList] # User feedback at the leaf of the graph (may include images)
+
+
+@dataclass
+class ProblemInstance:
+ """Problem instance with multimodal content support.
+
+ A composite of multiple ContentBlockLists representing different parts
+ of a problem. Uses ContentBlockList for variables, inputs, others, and
+ outputs to support both text and image content in a unified way.
+
+ The class provides:
+ - __repr__: Returns text-only representation for logging
+ - to_content_blocks(): Returns ContentBlockList for multimodal prompts
+ - has_images(): Check if any field contains images
+ """
+ instruction: str
+ code: str
+ documentation: str
+ variables: ContentBlockList
+ inputs: ContentBlockList
+ others: ContentBlockList
+ outputs: ContentBlockList
+ feedback: ContentBlockList # May contain images mixed with text
+ context: Optional[ContentBlockList]
+
+ optimizer_prompt_symbol_set: OptimizerPromptSymbolSet
+
+ def __post_init__(self):
+ # Normalize content fields so callers may pass plain strings (or None).
+ # ContentBlockList.ensure is idempotent for existing ContentBlockLists.
+ self.variables = ContentBlockList.ensure(self.variables)
+ self.inputs = ContentBlockList.ensure(self.inputs)
+ self.others = ContentBlockList.ensure(self.others)
+ self.outputs = ContentBlockList.ensure(self.outputs)
+ self.feedback = ContentBlockList.ensure(self.feedback)
+ if self.context is not None:
+ self.context = ContentBlockList.ensure(self.context)
+
+ problem_template = dedent(
+ """
+ # Instruction
+ {instruction}
+
+ # Code
+ {code}
+
+ # Documentation
+ {documentation}
+
+ # Variables
+ {variables}
+
+ # Inputs
+ {inputs}
+
+ # Others
+ {others}
+
+ # Outputs
+ {outputs}
+
+ # Context
+ {context}
+
+ # Feedback
+ {feedback}
+ """
+ )
+
+ def __repr__(self) -> str:
+ """Return text-only representation for backward compatibility.
+
+ Uses ContentBlockList.to_text() for fields that may contain images.
+ """
+ optimization_query = self.problem_template.format(
+ instruction=self.instruction,
+ code=self.code,
+ documentation=self.documentation,
+ variables=self.variables.to_text(),
+ inputs=self.inputs.to_text(),
+ outputs=self.outputs.to_text(),
+ others=self.others.to_text(),
+ context=self.context.to_text() if self.context is not None else "",
+ feedback=self.feedback.to_text()
+ )
+
+ return optimization_query
+
+ def to_content_blocks(self) -> ContentBlockList:
+ """Convert the problem instance to a list of ContentBlocks.
+
+ Consecutive TextContent blocks are merged into a single block for efficiency.
+ Images and other non-text blocks are kept separate.
+
+ Returns:
+ ContentBlockList: A list containing TextContent and ImageContent blocks
+ that represent the complete problem instance including any images
+ from variables, inputs, others, or outputs.
+ """
+ blocks = ContentBlockList()
+
+ # Header sections (always text)
+ header = dedent(f"""
+ # Instruction
+ {self.instruction}
+
+ # Code
+ {self.code}
+
+ # Documentation
+ {self.documentation}
+
+ # Variables
+ """)
+ blocks.append(header)
+
+ # Variables section (may contain images)
+ blocks.extend(self.variables)
+
+ # Inputs section
+ blocks.append("\n\n# Inputs\n")
+ blocks.extend(self.inputs)
+
+ # Others section
+ blocks.append("\n\n# Others\n")
+ blocks.extend(self.others)
+
+ # Outputs section
+ blocks.append("\n\n# Outputs\n")
+ blocks.extend(self.outputs)
+
+ # Context section (optional)
+ if self.context is not None and self.context.to_text().strip() != "":
+ blocks.append(f"\n\n# Context\n") # section name
+ blocks.extend(self.context) # extend the blocks
+
+ # Feedback section (may contain images)
+ blocks.append("\n\n# Feedback\n")
+ blocks.extend(self.feedback)
+
+ return blocks
+
+ def has_images(self) -> bool:
+ """Check if this problem instance contains any images.
+
+ Efficiently checks each ContentBlockList field directly
+ without building full content blocks.
+
+ Returns:
+ bool: True if any field contains ImageContent blocks.
+ """
+ return any(
+ field.has_images()
+ for field in [self.variables, self.inputs, self.others, self.outputs, self.feedback]
+ )
+
+
+
+
+
+# we provide two aliases for the Content class for semantic convenience
+Context = Content
+Feedback = Content
+
+class OptoPrimeV3(OptoPrime):
+ # This is generic representation prompt, which just explains how to read the problem.
+ representation_prompt = dedent(
+ """You're tasked to solve a coding/algorithm problem. You will see the instruction, the code, the documentation of each function used in the code, and the feedback about the execution result.
+
+ Specifically, a problem will be composed of the following parts:
+ - {instruction_section_title}: the instruction which describes the things you need to do or the question you should answer.
+ - {code_section_title}: the code defined in the problem.
+ - {documentation_section_title}: the documentation of each function used in #Code. The explanation might be incomplete and just contain high-level description. You can use the values in #Others to help infer how those functions work.
+ - {variables_section_title}: the input variables that you can change/tweak (trainable).
+ - {inputs_section_title}: the values of fixed inputs to the code, which CANNOT be changed (fixed).
+ - {others_section_title}: the intermediate values created through the code execution.
+ - {outputs_section_title}: the result of the code output.
+ - {feedback_section_title}: the feedback about the code's execution result.
+ - {context_section_title}: the context information that might be useful to solve the problem.
+
+ In `{variables_section_title}`, `{inputs_section_title}`, `{outputs_section_title}`, and `{others_section_title}`, the format is:
+
+ For variables we express as this:
+ {variable_expression_format}
+
+ If `data_type` is `code`, it means `{value_tag}` is the source code of a python code, which may include docstring and definitions."""
+ )
+
+ # Optimization
+ default_objective = "You need to change the `{value_tag}` of the variables in {variables_section_title} to improve the output in accordance to {feedback_section_title}."
+
+ output_format_prompt_template = dedent(
+ """
+ Output_format: Your output should be in the following XML or JSON format:
+
+ {output_format}
+
+ In <{reasoning_tag}>, explain the problem: 1. what the {instruction_section_title} means 2. what the {feedback_section_title} on {outputs_section_title} means to {variables_section_title} considering how {variables_section_title} are used in {code_section_title} and other values in {documentation_section_title}, {inputs_section_title}, {others_section_title}. 3. Reasoning about the suggested changes in {variables_section_title} (if needed) and the expected result.
+
+ If you need to suggest a change in the values of {variables_section_title}, write down the suggested values in <{improved_variable_tag}>. Remember you can change only the values in {variables_section_title}, not others. When `type` of a variable is `code`, you should write the new definition in the format of python code without syntax errors, and you should not change the function name or the function signature.
+
+ If no changes are needed, just output TERMINATE.
+ """
+ )
+
+ example_problem_template = PromptTemplate(dedent(
+ """
+ Here is an example of problem instance and response:
+
+ ================================
+ {example_problem}
+ ================================
+
+ Your response:
+ {example_response}
+ """
+ ))
+
+ user_prompt_template = PromptTemplate(dedent(
+ """
+ Now you see problem instance:
+
+ ================================
+ {problem_instance}
+ ================================
+
+ """
+ ))
+
+ final_prompt = dedent(
+ """
+ What are your suggestions on variables {names}?
+
+ Your response:
+ """
+ )
+
+ def __init__(
+ self,
+ parameters: List[ParameterNode],
+ llm: AbstractModel = None,
+ *args,
+ image_llm: AbstractModel = None,
+ propagator: Propagator = None,
+ objective: Union[None, str] = None,
+ ignore_extraction_error: bool = True,
+ # ignore the type conversion error when extracting updated values from LLM's suggestion
+ include_example=False,
+ memory_size=0, # Memory size to store the past feedback
+ max_tokens=8192,
+ log=True,
+ initial_var_char_limit=2000,
+ optimizer_prompt_symbol_set: OptimizerPromptSymbolSet = OptimizerPromptSymbolSet(),
+ use_json_object_format=True, # whether to use json object format for the response when calling LLM
+ truncate_expression=truncate_expression,
+ problem_context: Optional[ContentBlockList] = None,
+ **kwargs,
+ ):
+ super().__init__(parameters, *args, propagator=propagator, **kwargs)
+
+ self.truncate_expression = truncate_expression
+ self.problem_context: Optional[ContentBlockList] = problem_context
+ self.output_contains_image = False
+
+ self.use_json_object_format = use_json_object_format if optimizer_prompt_symbol_set.expect_json and use_json_object_format else False
+ self.ignore_extraction_error = ignore_extraction_error
+ self.llm = llm or LLM(mm_beta=True)
+ self.image_llm = image_llm
+
+ assert self.llm.mm_beta, "OptoPrimeV3 enables multi-modal LLM backbone by default. Please use LLM(model='...', mm_beta=True)."
+
+ self.objective = objective or self.default_objective.format(value_tag=optimizer_prompt_symbol_set.value_tag,
+ variables_section_title=optimizer_prompt_symbol_set.variables_section_title,
+ feedback_section_title=optimizer_prompt_symbol_set.feedback_section_title)
+ self.initial_var_char_limit = initial_var_char_limit
+ self.optimizer_prompt_symbol_set = optimizer_prompt_symbol_set
+
+ self.example_problem_summary = FunctionFeedback(graph=[(1, 'y = add(x=a,y=b)'), (2, "z = subtract(x=y, y=c)")],
+ documentation={'add': 'This is an add operator of x and y.',
+ 'subtract': "subtract y from x"},
+ others={'y': (6, None)},
+ roots={'a': (5, "a > 0"),
+ 'b': (1, None),
+ 'c': (5, None)},
+ output={'z': (1, None)},
+ user_feedback='The result of the code is not as expected. The result should be 10, but the code returns 1'
+ )
+ self.example_problem_summary.variables = {'a': (5, "a > 0")}
+ self.example_problem_summary.inputs = {'b': (1, None), 'c': (5, None)}
+
+ self.example_problem = self.problem_instance(self.example_problem_summary)
+ self.example_response = self.optimizer_prompt_symbol_set.example_output(
+ reasoning="In this case, the desired response would be to change the value of input a to 14, as that would make the code return 10.",
+ variables={
+ 'a': 10,
+ }
+ )
+
+ self.include_example = include_example
+ self.max_tokens = max_tokens
+ self.log = [] if log else None
+ self.summary_log = [] if log else None
+ self.memory = FIFOBuffer(memory_size)
+ # Plain list of LiteLLM-format message dicts (no Chat manager). The
+ # system prompt is rebuilt per step; this holds prior user/assistant rounds.
+ self.message_history: List[Dict[str, Any]] = []
+ self.conversation_length = memory_size # Number of conversation rounds to keep
+
+ self.default_prompt_symbols = self.optimizer_prompt_symbol_set.default_prompt_symbols
+
+ self.prompt_symbols = copy.deepcopy(self.default_prompt_symbols)
+ self.initialize_instruct_prompt()
+
+ def parameter_check(self, parameters: List[ParameterNode]):
+ """Check if the parameters are valid.
+ This can be overloaded by subclasses to add more checks.
+
+ Args:
+ parameters: List[ParameterNode]
+ The parameters to check.
+
+ Raises:
+ AssertionError: If more than one parameter contains image data.
+
+ Notes:
+ OptoPrimeV3 supports image parameters, but only one parameter can be
+ an image at a time since LLMs can only generate one image per inference.
+ """
+ # Count image parameters
+ image_params = [param for param in parameters if param.is_image]
+
+ if len(image_params) > 1:
+ param_names = ', '.join([f"'{p.name}'" for p in image_params])
+ raise AssertionError(
+ f"OptoPrimeV3 supports at most one image parameter, but found {len(image_params)}: "
+ f"{param_names}. LLMs can only generate one image at a time."
+ )
+ if len(image_params) == 1:
+ self.output_contains_image = True
+
+ def add_context(self, *args, images: Optional[List[Any]] = None, format: str = "PNG"):
+ """Add context to the optimizer, supporting both text and images.
+
+ Two usage patterns are supported:
+
+ **Usage 1: Variadic arguments (alternating text and images)**
+
+ optimizer.add_context("text part 1", image_link, "text part 2", image_file)
+
+ Each argument is either a string (text) or an image source.
+
+ **Usage 2: Template with placeholders**
+
+ optimizer.add_context(
+ "text part 1 [IMAGE] text part 2 [IMAGE]",
+ images=[image_link, image_file]
+ )
+
+ The text contains `[IMAGE]` placeholders that are replaced by images
+ from the `images` list in order. The number of placeholders must match
+ the number of images.
+
+ Args:
+ *args: Variable arguments. In Usage 1, alternating text and images.
+ In Usage 2, a single template string with placeholders.
+ images: Optional list of image sources for Usage 2. Each can be:
+ - URL string (http/https)
+ - Local file path
+ - PIL Image object
+ - Numpy array
+ format: Image format for numpy arrays (PNG, JPEG, etc.). Default: PNG
+
+ Raises:
+ ValueError: If using Usage 2 and the number of placeholders doesn't
+ match the number of images.
+
+ Examples:
+ # Usage 1: Alternating text and images
+ optimizer.add_context("Here's the diagram:", "diagram.png", "And here's another:", "other.png")
+
+ # Usage 2: Template with placeholders
+ optimizer.add_context("See [IMAGE] and compare with [IMAGE]", images=["a.png", "b.png"])
+
+ # Text-only context
+ optimizer.add_context("Important background information")
+ """
+ ctx = Content(*args, images=images, format=format)
+
+ # Store the context
+ if self.problem_context is None:
+ self.problem_context = ctx
+ else:
+ # Append to existing context with a newline separator
+ self.problem_context.append("\n\n")
+ self.problem_context.extend(ctx.to_content_blocks())
+
+ def initialize_instruct_prompt(self):
+ self.representation_prompt = self.representation_prompt.format(
+ variable_expression_format=dedent(f"""
+ <{self.optimizer_prompt_symbol_set.variable_tag} name="variable_name" type="data_type">
+ <{self.optimizer_prompt_symbol_set.value_tag}>
+ value
+ {self.optimizer_prompt_symbol_set.value_tag}>
+ <{self.optimizer_prompt_symbol_set.constraint_tag}>
+ constraint_expression
+ {self.optimizer_prompt_symbol_set.constraint_tag}>
+ {self.optimizer_prompt_symbol_set.variable_tag}>
+ """),
+ value_tag=self.optimizer_prompt_symbol_set.value_tag,
+ variables_section_title=self.optimizer_prompt_symbol_set.variables_section_title.replace(" ", ""),
+ inputs_section_title=self.optimizer_prompt_symbol_set.inputs_section_title.replace(" ", ""),
+ outputs_section_title=self.optimizer_prompt_symbol_set.outputs_section_title.replace(" ", ""),
+ feedback_section_title=self.optimizer_prompt_symbol_set.feedback_section_title.replace(" ", ""),
+ instruction_section_title=self.optimizer_prompt_symbol_set.instruction_section_title.replace(" ", ""),
+ code_section_title=self.optimizer_prompt_symbol_set.code_section_title.replace(" ", ""),
+ documentation_section_title=self.optimizer_prompt_symbol_set.documentation_section_title.replace(" ", ""),
+ others_section_title=self.optimizer_prompt_symbol_set.others_section_title.replace(" ", ""),
+ context_section_title=self.optimizer_prompt_symbol_set.context_section_title.replace(" ", "")
+ )
+ self.output_format_prompt = self.output_format_prompt_template.format(
+ output_format=self.optimizer_prompt_symbol_set.output_format,
+ reasoning_tag=self.optimizer_prompt_symbol_set.reasoning_tag,
+ improved_variable_tag=self.optimizer_prompt_symbol_set.improved_variable_tag,
+ instruction_section_title=self.optimizer_prompt_symbol_set.instruction_section_title.replace(" ", ""),
+ feedback_section_title=self.optimizer_prompt_symbol_set.feedback_section_title.replace(" ", ""),
+ outputs_section_title=self.optimizer_prompt_symbol_set.outputs_section_title.replace(" ", ""),
+ code_section_title=self.optimizer_prompt_symbol_set.code_section_title.replace(" ", ""),
+ documentation_section_title=self.optimizer_prompt_symbol_set.documentation_section_title.replace(" ", ""),
+ variables_section_title=self.optimizer_prompt_symbol_set.variables_section_title.replace(" ", ""),
+ inputs_section_title=self.optimizer_prompt_symbol_set.inputs_section_title.replace(" ", ""),
+ others_section_title=self.optimizer_prompt_symbol_set.others_section_title.replace(" ", ""),
+ )
+
+ def repr_node_value(self, node_dict, node_tag="node",
+ value_tag="value", constraint_tag="constraint") -> str:
+ """Returns text-only representation of node values (backward compatible)."""
+ temp_list = []
+ for k, v in node_dict.items():
+ if "__code" not in k:
+ # For images, use placeholder text
+ value_repr = "[IMAGE]" if is_image(v[0]) else str(v[0])
+ if v[1] is not None and node_tag == self.optimizer_prompt_symbol_set.variable_tag:
+ constraint_expr = f"<{constraint_tag}>\n{v[1]}\n{constraint_tag}>"
+ temp_list.append(
+ f"<{node_tag} name=\"{k}\" type=\"{type(v[0]).__name__}\">\n<{value_tag}>\n{value_repr}\n{value_tag}>\n{constraint_expr}\n{node_tag}>\n")
+ else:
+ temp_list.append(
+ f"<{node_tag} name=\"{k}\" type=\"{type(v[0]).__name__}\">\n<{value_tag}>\n{value_repr}\n{value_tag}>\n{node_tag}>\n")
+ else:
+ constraint_expr = f"\n{v[1]}\n"
+ signature = v[1].replace("The code should start with:\n", "")
+ func_body = v[0].replace(signature, "")
+ temp_list.append(
+ f"<{node_tag} name=\"{k}\" type=\"code\">\n<{value_tag}>\n{signature}{func_body}\n{value_tag}>\n{constraint_expr}\n{node_tag}>\n")
+ return "\n".join(temp_list)
+
+ def repr_node_value_compact(self, node_dict, node_tag="node",
+ value_tag="value", constraint_tag="constraint") -> str:
+ """Returns text-only compact representation of node values (backward compatible)."""
+ temp_list = []
+ for k, v in node_dict.items():
+ if "__code" not in k:
+ # For images, use placeholder text
+ if is_image(v[0]):
+ node_value = "[IMAGE]"
+ else:
+ node_value = self.truncate_expression(v[0], self.initial_var_char_limit)
+ if v[1] is not None and node_tag == self.optimizer_prompt_symbol_set.variable_tag:
+ constraint_expr = f"<{constraint_tag}>\n{v[1]}\n{constraint_tag}>"
+ temp_list.append(
+ f"<{node_tag} name=\"{k}\" type=\"{type(v[0]).__name__}\">\n<{value_tag}>\n{node_value}\n{value_tag}>\n{constraint_expr}\n{node_tag}>\n")
+ else:
+ temp_list.append(
+ f"<{node_tag} name=\"{k}\" type=\"{type(v[0]).__name__}\">\n<{value_tag}>\n{node_value}\n{value_tag}>\n{node_tag}>\n")
+ else:
+ constraint_expr = f"<{constraint_tag}>\n{v[1]}\n{constraint_tag}>"
+ # we only truncate the function body
+ signature = v[1].replace("The code should start with:\n", "")
+ func_body = v[0].replace(signature, "")
+ node_value = self.truncate_expression(func_body, self.initial_var_char_limit)
+ temp_list.append(
+ f"<{node_tag} name=\"{k}\" type=\"code\">\n<{value_tag}>\n{signature}{node_value}\n{value_tag}>\n{constraint_expr}\n{node_tag}>\n")
+ return "\n".join(temp_list)
+
+ def repr_node_value_as_content_blocks(self, node_dict, node_tag="node",
+ value_tag="value", constraint_tag="constraint") -> ContentBlockList:
+ """Returns a ContentBlockList representing node values, including images.
+
+ Consecutive TextContent blocks are merged for efficiency.
+ For image values, the text before and after the image are separate blocks.
+ """
+ blocks = ContentBlockList()
+
+ for k, v in node_dict.items():
+ value_data = v[0]
+ constraint = v[1]
+
+ if "__code" not in k:
+ # Check if this is an image
+ image_content = value_to_image_content(value_data)
+
+ if image_content is not None:
+ # Image node: output XML structure, then image, then closing
+ type_name = "image"
+ constraint_expr = f"<{constraint_tag}>\n{constraint}\n{constraint_tag}>" if constraint is not None and node_tag == self.optimizer_prompt_symbol_set.variable_tag else ""
+
+ xml_text = f"<{node_tag} name=\"{k}\" type=\"{type_name}\">\n<{value_tag}>\n"
+ blocks.append(xml_text)
+ blocks.append(image_content) # Image breaks the text flow
+
+ closing_text = f"\n{value_tag}>\n{constraint_expr}{node_tag}>\n\n" if constraint_expr else f"\n{value_tag}>\n{node_tag}>\n\n"
+ blocks.append(closing_text)
+ else:
+ # Non-image node: text representation
+ if constraint is not None and node_tag == self.optimizer_prompt_symbol_set.variable_tag:
+ constraint_expr = f"<{constraint_tag}>\n{constraint}\n{constraint_tag}>"
+ blocks.append(
+ f"<{node_tag} name=\"{k}\" type=\"{type(value_data).__name__}\">\n<{value_tag}>\n{value_data}\n{value_tag}>\n{constraint_expr}\n{node_tag}>\n\n"
+ )
+ else:
+ blocks.append(
+ f"<{node_tag} name=\"{k}\" type=\"{type(value_data).__name__}\">\n<{value_tag}>\n{value_data}\n{value_tag}>\n{node_tag}>\n\n"
+ )
+ else:
+ # Code node (never an image)
+ constraint_expr = f"<{constraint_tag}>\n{constraint}\n{constraint_tag}>"
+ signature = constraint.replace("The code should start with:\n", "")
+ func_body = value_data.replace(signature, "")
+ blocks.append(
+ f"<{node_tag} name=\"{k}\" type=\"code\">\n<{value_tag}>\n{signature}{func_body}\n{value_tag}>\n{constraint_expr}\n{node_tag}>\n\n"
+ )
+
+ return blocks
+
+ def repr_node_value_compact_as_content_blocks(self, node_dict, node_tag="node",
+ value_tag="value", constraint_tag="constraint") -> ContentBlockList:
+ """Returns a ContentBlockList with compact representation, including images.
+
+ Consecutive TextContent blocks are merged for efficiency.
+ Non-image values are truncated. Images break the text flow.
+ """
+ blocks = ContentBlockList()
+
+ for k, v in node_dict.items():
+ value_data = v[0]
+ constraint = v[1]
+
+ if "__code" not in k:
+ # Check if this is an image
+ image_content = value_to_image_content(value_data)
+
+ if image_content is not None:
+ # Image node: output XML structure, then image, then closing
+ type_name = "image"
+ constraint_expr = f"<{constraint_tag}>\n{constraint}\n{constraint_tag}>" if constraint is not None and node_tag == self.optimizer_prompt_symbol_set.variable_tag else ""
+
+ xml_text = f"<{node_tag} name=\"{k}\" type=\"{type_name}\">\n<{value_tag}>\n"
+ blocks.append(xml_text)
+ blocks.append(image_content) # Image breaks the text flow
+
+ closing_text = f"\n{value_tag}>\n{constraint_expr}{node_tag}>\n\n" if constraint_expr else f"\n{value_tag}>\n{node_tag}>\n\n"
+ blocks.append(closing_text)
+ else:
+ # Non-image node: truncated text representation
+ node_value = self.truncate_expression(value_data, self.initial_var_char_limit)
+ if constraint is not None and node_tag == self.optimizer_prompt_symbol_set.variable_tag:
+ constraint_expr = f"<{constraint_tag}>\n{constraint}\n{constraint_tag}>"
+ blocks.append(
+ f"<{node_tag} name=\"{k}\" type=\"{type(value_data).__name__}\">\n<{value_tag}>\n{node_value}\n{value_tag}>\n{constraint_expr}\n{node_tag}>\n\n"
+ )
+ else:
+ blocks.append(
+ f"<{node_tag} name=\"{k}\" type=\"{type(value_data).__name__}\">\n<{value_tag}>\n{node_value}\n{value_tag}>\n{node_tag}>\n\n"
+ )
+ else:
+ # Code node (never an image)
+ constraint_expr = f"<{constraint_tag}>\n{constraint}\n{constraint_tag}>"
+ signature = constraint.replace("The code should start with:\n", "")
+ func_body = value_data.replace(signature, "")
+ node_value = self.truncate_expression(func_body, self.initial_var_char_limit)
+ blocks.append(
+ f"<{node_tag} name=\"{k}\" type=\"code\">\n<{value_tag}>\n{signature}{node_value}\n{value_tag}>\n{constraint_expr}\n{node_tag}>\n\n"
+ )
+
+ return blocks
+
+ def summarize(self):
+ """Aggregate feedback from parameters into a structured summary.
+
+ Collects and organizes feedback from all trainable parameters into
+ a FunctionFeedback structure suitable for problem representation.
+
+ Returns
+ -------
+ FunctionFeedback
+ Structured feedback containing:
+ - variables: Trainable parameters with values and descriptions
+ - inputs: Non-trainable root nodes
+ - graph: Topologically sorted function calls
+ - others: Intermediate computation values
+ - output: Final output values
+ - documentation: Function documentation strings
+ - user_feedback: Aggregated user feedback
+
+ Notes
+ -----
+ The method performs several transformations:
+ 1. Aggregates feedback from all trainable parameters
+ 2. Converts the trace graph to FunctionFeedback structure
+ 3. Separates root nodes into variables (trainable) and inputs (non-trainable)
+ 4. Preserves the computation graph and intermediate values
+
+ Parameters without feedback (disconnected from output) are still
+ included in the summary but may not receive updates.
+ """
+ # Aggregate feedback from all the parameters
+ feedbacks = [
+ self.propagator.aggregate(node.feedback)
+ for node in self.parameters
+ if node.trainable
+ ]
+ summary = sum(feedbacks) # TraceGraph
+ # Construct variables and update others
+ # Some trainable nodes might not receive feedback, because they might not be connected to the output
+ summary = node_to_function_feedback(summary)
+ # Classify the root nodes into variables and others
+ # summary.variables = {p.py_name: p.data for p in self.parameters if p.trainable and p.py_name in summary.roots}
+
+ trainable_param_dict = {p.py_name: p for p in self.parameters if p.trainable}
+ summary.variables = {
+ py_name: data
+ for py_name, data in summary.roots.items()
+ if py_name in trainable_param_dict
+ }
+ summary.inputs = {
+ py_name: data
+ for py_name, data in summary.roots.items()
+ if py_name not in trainable_param_dict
+ } # non-variable roots
+
+ return summary
+
+ def construct_prompt(self, summary, mask=None, *args, **kwargs):
+ """Construct the system and user prompt.
+
+ The prompt for the optimizer agent is rather complex.
+ There are prompts that are automatically constructed through the Trace frontend (aka the bundle/node API).
+ However, we also allow the user to provide additional context to the optimizer agent.
+
+ We handle multimodal (MM) conversion implicitly for the automatic part (TraceGraph),
+ but we handle the user-provided context explicitly.
+
+ Args:
+ summary: The FunctionFeedback summary containing graph information.
+ mask: List of section titles to exclude from the problem instance.
+
+ Returns:
+ Tuple of (system_prompt: str, user_prompt: ContentBlockList)
+ - system_prompt is always a string
+ - user_prompt is a ContentBlockList for multimodal support
+ """
+ system_prompt = (
+ self.representation_prompt + self.output_format_prompt
+ ) # generic representation + output rule
+
+ problem_inst = self.problem_instance(summary, mask=mask)
+
+ # Build user prompt as ContentBlockList (auto-merges consecutive text)
+ user_content_blocks = ContentBlockList()
+
+ # Add example if included
+ if self.include_example:
+ example_text = self.example_problem_template.format(
+ example_problem=str(self.example_problem), # Example is always text
+ example_response=self.example_response,
+ )
+ user_content_blocks.append(example_text)
+
+ # Add problem instance template
+ # context is part of the problem instance
+ user_content_blocks.append(self.user_prompt_template.format(
+ problem_instance=problem_inst.to_content_blocks(),
+ ))
+
+ # Add final prompt
+ var_names = ", ".join(k for k in summary.variables.keys())
+ user_content_blocks.append(self.final_prompt.format(
+ names=var_names,
+ ))
+
+ return system_prompt, user_content_blocks
+
+ def problem_instance(self, summary: FunctionFeedback, mask=None):
+ """Create a ProblemInstance from the summary.
+
+ Args:
+ summary: The FunctionFeedback summary containing graph information.
+ mask: List of section titles to exclude from the problem instance.
+
+ Returns:
+ ProblemInstance with content block fields for multimodal support.
+ """
+ mask = mask or []
+
+ # Use content block representations for multimodal support
+ variables_content = (
+ self.repr_node_value_as_content_blocks(
+ summary.variables,
+ node_tag=self.optimizer_prompt_symbol_set.variable_tag,
+ value_tag=self.optimizer_prompt_symbol_set.value_tag,
+ constraint_tag=self.optimizer_prompt_symbol_set.constraint_tag
+ )
+ if self.optimizer_prompt_symbol_set.variables_section_title not in mask
+ else ContentBlockList()
+ )
+
+ # we add a temporary check here to ensure no more than 1 parameter is an image
+ num_images = sum(1 for block in variables_content if isinstance(block, ImageContent))
+ if num_images > 0:
+ assert num_images <= 1, "Currently we do not support generating multiple images (more than 1 parameter is an image)"
+ self.output_contains_image = True
+
+ inputs_content = (
+ self.repr_node_value_compact_as_content_blocks(
+ summary.inputs,
+ node_tag=self.optimizer_prompt_symbol_set.node_tag,
+ value_tag=self.optimizer_prompt_symbol_set.value_tag,
+ constraint_tag=self.optimizer_prompt_symbol_set.constraint_tag
+ )
+ if self.optimizer_prompt_symbol_set.inputs_section_title not in mask
+ else ContentBlockList()
+ )
+ outputs_content = (
+ self.repr_node_value_compact_as_content_blocks(
+ summary.output,
+ node_tag=self.optimizer_prompt_symbol_set.node_tag,
+ value_tag=self.optimizer_prompt_symbol_set.value_tag,
+ constraint_tag=self.optimizer_prompt_symbol_set.constraint_tag
+ )
+ if self.optimizer_prompt_symbol_set.outputs_section_title not in mask
+ else ContentBlockList()
+ )
+ others_content = (
+ self.repr_node_value_compact_as_content_blocks(
+ summary.others,
+ node_tag=self.optimizer_prompt_symbol_set.node_tag,
+ value_tag=self.optimizer_prompt_symbol_set.value_tag,
+ constraint_tag=self.optimizer_prompt_symbol_set.constraint_tag
+ )
+ if self.optimizer_prompt_symbol_set.others_section_title not in mask
+ else ContentBlockList()
+ )
+
+ return ProblemInstance(
+ instruction=self.objective if "#Instruction" not in mask else "",
+ code=(
+ "\n".join([v for k, v in sorted(summary.graph)])
+ if self.optimizer_prompt_symbol_set.inputs_section_title not in mask
+ else ""
+ ),
+ documentation=(
+ "\n".join([f"[{k}] {v}" for k, v in summary.documentation.items()])
+ if self.optimizer_prompt_symbol_set.documentation_section_title not in mask
+ else ""
+ ),
+ variables=variables_content,
+ inputs=inputs_content,
+ outputs=outputs_content,
+ others=others_content,
+ feedback=Content(summary.user_feedback) if self.optimizer_prompt_symbol_set.feedback_section_title not in mask else Content(""),
+ context=self.problem_context,
+ optimizer_prompt_symbol_set=self.optimizer_prompt_symbol_set
+ )
+
+ def _step(
+ self, verbose=False, mask=None, *args, **kwargs
+ ) -> Dict[ParameterNode, Any]:
+ """Execute one optimization step.
+
+ Args:
+ verbose: If True, print prompts and responses.
+ mask: List of section titles to exclude from the problem instance.
+
+ Returns:
+ Dictionary mapping parameters to their updated values.
+ """
+ assert isinstance(self.propagator, GraphPropagator)
+ summary = self.summarize()
+
+ system_prompt, user_content_blocks = self.construct_prompt(summary, mask=mask)
+
+ response = self.call_llm(
+ system_prompt=system_prompt,
+ user_prompt=user_content_blocks,
+ verbose=verbose,
+ max_tokens=self.max_tokens,
+ )
+
+ if "TERMINATE" in response.to_text():
+ return {}
+
+ suggestion = self.extract_llm_suggestion(response.to_text())
+ update_dict = self.construct_update_dict(suggestion['variables'])
+ # suggestion has two keys: reasoning, and variables
+
+ # for update_dict, we manually update the image according to the variable name
+ if response.get_images().has_images():
+ images = response.get_images()
+ assert len(images) == 1, "Currently we only allow at most one image parameter"
+ # find the variable name
+ image_param = [param for param in self.parameters if param.is_image][0]
+ update_dict[image_param] = images[0].as_image() # parameter as PIL Image
+
+ if self.log is not None:
+ # For logging, use text representation
+ log_user_prompt = str(self.problem_instance(summary))
+ self.log.append(
+ {
+ "system_prompt": system_prompt,
+ "user_prompt": log_user_prompt,
+ "response": response,
+ }
+ )
+ self.summary_log.append(
+ {"problem_instance": self.problem_instance(summary), "summary": summary}
+ )
+
+ return update_dict
+
+ def extract_llm_suggestion(self, response: str):
+ """Extract the suggestion from the response."""
+
+ suggestion = self.optimizer_prompt_symbol_set.output_response_extractor(response)
+
+ if len(suggestion) == 0:
+ if not self.ignore_extraction_error:
+ print("Cannot extract suggestion from LLM's response:")
+ print(response)
+
+ return suggestion
+
+ def call_llm(
+ self,
+ system_prompt: str,
+ user_prompt: ContentBlockList,
+ verbose: Union[bool, str] = False,
+ max_tokens: int = 4096,
+ ) -> AssistantTurn:
+ """Call the LLM with a prompt and return the response.
+
+ Args:
+ system_prompt: The system prompt (always a string).
+ user_prompt: The user prompt as ContentBlockList for multimodal content.
+ verbose: If True, print the prompt and response. If "output", only print response.
+ max_tokens: Maximum tokens in the response.
+
+ Returns:
+ assistant_turn: AssistantTurn object
+ """
+ if verbose not in (False, "output"):
+ # Print text portions, indicate if images present
+ text_parts = [block.text for block in user_prompt if isinstance(block, TextContent)]
+ has_images = any(isinstance(block, ImageContent) for block in user_prompt)
+ suffix = f" [+ {DEFAULT_IMAGE_PLACEHOLDER}]" if has_images else ""
+ print("Prompt\n", system_prompt + "".join(text_parts) + suffix)
+
+ # Build the user message from the content blocks.
+ user_message = UserTurn(user_prompt).to_litellm_format()
+
+ # Keep the last `conversation_length` rounds (each round = user+assistant).
+ if self.conversation_length > 0:
+ history = self.message_history[-2 * self.conversation_length:]
+ else:
+ history = []
+
+ # Assemble the stateless request: system + history + current user turn.
+ messages = to_messages(system_prompt, history=history)
+ messages.append(user_message)
+
+ # Bedrock doesn't support response_format natively - LiteLLM adds tools which breaks the response
+ _is_bedrock = hasattr(self.llm, 'model_name') and is_bedrock_model(self.llm.model_name)
+ response_format = {"type": "json_object"} if (self.use_json_object_format and not _is_bedrock) else None
+
+ # Prepare common arguments
+ llm_kwargs = {"messages": messages, "max_tokens": max_tokens, "response_format": response_format}
+
+ # Add image generation tool only for non-Gemini models when output contains image
+ if self.output_contains_image and 'gemini' not in self.llm.model_name:
+ llm_kwargs["tools"] = [{"type": "image_generation"}]
+
+ assistant_turn = self.llm(**llm_kwargs)
+
+ if verbose:
+ print("LLM response:\n", assistant_turn)
+
+ # Append this round to the history we manage ourselves.
+ self.message_history.append(user_message)
+ self.message_history.append(assistant_turn.to_litellm_format())
+
+ return assistant_turn
+
+ def save(self, path: str):
+ """Save the optimizer state to a file."""
+ with open(path, 'wb') as f:
+ pickle.dump({
+ "truncate_expression": self.truncate_expression,
+ "use_json_object_format": self.use_json_object_format,
+ "ignore_extraction_error": self.ignore_extraction_error,
+ "objective": self.objective,
+ "initial_var_char_limit": self.initial_var_char_limit,
+ "optimizer_prompt_symbol_set": self.optimizer_prompt_symbol_set,
+ "include_example": self.include_example,
+ "max_tokens": self.max_tokens,
+ "memory": self.memory,
+ "message_history": self.message_history,
+ "conversation_length": self.conversation_length,
+ "default_prompt_symbols": self.default_prompt_symbols,
+ "prompt_symbols": self.prompt_symbols,
+ "representation_prompt": self.representation_prompt,
+ "output_format_prompt": self.output_format_prompt,
+ }, f)
+
+ def load(self, path: str):
+ """Load the optimizer state from a file."""
+ with open(path, 'rb') as f:
+ state = pickle.load(f)
+ self.truncate_expression = state["truncate_expression"]
+ self.use_json_object_format = state["use_json_object_format"]
+ self.ignore_extraction_error = state["ignore_extraction_error"]
+ self.objective = state["objective"]
+ self.initial_var_char_limit = state["initial_var_char_limit"]
+ self.optimizer_prompt_symbol_set = state["optimizer_prompt_symbol_set"]
+ self.include_example = state["include_example"]
+ self.max_tokens = state["max_tokens"]
+ self.memory = state["memory"]
+ self.message_history = state.get("message_history", [])
+ self.conversation_length = state.get("conversation_length", 0)
+ self.default_prompt_symbols = state["default_prompt_symbols"]
+ self.prompt_symbols = state["prompt_symbols"]
+ self.representation_prompt = state["representation_prompt"]
+ self.output_format_prompt = state["output_format_prompt"]
diff --git a/opto/optimizers/utils.py b/opto/optimizers/utils.py
index 13a5ad01..ca967402 100644
--- a/opto/optimizers/utils.py
+++ b/opto/optimizers/utils.py
@@ -1,5 +1,26 @@
from typing import Dict, Any
+
+def is_bedrock_model(model_name: str) -> bool:
+ """Check whether a model name refers to an AWS Bedrock model.
+
+ Bedrock models in LiteLLM look like ``bedrock/us.anthropic.claude-...`` or
+ carry a region prefix such as ``us.``/``eu.``/``ap.``.
+
+ Args:
+ model_name: The model name string to check (may be None).
+
+ Returns:
+ True if the model is a Bedrock model, False otherwise.
+ """
+ if model_name is None:
+ return False
+ if model_name.startswith('bedrock/'):
+ return True
+ # AWS region prefixes (us-east-1, eu-west-1, ap-northeast-1, ...)
+ return any(model_name.startswith(f'{region}.') for region in ('us', 'eu', 'ap'))
+
+
def print_color(message, color=None, logger=None):
colors = {
"red": "\033[91m",
diff --git a/opto/trace/nodes.py b/opto/trace/nodes.py
index ad935015..8775deb9 100644
--- a/opto/trace/nodes.py
+++ b/opto/trace/nodes.py
@@ -285,6 +285,102 @@ def __len__(self):
T = TypeVar("T")
+def verify_data_is_image_url(url: str, timeout: float = 1.0) -> bool:
+ """Verify that a URL points to an image via a HEAD request (Content-Type).
+
+ Use this when you need definitive verification beyond the pattern-based
+ :func:`is_image` check (e.g. right before converting an image to base64).
+
+ Args:
+ url: The URL to check.
+ timeout: Maximum time in seconds to wait for the request. Default 1.0.
+
+ Returns:
+ bool: True if the URL returns an ``image/*`` Content-Type, else False.
+ Returns False for non-URL data or if the request/library is unavailable.
+ """
+ if not isinstance(url, str):
+ return False
+ try:
+ from urllib.parse import urlparse
+ parsed = urlparse(url)
+ if parsed.scheme not in ('http', 'https'):
+ return False
+ try:
+ import requests
+ response = requests.head(url, timeout=timeout, allow_redirects=True)
+ content_type = response.headers.get('content-type', '').lower()
+ return content_type.startswith('image/')
+ except ImportError:
+ warnings.warn(
+ "requests library not available. Install with: pip install requests",
+ ImportWarning,
+ )
+ return False
+ except Exception:
+ # Network errors, timeouts, invalid URLs, etc.
+ return False
+ except (ValueError, AttributeError):
+ return False
+
+
+def is_image(data) -> bool:
+ """Pattern-based check for whether ``data`` represents an image.
+
+ Supports: base64 data-URL strings (``data:image/...``), PIL Image objects,
+ raw image bytes, image URLs (by extension; no network request), and
+ ``ImageContent`` containers (checked by class name to avoid an import cycle).
+
+ For network verification of URLs, use :func:`verify_data_is_image_url`.
+ Convert numpy arrays to PIL Images first.
+ """
+ # Base64 data URL string
+ if isinstance(data, str) and data.startswith('data:image/'):
+ return True
+
+ # PIL Image object
+ try:
+ from PIL import Image
+ if isinstance(data, Image.Image):
+ return True
+ except ImportError:
+ pass
+
+ # Raw image bytes
+ if isinstance(data, bytes):
+ try:
+ from PIL import Image
+ from io import BytesIO
+ Image.open(BytesIO(data))
+ return True
+ except Exception:
+ pass
+
+ # Image URL (pattern-based, no network request)
+ if isinstance(data, str):
+ try:
+ from urllib.parse import urlparse
+ parsed = urlparse(data)
+ if parsed.scheme in ('http', 'https'):
+ path = parsed.path.lower()
+ image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp',
+ '.svg', '.ico', '.tiff', '.tif', '.heic', '.heif')
+ if any(path.endswith(ext) for ext in image_extensions):
+ return True
+ except (ValueError, AttributeError):
+ pass
+
+ # Specialized container class (e.g. ImageContent) checked by name to keep
+ # nodes.py free of external (opto.utils) dependencies.
+ try:
+ if 'ImageContent' in data.__class__.__name__:
+ return True
+ except AttributeError:
+ pass
+
+ return False
+
+
class AbstractNode(Generic[T]):
"""AbstractNode represents an abstract data node in a directed graph.
@@ -362,6 +458,11 @@ def data(self):
current_used_nodes[-1].add(self)
return self.__getattribute__("_data")
+ @property
+ def is_image(self) -> bool:
+ """Whether this node's data represents an image (see :func:`is_image`)."""
+ return is_image(self._data)
+
@property
def parents(self):
"""Get the parents of a node.
diff --git a/opto/utils/backbone/__init__.py b/opto/utils/backbone/__init__.py
new file mode 100644
index 00000000..22f5b87a
--- /dev/null
+++ b/opto/utils/backbone/__init__.py
@@ -0,0 +1,33 @@
+"""Minimal multimodal conversation primitives for Trace optimizers.
+
+Provides text/image content blocks, lightweight user/assistant turns, a prompt
+template, and a stateless :func:`to_messages` helper for building provider-ready
+messages lists. There is no conversation manager: optimizers own their own
+message history as a plain list of dicts.
+"""
+from .content import (
+ DEFAULT_IMAGE_PLACEHOLDER,
+ ContentBase,
+ ContentBlockList,
+ Content,
+ TextContent,
+ ImageContent,
+ ContentBlock,
+)
+from .template import PromptTemplate
+from .turns import Turn, UserTurn, AssistantTurn, to_messages
+
+__all__ = [
+ "DEFAULT_IMAGE_PLACEHOLDER",
+ "ContentBase",
+ "ContentBlockList",
+ "Content",
+ "ContentBlock",
+ "TextContent",
+ "ImageContent",
+ "PromptTemplate",
+ "Turn",
+ "UserTurn",
+ "AssistantTurn",
+ "to_messages",
+]
diff --git a/opto/utils/backbone/content.py b/opto/utils/backbone/content.py
new file mode 100644
index 00000000..374bc96a
--- /dev/null
+++ b/opto/utils/backbone/content.py
@@ -0,0 +1,463 @@
+"""Multimodal content blocks (text + image) for LLM conversations.
+
+Every class here is a small, picklable data class with a ``build``/``autocast``
+helper to construct itself from loosely typed input. These primitives are the
+minimal layer used by the optimizers to send text and images to an LLM.
+"""
+from typing import List, Dict, Any, Optional, Literal, Union, Iterable
+from dataclasses import dataclass
+import base64
+from pathlib import Path
+
+from PIL import Image
+import io
+
+
+# Placeholder used when rendering an image as plain text.
+DEFAULT_IMAGE_PLACEHOLDER = "\n[IMAGE]\n"
+
+
+@dataclass
+class ContentBase:
+ """Abstract base class for all content blocks."""
+
+ def __init__(self, **kwargs):
+ for key, value in kwargs.items():
+ setattr(self, key, value)
+
+ def to_dict(self) -> Dict[str, Any]:
+ raise NotImplementedError("Subclasses must implement this method")
+
+ @classmethod
+ def build(cls, value: Any, **kwargs) -> "ContentBase":
+ raise NotImplementedError("Subclasses must implement this method")
+
+ def is_empty(self) -> bool:
+ raise NotImplementedError("Subclasses must implement this method")
+
+
+class ContentBlockList(list):
+ """List of content blocks with automatic type conversion.
+
+ Supports automatic conversion from str (-> TextContent), a single
+ ContentBlock, a list of ContentBlocks, or None (-> empty list). May contain
+ mixed types (text and images).
+ """
+
+ def __init__(self, content: Union[str, "ContentBase", List["ContentBase"], None] = None):
+ super().__init__()
+ if content is not None:
+ self.extend(self._normalize(content))
+
+ @staticmethod
+ def _normalize(content: Union[str, "ContentBase", List["ContentBase"], None]) -> List["ContentBase"]:
+ if content is None:
+ return []
+ if isinstance(content, str):
+ return [TextContent(text=content)] if content else []
+ if isinstance(content, list):
+ return content
+ return [content]
+
+ @classmethod
+ def ensure(cls, content: Union[str, "ContentBase", List["ContentBase"], None]) -> "ContentBlockList":
+ """Return content as a ContentBlockList, converting if needed."""
+ if isinstance(content, cls):
+ return content
+ return cls(content)
+
+ def __getitem__(self, key: Union[int, slice]) -> Union["ContentBase", "ContentBlockList"]:
+ if isinstance(key, slice):
+ return ContentBlockList(list.__getitem__(self, key))
+ return list.__getitem__(self, key)
+
+ def to_dict(self) -> Dict[str, Any]:
+ return {"type": "list", "blocks": [b.to_dict() for b in self]}
+
+ def append(self, item: Union[str, "ContentBase", "ContentBlockList"]) -> "ContentBlockList":
+ """Append a string or ContentBlock, merging consecutive text blocks."""
+ if isinstance(item, str):
+ if self and isinstance(self[-1], TextContent):
+ self[-1] = TextContent(text=self[-1].text + " " + item)
+ else:
+ super().append(TextContent(text=item))
+ elif isinstance(item, TextContent):
+ if self and isinstance(self[-1], TextContent):
+ self[-1] = TextContent(text=self[-1].text + " " + item.text)
+ else:
+ super().append(item)
+ elif isinstance(item, ContentBlockList):
+ super().extend(item)
+ else:
+ super().append(item)
+ return self
+
+ def extend(self, blocks: Union[str, "ContentBase", List["ContentBase"], "ContentBlockList", None]) -> "ContentBlockList":
+ """Extend with blocks, merging consecutive TextContent."""
+ for block in self._normalize(blocks):
+ self.append(block)
+ return self
+
+ def __add__(self, other) -> "ContentBlockList":
+ if isinstance(other, (ContentBlockList, list)):
+ result = ContentBlockList(list(self))
+ result.extend(other)
+ return result
+ if isinstance(other, str):
+ result = ContentBlockList(list(self))
+ result.append(TextContent(text=other))
+ return result
+ return NotImplemented
+
+ def __radd__(self, other) -> "ContentBlockList":
+ if isinstance(other, str):
+ result = ContentBlockList([TextContent(text=other)])
+ result.extend(self)
+ return result
+ return NotImplemented
+
+ def is_empty(self) -> bool:
+ if len(self) == 0:
+ return True
+ return all(block.is_empty() for block in self)
+
+ def has_images(self) -> bool:
+ return any(isinstance(block, ImageContent) for block in self)
+
+ def has_text(self) -> bool:
+ return any(isinstance(block, TextContent) for block in self)
+
+ def to_text(self, image_placeholder: str = DEFAULT_IMAGE_PLACEHOLDER) -> str:
+ """Text representation where images are replaced with a placeholder.
+
+ Nested ContentBlockLists are handled recursively.
+ """
+ parts = []
+ for block in self:
+ if isinstance(block, TextContent):
+ parts.append(block.text)
+ elif isinstance(block, ImageContent):
+ parts.append(image_placeholder)
+ elif isinstance(block, ContentBlockList):
+ nested = block.to_text(image_placeholder)
+ if nested:
+ parts.append(nested)
+ return " ".join(parts)
+
+ def __bool__(self) -> bool:
+ for block in self:
+ if isinstance(block, ImageContent):
+ return True
+ if isinstance(block, TextContent) and block.text.strip():
+ return True
+ return False
+
+ def __repr__(self) -> str:
+ return self.to_text()
+
+ def to_content_blocks(self) -> "ContentBlockList":
+ """Return self (interface compatibility with composite classes)."""
+ return self
+
+ def to_litellm_format(self, role: Optional[str] = None) -> List[Dict[str, Any]]:
+ """Convert content blocks to LiteLLM/OpenAI Response API format."""
+ if role is None:
+ role = "user"
+ content = []
+ for block in self:
+ if block.is_empty():
+ continue
+ if isinstance(block, TextContent):
+ content.append(block.to_litellm_format(role=role))
+ elif isinstance(block, ImageContent):
+ content.append(block.to_litellm_format())
+ elif hasattr(block, "to_litellm_format"):
+ content.append(block.to_litellm_format())
+ else:
+ content.append(block.to_dict())
+ return content
+
+
+class Content(ContentBlockList):
+ """User-facing multimodal content builder for the optimizer.
+
+ Creation patterns:
+ - Variadic: ``Content("text", image, "more text")`` (strings auto-detected
+ as text or image paths/URLs)
+ - Template: ``Content("See [IMAGE] here", images=[img])``
+ - Empty: ``Content()``
+ """
+
+ def __init__(self, *args, images: Optional[List[Any]] = None, format: str = "PNG"):
+ super().__init__()
+ if images is not None:
+ if len(args) != 1 or not isinstance(args[0], str):
+ raise ValueError(
+ "Template mode requires exactly one template string as the first "
+ f"argument. Got {len(args)} arguments."
+ )
+ self._build_from_template(args[0], images=images, format=format)
+ elif args:
+ self._build_from_variadic(*args)
+
+ def _build_from_variadic(self, *args) -> None:
+ for arg in args:
+ image_content = ImageContent.build(arg)
+ if not image_content.is_empty():
+ self.append(image_content)
+ else:
+ self.append(arg)
+
+ def _build_from_template(self, template: str, images: List[Any], format: str = "PNG") -> None:
+ placeholder = DEFAULT_IMAGE_PLACEHOLDER
+ placeholder_count = template.count(placeholder)
+ if placeholder_count != len(images):
+ raise ValueError(
+ f"Number of {placeholder} placeholders ({placeholder_count}) "
+ f"does not match number of images ({len(images)})"
+ )
+ parts = template.split(placeholder)
+ for i, part in enumerate(parts):
+ if part:
+ self.append(part)
+ if i < len(images):
+ image_content = ImageContent.build(images[i], format=format)
+ if image_content is None:
+ raise ValueError(
+ f"Could not convert image at index {i} to ImageContent: {type(images[i])}"
+ )
+ self.append(image_content)
+
+
+@dataclass
+class TextContent(ContentBase):
+ """Text content block."""
+ type: Literal["text"] = "text"
+ text: str = ""
+
+ def __init__(self, text: str = ""):
+ super().__init__(text=text)
+
+ def is_empty(self) -> bool:
+ return not self.text
+
+ @classmethod
+ def build(cls, value: Any = "", **kwargs) -> "TextContent":
+ return cls(text=value if isinstance(value, str) else str(value))
+
+ def to_dict(self) -> Dict[str, Any]:
+ return {"type": self.type, "text": self.text}
+
+ def to_litellm_format(self, role: str = "user") -> Dict[str, Any]:
+ """Response API format: input_text for user, output_text for assistant."""
+ text_type = "input_text" if role == "user" else "output_text"
+ return {"type": text_type, "text": self.text}
+
+ def __add__(self, other) -> "TextContent":
+ if isinstance(other, str):
+ return TextContent(text=self.text + " " + other)
+ if isinstance(other, TextContent):
+ return TextContent(text=self.text + " " + other.text)
+ return NotImplemented
+
+ def __radd__(self, other) -> "TextContent":
+ if isinstance(other, str):
+ return TextContent(text=other + " " + self.text)
+ return NotImplemented
+
+
+@dataclass
+class ImageContent(ContentBase):
+ """Image content block - supports URLs, base64, file paths, bytes, PIL, numpy.
+
+ Storage: ``image_url`` (http/https or data URL), ``image_data`` (base64), or
+ ``image_bytes`` (raw bytes; Gemini prefers these). Use :meth:`build` to
+ construct from any supported value.
+ """
+ type: Literal["image"] = "image"
+ image_url: Optional[str] = None
+ image_data: Optional[str] = None # base64 encoded
+ image_bytes: Optional[bytes] = None
+ media_type: str = "image/jpeg"
+ detail: Optional[str] = None # OpenAI: "auto", "low", "high"
+
+ def __init__(self, value: Any = None, format: str = "PNG", **kwargs):
+ if kwargs:
+ kwargs.setdefault("type", "image")
+ kwargs.setdefault("media_type", "image/jpeg")
+ super().__init__(**kwargs)
+ else:
+ super().__init__(**self.autocast(value, format=format))
+
+ def __repr__(self) -> str:
+ data = f"{self.image_data[:10]}..." if self.image_data and len(self.image_data) > 10 else self.image_data
+ raw = f"{str(self.image_bytes[:10])}..." if self.image_bytes and len(self.image_bytes) > 10 else self.image_bytes
+ return f"ImageContent(image_url={self.image_url}, image_data={data}, image_bytes={raw}, media_type={self.media_type})"
+
+ __str__ = __repr__
+
+ def is_empty(self) -> bool:
+ return not self.image_url and not self.image_data and not self.image_bytes
+
+ def to_dict(self) -> Dict[str, Any]:
+ result = {"type": self.type, "media_type": self.media_type}
+ if self.image_url:
+ result["image_url"] = self.image_url
+ if self.image_data:
+ result["image_data"] = self.image_data
+ if self.image_bytes:
+ result["image_bytes"] = self.image_bytes
+ if self.detail:
+ result["detail"] = self.detail
+ return result
+
+ def to_litellm_format(self) -> Dict[str, Any]:
+ """Response API format: {"type": "input_image", "image_url": "..."}."""
+ if self.image_url:
+ url = self.image_url
+ elif self.image_data:
+ url = f"data:{self.media_type};base64,{self.image_data}"
+ elif self.image_bytes:
+ b64 = base64.b64encode(self.image_bytes).decode("utf-8")
+ url = f"data:{self.media_type};base64,{b64}"
+ else:
+ return {"type": "input_image", "image_url": ""}
+ result = {"type": "input_image", "image_url": url}
+ if self.detail:
+ result["detail"] = self.detail
+ return result
+
+ @staticmethod
+ def autocast(value: Any, format: str = "PNG") -> Dict[str, Any]:
+ """Auto-detect value type and return image field values.
+
+ Accepts: None, ImageContent, URL/data-URL/file-path strings, raw bytes,
+ PIL Images, and numpy arrays.
+ """
+ empty = {"image_url": None, "image_data": None, "image_bytes": None, "media_type": "image/jpeg"}
+ if value is None:
+ return dict(empty)
+
+ if isinstance(value, ImageContent):
+ return {
+ "image_url": value.image_url,
+ "image_data": value.image_data,
+ "image_bytes": value.image_bytes,
+ "media_type": value.media_type,
+ }
+
+ if isinstance(value, str):
+ if not value.strip():
+ return dict(empty)
+ if value.startswith("data:image/"):
+ try:
+ header, b64 = value.split(",", 1)
+ media_type = header.split(":")[1].split(";")[0]
+ return {"image_url": None, "image_data": b64, "image_bytes": None, "media_type": media_type}
+ except (ValueError, IndexError):
+ return {"image_url": None, "image_data": value.split(",")[-1], "image_bytes": None, "media_type": "image/jpeg"}
+ if value.startswith("http://") or value.startswith("https://"):
+ return {"image_url": value, "image_data": None, "image_bytes": None, "media_type": "image/jpeg"}
+ # Treat short strings as possible file paths.
+ if len(value) < 4096:
+ path = Path(value)
+ try:
+ if path.exists():
+ media_type = _ext_to_media_type(path.suffix)
+ with open(value, "rb") as f:
+ image_data = base64.b64encode(f.read()).decode("utf-8")
+ return {"image_url": None, "image_data": image_data, "image_bytes": None, "media_type": media_type}
+ except (OSError, IOError):
+ pass
+
+ if isinstance(value, bytes):
+ return {"image_url": None, "image_data": base64.b64encode(value).decode("utf-8"), "image_bytes": None, "media_type": "image/jpeg"}
+
+ if isinstance(value, Image.Image):
+ buffer = io.BytesIO()
+ img_format = value.format or format.upper()
+ value.save(buffer, format=img_format)
+ buffer.seek(0)
+ return {
+ "image_url": None,
+ "image_data": base64.b64encode(buffer.getvalue()).decode("utf-8"),
+ "image_bytes": None,
+ "media_type": f"image/{img_format.lower()}",
+ }
+
+ try:
+ import numpy as np
+ if isinstance(value, np.ndarray) or hasattr(value, "__array__"):
+ arr = value if isinstance(value, np.ndarray) else np.array(value)
+ if arr.dtype in (np.float32, np.float64):
+ arr = (arr * 255).astype(np.uint8) if arr.max() <= 1.0 else arr.astype(np.uint8)
+ elif arr.dtype != np.uint8:
+ arr = arr.astype(np.uint8)
+ image = Image.fromarray(arr)
+ buffer = io.BytesIO()
+ image.save(buffer, format=format.upper())
+ buffer.seek(0)
+ return {
+ "image_url": None,
+ "image_data": base64.b64encode(buffer.getvalue()).decode("utf-8"),
+ "image_bytes": None,
+ "media_type": f"image/{format.lower()}",
+ }
+ except ImportError:
+ pass
+
+ return dict(empty)
+
+ @classmethod
+ def build(cls, value: Any, format: str = "PNG") -> "ImageContent":
+ """Construct an ImageContent from any supported value."""
+ if isinstance(value, cls):
+ return value
+ return cls(**cls.autocast(value, format=format))
+
+ def as_image(self) -> Image.Image:
+ """Return the image as a PIL Image, fetching from URL if needed."""
+ image_bytes = self.get_bytes()
+ if image_bytes:
+ return Image.open(io.BytesIO(image_bytes))
+ if self.image_url:
+ if self.image_url.startswith(("http://", "https://")):
+ try:
+ import requests
+ response = requests.get(self.image_url, timeout=30)
+ response.raise_for_status()
+ return Image.open(io.BytesIO(response.content))
+ except ImportError:
+ from urllib.request import urlopen
+ with urlopen(self.image_url, timeout=30) as response:
+ return Image.open(io.BytesIO(response.read()))
+ return Image.open(self.image_url)
+ raise ValueError("No image data available to convert to PIL Image")
+
+ def get_bytes(self) -> Optional[bytes]:
+ if self.image_bytes:
+ return self.image_bytes
+ if self.image_data:
+ return base64.b64decode(self.image_data)
+ return None
+
+ def get_base64(self) -> Optional[str]:
+ if self.image_data:
+ return self.image_data
+ if self.image_bytes:
+ return base64.b64encode(self.image_bytes).decode("utf-8")
+ return None
+
+
+def _ext_to_media_type(suffix: str) -> str:
+ return {
+ ".jpg": "image/jpeg",
+ ".jpeg": "image/jpeg",
+ ".png": "image/png",
+ ".gif": "image/gif",
+ ".webp": "image/webp",
+ }.get(suffix.lower(), "image/jpeg")
+
+
+# Union type alias for the supported content types (for type hints).
+ContentBlock = Union[TextContent, ImageContent]
diff --git a/opto/utils/backbone/template.py b/opto/utils/backbone/template.py
new file mode 100644
index 00000000..ae0b659d
--- /dev/null
+++ b/opto/utils/backbone/template.py
@@ -0,0 +1,194 @@
+"""PromptTemplate: ``str.format``-like templating that also supports
+multimodal :class:`ContentBlockList` values.
+"""
+from typing import Union
+
+from .content import ContentBlockList
+
+
+class PromptTemplate:
+ """Template for building ContentBlockLists with {placeholder} support.
+
+ Similar to str.format(), but supports multimodal content (ContentBlockList).
+
+ Return type depends on values:
+ - All strings → returns str (backward compatible)
+ - Any multimodal content → returns ContentBlockList
+
+ Features:
+ - Multiple placeholders: {a}, {b}, {c}
+ - Escaping: {{ and }} for literal braces
+ - Missing placeholders: left as-is in text
+ - Extra kwargs: silently ignored (no error)
+ - Nested templates: if value is PromptTemplate, formats it first
+ - Mixed values: str, ContentBlockList, or objects with to_content_blocks()
+
+ Examples:
+ # Define template (can be class attribute)
+ user_prompt_template = PromptTemplate('''
+ Now you see problem instance:
+
+ ================================
+ {problem_instance}
+ ================================
+ ''')
+
+ # Format with ContentBlockList (may contain images)
+ content = user_prompt_template.format(
+ problem_instance=problem.to_content_blocks()
+ )
+ # Returns ContentBlockList: [TextContent("Now you see..."), *problem_blocks, TextContent("===...")]
+
+ # Multiple placeholders
+ template = PromptTemplate("User: {user}\\nAssistant: {assistant}")
+ result = template.format(user=user_blocks, assistant=assistant_blocks)
+
+ # Nested templates
+ outer = PromptTemplate("Header\\n{body}\\nFooter")
+ inner = PromptTemplate("Content: {data}")
+ result = outer.format(body=inner, data="some data") # inner gets same kwargs
+
+ # Escaping braces
+ template = PromptTemplate('JSON example: {{"key": "{value}"}}')
+ result = template.format(value="hello") # {"key": "hello"}
+
+ # Extra kwargs are ignored (no error)
+ result = template.format(value="hello", unused_key="ignored")
+
+ # Missing placeholders left as-is
+ template = PromptTemplate("Hello {name}, score: {score}")
+ result = template.format(name="Alice") # "Hello Alice, score: {score}"
+ """
+
+ # Regex to find {placeholder} but not {{ or }}
+ _PLACEHOLDER_PATTERN = None # Lazy compiled
+
+ def __init__(self, template: str):
+ """Initialize with a template string.
+
+ Args:
+ template: Template string with {placeholder} syntax.
+ """
+ self.template = template
+
+ @classmethod
+ def _get_pattern(cls):
+ """Lazily compile the placeholder regex pattern."""
+ if cls._PLACEHOLDER_PATTERN is None:
+ import re
+ # Match {name} but not {{ or }}
+ # Captures the placeholder name
+ cls._PLACEHOLDER_PATTERN = re.compile(r'\{(\w+)\}')
+ return cls._PLACEHOLDER_PATTERN
+
+ def format(self, **kwargs) -> Union[str, 'ContentBlockList']:
+ """Format the template with the given values.
+
+ Similar to str.format(), but supports multimodal content.
+ Extra kwargs are silently ignored.
+
+ If all values are strings, returns a str (backward compatible).
+ If any value is a ContentBlockList or multimodal, returns ContentBlockList.
+
+ Args:
+ **kwargs: Placeholder values. Each value can be:
+ - str: inserted as text
+ - ContentBlockList: blocks spliced in at that position
+ - PromptTemplate: formatted first, then spliced in
+ - Object with to_content_blocks(): method called, result spliced
+ - Other: converted to str
+
+ Returns:
+ str: If all values are strings (backward compatible behavior).
+ ContentBlockList: If any value is multimodal content.
+ """
+ # Check if all values are simple strings - if so, use simple string formatting
+ pattern = self._get_pattern()
+ placeholder_names = set(pattern.findall(self.template))
+
+ # Only check values for placeholders that exist in the template
+ relevant_values = {k: v for k, v in kwargs.items() if k in placeholder_names}
+
+ if all(isinstance(v, str) for v in relevant_values.values()):
+ # All strings: use simple string replacement, return str
+ # Handle escaping and missing placeholders
+ result = self.template.replace("{{", "\x00LBRACE\x00").replace("}}", "\x00RBRACE\x00")
+
+ for name in placeholder_names:
+ placeholder = "{" + name + "}"
+ if name in kwargs:
+ result = result.replace(placeholder, kwargs[name])
+ # Missing placeholders left as-is
+
+ result = result.replace("\x00LBRACE\x00", "{").replace("\x00RBRACE\x00", "}")
+ return result
+
+ # Multimodal content: build ContentBlockList
+ result = ContentBlockList()
+
+ # Handle escaping: replace {{ with a sentinel, }} with another
+ LBRACE_SENTINEL = "\x00LBRACE\x00"
+ RBRACE_SENTINEL = "\x00RBRACE\x00"
+
+ text = self.template.replace("{{", LBRACE_SENTINEL).replace("}}", RBRACE_SENTINEL)
+
+ last_end = 0
+
+ for match in pattern.finditer(text):
+ # Add text before this placeholder
+ prefix = text[last_end:match.start()]
+ if prefix:
+ # Restore escaped braces in prefix
+ prefix = prefix.replace(LBRACE_SENTINEL, "{").replace(RBRACE_SENTINEL, "}")
+ result.append(prefix)
+
+ # Get placeholder name and value
+ placeholder_name = match.group(1)
+
+ if placeholder_name in kwargs:
+ value = kwargs[placeholder_name]
+ # Convert value to ContentBlockList and splice in
+ content = self._value_to_content(value, **kwargs)
+ result.extend(content)
+ else:
+ # Missing placeholder: leave as-is (restore original {name})
+ result.append("{" + placeholder_name + "}")
+
+ last_end = match.end()
+
+ # Add remaining text after last placeholder
+ suffix = text[last_end:]
+ if suffix:
+ suffix = suffix.replace(LBRACE_SENTINEL, "{").replace(RBRACE_SENTINEL, "}")
+ result.append(suffix)
+
+ return result
+
+ def _value_to_content(self, value, **kwargs) -> 'ContentBlockList':
+ """Convert a value to ContentBlockList.
+
+ Args:
+ value: The value to convert
+ **kwargs: Passed to nested PromptTemplate.render()
+
+ Returns:
+ ContentBlockList: The value as content blocks.
+ """
+ if isinstance(value, ContentBlockList):
+ return value
+ elif isinstance(value, PromptTemplate):
+ # Nested template: format it with the same kwargs
+ return value.format(**kwargs)
+ elif hasattr(value, 'to_content_blocks'):
+ # Object with to_content_blocks method (e.g., ProblemInstance)
+ return value.to_content_blocks()
+ elif isinstance(value, str):
+ return ContentBlockList(value)
+ else:
+ # Fallback: convert to string
+ return ContentBlockList(str(value))
+
+ def __repr__(self) -> str:
+ """Return a preview of the template."""
+ preview = self.template[:50] + "..." if len(self.template) > 50 else self.template
+ return f"PromptTemplate({preview!r})"
diff --git a/opto/utils/backbone/turns.py b/opto/utils/backbone/turns.py
new file mode 100644
index 00000000..c4fededd
--- /dev/null
+++ b/opto/utils/backbone/turns.py
@@ -0,0 +1,473 @@
+"""Conversation turns: :class:`UserTurn` and :class:`AssistantTurn`.
+
+``AssistantTurn.autocast`` parses raw responses from LiteLLM/OpenAI (Responses
+and Completion APIs), Bedrock Converse, and Google GenAI into a uniform shape.
+The :func:`to_messages` helper builds a provider-ready messages list from a
+system prompt and a user content block list (the minimal replacement for the
+old ``Chat`` manager).
+"""
+from typing import List, Dict, Any, Optional, Union
+from dataclasses import dataclass, field
+
+from .content import ContentBlockList, TextContent, ImageContent
+
+
+@dataclass
+class UserTurn:
+ """A user message turn (role + multimodal content)."""
+ role: str = "user"
+ content: ContentBlockList = field(default_factory=ContentBlockList)
+
+ def __init__(self, content=None, **kwargs):
+ if isinstance(content, UserTurn):
+ self.role = content.role
+ self.content = ContentBlockList(content.content)
+ return
+ if content is None:
+ content = ContentBlockList()
+ elif not isinstance(content, ContentBlockList):
+ content = ContentBlockList(content) if isinstance(content, list) else ContentBlockList([content])
+ self.role = kwargs.get("role", "user")
+ self.content = content
+
+ def add_text(self, text: str) -> "UserTurn":
+ self.content.append(TextContent(text=text))
+ return self
+
+ def add_image(self, url: Optional[str] = None, data: Optional[str] = None,
+ media_type: str = "image/jpeg") -> "UserTurn":
+ self.content.append(ImageContent(image_url=url, image_data=data, media_type=media_type))
+ return self
+
+ def to_dict(self) -> Dict[str, Any]:
+ return {"role": "user", "content": [c.to_dict() for c in self.content]}
+
+ def to_litellm_format(self) -> Dict[str, Any]:
+ return {"role": "user", "content": self.content.to_litellm_format(role="user")}
+
+ def __repr__(self) -> str:
+ preview = str(self.content)
+ preview = preview[:50] + "..." if len(preview) > 50 else preview
+ return f"UserTurn(content={preview!r})"
+
+
+@dataclass
+class Turn:
+ def __init__(self, **kwargs):
+ for key, value in kwargs.items():
+ setattr(self, key, value)
+
+
+@dataclass
+class AssistantTurn(Turn):
+ """An assistant message turn, parsed from a raw LLM response."""
+ role: str = "assistant"
+ content: ContentBlockList = field(default_factory=ContentBlockList)
+
+ reasoning: Optional[str] = None
+ finish_reason: Optional[str] = None
+ prompt_tokens: Optional[int] = None
+ completion_tokens: Optional[int] = None
+ model: Optional[str] = None
+ timestamp: Optional[str] = None
+ metadata: Dict[str, Any] = field(default_factory=dict)
+
+ def __init__(self, *args, **kwargs):
+ """Initialize empty, from another AssistantTurn, from a raw response
+ (single positional arg), or from explicit fields (kwargs)."""
+ if len(args) == 1 and isinstance(args[0], AssistantTurn):
+ other = args[0]
+ super().__init__(
+ role=other.role,
+ content=ContentBlockList(other.content),
+ reasoning=other.reasoning,
+ finish_reason=other.finish_reason,
+ prompt_tokens=other.prompt_tokens,
+ completion_tokens=other.completion_tokens,
+ model=other.model,
+ timestamp=other.timestamp,
+ metadata=dict(other.metadata),
+ )
+ elif len(args) > 0 and len(kwargs) == 0:
+ super().__init__(**self.autocast(args[0]))
+ elif len(kwargs) > 0:
+ super().__init__(**kwargs)
+ else:
+ super().__init__(
+ role="assistant",
+ content=ContentBlockList(),
+ reasoning=None,
+ finish_reason=None,
+ prompt_tokens=None,
+ completion_tokens=None,
+ model=None,
+ timestamp=None,
+ metadata={},
+ )
+
+ @staticmethod
+ def from_google_genai(value: Any) -> Dict[str, Any]:
+ """Parse a Google GenAI response (generate_content or Interactions API)."""
+ result = {
+ "role": "assistant",
+ "content": ContentBlockList(),
+ "reasoning": None,
+ "finish_reason": None,
+ "prompt_tokens": None,
+ "completion_tokens": None,
+ "model": None,
+ "timestamp": None,
+ "metadata": {},
+ }
+
+ raw_response = value.raw_response if hasattr(value, "raw_response") else value
+
+ # Interactions API (new): has 'outputs'
+ if hasattr(raw_response, "outputs"):
+ interaction = raw_response
+ if interaction.outputs:
+ for output in interaction.outputs:
+ if hasattr(output, "text") and output.text:
+ result["content"].append(TextContent(text=output.text))
+ elif hasattr(output, "content"):
+ if isinstance(output.content, list):
+ for part in output.content:
+ if hasattr(part, "text") and part.text:
+ result["content"].append(TextContent(text=part.text))
+ else:
+ result["content"].append(TextContent(text=str(output.content)))
+ if hasattr(interaction, "model"):
+ result["model"] = interaction.model
+ if hasattr(interaction, "status"):
+ result["finish_reason"] = interaction.status
+ if hasattr(interaction, "usage"):
+ usage = interaction.usage
+ if hasattr(usage, "input_tokens"):
+ result["prompt_tokens"] = usage.input_tokens
+ elif hasattr(usage, "prompt_token_count"):
+ result["prompt_tokens"] = usage.prompt_token_count
+ if hasattr(usage, "output_tokens"):
+ result["completion_tokens"] = usage.output_tokens
+ elif hasattr(usage, "candidates_token_count"):
+ result["completion_tokens"] = usage.candidates_token_count
+ if hasattr(interaction, "id"):
+ result["metadata"]["interaction_id"] = interaction.id
+ return result
+
+ # Legacy generate_content API
+ if hasattr(raw_response, "thoughts") and raw_response.thoughts:
+ result["reasoning"] = str(raw_response.thoughts)
+ if hasattr(raw_response, "model_version"):
+ result["model"] = raw_response.model_version
+ if hasattr(raw_response, "usage_metadata"):
+ usage = raw_response.usage_metadata
+ if hasattr(usage, "prompt_token_count"):
+ result["prompt_tokens"] = usage.prompt_token_count
+ if hasattr(usage, "candidates_token_count"):
+ result["completion_tokens"] = usage.candidates_token_count
+
+ content_extracted = False
+ if hasattr(raw_response, "candidates") and raw_response.candidates:
+ candidate = raw_response.candidates[0]
+ if hasattr(candidate, "content") and hasattr(candidate.content, "parts"):
+ for part in candidate.content.parts:
+ if hasattr(part, "text") and part.text:
+ result["content"].append(TextContent(text=part.text))
+ content_extracted = True
+ elif hasattr(part, "inline_data"):
+ inline = part.inline_data
+ image_bytes = None
+ image_data = None
+ media_type = "image/jpeg"
+ data = None
+ if hasattr(inline, "data"):
+ data = inline.data
+ if isinstance(data, bytes):
+ image_bytes = data
+ elif isinstance(data, str):
+ image_data = data
+ if hasattr(inline, "mime_type"):
+ media_type = inline.mime_type
+ if image_data or image_bytes:
+ result["content"].append(ImageContent(
+ image_data=image_data,
+ image_bytes=image_bytes if isinstance(data, bytes) else None,
+ media_type=media_type,
+ ))
+ content_extracted = True
+ if hasattr(candidate, "finish_reason"):
+ result["finish_reason"] = str(candidate.finish_reason)
+
+ if not content_extracted:
+ if hasattr(raw_response, "text"):
+ result["content"].append(TextContent(text=raw_response.text))
+ elif hasattr(value, "choices"):
+ result["content"].append(TextContent(text=value.choices[0].message.content))
+
+ return result
+
+ @staticmethod
+ def from_litellm_openai_response_api(value: Any) -> Dict[str, Any]:
+ """Parse a LiteLLM/OpenAI response (Responses API, Completion API, or
+ Bedrock Converse)."""
+ result = {
+ "role": "assistant",
+ "content": ContentBlockList(),
+ "reasoning": None,
+ "finish_reason": None,
+ "prompt_tokens": None,
+ "completion_tokens": None,
+ "model": None,
+ "timestamp": None,
+ "metadata": {},
+ }
+
+ # Bedrock Converse: 'output' with nested 'message'
+ is_bedrock = False
+ bedrock_output = None
+ bedrock_value = value
+ if hasattr(value, "output"):
+ output_val = value.output
+ if hasattr(output_val, "message"):
+ is_bedrock = True
+ bedrock_output = output_val
+ elif isinstance(output_val, dict) and "message" in output_val:
+ is_bedrock = True
+ bedrock_output = output_val
+ if not is_bedrock and isinstance(value, dict) and "output" in value:
+ output_val = value["output"]
+ if isinstance(output_val, dict) and "message" in output_val:
+ is_bedrock = True
+ bedrock_output = output_val
+ bedrock_value = value
+
+ if is_bedrock and bedrock_output is not None:
+ message = bedrock_output.get("message") if isinstance(bedrock_output, dict) else getattr(bedrock_output, "message", None)
+ if message:
+ if isinstance(message, dict):
+ result["role"] = message.get("role", "assistant")
+ elif hasattr(message, "role"):
+ result["role"] = message.role
+ content_list = message.get("content") if isinstance(message, dict) else getattr(message, "content", None)
+ if content_list:
+ for content_item in content_list:
+ text_val = content_item.get("text") if isinstance(content_item, dict) else getattr(content_item, "text", None)
+ if text_val:
+ result["content"].append(TextContent(text=text_val))
+ stop_reason = bedrock_value.get("stopReason") if isinstance(bedrock_value, dict) else getattr(bedrock_value, "stopReason", None)
+ if stop_reason:
+ result["finish_reason"] = stop_reason
+ usage = bedrock_value.get("usage") if isinstance(bedrock_value, dict) else getattr(bedrock_value, "usage", None)
+ if usage:
+ if isinstance(usage, dict):
+ result["prompt_tokens"] = usage.get("inputTokens")
+ result["completion_tokens"] = usage.get("outputTokens")
+ else:
+ if hasattr(usage, "inputTokens"):
+ result["prompt_tokens"] = usage.inputTokens
+ if hasattr(usage, "outputTokens"):
+ result["completion_tokens"] = usage.outputTokens
+ return result
+
+ # Responses API: 'output' list + 'object'
+ if hasattr(value, "output") and hasattr(value, "object"):
+ if hasattr(value, "id"):
+ result["metadata"]["response_id"] = value.id
+ if hasattr(value, "created_at"):
+ result["timestamp"] = str(value.created_at)
+ if hasattr(value, "model"):
+ result["model"] = value.model
+ if hasattr(value, "status"):
+ result["finish_reason"] = value.status
+ if value.output:
+ for output_item in value.output:
+ if getattr(output_item, "type", None) == "image_generation_call":
+ if getattr(output_item, "result", None):
+ media_type = "image/jpeg"
+ if hasattr(output_item, "output_format"):
+ format_map = {
+ "png": "image/png",
+ "jpeg": "image/jpeg",
+ "jpg": "image/jpeg",
+ "webp": "image/webp",
+ "gif": "image/gif",
+ }
+ media_type = format_map.get(output_item.output_format.lower(), "image/jpeg")
+ result["content"].append(ImageContent(image_data=output_item.result, media_type=media_type))
+ if getattr(output_item, "revised_prompt", None):
+ result["metadata"].setdefault("image_generation", []).append({
+ "id": getattr(output_item, "id", None),
+ "revised_prompt": output_item.revised_prompt,
+ "size": getattr(output_item, "size", None),
+ "quality": getattr(output_item, "quality", None),
+ "status": getattr(output_item, "status", None),
+ })
+ elif getattr(output_item, "type", None) == "message":
+ if hasattr(output_item, "role"):
+ result["role"] = output_item.role
+ if hasattr(output_item, "status") and not result["finish_reason"]:
+ result["finish_reason"] = output_item.status
+ if getattr(output_item, "content", None):
+ for content_item in output_item.content:
+ if getattr(content_item, "type", None) == "output_text" and getattr(content_item, "text", None):
+ result["content"].append(TextContent(text=content_item.text))
+ elif getattr(content_item, "text", None):
+ result["content"].append(TextContent(text=str(content_item.text)))
+ if hasattr(value, "reasoning"):
+ if isinstance(value.reasoning, dict):
+ reasoning_parts = []
+ if value.reasoning.get("summary"):
+ reasoning_parts.append(f"Summary: {value.reasoning['summary']}")
+ if value.reasoning.get("effort"):
+ reasoning_parts.append(f"Effort: {value.reasoning['effort']}")
+ if reasoning_parts:
+ result["reasoning"] = "\n".join(reasoning_parts)
+ elif value.reasoning:
+ result["reasoning"] = str(value.reasoning)
+ if hasattr(value, "usage"):
+ if hasattr(value.usage, "input_tokens"):
+ result["prompt_tokens"] = value.usage.input_tokens
+ if hasattr(value.usage, "output_tokens"):
+ result["completion_tokens"] = value.usage.output_tokens
+ return result
+
+ # Legacy Completion API: 'choices'
+ if hasattr(value, "choices") and len(value.choices) > 0:
+ choice = value.choices[0]
+ message = choice.message if hasattr(choice, "message") else choice
+ if hasattr(message, "content") and message.content:
+ result["content"].append(TextContent(text=str(message.content)))
+ if hasattr(choice, "finish_reason"):
+ result["finish_reason"] = choice.finish_reason
+ if hasattr(message, "reasoning") and message.reasoning:
+ result["reasoning"] = message.reasoning
+ if hasattr(value, "usage"):
+ if hasattr(value.usage, "prompt_tokens"):
+ result["prompt_tokens"] = value.usage.prompt_tokens
+ if hasattr(value.usage, "completion_tokens"):
+ result["completion_tokens"] = value.usage.completion_tokens
+ if hasattr(value, "model"):
+ result["model"] = value.model
+
+ return result
+
+ @staticmethod
+ def autocast(value: Any) -> Dict[str, Any]:
+ """Detect the response format and parse into AssistantTurn fields."""
+ raw_response = value.raw_response if hasattr(value, "raw_response") else value
+
+ if hasattr(raw_response, "outputs") or \
+ (hasattr(raw_response, "candidates") and not hasattr(value, "choices")) or \
+ hasattr(raw_response, "usage_metadata"):
+ return AssistantTurn.from_google_genai(value)
+
+ has_output = hasattr(value, "output") or (isinstance(value, dict) and "output" in value)
+ has_choices = hasattr(value, "choices") or (isinstance(value, dict) and "choices" in value)
+ if has_output or has_choices:
+ return AssistantTurn.from_litellm_openai_response_api(value)
+
+ if hasattr(raw_response, "text"):
+ return AssistantTurn.from_google_genai(value)
+
+ return {
+ "role": "assistant",
+ "content": ContentBlockList(),
+ "reasoning": None,
+ "finish_reason": None,
+ "prompt_tokens": None,
+ "completion_tokens": None,
+ "model": None,
+ "timestamp": None,
+ "metadata": {},
+ }
+
+ def add_text(self, text: str) -> "AssistantTurn":
+ self.content.append(text)
+ return self
+
+ def add_image(self, url: Optional[str] = None, data: Optional[str] = None,
+ media_type: str = "image/jpeg") -> "AssistantTurn":
+ self.content.append(ImageContent(image_url=url, image_data=data, media_type=media_type))
+ return self
+
+ def to_text(self) -> str:
+ """All text content concatenated; images shown as placeholders."""
+ return self.content.to_text()
+
+ def to_dict(self) -> Dict[str, Any]:
+ return {
+ "role": self.role,
+ "content": [c.to_dict() for c in self.content],
+ "reasoning": self.reasoning,
+ "finish_reason": self.finish_reason,
+ "prompt_tokens": self.prompt_tokens,
+ "completion_tokens": self.completion_tokens,
+ "model": self.model,
+ "metadata": self.metadata,
+ }
+
+ def get_text(self) -> ContentBlockList:
+ """ContentBlockList of only the TextContent blocks."""
+ blocks = ContentBlockList()
+ for block in self.content:
+ if isinstance(block, TextContent):
+ blocks.append(block)
+ return blocks
+
+ def get_images(self) -> ContentBlockList:
+ """ContentBlockList of only the ImageContent blocks."""
+ blocks = ContentBlockList()
+ for block in self.content:
+ if isinstance(block, ImageContent):
+ blocks.append(block)
+ return blocks
+
+ def to_litellm_format(self) -> Dict[str, Any]:
+ return {"role": self.role, "content": self.content.to_litellm_format(role=self.role)}
+
+ def __repr__(self) -> str:
+ preview = str(self.content)
+ preview = preview[:50] + "..." if len(preview) > 50 else preview
+ parts = [f"AssistantTurn(content={preview!r}"]
+ if getattr(self, "model", None):
+ parts.append(f", model={self.model!r}")
+ if getattr(self, "prompt_tokens", None):
+ parts.append(f", prompt_tokens={self.prompt_tokens}")
+ if getattr(self, "completion_tokens", None):
+ parts.append(f", completion_tokens={self.completion_tokens}")
+ parts.append(")")
+ return "".join(parts)
+
+
+def to_messages(
+ system_prompt: Optional[str],
+ user_content: Union[str, ContentBlockList, TextContent, ImageContent, None] = None,
+ history: Optional[List[Dict[str, Any]]] = None,
+) -> List[Dict[str, Any]]:
+ """Build a LiteLLM/OpenAI-format messages list.
+
+ This is the minimal, stateless replacement for the old ``Chat`` manager.
+ Optimizers own their own ``history`` (a plain list of message dicts) and call
+ this to assemble the request:
+
+ messages = to_messages(system_prompt, user_blocks, history=self.history)
+
+ Args:
+ system_prompt: Optional system message text.
+ user_content: The current user turn content (str or content blocks).
+ If None, no user message is appended.
+ history: Prior message dicts (already in LiteLLM format) to insert
+ between the system message and the new user message.
+
+ Returns:
+ A list of message dicts suitable for ``LLM(messages=...)``.
+ """
+ messages: List[Dict[str, Any]] = []
+ if system_prompt:
+ messages.append({"role": "system", "content": system_prompt})
+ if history:
+ messages.extend(history)
+ if user_content is not None:
+ blocks = ContentBlockList.ensure(user_content)
+ messages.append({"role": "user", "content": blocks.to_litellm_format(role="user")})
+ return messages
diff --git a/opto/utils/llm.py b/opto/utils/llm.py
index 97377c16..825e207b 100644
--- a/opto/utils/llm.py
+++ b/opto/utils/llm.py
@@ -1,4 +1,12 @@
-from typing import List, Tuple, Dict, Any, Callable, Union
+"""
+When MM (multimodal) is enabled, we primarily either use:
+1. LiteLLM's response API
+2. Google's Interaction API design (not supported by LiteLLM response API at all)
+When MM is disabled, for backward compatibility, we use:
+1. LiteLLM's completion API
+"""
+
+from typing import List, Tuple, Dict, Any, Callable, Union, Optional
import os
import time
import json
@@ -6,11 +14,35 @@
import warnings
from .auto_retry import retry_with_exponential_backoff
+# Import the assistant-turn parser for mm_beta mode.
+# Heavy/optional SDKs (openai, google-genai) are imported lazily inside the
+# backends that need them, so importing this module never requires them.
+from .backbone import AssistantTurn
+
try:
import autogen # We import autogen here to avoid the need of installing autogen
except ImportError:
pass
+
+def _is_image_generation_model(model_name: str) -> bool:
+ """Detect if a model is for image generation based on its name.
+
+ Detects:
+ - OpenAI: gpt-image-1, gpt-image-1.5, gpt-image-1-mini, dall-e-2, dall-e-3
+ - Gemini: gemini-2.5-flash-image, gemini-2.5-pro-image, etc.
+
+ Args:
+ model_name: The name of the model to check
+
+ Returns:
+ bool: True if the model is an image generation model, False otherwise
+ """
+ if model_name is None:
+ return False
+ model_lower = model_name.lower()
+ return 'image' in model_lower or 'dall-e' in model_lower
+
class AbstractModel:
"""Abstract base class for LLM model wrappers with automatic refreshing.
@@ -24,6 +56,12 @@ class AbstractModel:
reset_freq : int or None, optional
Number of seconds after which to refresh the model. If None, the model
is never refreshed.
+ mm_beta : bool, optional
+ If True, returns AssistantTurn objects with rich multimodal content.
+ If False (default), returns raw API responses in legacy format.
+ model_name : str or None, optional
+ The name of the model being used (e.g., "gpt-4o", "claude-3-5-sonnet-latest").
+ If None, no model name is stored.
Attributes
----------
@@ -31,13 +69,21 @@ class AbstractModel:
The factory function for creating model instances.
reset_freq : int or None
Refresh frequency in seconds.
+ mm_beta : bool
+ Whether to use multimodal beta mode.
+ model_name : str or None
+ The name of the model being used.
+ is_image_model : bool
+ Whether the model is for image generation (auto-detected from model name).
+
model : Any
Property that returns the current model instance.
Methods
-------
__call__(*args, **kwargs)
- Execute the model, refreshing if needed.
+ Execute the model, refreshing if needed. Returns AssistantTurn if mm_beta=True,
+ otherwise returns raw API response.
Notes
-----
@@ -45,8 +91,9 @@ class AbstractModel:
1. **Automatic Refreshing**: Recreates the model instance periodically
to prevent issues with long-running connections.
2. **Serialization**: Supports pickling by recreating the model on load.
- 3. **Consistent Interface**: Ensures responses are available at
- `response['choices'][0]['message']['content']`.
+ 3. **Response Formats**:
+ - Legacy (mm_beta=False): `response['choices'][0]['message']['content']`
+ - Multimodal (mm_beta=True): AssistantTurn object with .content, .reasoning, etc.
Subclasses should override the `model` property to customize behavior.
@@ -56,32 +103,58 @@ class AbstractModel:
LiteLLM : Concrete implementation using LiteLLM
"""
- def __init__(self, factory: Callable, reset_freq: Union[int, None] = None) -> None:
+ def __init__(self, factory: Callable, reset_freq: Union[int, None] = None,
+ mm_beta: bool = False, model_name: Union[str, None] = None) -> None:
"""
Args:
factory: A function that takes no arguments and returns a model that is callable.
reset_freq: The number of seconds after which the model should be
refreshed. If None, the model is never refreshed.
+ mm_beta: If True, returns AssistantTurn objects with rich multimodal content.
+ If False (default), returns raw API responses in legacy format.
+ model_name: The name of the model being used (e.g., "gpt-4o", "claude-3-5-sonnet-latest").
+ If None, no model name is stored.
"""
self.factory = factory
self._model = self.factory()
self.reset_freq = reset_freq
self._init_time = time.time()
+ self.mm_beta = mm_beta
+ self.model_name = model_name
# Overwrite this `model` property when subclassing.
@property
def model(self):
"""When self.model is called, text responses should always be available at `response['choices'][0]['message']['content']`"""
return self._model
+
+ @property
+ def is_image_model(self) -> bool:
+ """Check if this model is for image generation based on model name.
+
+ Returns True if the model name contains 'image' or 'dall-e', False otherwise.
+ """
+ return _is_image_generation_model(self.model_name)
# This is the main API
def __call__(self, *args, **kwargs) -> Any:
""" The call function handles refreshing the model if needed.
+
+ Returns:
+ If mm_beta=False: Raw completion API response (backward compatible)
+ If mm_beta=True: AssistantTurn object with parsed multimodal content
"""
if self.reset_freq is not None and time.time() - self._init_time > self.reset_freq:
self._model = self.factory()
self._init_time = time.time()
- return self.model(*args, **kwargs)
+
+ response = self.model(*args, **kwargs)
+
+ # Parse to AssistantTurn if mm_beta mode is enabled
+ if self.mm_beta:
+ return AssistantTurn(response)
+
+ return response
def __getstate__(self):
state = self.__dict__.copy()
@@ -151,7 +224,8 @@ class AutoGenLLM(AbstractModel):
>>> response = llm(messages=[{"role": "user", "content": "Hello"}])
"""
- def __init__(self, config_list: List = None, filter_dict: Dict = None, reset_freq: Union[int, None] = None) -> None:
+ def __init__(self, config_list: List = None, filter_dict: Dict = None,
+ reset_freq: Union[int, None] = None, mm_beta: bool = False) -> None:
if config_list is None:
try:
config_list = autogen.config_list_from_json("OAI_CONFIG_LIST")
@@ -163,8 +237,11 @@ def __init__(self, config_list: List = None, filter_dict: Dict = None, reset_fre
if filter_dict is not None:
config_list = autogen.filter_config(config_list, filter_dict)
+ # Extract model name from config_list if available
+ model_name = config_list[0].get('model') if config_list and len(config_list) > 0 else None
+
factory = lambda *args, **kwargs: self._factory(config_list)
- super().__init__(factory, reset_freq)
+ super().__init__(factory, reset_freq, mm_beta=mm_beta, model_name=model_name)
@classmethod
def _factory(cls, config_list):
@@ -270,16 +347,55 @@ class LiteLLM(AbstractModel):
This is an LLM backend supported by LiteLLM library.
https://docs.litellm.ai/docs/completion/input
+ https://docs.litellm.ai/docs/response_api
+ https://docs.litellm.ai/docs/image_generation
To use this, set the credentials through the environment variable as
instructed in the LiteLLM documentation. For convenience, you can set the
default model name through the environment variable TRACE_LITELLM_MODEL.
- When using Azure models via token provider, you can set the Azure token
- provider scope through the environment variable AZURE_TOKEN_PROVIDER_SCOPE.
+
+ Azure OpenAI Authentication:
+ Two authentication methods are supported for Azure OpenAI:
+
+ 1. API Key Authentication (Recommended for most users):
+ Set these environment variables:
+ - AZURE_API_KEY: Your Azure OpenAI API key
+ - AZURE_API_BASE: Your Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com)
+ - AZURE_API_VERSION: API version (e.g., 2024-08-01-preview)
+ - TRACE_LITELLM_MODEL: Model name with azure/ prefix (e.g., azure/o4-mini)
+
+ Do NOT set AZURE_TOKEN_PROVIDER_SCOPE for this method.
+
+ 2. Azure AD Credential Authentication (For enterprise users):
+ Set AZURE_TOKEN_PROVIDER_SCOPE (e.g., https://cognitiveservices.azure.com/.default)
+ to use Azure Identity credential-based authentication.
+ This method does NOT use AZURE_API_KEY.
+
+ This class now supports storing default completion parameters (like temperature,
+ top_p, max_tokens, etc.) that will be used for all calls unless overridden.
+
+ Text Generation:
+ When mm_beta=True, the Responses API is used for rich multimodal content.
+ When mm_beta=False (default), the Completion API is used for backward compatibility.
+
+ See: https://docs.litellm.ai/docs/response_api
+
+ Image Generation:
+ Automatically detects image generation models (containing 'image' or 'dall-e' in name).
+ Uses litellm.image_generation() API for models like:
+ - gpt-image-1, gpt-image-1.5, gpt-image-1-mini
+ - dall-e-2, dall-e-3
+
+ Image models require a single string prompt:
+ llm = LLM(model="gpt-image-1.5")
+ result = llm(prompt="A serene mountain landscape")
+
+ Check llm.is_image_model to determine if a model is for image generation.
"""
def __init__(self, model: Union[str, None] = None, reset_freq: Union[int, None] = None,
- cache=True, max_retries=10, base_delay=1.0) -> None:
+ cache=True, max_retries=1, base_delay=1.0,
+ mm_beta: bool = False, **default_params) -> None:
if model is None:
model = os.environ.get('TRACE_LITELLM_MODEL')
if model is None:
@@ -288,38 +404,164 @@ def __init__(self, model: Union[str, None] = None, reset_freq: Union[int, None]
self.model_name = model
self.cache = cache
- factory = lambda: self._factory(self.model_name, max_retries=max_retries, base_delay=base_delay) # an LLM instance uses a fixed model
- super().__init__(factory, reset_freq)
+ self.default_params = default_params # Store default completion parameters
+
+ factory = lambda: self._factory(
+ self.model_name,
+ self.default_params,
+ mm_beta,
+ max_retries=max_retries,
+ base_delay=base_delay
+ )
+ super().__init__(factory, reset_freq, mm_beta=mm_beta, model_name=model)
@classmethod
- def _factory(cls, model_name: str, max_retries=10, base_delay=1.0):
+ def _factory(cls, model_name: str, default_params: dict, mm_beta: bool,
+ max_retries=1, base_delay=1.0):
import litellm
+
+ # For Azure models, set global litellm variables as a fallback
+ # (workaround for potential litellm.responses API issues)
+ if model_name.startswith('azure/'):
+ if os.environ.get('AZURE_API_VERSION') and not hasattr(litellm, '_azure_api_version_set'):
+ try:
+ litellm.api_version = os.environ.get('AZURE_API_VERSION')
+ litellm._azure_api_version_set = True # Mark to avoid setting multiple times
+ except:
+ pass # Ignore if litellm doesn't support this
+
+ # Check if this is an image generation model
+ is_image_model = _is_image_generation_model(model_name)
+
+ if is_image_model:
+ # Image generation API
+ api_func = litellm.image_generation
+ operation_name = "LiteLLM_image_generation"
+
+ # Standard image generation wrapper
+ def image_wrapper(prompt, **kwargs):
+ assert isinstance(prompt, str), (
+ f"Image generation requires a single string prompt. "
+ f"Got {type(prompt).__name__}. "
+ f"Usage: llm(prompt='your prompt here')"
+ )
+ return retry_with_exponential_backoff(
+ lambda: api_func(model=model_name, prompt=prompt, **{**default_params, **kwargs}),
+ max_retries=max_retries,
+ base_delay=base_delay,
+ operation_name=operation_name
+ )
+ return image_wrapper
+
+ # Use Responses API when mm_beta=True, otherwise use Completion API
+ api_func = litellm.responses if mm_beta else litellm.completion
+ operation_name = "LiteLLM_responses" if mm_beta else "LiteLLM_completion"
+
if model_name.startswith('azure/'): # azure model
azure_token_provider_scope = os.environ.get('AZURE_TOKEN_PROVIDER_SCOPE', None)
if azure_token_provider_scope is not None:
+ # Azure AD credential-based authentication
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
credential = get_bearer_token_provider(DefaultAzureCredential(), azure_token_provider_scope)
- return lambda *args, **kwargs: retry_with_exponential_backoff(
- lambda: litellm.completion(model_name, *args,
- azure_ad_token_provider=credential, **kwargs),
+ if mm_beta:
+ # Responses API: model as keyword argument, convert messages to input
+ def azure_responses_wrapper(*args, **kwargs):
+ # Convert 'messages' to 'input' for Responses API
+ if 'messages' in kwargs and 'input' not in kwargs:
+ kwargs['input'] = kwargs.pop('messages')
+ return retry_with_exponential_backoff(
+ lambda: api_func(model=model_name,
+ azure_ad_token_provider=credential, **{**default_params, **kwargs}),
+ max_retries=max_retries,
+ base_delay=base_delay,
+ operation_name=operation_name
+ )
+ return azure_responses_wrapper
+ else:
+ # Completion API: model as positional argument
+ return lambda *args, **kwargs: retry_with_exponential_backoff(
+ lambda: api_func(model_name, *args,
+ azure_ad_token_provider=credential, **{**default_params, **kwargs}),
+ max_retries=max_retries,
+ base_delay=base_delay,
+ operation_name=operation_name
+ )
+ else:
+ # Azure API key authentication - explicitly pass Azure env vars if available
+ azure_params = {}
+ if 'api_key' not in default_params:
+ azure_api_key = os.environ.get('AZURE_API_KEY')
+ if azure_api_key:
+ azure_params['api_key'] = azure_api_key
+ if 'api_base' not in default_params:
+ azure_api_base = os.environ.get('AZURE_API_BASE')
+ if azure_api_base:
+ azure_params['api_base'] = azure_api_base
+ if 'api_version' not in default_params:
+ azure_api_version = os.environ.get('AZURE_API_VERSION')
+ if azure_api_version:
+ azure_params['api_version'] = azure_api_version
+
+ if mm_beta:
+ # Responses API: model as keyword argument, convert messages to input
+ def azure_key_responses_wrapper(*args, **kwargs):
+ # Convert 'messages' to 'input' for Responses API
+ if 'messages' in kwargs and 'input' not in kwargs:
+ kwargs['input'] = kwargs.pop('messages')
+ return retry_with_exponential_backoff(
+ lambda: api_func(model=model_name, **{**azure_params, **default_params, **kwargs}),
+ max_retries=max_retries,
+ base_delay=base_delay,
+ operation_name=operation_name
+ )
+ return azure_key_responses_wrapper
+ else:
+ # Completion API: model as positional argument
+ return lambda *args, **kwargs: retry_with_exponential_backoff(
+ lambda: api_func(model_name, *args, **{**azure_params, **default_params, **kwargs}),
+ max_retries=max_retries,
+ base_delay=base_delay,
+ operation_name=operation_name
+ )
+
+ if mm_beta:
+ # Responses API: model as keyword argument, convert messages to input
+ def responses_wrapper(*args, **kwargs):
+ # Convert 'messages' to 'input' for Responses API
+ if 'messages' in kwargs and 'input' not in kwargs:
+ kwargs['input'] = kwargs.pop('messages')
+ return retry_with_exponential_backoff(
+ lambda: api_func(model=model_name, **{**default_params, **kwargs}),
max_retries=max_retries,
base_delay=base_delay,
- operation_name="LiteLLM_completion"
+ operation_name=operation_name
)
- return lambda *args, **kwargs: retry_with_exponential_backoff(
- lambda: litellm.completion(model_name, *args, **kwargs),
- max_retries=max_retries,
- base_delay=base_delay,
- operation_name="LiteLLM_completion"
- )
+ return responses_wrapper
+ else:
+ # Completion API: model as positional argument
+ return lambda *args, **kwargs: retry_with_exponential_backoff(
+ lambda: api_func(model_name, *args, **{**default_params, **kwargs}),
+ max_retries=max_retries,
+ base_delay=base_delay,
+ operation_name=operation_name
+ )
@property
def model(self):
"""
- response = litellm.completion(
- model=self.model,
- messages=[{"content": message, "role": "user"}]
- )
+ Calls either litellm.completion() or litellm.responses() depending on mm_beta.
+
+ For completion API (mm_beta=False):
+ response = litellm.completion(
+ model=self.model,
+ messages=[{"content": message, "role": "user"}]
+ )
+
+ For responses API (mm_beta=True):
+ response = litellm.responses(
+ model=self.model,
+ input="Your input text"
+ )
"""
return lambda *args, **kwargs: self._model(*args, **kwargs)
@@ -331,7 +573,7 @@ class CustomLLM(AbstractModel):
"""
def __init__(self, model: Union[str, None] = None, reset_freq: Union[int, None] = None,
- cache=True) -> None:
+ cache=True, mm_beta: bool = False) -> None:
if model is None:
model = os.environ.get('TRACE_CUSTOMLLM_MODEL', 'gpt-4o')
base_url = os.environ.get('TRACE_CUSTOMLLM_URL', 'http://xx.xx.xxx.xx:4000/')
@@ -342,7 +584,7 @@ def __init__(self, model: Union[str, None] = None, reset_freq: Union[int, None]
self.model_name = model
self.cache = cache
factory = lambda: self._factory(base_url, server_api_key) # an LLM instance uses a fixed model
- super().__init__(factory, reset_freq)
+ super().__init__(factory, reset_freq, mm_beta=mm_beta, model_name=model)
@classmethod
def _factory(cls, base_url: str, server_api_key: str):
@@ -359,83 +601,417 @@ def create(self, **config: Any):
config['model'] = self.model_name
return self._model.chat.completions.create(**config)
+class GoogleGenAILLM(AbstractModel):
+ """
+ This is an LLM backend using Google's GenAI SDK with the Interactions API.
+
+ https://ai.google.dev/gemini-api/docs/text-generation
+ https://ai.google.dev/gemini-api/docs/image-generation
+
+ The Interactions API is a unified interface for interacting with Gemini models,
+ similar to OpenAI's Response API. It provides better state management, tool
+ orchestration, and support for long-running tasks.
+
+ To use this, set the GEMINI_API_KEY environment variable with your API key.
+ For convenience, you can set the default model name through the environment
+ variable TRACE_GOOGLE_GENAI_MODEL.
+
+ Supported models:
+ - Text: gemini-2.5-flash, gemini-2.5-pro, gemini-2.5-flash-lite
+ - Image: gemini-2.5-flash-image, gemini-2.5-pro-image
+
+ This class supports storing default generation parameters (like temperature,
+ max_output_tokens, etc.) that will be used for all calls unless overridden.
+
+ Text Generation:
+ Pass standard OpenAI-style messages; the backend converts them to the
+ format expected by Google GenAI (and extracts the system instruction).
+
+ Example:
+ from opto.utils.llm import LLM
+ from opto.utils.backbone import to_messages
+
+ llm = LLM(model="gemini-2.5-flash", mm_beta=True)
+ messages = to_messages("You are a helpful assistant.", "What is AI?")
+ response = llm(messages=messages, max_tokens=100)
+ print(response.get_text())
+
+ Image Generation:
+ Automatically detects image generation models (containing 'image' in name).
+ Uses client.models.generate_images() API for models like gemini-2.5-flash-image.
+
+ Image models require a single string prompt:
+ llm = LLM(model="gemini-2.5-flash-image")
+ result = llm(prompt="A serene mountain landscape", number_of_images=2)
+
+ Check llm.is_image_model to determine if a model is for image generation.
+ """
+
+ def __init__(self, model: Union[str, None] = None, reset_freq: Union[int, None] = None,
+ cache=True, mm_beta: bool = False, max_retries: int = 1,
+ base_delay: float = 1.0, **default_params) -> None:
+ if model is None:
+ model = os.environ.get('TRACE_GOOGLE_GENAI_MODEL', 'gemini-2.5-flash')
+
+ self.model_name = model
+ self.cache = cache
+ self.default_params = default_params # Store default generation parameters
+ factory = lambda: self._factory(self.model_name, self.default_params,
+ max_retries=max_retries, base_delay=base_delay)
+ super().__init__(factory, reset_freq, mm_beta=mm_beta, model_name=model)
+
+ @classmethod
+ def _factory(cls, model_name: str, default_params: dict,
+ max_retries: int = 1, base_delay: float = 1.0):
+ """Create a Google GenAI client wrapper using the Interactions API."""
+ from google import genai
+ from google.genai import types
+ # Get API key from environment variable
+ api_key = os.environ.get('GEMINI_API_KEY')
+ if api_key:
+ client = genai.Client(api_key=api_key)
+ else:
+ # Try without API key (will use default credentials or fail gracefully)
+ client = genai.Client()
+
+ # Check if this is an image generation model
+ is_image_model = _is_image_generation_model(model_name)
+
+ if is_image_model:
+ # Image generation for Gemini
+ def image_api_func(prompt, **kwargs):
+ assert isinstance(prompt, str), (
+ f"Image generation requires a single string prompt. "
+ f"Got {type(prompt).__name__}. "
+ f"Usage: llm(prompt='your prompt here')"
+ )
+
+ # Gemini image generation API
+ # https://ai.google.dev/gemini-api/docs/image-generation
+ # Filter kwargs to only valid parameters for generate_images
+ valid_params = {
+ k: v for k, v in kwargs.items()
+ if k in ['number_of_images', 'aspect_ratio', 'safety_filter_level']
+ }
+ response = client.models.generate_images(
+ model=model_name,
+ prompt=prompt,
+ **valid_params
+ )
+ return response
+
+ return lambda *args, **kwargs: retry_with_exponential_backoff(
+ lambda: image_api_func(*args, **{**default_params, **kwargs}),
+ max_retries=max_retries,
+ base_delay=base_delay,
+ operation_name=f"{model_name}_image_gen"
+ )
+
+ # Build config if there are generation parameters
+ config_params = {}
+
+ # Handle thinking config for Gemini 2.5+ models
+ if 'thinking_budget' in default_params:
+ thinking_budget = default_params.pop('thinking_budget')
+ config_params['thinking_config'] = types.ThinkingConfig(
+ thinking_budget=thinking_budget
+ )
+
+ def api_func(model_name, *args, **kwargs):
+ # Extract system_instruction if present (needs to be at config level, not in kwargs)
+ system_instruction = kwargs.pop('system_instruction', None)
+
+ # Handle messages parameter (OpenAI-style or Gemini-native dicts)
+ messages = kwargs.pop('messages', None)
+ contents = kwargs.pop('contents', None)
+
+ if messages:
+ # Detect format: OpenAI-style has 'content' key, Gemini-native has 'parts' key.
+ # If OpenAI-style, convert via _gemini_messages_to_contents so the SDK accepts it.
+ first_non_system = next(
+ (m for m in messages if m.get('role') != 'system'), None
+ )
+ is_openai_format = (
+ first_non_system is not None and
+ 'content' in first_non_system and
+ 'parts' not in first_non_system
+ )
+ if is_openai_format:
+ converted, extracted_sys = _gemini_messages_to_contents(messages)
+ if system_instruction is None:
+ system_instruction = extracted_sys
+ contents = converted
+ else:
+ # Already in Gemini native format ('parts' keys)
+ if messages[0].get('role') == 'system':
+ if system_instruction is None:
+ system_instruction = messages[0].get('content')
+ contents = messages[1:]
+ else:
+ contents = messages
+
+ # Use contents if provided, otherwise use positional args.
+ # If a positional arg is a list of OpenAI-format dicts, convert it.
+ if contents is None and args:
+ raw = args[0]
+ if (
+ isinstance(raw, list) and raw and
+ isinstance(raw[0], dict) and
+ 'content' in raw[0] and 'parts' not in raw[0]
+ ):
+ converted, extracted_sys = _gemini_messages_to_contents(raw)
+ if system_instruction is None:
+ system_instruction = extracted_sys
+ contents = converted
+ else:
+ contents = raw
+ contents_to_use = contents
+
+ # Map max_tokens to max_output_tokens for Google GenAI
+ if 'max_tokens' in kwargs:
+ kwargs['max_output_tokens'] = kwargs.pop('max_tokens')
+
+ # Remove any other parameters that shouldn't go to GenerateContentConfig
+ # Keep only valid config parameters
+ valid_config_params = {
+ 'temperature', 'max_output_tokens', 'top_p', 'top_k',
+ 'stop_sequences', 'candidate_count', 'presence_penalty',
+ 'frequency_penalty', 'response_mime_type', 'response_schema'
+ }
+ config_kwargs = {k: v for k, v in kwargs.items() if k in valid_config_params}
+
+ if system_instruction:
+ config_params_with_system = {**config_params, 'system_instruction': system_instruction}
+ else:
+ config_params_with_system = config_params
+
+ response = client.models.generate_content(
+ model=model_name,
+ contents=contents_to_use,
+ config=types.GenerateContentConfig(**{**config_params_with_system, **config_kwargs})
+ )
+
+ return response
+
+ return lambda *args, **kwargs: retry_with_exponential_backoff(
+ lambda: api_func(model_name, *args, **{**default_params, **kwargs}),
+ max_retries=max_retries,
+ base_delay=base_delay,
+ operation_name=f"{model_name}"
+ )
+
+ @property
+ def model(self):
+ """
+ Wrapper that injects the model name into calls.
+
+ Example:
+ response = llm(contents="How does AI work?")
+ """
+ return lambda *args, **kwargs: self._model(model=self.model_name, *args, **kwargs)
+
+# ---------------------------------------------------------------------------
+# Helper to convert OpenAI-style messages into Gemini 'contents' format.
+# ---------------------------------------------------------------------------
+
+def _gemini_messages_to_contents(messages):
+ """Convert a standard messages list to Gemini REST API ``contents`` format.
+
+ Returns (contents, system_instruction) where system_instruction is a str
+ extracted from any ``role="system"`` message, or None.
+ """
+ contents = []
+ system_instruction = None
+
+ for msg in messages:
+ role = msg.get('role', 'user')
+ content = msg.get('content', '')
+
+ if role == 'system':
+ if isinstance(content, list):
+ texts = [
+ item['text'] for item in content
+ if isinstance(item, dict) and item.get('type') == 'text'
+ ]
+ system_instruction = '\n'.join(texts)
+ else:
+ system_instruction = str(content)
+ continue
+
+ gemini_role = 'model' if role == 'assistant' else 'user'
+
+ if isinstance(content, str):
+ parts = [{'text': content}]
+ elif isinstance(content, list):
+ parts = [
+ {'text': item['text']}
+ for item in content
+ if isinstance(item, dict) and item.get('type') == 'text'
+ ]
+ if not parts:
+ parts = [{'text': str(content)}]
+ else:
+ parts = [{'text': str(content)}]
+
+ contents.append({'role': gemini_role, 'parts': parts})
+
+ return contents, system_instruction
+
+
# Registry of available backends
_LLM_REGISTRY = {
"LiteLLM": LiteLLM,
"AutoGen": AutoGenLLM,
"CustomLLM": CustomLLM,
+ "GoogleGenAI": GoogleGenAILLM,
}
class LLMFactory:
- """Factory for creating LLM instances with predefined profiles.
-
- The code comes with these built-in profiles:
-
- llm_default = LLM(profile="default") # gpt-4o-mini
- llm_premium = LLM(profile="premium") # gpt-4
- llm_cheap = LLM(profile="cheap") # gpt-4o-mini
- llm_fast = LLM(profile="fast") # gpt-3.5-turbo-mini
- llm_reasoning = LLM(profile="reasoning") # o1-mini
-
- You can override those built-in profiles:
+ """Factory for creating LLM instances with named profiles.
- LLMFactory.register_profile("default", "LiteLLM", model="gpt-4o", temperature=0.5)
- LLMFactory.register_profile("premium", "LiteLLM", model="o1-preview", max_tokens=8000)
- LLMFactory.register_profile("cheap", "LiteLLM", model="gpt-3.5-turbo", temperature=0.9)
- LLMFactory.register_profile("fast", "LiteLLM", model="gpt-3.5-turbo", max_tokens=500)
- LLMFactory.register_profile("reasoning", "LiteLLM", model="o1-preview")
+ Profiles store reusable backend configurations (model + any
+ LiteLLM-supported params like temperature, top_p, max_tokens, ...). The
+ default profile uses 'gpt-4o' via LiteLLM.
- An Example of using Different Backends
+ Example:
+ LLMFactory.create_profile("creative", model="gpt-4o", temperature=0.9)
+ llm = LLM(profile="creative")
+ LLMFactory.list_profiles()
+ LLMFactory.get_profile_info("creative")
- # Register custom profiles for different use cases
- LLMFactory.register_profile("advanced_reasoning", "LiteLLM", model="o1-preview", max_tokens=4000)
- LLMFactory.register_profile("claude_sonnet", "LiteLLM", model="claude-3-5-sonnet-latest", temperature=0.3)
- LLMFactory.register_profile("custom_server", "CustomLLM", model="llama-3.1-8b")
-
- # Use in different contexts
- reasoning_llm = LLM(profile="advanced_reasoning") # For complex reasoning
- claude_llm = LLM(profile="claude_sonnet") # For Claude responses
- local_llm = LLM(profile="custom_server") # For local deployment
-
- # Single LLM optimizer with custom profile
- optimizer1 = OptoPrime(parameters, llm=LLM(profile="advanced_reasoning"))
-
- # Multi-LLM optimizer with multiple profiles
- optimizer2 = OptoPrimeMulti(parameters, llm_profiles=["cheap", "premium", "claude_sonnet"], generation_technique="multi_llm")
+ See https://docs.litellm.ai/docs/completion/input for the full parameter list.
"""
- # Default profiles for different use cases
+ # Default profile - just gpt-4o-mini with no opinionated settings
_profiles = {
- 'default': {'backend': 'LiteLLM', 'params': {'model': 'gpt-4o-mini'}},
- 'premium': {'backend': 'LiteLLM', 'params': {'model': 'gpt-4'}},
- 'cheap': {'backend': 'LiteLLM', 'params': {'model': 'gpt-4o-mini'}},
- 'fast': {'backend': 'LiteLLM', 'params': {'model': 'gpt-3.5-turbo-mini'}},
- 'reasoning': {'backend': 'LiteLLM', 'params': {'model': 'o1-mini'}},
+ 'default': {'backend': 'LiteLLM', 'params': {'model': 'gpt-4o'}},
}
-
@classmethod
- def get_llm(cls, profile: str = 'default') -> AbstractModel:
- """Get an LLM instance for the specified profile."""
+ def get_llm(cls, profile: str = 'default', model: str = None, mm_beta: bool = False, **kwargs) -> AbstractModel:
+ """Get an LLM instance for the specified profile or model.
+
+ Args:
+ profile: Name of the profile to use. Defaults to 'default'.
+ model: Model name to use directly. If provided, overrides profile.
+ mm_beta: If True, returns AssistantTurn objects with rich multimodal content.
+ If False (default), returns raw API responses in legacy format.
+ **kwargs: Additional parameters to pass to the backend (e.g., temperature, top_p).
+ These override profile settings if both are specified.
+
+ Returns:
+ An LLM instance configured according to the profile/model and parameters.
+
+ Examples:
+ # Use default profile
+ llm = LLMFactory.get_llm()
+
+ # Use specific model
+ llm = LLMFactory.get_llm(model="gpt-4o")
+
+ # Use named profile
+ llm = LLMFactory.get_llm(profile="creative_writer")
+
+ # Use model with custom parameters
+ llm = LLMFactory.get_llm(model="gpt-4o", temperature=0.7, max_tokens=1000)
+
+ # Override profile settings
+ llm = LLMFactory.get_llm(profile="creative_writer", temperature=0.5)
+
+ # Use mm_beta mode for multimodal responses
+ llm = LLMFactory.get_llm(model="gpt-4o", mm_beta=True)
+ """
+ # If model is specified directly, create a simple config
+ if model is not None:
+ backend = kwargs.pop('backend', None)
+
+ # Determine backend with priority:
+ # 1. Explicit backend kwarg (always wins)
+ # 2. Gemini model name -> GoogleGenAI
+ # 3. Default -> LiteLLM
+ if backend is not None:
+ backend_cls = _LLM_REGISTRY[backend]
+ elif model.startswith('gemini'):
+ # Gemini models default to GoogleGenAILLM backend
+ backend_cls = _LLM_REGISTRY['GoogleGenAI']
+ # Strip 'gemini/' prefix if present (LiteLLM format: gemini/gemini-pro)
+ if model.startswith('gemini/'):
+ model = model[len('gemini/'):]
+ else:
+ # Default to LiteLLM for other models
+ backend_cls = _LLM_REGISTRY['LiteLLM']
+
+ params = {'model': model, 'mm_beta': mm_beta, **kwargs}
+ return backend_cls(**params)
+ # Otherwise use profile
if profile not in cls._profiles:
- raise ValueError(f"Unknown profile '{profile}'. Available profiles: {list(cls._profiles.keys())}")
+ raise ValueError(
+ f"Unknown profile '{profile}'. Available profiles: {list(cls._profiles.keys())}. "
+ f"Use LLMFactory.create_profile() to create custom profiles, or pass model= directly."
+ )
- config = cls._profiles[profile]
+ config = cls._profiles[profile].copy()
backend_cls = _LLM_REGISTRY[config['backend']]
- return backend_cls(**config['params'])
+
+ # Merge profile params with any override kwargs
+ params = config['params'].copy()
+ params['mm_beta'] = mm_beta
+ params.update(kwargs)
+
+ return backend_cls(**params)
@classmethod
- def register_profile(cls, name: str, backend: str, **params):
- """Register a new LLM profile."""
+ def create_profile(cls, name: str, backend: str = 'LiteLLM', **params):
+ """Register a new LLM profile with custom configuration.
+
+ Args:
+ name: Profile name to register.
+ backend: Backend to use ('LiteLLM', 'AutoGen', or 'CustomLLM'). Defaults to 'LiteLLM'.
+ **params: Configuration parameters for the backend. For LiteLLM, this can include
+ any parameters from https://docs.litellm.ai/docs/completion/input
+
+ Examples:
+ # Simple profile with just a model
+ LLMFactory.create_profile("gpt4", model="gpt-4o")
+
+ # Profile with temperature and token settings
+ LLMFactory.create_profile(
+ "creative",
+ model="gpt-4o",
+ temperature=0.9,
+ max_tokens=2000
+ )
+
+ # Profile with advanced settings
+ LLMFactory.create_profile(
+ "structured_json",
+ model="gpt-4o-mini",
+ temperature=0.3,
+ response_format={"type": "json_object"},
+ max_tokens=1500,
+ top_p=0.9
+ )
+ """
+ if backend not in _LLM_REGISTRY:
+ raise ValueError(
+ f"Unknown backend '{backend}'. Valid options: {list(_LLM_REGISTRY.keys())}"
+ )
cls._profiles[name] = {'backend': backend, 'params': params}
@classmethod
def list_profiles(cls):
- """List all available profiles."""
+ """List all available profile names."""
return list(cls._profiles.keys())
@classmethod
def get_profile_info(cls, profile: str = None):
- """Get information about a profile or all profiles."""
+ """Get configuration information about one or all profiles.
+
+ Args:
+ profile: Profile name to get info for. If None, returns all profiles.
+
+ Returns:
+ Dictionary with profile configuration(s).
+ """
if profile:
return cls._profiles.get(profile)
return cls._profiles
@@ -446,10 +1022,12 @@ class DummyLLM(AbstractModel):
def __init__(self,
callable,
- reset_freq: Union[int, None] = None) -> None:
+ reset_freq: Union[int, None] = None,
+ mm_beta: bool = False,
+ model_name: Union[str, None] = None) -> None:
# self.message = message
self.callable = callable
- super().__init__(self._factory, reset_freq)
+ super().__init__(self._factory, reset_freq, mm_beta=mm_beta, model_name=model_name)
def _factory(self):
@@ -465,29 +1043,71 @@ def __init__(self, content):
class Response:
def __init__(self, content):
self.choices = [Choice(content)]
+ self.content = content # for the AssistantTurn API
return lambda *args, **kwargs: Response(self.callable(*args, **kwargs))
-
class LLM:
+ """Unified entry point for all supported LLM backends.
+
+ Defaults to gpt-4o via LiteLLM unless ``TRACE_LITELLM_MODEL`` is set. Pass
+ ``mm_beta=True`` to receive parsed :class:`AssistantTurn` responses (with
+ ``.get_text()`` / ``.get_images()``); otherwise the raw API response is
+ returned.
+
+ Common usage:
+ llm = LLM() # default model
+ llm = LLM(model="gpt-4o", temperature=0.7) # explicit model + params
+ llm = LLM(model="claude-3-5-sonnet-latest")
+ llm = LLM(model="gemini-2.5-flash", mm_beta=True)
+ llm = LLM(profile="creative") # named profile
+ llm = LLM(backend="AutoGen", config_list=cfg)
+
+ Image generation (models with 'image'/'dall-e' in the name) take a single
+ prompt string:
+ img = LLM(model="gpt-image-1.5")
+ result = img(prompt="A serene mountain landscape")
+
+ Model selection priority when ``model`` is not given:
+ 1. explicit ``model=`` argument
+ 2. ``TRACE_LITELLM_MODEL`` environment variable
+ 3. named ``profile`` (default 'default')
+ 4. backend-specific defaults
+
+ System messages: for LiteLLM backends include a ``role="system"`` message;
+ for GoogleGenAI pass ``system_instruction=`` (the backend also extracts a
+ leading system message automatically).
+
+ See Also:
+ - LLMFactory: managing named profiles
+ - AssistantTurn: returned when mm_beta=True
+ - https://docs.litellm.ai/docs/completion/input
"""
- A unified entry point for all supported LLM backends.
-
- Usage:
- # pick by env var (default: LiteLLM)
- llm = LLM()
- # or override explicitly
- llm = LLM(backend="AutoGen", config_list=my_configs)
- # or use predefined profiles
- llm = LLM(profile="premium") # Use premium model
- llm = LLM(profile="cheap") # Use cheaper model
- llm = LLM(profile="reasoning") # Use reasoning/thinking model
- """
- def __new__(cls, *args, profile: str = None, backend: str = None, **kwargs):
- # New: if profile is specified, use LLMFactory
+ def __new__(cls, model: str = None, profile: str = 'default', backend: str = None,
+ mm_beta: bool = False, **kwargs):
+
+ if _is_image_generation_model(model):
+ mm_beta = True
+
+ # Priority 1: If model is specified, use LLMFactory with model
+ if model:
+ if backend is not None:
+ kwargs['backend'] = backend
+ return LLMFactory.get_llm(model=model, mm_beta=mm_beta, **kwargs)
+
+ # Priority 2: Check if TRACE_LITELLM_MODEL is set (honor user's explicit env config)
+ env_model = os.environ.get('TRACE_LITELLM_MODEL')
+ if env_model is not None:
+ if backend is not None:
+ kwargs['backend'] = backend
+ return LLMFactory.get_llm(model=env_model, mm_beta=mm_beta, **kwargs)
+
+ # Priority 3: If profile is specified, use LLMFactory
if profile:
- return LLMFactory.get_llm(profile)
- # Decide which backend to use
+ return LLMFactory.get_llm(profile=profile, mm_beta=mm_beta, **kwargs)
+
+ # Priority 4: Use backend-specific instantiation (for AutoGen, CustomLLM, etc.)
+ # This path is for when neither profile nor model is specified
name = backend or os.getenv("TRACE_DEFAULT_LLM_BACKEND", "LiteLLM")
try:
backend_cls = _LLM_REGISTRY[name]
@@ -495,4 +1115,5 @@ def __new__(cls, *args, profile: str = None, backend: str = None, **kwargs):
raise ValueError(f"Unknown LLM backend: {name}. "
f"Valid options are: {list(_LLM_REGISTRY)}")
# Instantiate and return the chosen subclass
- return backend_cls(*args, **kwargs)
\ No newline at end of file
+ kwargs['mm_beta'] = mm_beta
+ return backend_cls(**kwargs)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index dbd60be5..394d4046 100644
--- a/setup.py
+++ b/setup.py
@@ -11,9 +11,11 @@
install_requires = [
"graphviz>=0.20.1",
"pytest",
- "litellm==1.75.0",
+ "litellm==1.80.8",
+ "google-genai",
"black",
"scikit-learn",
+ "pillow",
"tensorboardX",
"tensorboard"
]
diff --git a/tests/llm_optimizers_tests/test_optoprime_v3.py b/tests/llm_optimizers_tests/test_optoprime_v3.py
new file mode 100644
index 00000000..f124c5ec
--- /dev/null
+++ b/tests/llm_optimizers_tests/test_optoprime_v3.py
@@ -0,0 +1,510 @@
+import os
+import pytest
+from opto.trace import GRAPH
+from opto.utils.llm import LLM
+
+from opto.trace import node, bundle
+from opto.optimizers.optoprime_v3 import (
+ OptoPrimeV3, OptimizerPromptSymbolSet2, ProblemInstance,
+ OptimizerPromptSymbolSet, value_to_image_content
+)
+from opto.utils.backbone import TextContent, ImageContent
+
+# You can override for temporarly testing a specific optimizer ALL_OPTIMIZERS = [TextGrad] # [OptoPrimeMulti] ALL_OPTIMIZERS = [OptoPrime]
+
+# Tests that issue real LLM calls are opt-in: set RUN_LIVE_LLM_TESTS=1 to run
+# them. CI runs against a text-only stub that cannot satisfy the multimodal
+# optimizer steps, so they are skipped there.
+SKIP_REASON = "Live LLM test; set RUN_LIVE_LLM_TESTS=1 to run"
+HAS_CREDENTIALS = os.environ.get("RUN_LIVE_LLM_TESTS") == "1"
+llm = LLM()
+
+
+@pytest.fixture(autouse=True)
+def clear_graph():
+ """Reset the graph before each test"""
+ GRAPH.clear()
+ yield
+ GRAPH.clear()
+
+
+@pytest.mark.skipif(not HAS_CREDENTIALS, reason=SKIP_REASON)
+def test_response_extraction():
+ pass
+
+
+def test_tag_template_change():
+ num_1 = node(1, trainable=True)
+ num_2 = node(2, trainable=True, description="<=5")
+ result = num_1 + num_2
+ optimizer = OptoPrimeV3([num_1, num_2], use_json_object_format=False,
+ ignore_extraction_error=False,
+ include_example=True,
+ optimizer_prompt_symbol_set=OptimizerPromptSymbolSet2())
+
+ optimizer.zero_feedback()
+ optimizer.backward(result, 'make this number bigger')
+
+ summary = optimizer.summarize()
+ system_prompt, user_prompt = optimizer.construct_prompt(summary)
+
+ # system_prompt is a string, user_prompt is a ContentBlockList
+ system_prompt = optimizer.replace_symbols(system_prompt, optimizer.prompt_symbols)
+
+ # Convert ContentBlockList to text for symbol replacement
+ user_prompt_text = "".join(block.text for block in user_prompt if isinstance(block, TextContent))
+ user_prompt_text = optimizer.replace_symbols(user_prompt_text, optimizer.prompt_symbols)
+
+ assert """""" in system_prompt, "Expected tag to be present in system_prompt"
+ assert """""" in user_prompt_text, "Expected tag to be present in user_prompt"
+
+ print(system_prompt)
+ print(user_prompt_text)
+
+
+@bundle()
+def transform(num):
+ """Add number"""
+ return num + 1
+
+
+@bundle(trainable=True)
+def multiply(num):
+ return num * 5
+
+
+def test_function_repr():
+ num_1 = node(1, trainable=False)
+
+ result = multiply(transform(num_1))
+ optimizer = OptoPrimeV3([multiply.parameter], use_json_object_format=False,
+ ignore_extraction_error=False,
+ include_example=True)
+
+ optimizer.zero_feedback()
+ optimizer.backward(result, 'make this number bigger')
+
+ summary = optimizer.summarize()
+ system_prompt, user_prompt = optimizer.construct_prompt(summary)
+
+ system_prompt = optimizer.replace_symbols(system_prompt, optimizer.prompt_symbols)
+ # Convert ContentBlockList to text for symbol replacement
+ user_prompt_text = "".join(block.text for block in user_prompt if isinstance(block, TextContent))
+ user_prompt_text = optimizer.replace_symbols(user_prompt_text, optimizer.prompt_symbols)
+
+ function_repr = """
+
+def multiply(num):
+ return num * 5
+
+
+The code should start with:
+def multiply(num):
+
+"""
+
+ assert function_repr in user_prompt_text, "Expected function representation to be present in user_prompt"
+
+def test_big_data_truncation():
+ num_1 = node("**2", trainable=True)
+
+ list_1 = node("12345691912338" * 10, trainable=False)
+
+ result = list_1 + num_1
+
+ optimizer = OptoPrimeV3([num_1], use_json_object_format=False,
+ ignore_extraction_error=False,
+ include_example=True, initial_var_char_limit=10)
+
+ optimizer.zero_feedback()
+ optimizer.backward(result, 'compute the expression')
+
+ summary = optimizer.summarize()
+ system_prompt, user_prompt = optimizer.construct_prompt(summary)
+
+ system_prompt = optimizer.replace_symbols(system_prompt, optimizer.prompt_symbols)
+ # Convert ContentBlockList to text for symbol replacement
+ user_prompt_text = "".join(block.text for block in user_prompt if isinstance(block, TextContent))
+ user_prompt_text = optimizer.replace_symbols(user_prompt_text, optimizer.prompt_symbols)
+
+ truncated_repr = """1234569191...(skipped due to length limit)"""
+
+ assert truncated_repr in user_prompt_text, "Expected truncated list representation to be present in user_prompt"
+
+def test_extraction_pipeline():
+ num_1 = node(1, trainable=True)
+ num_2 = node(2, trainable=True, description="<=5")
+ result = num_1 + num_2
+ optimizer = OptoPrimeV3([num_1, num_2], use_json_object_format=False,
+ ignore_extraction_error=False,
+ include_example=True,
+ optimizer_prompt_symbol_set=OptimizerPromptSymbolSet2())
+
+ optimizer.zero_feedback()
+ optimizer.backward(result, 'make this number bigger')
+
+ summary = optimizer.summarize()
+ system_prompt, user_prompt = optimizer.construct_prompt(summary)
+
+ # Verify construct_prompt returns expected types
+ assert isinstance(system_prompt, str)
+ assert isinstance(user_prompt, list)
+
+ # Test extraction from a mock response
+ response = """
+The instruction suggests that the output, `add0`, needs to be made bigger than it currently is (3). The code performs an addition of `int0` and `int1` to produce `add0`. To increase `add0`, we can increase the values of `int0` or `int1`, or both. Given that `int1` has a constraint of being less than or equal to 5, we can set `int0` to a higher value, since it has no explicit constraint. By adjusting `int0` to a higher value, the output can be made larger in accordance with the feedback.
+
+
+
+int0
+
+5
+
+
+
+
+int1
+
+5
+
+"""
+ suggestion = optimizer.extract_llm_suggestion(response)
+
+ assert 'reasoning' in suggestion, "Expected 'reasoning' in suggestion"
+ assert 'variables' in suggestion, "Expected 'variables' in suggestion"
+ assert 'int0' in suggestion['variables'], "Expected 'int0' variable in suggestion"
+ assert 'int1' in suggestion['variables'], "Expected 'int1' variable in suggestion"
+ assert suggestion['variables']['int0'] == '5', "Expected int0 to be incremented to 5"
+ assert suggestion['variables']['int1'] == '5', "Expected int1 to be incremented to 5"
+
+
+# ==================== Multimodal / Content Block Tests ====================
+
+def test_problem_instance_text_only():
+ """Test that ProblemInstance with text-only content works correctly."""
+ from opto.utils.backbone import ContentBlockList
+ symbol_set = OptimizerPromptSymbolSet()
+
+ instance = ProblemInstance(
+ instruction="Test instruction",
+ code="y = add(x=a, y=b)",
+ documentation="[add] Adds two numbers",
+ variables=ContentBlockList("5"),
+ inputs=ContentBlockList("3"),
+ others=ContentBlockList(),
+ outputs=ContentBlockList("8"),
+ feedback="Result should be 10",
+ context="Some context",
+ optimizer_prompt_symbol_set=symbol_set
+ )
+
+ # Test __repr__ returns string
+ text_repr = str(instance)
+ assert "Test instruction" in text_repr
+ assert "y = add(x=a, y=b)" in text_repr
+ assert "Result should be 10" in text_repr
+ assert "Some context" in text_repr
+
+ # Test to_content_blocks returns list
+ blocks = instance.to_content_blocks()
+ assert isinstance(blocks, list)
+ assert len(blocks) > 0
+ assert all(isinstance(b, (TextContent, ImageContent)) for b in blocks)
+
+ # Test has_images returns False for text-only
+ assert not instance.has_images()
+
+
+def test_problem_instance_with_content_blocks():
+ """Test ProblemInstance with ContentBlockList fields containing images."""
+ from opto.utils.backbone import ContentBlockList
+ symbol_set = OptimizerPromptSymbolSet()
+
+ # Create content blocks with an image
+ variables_blocks = ContentBlockList([
+ TextContent(text=""),
+ ImageContent(image_url="https://example.com/test.jpg"),
+ TextContent(text="")
+ ])
+
+ instance = ProblemInstance(
+ instruction="Analyze the image",
+ code="result = analyze(img)",
+ documentation="[analyze] Analyzes an image",
+ variables=variables_blocks,
+ inputs=ContentBlockList(),
+ others=ContentBlockList(),
+ outputs=ContentBlockList("cat"),
+ feedback="Result should be 'dog'",
+ context=None,
+ optimizer_prompt_symbol_set=symbol_set
+ )
+
+ # Test __repr__ handles content blocks (should show [IMAGE] placeholder)
+ text_repr = str(instance)
+ assert "Analyze the image" in text_repr
+ assert "[IMAGE]" in text_repr
+
+ # Test to_content_blocks includes the image
+ blocks = instance.to_content_blocks()
+ assert isinstance(blocks, list)
+
+ # Find the ImageContent block
+ image_blocks = [b for b in blocks if isinstance(b, ImageContent)]
+ assert len(image_blocks) == 1
+ assert image_blocks[0].image_url == "https://example.com/test.jpg"
+
+ # Test has_images returns True
+ assert instance.has_images()
+
+
+def test_problem_instance_mixed_content():
+ """Test ProblemInstance with mixed text and image content in multiple fields."""
+ from opto.utils.backbone import ContentBlockList
+ symbol_set = OptimizerPromptSymbolSet()
+
+ # Variables with image
+ variables_blocks = ContentBlockList([
+ TextContent(text="Hello\n"),
+ TextContent(text=""),
+ ImageContent(image_data="base64data", media_type="image/png"),
+ TextContent(text="")
+ ])
+
+ # Inputs with image
+ inputs_blocks = ContentBlockList([
+ TextContent(text=""),
+ ImageContent(image_url="https://example.com/ref.png"),
+ TextContent(text="")
+ ])
+
+ instance = ProblemInstance(
+ instruction="Compare images",
+ code="result = compare(img, reference)",
+ documentation="[compare] Compares two images",
+ variables=variables_blocks,
+ inputs=inputs_blocks,
+ others=ContentBlockList(),
+ outputs=ContentBlockList("0.8"),
+ feedback="Similarity should be higher",
+ context="Context text",
+ optimizer_prompt_symbol_set=symbol_set
+ )
+
+ # Test has_images
+ assert instance.has_images()
+
+ # Test to_content_blocks
+ blocks = instance.to_content_blocks()
+ image_blocks = [b for b in blocks if isinstance(b, ImageContent)]
+ assert len(image_blocks) == 2 # One from variables, one from inputs
+
+
+def test_value_to_image_content_url():
+ """Test value_to_image_content with URL strings."""
+ # Valid image URL
+ result = value_to_image_content("https://example.com/image.jpg")
+ assert result is not None
+ assert isinstance(result, ImageContent)
+ assert result.image_url == "https://example.com/image.jpg"
+
+ # Non-image URL (no image extension) - is_image returns False for pattern check
+ result = value_to_image_content("https://example.com/page.html")
+ assert result is None
+
+ # Non-URL string
+ result = value_to_image_content("just a regular string")
+ assert result is None
+
+
+def test_value_to_image_content_base64():
+ """Test value_to_image_content with base64 data URLs."""
+ # Valid base64 data URL
+ data_url = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUg=="
+ result = value_to_image_content(data_url)
+ assert result is not None
+ assert isinstance(result, ImageContent)
+ assert result.image_data == "iVBORw0KGgoAAAANSUhEUg=="
+ assert result.media_type == "image/png"
+
+
+def test_value_to_image_content_non_image():
+ """Test value_to_image_content with non-image values."""
+ # Integer
+ assert value_to_image_content(42) is None
+
+ # List
+ assert value_to_image_content([1, 2, 3]) is None
+
+ # Dict
+ assert value_to_image_content({"key": "value"}) is None
+
+ # Regular string
+ assert value_to_image_content("hello world") is None
+
+
+def test_construct_prompt():
+ """Test construct_prompt returns ContentBlockList for multimodal support."""
+ num_1 = node(1, trainable=True)
+ num_2 = node(2, trainable=True)
+ result = num_1 + num_2
+
+ optimizer = OptoPrimeV3([num_1, num_2], use_json_object_format=False)
+ optimizer.zero_feedback()
+ optimizer.backward(result, 'make this number bigger')
+
+ summary = optimizer.summarize()
+ system_prompt, user_prompt = optimizer.construct_prompt(summary)
+
+ # system_prompt should be string, user_prompt should be ContentBlockList
+ assert isinstance(system_prompt, str)
+ assert isinstance(user_prompt, list)
+ assert all(isinstance(b, (TextContent, ImageContent)) for b in user_prompt)
+
+ # Check that text content contains expected info
+ text_parts = [b.text for b in user_prompt if isinstance(b, TextContent)]
+ full_text = "".join(text_parts)
+ assert "int0" in full_text or "int1" in full_text
+
+
+def test_repr_node_value_as_content_blocks():
+ """Test repr_node_value_as_content_blocks method."""
+ num_1 = node(1, trainable=True)
+ result = num_1 + 1
+
+ optimizer = OptoPrimeV3([num_1], use_json_object_format=False)
+ optimizer.zero_feedback()
+ optimizer.backward(result, 'test')
+
+ # Test with non-image nodes
+ summary = optimizer.summarize()
+ blocks = optimizer.repr_node_value_as_content_blocks(
+ summary.variables,
+ node_tag=optimizer.optimizer_prompt_symbol_set.variable_tag,
+ value_tag=optimizer.optimizer_prompt_symbol_set.value_tag,
+ constraint_tag=optimizer.optimizer_prompt_symbol_set.constraint_tag
+ )
+
+ assert isinstance(blocks, list)
+ assert len(blocks) > 0
+ assert all(isinstance(b, TextContent) for b in blocks) # No images in this case
+
+
+def test_repr_node_value_compact_as_content_blocks():
+ """Test repr_node_value_compact_as_content_blocks method."""
+ long_string = "x" * 5000 # Long string that will be truncated
+ str_node = node(long_string, trainable=True)
+ result = str_node + "!"
+
+ optimizer = OptoPrimeV3([str_node], use_json_object_format=False, initial_var_char_limit=100)
+ optimizer.zero_feedback()
+ optimizer.backward(result, 'test')
+
+ summary = optimizer.summarize()
+ blocks = optimizer.repr_node_value_compact_as_content_blocks(
+ summary.inputs,
+ node_tag=optimizer.optimizer_prompt_symbol_set.node_tag,
+ value_tag=optimizer.optimizer_prompt_symbol_set.value_tag,
+ constraint_tag=optimizer.optimizer_prompt_symbol_set.constraint_tag
+ )
+
+ # Should be truncated
+ text_parts = [b.text for b in blocks if isinstance(b, TextContent)]
+ full_text = "".join(text_parts)
+ assert "skipped due to length limit" in full_text or len(full_text) < len(long_string)
+
+
+# ==================== Real LLM Call Tests ====================
+
+@pytest.mark.skipif(not HAS_CREDENTIALS, reason=SKIP_REASON)
+def test_optimizer_step_real_llm_call():
+ """Test a real optimization step with LLM call."""
+ # Create a simple optimization problem
+ greeting = node("Hello", trainable=True, description="A greeting message")
+
+ @bundle()
+ def make_sentence(word):
+ """Create a sentence from a word."""
+ return f"{word}, how are you today?"
+
+ result = make_sentence(greeting)
+
+ # Create optimizer
+ optimizer = OptoPrimeV3(
+ [greeting],
+ use_json_object_format=False,
+ ignore_extraction_error=True,
+ include_example=False,
+ )
+
+ # Setup feedback
+ optimizer.zero_feedback()
+ optimizer.backward(result, "The greeting should be more formal and professional")
+
+ # Execute optimization step - this makes a real LLM call
+ update_dict = optimizer.step(verbose=True)
+
+ # Verify the optimizer produced a suggestion
+ print(f"Update dict: {update_dict}")
+
+ # The LLM should have suggested a new value
+ # We don't assert specific content since LLM output varies
+ # but we verify the step completed without error
+ assert optimizer.log is not None
+ assert len(optimizer.log) > 0
+
+ # Check that the log contains the expected structure
+ last_log = optimizer.log[-1]
+ assert "system_prompt" in last_log
+ assert "user_prompt" in last_log
+ assert "response" in last_log
+
+ print(f"LLM Response: {last_log['response'][:500]}...")
+
+
+@pytest.mark.skipif(not HAS_CREDENTIALS, reason=SKIP_REASON)
+def test_optimizer_step_with_content_blocks():
+ """Test optimization step using content blocks (multimodal mode)."""
+ # Create trainable parameters
+ num_1 = node(5, trainable=True, description="A number to optimize")
+ num_2 = node(3, trainable=True, description="Another number")
+
+ result = num_1 + num_2
+
+ # Create optimizer
+ optimizer = OptoPrimeV3(
+ [num_1, num_2],
+ use_json_object_format=False,
+ ignore_extraction_error=True,
+ include_example=False,
+ )
+
+ # Setup feedback
+ optimizer.zero_feedback()
+ optimizer.backward(result, "The sum should be exactly 100")
+
+ # Test that construct_prompt returns ContentBlockList
+ summary = optimizer.summarize()
+ system_prompt, user_prompt = optimizer.construct_prompt(summary)
+
+ # Verify content blocks structure
+ from opto.utils.backbone import ContentBlockList
+ assert isinstance(user_prompt, ContentBlockList)
+ assert len(user_prompt) > 0
+
+ # Verify text is merged (should be fewer blocks than if not merged)
+ text_blocks = [b for b in user_prompt if isinstance(b, TextContent)]
+ print(f"Number of text blocks after merging: {len(text_blocks)}")
+
+ # Execute the step (this makes a real LLM call)
+ update_dict = optimizer.step(verbose=True)
+
+ print(f"Update dict: {update_dict}")
+
+ # Verify the step completed
+ assert optimizer.log is not None
+ assert len(optimizer.log) > 0
+
+@pytest.mark.skipif(not HAS_CREDENTIALS, reason=SKIP_REASON)
+def test_optimizer_multimodal_parameter_update():
+ pass
\ No newline at end of file
diff --git a/tests/unit_tests/test_backbone.py b/tests/unit_tests/test_backbone.py
new file mode 100644
index 00000000..2f192e26
--- /dev/null
+++ b/tests/unit_tests/test_backbone.py
@@ -0,0 +1,256 @@
+"""Tests for the minimal multimodal backbone.
+
+Covers:
+1. Content blocks (TextContent, ImageContent, ContentBlockList, Content)
+2. UserTurn / AssistantTurn building and LiteLLM-format rendering
+3. The stateless ``to_messages`` helper (system + history + user content)
+4. Opt-in live LLM calls (set RUN_LIVE_LLM_TESTS=1) and raw-response parsing
+"""
+import os
+import base64
+import pytest
+
+from opto.utils.backbone import (
+ TextContent,
+ ImageContent,
+ ContentBlockList,
+ Content,
+ UserTurn,
+ AssistantTurn,
+ to_messages,
+ DEFAULT_IMAGE_PLACEHOLDER,
+)
+
+# Live tests make real LLM calls. They are opt-in: set RUN_LIVE_LLM_TESTS=1.
+SKIP_REASON = "Live LLM test; set RUN_LIVE_LLM_TESTS=1 to run"
+HAS_CREDENTIALS = os.environ.get("RUN_LIVE_LLM_TESTS") == "1"
+
+
+# ============================================================================
+# Content block tests
+# ============================================================================
+
+def test_text_content_merge():
+ """Consecutive text blocks merge into one."""
+ blocks = ContentBlockList()
+ blocks.append("Hello")
+ blocks.append("world")
+ assert len(blocks) == 1
+ assert isinstance(blocks[0], TextContent)
+ assert blocks[0].text == "Hello world"
+
+
+def test_image_content_url_format():
+ img = ImageContent(image_url="https://example.com/a.png")
+ fmt = img.to_litellm_format()
+ assert fmt["type"] == "input_image"
+ assert fmt["image_url"] == "https://example.com/a.png"
+
+
+def test_image_content_base64_format():
+ data = base64.b64encode(b"fake image").decode("utf-8")
+ img = ImageContent(image_data=data, media_type="image/png")
+ fmt = img.to_litellm_format()
+ assert fmt["type"] == "input_image"
+ assert fmt["image_url"].startswith("data:image/png;base64,")
+
+
+def test_content_block_list_has_images_and_to_text():
+ blocks = ContentBlockList()
+ blocks.append("before")
+ blocks.append(ImageContent(image_url="https://example.com/a.png"))
+ blocks.append("after")
+ assert blocks.has_images()
+ text = blocks.to_text()
+ assert "before" in text and "after" in text
+ assert DEFAULT_IMAGE_PLACEHOLDER.strip() in text
+
+
+def test_content_variadic_builder():
+ ctx = Content("some text", "more text")
+ assert not ctx.has_images()
+ assert "some text" in ctx.to_text()
+
+
+def test_content_template_builder():
+ img = ImageContent(image_url="https://example.com/a.png")
+ ctx = Content(f"See {DEFAULT_IMAGE_PLACEHOLDER} here", images=[img])
+ assert ctx.has_images()
+
+
+def test_content_blocks_to_litellm_format_mixed():
+ blocks = ContentBlockList()
+ blocks.append("text")
+ blocks.append(ImageContent(image_url="https://example.com/a.png"))
+ fmt = blocks.to_litellm_format(role="user")
+ assert len(fmt) == 2
+ assert fmt[0]["type"] == "input_text"
+ assert fmt[1]["type"] == "input_image"
+
+
+# ============================================================================
+# Turn tests
+# ============================================================================
+
+def test_user_turn_multiple_images():
+ user_turn = (UserTurn()
+ .add_text("What are in these images?")
+ .add_image(url="https://example.com/one.jpg")
+ .add_image(url="https://example.com/two.jpg"))
+ msg = user_turn.to_litellm_format()
+ assert msg["role"] == "user"
+ assert len(msg["content"]) == 3
+ assert msg["content"][0]["type"] == "input_text"
+ assert msg["content"][1]["type"] == "input_image"
+ assert msg["content"][2]["type"] == "input_image"
+
+
+def test_user_turn_base64_images():
+ d1 = base64.b64encode(b"img1").decode("utf-8")
+ d2 = base64.b64encode(b"img2").decode("utf-8")
+ user_turn = (UserTurn()
+ .add_text("Compare:")
+ .add_image(data=d1, media_type="image/png")
+ .add_image(data=d2, media_type="image/jpeg"))
+ msg = user_turn.to_litellm_format()
+ assert msg["content"][1]["image_url"].startswith("data:image/png;base64,")
+ assert msg["content"][2]["image_url"].startswith("data:image/jpeg;base64,")
+
+
+def test_assistant_turn_text_format():
+ at = AssistantTurn().add_text("Here is the answer.")
+ msg = at.to_litellm_format()
+ assert msg["role"] == "assistant"
+ assert any("answer" in item.get("text", "") for item in msg["content"])
+
+
+def test_assistant_turn_get_text_and_images():
+ at = (AssistantTurn()
+ .add_text("caption")
+ .add_image(url="https://example.com/gen.png"))
+ assert "caption" in at.to_text()
+ assert at.get_images().has_images()
+ assert len(at.get_images()) == 1
+ assert at.get_text().to_text().strip() == "caption"
+
+
+# ============================================================================
+# to_messages helper tests
+# ============================================================================
+
+def test_to_messages_system_and_user():
+ messages = to_messages("You are helpful.", "Hello")
+ assert len(messages) == 2
+ assert messages[0] == {"role": "system", "content": "You are helpful."}
+ assert messages[1]["role"] == "user"
+ assert messages[1]["content"][0]["text"] == "Hello"
+
+
+def test_to_messages_no_system():
+ messages = to_messages(None, "Hi")
+ assert len(messages) == 1
+ assert messages[0]["role"] == "user"
+
+
+def test_to_messages_with_history():
+ history = [
+ UserTurn().add_text("Q1").to_litellm_format(),
+ AssistantTurn().add_text("A1").to_litellm_format(),
+ ]
+ messages = to_messages("sys", "Q2", history=history)
+ # system + 2 history + current user
+ assert len(messages) == 4
+ assert messages[0]["role"] == "system"
+ assert messages[1]["role"] == "user"
+ assert messages[2]["role"] == "assistant"
+ assert messages[3]["role"] == "user"
+ assert messages[3]["content"][0]["text"] == "Q2"
+
+
+def test_to_messages_multimodal_user_content():
+ blocks = ContentBlockList()
+ blocks.append("Analyze")
+ blocks.append(ImageContent(image_url="https://example.com/a.png"))
+ messages = to_messages("sys", blocks)
+ user_content = messages[-1]["content"]
+ assert len(user_content) == 2
+ assert user_content[1]["type"] == "input_image"
+
+
+# ============================================================================
+# Live LLM call tests (opt-in)
+# ============================================================================
+
+@pytest.mark.skipif(not HAS_CREDENTIALS, reason=SKIP_REASON)
+def test_real_llm_call_with_multiple_images():
+ from opto.utils.llm import LLM
+
+ user_turn = (UserTurn()
+ .add_text("What are in these images? Describe each briefly.")
+ .add_image(url="https://images.pexels.com/photos/736230/pexels-photo-736230.jpeg")
+ .add_image(url="https://images.contentstack.io/v3/assets/bltcedd8dbd5891265b/blt134818d279038650/6668df6434f6fb5cd48aac34/beautiful-flowers-rose.jpeg"))
+
+ messages = to_messages("You analyze images.", user_turn.content)
+
+ llm = LLM(mm_beta=True)
+ response = llm(messages=messages, max_tokens=500)
+ text = response.to_text()
+
+ assert text is not None and len(text) > 50
+ assert any(w in text.lower() for w in ["flower", "image", "rose", "pink", "red", "petal"])
+
+
+@pytest.mark.skipif(not HAS_CREDENTIALS, reason=SKIP_REASON)
+def test_real_llm_multi_turn_with_images():
+ from opto.utils.llm import LLM
+
+ llm = LLM(mm_beta=True)
+ history = []
+
+ user1 = (UserTurn()
+ .add_text("What type of flowers are shown in these images?")
+ .add_image(url="https://images.pexels.com/photos/736230/pexels-photo-736230.jpeg"))
+ user1_msg = user1.to_litellm_format()
+ messages = to_messages("You analyze images.", history=history)
+ messages.append(user1_msg)
+
+ response1 = llm(messages=messages, max_tokens=300)
+ history.append(user1_msg)
+ history.append(response1.to_litellm_format())
+
+ user2_msg = UserTurn().add_text("Which would be better for a gift and why?").to_litellm_format()
+ messages = to_messages("You analyze images.", history=history)
+ messages.append(user2_msg)
+
+ response2 = llm(messages=messages, max_tokens=300)
+ text2 = response2.to_text()
+
+ assert response1.to_text() and len(response1.to_text()) > 20
+ assert text2 and len(text2) > 20
+ assert any(w in text2.lower() for w in ["flower", "rose", "gift", "love"])
+
+
+# ============================================================================
+# Raw-response parsing into AssistantTurn (opt-in)
+# ============================================================================
+
+@pytest.mark.skipif(not HAS_CREDENTIALS, reason=SKIP_REASON)
+def test_openai_raw_response_parsing():
+ import litellm
+
+ response = litellm.responses(model="openai/gpt-4o", input="Hello, how are you?")
+ at = AssistantTurn(response)
+ assert "Hello" in at.content[0].text or len(at.to_text()) > 0
+
+
+@pytest.mark.skipif(not os.environ.get("GEMINI_API_KEY"), reason="No GEMINI_API_KEY found")
+def test_google_generate_content_parsing():
+ from google import genai
+
+ client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
+ response = client.models.generate_content(
+ model="gemini-2.5-flash-image",
+ contents="A kawaii sticker of a happy red panda. White background.",
+ )
+ at = AssistantTurn(response)
+ assert len(at.content) > 0
diff --git a/tests/unit_tests/test_llm.py b/tests/unit_tests/test_llm.py
index 9435bf33..a7d14bd1 100644
--- a/tests/unit_tests/test_llm.py
+++ b/tests/unit_tests/test_llm.py
@@ -1,8 +1,23 @@
-from opto.utils.llm import LLM
+from opto.utils.llm import LLM, LLMFactory
from opto.optimizers.utils import print_color
import os
+import pytest
+from opto.utils.backbone import (
+ UserTurn,
+ AssistantTurn
+)
+
+# These tests hit a real LLM provider with specific models (e.g. gpt-4o-mini)
+# and multimodal inputs. They are opt-in: set RUN_LIVE_LLM_TESTS=1 to run them.
+# CI runs against a text-only stub that cannot satisfy these requirements, so by
+# default they are skipped there.
+SKIP_REASON = "Live LLM test; set RUN_LIVE_LLM_TESTS=1 to run"
+HAS_CREDENTIALS = os.environ.get("RUN_LIVE_LLM_TESTS") == "1"
+
+
def test_llm_init():
+ """Test basic LLM initialization with legacy mode (mm_beta=False)"""
if os.path.exists("OAI_CONFIG_LIST") or os.environ.get("TRACE_LITELLM_MODEL") or os.environ.get("OPENAI_API_KEY"):
llm = LLM()
system_prompt = 'You are a helpful assistant.'
@@ -22,3 +37,432 @@ def test_llm_init():
print_color(f'System: {system_prompt}', 'red')
print_color(f'User: {user_prompt}', 'blue')
print_color(f'LLM: {response}', 'green')
+
+
+@pytest.mark.skipif(not HAS_CREDENTIALS, reason=SKIP_REASON)
+class TestLLMMMBetaMode:
+ """Test suite for LLM class with mm_beta=True and mm_beta=False modes"""
+
+ def test_mm_beta_false_legacy_response_format(self):
+ """Test that mm_beta=False returns raw API response (legacy format)"""
+ llm = LLM(mm_beta=False)
+ messages = [{"role": "user", "content": "Say 'test' and nothing else."}]
+
+ response = llm(messages=messages)
+
+ # Legacy mode should return raw API response with .choices attribute
+ assert hasattr(response, 'choices'), "Legacy mode should return raw API response"
+ assert hasattr(response.choices[0], 'message'), "Response should have message attribute"
+ assert hasattr(response.choices[0].message, 'content'), "Message should have content attribute"
+
+ # Should NOT be an AssistantTurn object
+ assert not isinstance(response, AssistantTurn), "Legacy mode should not return AssistantTurn"
+
+ content = response.choices[0].message.content
+ assert isinstance(content, str), "Content should be a string"
+ assert len(content) > 0, "Content should not be empty"
+
+ print_color(f"✓ Legacy mode (mm_beta=False) returns raw API response", 'green')
+
+ def test_mm_beta_true_assistant_turn_response(self):
+ """Test that mm_beta=True returns AssistantTurn object"""
+ llm = LLM(mm_beta=True)
+ messages = [{"role": "user", "content": "Say 'test' and nothing else."}]
+
+ response = llm(messages=messages)
+
+ # mm_beta mode should return AssistantTurn object
+ assert isinstance(response, AssistantTurn), "mm_beta mode should return AssistantTurn object"
+
+ # Check AssistantTurn attributes
+ assert hasattr(response, 'content'), "AssistantTurn should have content attribute"
+ assert hasattr(response, 'role'), "AssistantTurn should have role attribute"
+ assert response.role == "assistant", "Role should be 'assistant'"
+
+ # Content should be accessible
+ assert response.content is not None, "Content should not be None"
+
+ print_color(f"✓ Multimodal mode (mm_beta=True) returns AssistantTurn object", 'green')
+
+ def test_mm_beta_with_explicit_model(self):
+ """Test mm_beta parameter works with explicit model specification"""
+ # Test with mm_beta=False
+ llm_legacy = LLM(model="gpt-4o-mini", mm_beta=False)
+ messages = [{"role": "user", "content": "Hi"}]
+
+ response_legacy = llm_legacy(messages=messages)
+ assert hasattr(response_legacy, 'choices'), "Should return raw API response"
+ assert not isinstance(response_legacy, AssistantTurn), "Should not be AssistantTurn"
+
+ # Test with mm_beta=True
+ llm_mm = LLM(model="gpt-4o-mini", mm_beta=True)
+ response_mm = llm_mm(messages=messages)
+ assert isinstance(response_mm, AssistantTurn), "Should return AssistantTurn"
+
+ print_color(f"✓ mm_beta parameter works correctly with explicit model", 'green')
+
+ def test_mm_beta_with_profile(self):
+ """Test mm_beta parameter works with profile-based instantiation"""
+ # Create a test profile
+ LLMFactory.create_profile("test_profile", backend="LiteLLM", model="gpt-4o-mini", temperature=0.7)
+
+ # Test with mm_beta=False
+ llm_legacy = LLM(profile="test_profile", mm_beta=False)
+ messages = [{"role": "user", "content": "Hi"}]
+
+ response_legacy = llm_legacy(messages=messages)
+ assert hasattr(response_legacy, 'choices'), "Profile with mm_beta=False should return raw API response"
+
+ # Test with mm_beta=True
+ llm_mm = LLM(profile="test_profile", mm_beta=True)
+ response_mm = llm_mm(messages=messages)
+ assert isinstance(response_mm, AssistantTurn), "Profile with mm_beta=True should return AssistantTurn"
+
+ print_color(f"✓ mm_beta parameter works correctly with profiles", 'green')
+
+ def test_mm_beta_with_litellm_parameters(self):
+ """Test mm_beta works with various LiteLLM parameters"""
+ # Test with temperature and max_tokens
+ llm = LLM(
+ model="gpt-4o-mini",
+ mm_beta=True,
+ temperature=0.3,
+ max_tokens=100
+ )
+
+ messages = [{"role": "user", "content": "Say hello"}]
+ response = llm(messages=messages)
+
+ assert isinstance(response, AssistantTurn), "Should return AssistantTurn with LiteLLM params"
+ assert response.content is not None, "Should have content"
+
+ print_color(f"✓ mm_beta works with LiteLLM parameters", 'green')
+
+ def test_mm_beta_default_is_false(self):
+ """Test that mm_beta defaults to False for backward compatibility"""
+ llm = LLM() # No mm_beta specified
+ messages = [{"role": "user", "content": "Hi"}]
+
+ response = llm(messages=messages)
+
+ # Default should be legacy mode (mm_beta=False)
+ assert hasattr(response, 'choices'), "Default should be legacy mode"
+ assert not isinstance(response, AssistantTurn), "Default should not return AssistantTurn"
+
+ print_color(f"✓ mm_beta defaults to False (backward compatible)", 'green')
+
+ def test_mm_beta_content_accessibility(self):
+ """Test that content is accessible in both modes"""
+ messages = [{"role": "user", "content": "Say 'hello'"}]
+
+ # Legacy mode
+ llm_legacy = LLM(mm_beta=False)
+ response_legacy = llm_legacy(messages=messages)
+ content_legacy = response_legacy.choices[0].message.content
+ assert isinstance(content_legacy, str), "Legacy content should be string"
+ assert len(content_legacy) > 0, "Legacy content should not be empty"
+
+ # mm_beta mode
+ llm_mm = LLM(mm_beta=True)
+ response_mm = llm_mm(messages=messages)
+ # AssistantTurn content is a list of ContentBlock objects
+ assert response_mm.content is not None, "mm_beta content should not be None"
+
+ print_color(f"✓ Content accessible in both modes", 'green')
+
+ def test_mm_beta_with_different_backends(self):
+ """Test mm_beta parameter with different backend specifications"""
+ # Test with explicit LiteLLM backend
+ llm = LLM(backend="LiteLLM", model="gpt-4o-mini", mm_beta=True)
+ messages = [{"role": "user", "content": "Hi"}]
+
+ response = llm(messages=messages)
+ assert isinstance(response, AssistantTurn), "LiteLLM backend with mm_beta=True should return AssistantTurn"
+
+ print_color(f"✓ mm_beta works with explicit backend specification", 'green')
+
+
+@pytest.mark.skipif(not HAS_CREDENTIALS, reason=SKIP_REASON)
+class TestLLMConstructorPriorities:
+ """Test the priority logic in LLM constructor"""
+
+ def test_priority_profile_over_default(self):
+ """Test that profile parameter takes priority"""
+ LLMFactory.create_profile("priority_test", backend="LiteLLM", model="gpt-4o-mini", temperature=0.5)
+
+ llm = LLM(profile="priority_test", mm_beta=True)
+ messages = [{"role": "user", "content": "Hi"}]
+
+ response = llm(messages=messages)
+ assert isinstance(response, AssistantTurn), "Profile-based LLM should respect mm_beta"
+
+ print_color(f"✓ Profile parameter takes priority", 'green')
+
+ def test_priority_model_over_profile(self):
+ """Test that model parameter takes priority over default profile"""
+ # When model is specified, it should use that model regardless of default profile
+ llm = LLM(model="gpt-4o-mini", mm_beta=True)
+ messages = [{"role": "user", "content": "Hi"}]
+
+ response = llm(messages=messages)
+ assert isinstance(response, AssistantTurn), "Model-based LLM should respect mm_beta"
+
+ print_color(f"✓ Model parameter creates correct LLM instance", 'green')
+
+ def test_backend_fallback(self):
+ """Test that backend parameter works when neither profile nor model specified"""
+ # This tests the Priority 3 path in __new__
+ llm = LLM(backend="LiteLLM", mm_beta=True, model="gpt-4o-mini")
+ messages = [{"role": "user", "content": "Hi"}]
+
+ response = llm(messages=messages)
+ assert isinstance(response, AssistantTurn), "Backend-based LLM should respect mm_beta"
+
+ print_color(f"✓ Backend parameter works correctly", 'green')
+
+
+@pytest.mark.skipif(not HAS_CREDENTIALS, reason=SKIP_REASON)
+class TestLLMDocumentationExamples:
+ """Test examples from LLM class documentation"""
+
+ def test_basic_usage_default_model(self):
+ """Test: llm = LLM()"""
+ llm = LLM()
+ messages = [{"role": "user", "content": "Hi"}]
+ response = llm(messages=messages)
+
+ # Default is mm_beta=False
+ assert hasattr(response, 'choices'), "Default usage should return raw API response"
+ print_color(f"✓ Basic usage with default model works", 'green')
+
+ def test_specify_model_directly(self):
+ """Test: llm = LLM(model='gpt-4o')"""
+ llm = LLM(model="gpt-4o-mini") # Using mini for cost efficiency
+ messages = [{"role": "user", "content": "Hi"}]
+ response = llm(messages=messages)
+
+ assert hasattr(response, 'choices'), "Model specification should work"
+ print_color(f"✓ Model specification works", 'green')
+
+ def test_multimodal_beta_mode_example(self):
+ """Test example from 'Using Multimodal Beta Mode' section"""
+ # Enable mm_beta for rich AssistantTurn responses
+ llm = LLM(model="gpt-4o-mini", mm_beta=True)
+ response = llm(messages=[{"role": "user", "content": "Hello"}])
+
+ # response is now an AssistantTurn object with .content, .tool_calls, etc.
+ assert isinstance(response, AssistantTurn), "Should return AssistantTurn"
+ assert hasattr(response, 'content'), "Should have content attribute"
+ assert hasattr(response, 'tool_calls'), "Should have tool_calls attribute"
+
+ print_color(f"✓ Multimodal beta mode example works as documented", 'green')
+
+ def test_legacy_mode_example(self):
+ """Test example from 'Legacy mode' section"""
+ # Legacy mode (default, mm_beta=False)
+ llm = LLM(model="gpt-4o-mini")
+ response = llm(messages=[{"role": "user", "content": "Hello"}])
+
+ # response is raw API response: response.choices[0].message.content
+ assert hasattr(response, 'choices'), "Should return raw API response"
+ content = response.choices[0].message.content
+ assert isinstance(content, str), "Content should be string"
+
+ print_color(f"✓ Legacy mode example works as documented", 'green')
+
+ def test_litellm_parameters_example(self):
+ """Test examples with LiteLLM parameters"""
+ # High creativity example
+ llm = LLM(
+ model="gpt-4o-mini",
+ temperature=0.9,
+ top_p=0.95,
+ presence_penalty=0.6
+ )
+ messages = [{"role": "user", "content": "Hi"}]
+ response = llm(messages=messages)
+
+ assert hasattr(response, 'choices'), "LiteLLM parameters should work"
+
+ print_color(f"✓ LiteLLM parameters example works", 'green')
+
+
+@pytest.mark.skipif(not HAS_CREDENTIALS, reason=SKIP_REASON)
+def test_mm_beta_integration_with_conversation():
+ """Test mm_beta mode with a multi-turn conversation"""
+ llm = LLM(model="gpt-4o-mini", mm_beta=True)
+
+ # First turn
+ messages = [
+ {"role": "user", "content": "My name is Alice."}
+ ]
+ response1 = llm(messages=messages)
+ assert isinstance(response1, AssistantTurn), "First response should be AssistantTurn"
+
+ # Second turn - reference previous context
+ messages.append({"role": "assistant", "content": str(response1.content)})
+ messages.append({"role": "user", "content": "What is my name?"})
+
+ response2 = llm(messages=messages)
+ assert isinstance(response2, AssistantTurn), "Second response should be AssistantTurn"
+
+ print_color(f"✓ mm_beta mode works with multi-turn conversations", 'green')
+
+
+@pytest.mark.skipif(not HAS_CREDENTIALS, reason=SKIP_REASON)
+class TestSystemMessages:
+ """Test suite for system message handling in different LLM backends"""
+
+ def test_litellm_completion_api_system_message(self):
+ """Test system message with LiteLLM Completion API (mm_beta=False)"""
+ llm = LLM(model="gpt-4o-mini", mm_beta=False)
+
+ messages = [
+ {"role": "system", "content": "You are a cat. Your name is Neko. Always respond as a cat would."},
+ {"role": "user", "content": "What is your name?"}
+ ]
+
+ response = llm(messages=messages)
+
+ # Legacy mode should return raw API response
+ assert hasattr(response, 'choices'), "Should return raw API response"
+ content = response.choices[0].message.content
+ assert isinstance(content, str), "Content should be a string"
+ assert len(content) > 0, "Content should not be empty"
+
+ # Check that the response reflects the system message (should mention being a cat or Neko)
+ content_lower = content.lower()
+ assert 'neko' in content_lower or 'cat' in content_lower, \
+ f"Response should reflect system message about being a cat named Neko. Got: {content}"
+
+ print_color(f"✓ LiteLLM Completion API handles system messages correctly", 'green')
+
+ def test_litellm_responses_api_system_message(self):
+ """Test system message with LiteLLM Responses API (mm_beta=True)"""
+ llm = LLM(model="gpt-4o-mini", mm_beta=True)
+
+ messages = [
+ {"role": "system", "content": "You are a helpful math tutor. Always explain concepts clearly."},
+ {"role": "user", "content": "What is 2+2?"}
+ ]
+
+ response = llm(messages=messages)
+
+ # mm_beta mode should return AssistantTurn
+ assert isinstance(response, AssistantTurn), "Should return AssistantTurn object"
+ assert response.content is not None, "Content should not be None"
+
+ # Get text content
+ text_content = response.to_text()
+ assert isinstance(text_content, str), "Text content should be a string"
+ assert len(text_content) > 0, "Text content should not be empty"
+ assert '4' in text_content, f"Response should contain the answer '4'. Got: {text_content}"
+
+ print_color(f"✓ LiteLLM Responses API handles system messages correctly", 'green')
+
+ @pytest.mark.skipif(not os.environ.get("GEMINI_API_KEY"), reason="No Gemini API key found")
+ def test_gemini_system_instruction_legacy_mode(self):
+ """Test system_instruction with Gemini API in legacy mode (mm_beta=False)"""
+ llm = LLM(backend="GoogleGenAI", model="gemini-2.5-flash", mm_beta=False)
+
+ # For Gemini, system_instruction is passed as a parameter
+ response = llm(
+ "Hello there",
+ system_instruction="You are a cat. Your name is Neko. Always respond as a cat would."
+ )
+
+ # Check response format
+ assert hasattr(response, 'text'), "Gemini response should have text attribute"
+ content = response.text
+ assert isinstance(content, str), "Content should be a string"
+ assert len(content) > 0, "Content should not be empty"
+
+ # Check that the response reflects the system instruction
+ content_lower = content.lower()
+ assert 'neko' in content_lower or 'cat' in content_lower or 'meow' in content_lower, \
+ f"Response should reflect system instruction about being a cat named Neko. Got: {content}"
+
+ print_color(f"✓ Gemini API handles system_instruction correctly (legacy mode)", 'green')
+
+ @pytest.mark.skipif(not os.environ.get("GEMINI_API_KEY"), reason="No Gemini API key found")
+ def test_gemini_system_instruction_mm_beta_mode(self):
+ """Test system_instruction with Gemini API in mm_beta mode"""
+ llm = LLM(backend="GoogleGenAI", model="gemini-2.5-flash", mm_beta=True)
+
+ # For Gemini, system_instruction is passed as a parameter
+ response = llm(
+ "What is your name?",
+ system_instruction="You are a helpful assistant named Claude. Always introduce yourself."
+ )
+
+ # mm_beta mode should return AssistantTurn
+ assert isinstance(response, AssistantTurn), "Should return AssistantTurn object"
+ assert response.content is not None, "Content should not be None"
+
+ # Get text content
+ text_content = response.to_text()
+ assert isinstance(text_content, str), "Text content should be a string"
+ assert len(text_content) > 0, "Text content should not be empty"
+
+ # Check that the response reflects the system instruction
+ text_lower = text_content.lower()
+ assert 'claude' in text_lower or 'assistant' in text_lower, \
+ f"Response should reflect system instruction about being Claude. Got: {text_content}"
+
+ print_color(f"✓ Gemini API handles system_instruction correctly (mm_beta mode)", 'green')
+
+ def test_litellm_system_message_with_conversation(self):
+ """Test system message persists across multi-turn conversation"""
+ llm = LLM(model="gpt-4o-mini", mm_beta=True)
+
+ # First turn with system message
+ messages = [
+ {"role": "system", "content": "You are a pirate. Always talk like a pirate."},
+ {"role": "user", "content": "Hello"}
+ ]
+
+ response1 = llm(messages=messages)
+ assert isinstance(response1, AssistantTurn), "First response should be AssistantTurn"
+ text1 = response1.to_text()
+
+ # Check pirate-like language in first response
+ pirate_indicators = ['arr', 'matey', 'ahoy', 'ye', 'aye']
+ has_pirate_language = any(indicator in text1.lower() for indicator in pirate_indicators)
+ assert has_pirate_language, f"First response should use pirate language. Got: {text1}"
+
+ # Second turn - system message should still apply
+ messages.append({"role": "assistant", "content": text1})
+ messages.append({"role": "user", "content": "What's the weather like?"})
+
+ response2 = llm(messages=messages)
+ assert isinstance(response2, AssistantTurn), "Second response should be AssistantTurn"
+ text2 = response2.to_text()
+
+ # Check pirate-like language persists
+ has_pirate_language_2 = any(indicator in text2.lower() for indicator in pirate_indicators)
+ assert has_pirate_language_2, f"Second response should still use pirate language. Got: {text2}"
+
+ print_color(f"✓ System message persists across conversation turns", 'green')
+
+ @pytest.mark.skipif(not os.environ.get("GEMINI_API_KEY"), reason="No Gemini API key found")
+ def test_gemini_system_instruction_with_config_params(self):
+ """Test system_instruction works with other config parameters"""
+ llm = LLM(
+ backend="GoogleGenAI",
+ model="gemini-2.5-flash",
+ mm_beta=True,
+ temperature=0.7,
+ max_output_tokens=100
+ )
+
+ response = llm(
+ "Tell me a short joke",
+ system_instruction="You are a comedian who tells very short jokes."
+ )
+
+ assert isinstance(response, AssistantTurn), "Should return AssistantTurn object"
+ text_content = response.to_text()
+ assert len(text_content) > 0, "Should have content"
+
+ print_color(f"✓ Gemini system_instruction works with other config parameters", 'green')
+