Source code for agentscope.tool._multi_modality._openai_tools

# -*- coding: utf-8 -*-
"""
Wrap OpenAI API calls as tools. Refer the official
`OpenAI API documentation <https://platform.openai.com/docs/overview>`_ for
more details.
"""
import base64
from io import BytesIO
import os
from typing import Literal, IO
import requests

from .. import ToolResponse
from ...formatter._openai_formatter import _to_openai_image_url
from ...message import (
    ImageBlock,
    TextBlock,
    Base64Source,
    URLSource,
    AudioBlock,
)


def _parse_url(url: str) -> BytesIO | IO[bytes]:
    """
    If url is a local file path, return a BytesIO of the file content.
    If url is a web URL, fetch the content and return as BytesIO.
    """
    if url.startswith(("http://", "https://")):
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return BytesIO(response.content)
    else:
        if not os.path.exists(url):
            raise FileNotFoundError(f"File not found: {url}")
        return open(os.path.abspath(url), "rb")



[docs]
def openai_text_to_image(
    prompt: str,
    api_key: str,
    n: int = 1,
    model: Literal["dall-e-2", "dall-e-3", "gpt-image-1"] = "dall-e-2",
    size: Literal[
        "256x256",
        "512x512",
        "1024x1024",
        "1792x1024",
        "1024x1792",
    ] = "256x256",
    quality: Literal[
        "auto",
        "standard",
        "hd",
        "high",
        "medium",
        "low",
    ] = "auto",
    style: Literal["vivid", "natural"] = "vivid",
    response_format: Literal["url", "b64_json"] = "url",
) -> ToolResponse:
    """
    Generate image(s) based on the given prompt, and return image URL(s) or
    base64 data.

    Args:
        prompt (`str`):
            The text prompt to generate images.
        api_key (`str`):
            The API key for the OpenAI API.
        n (`int`, defaults to `1`):
            The number of images to generate.
        model (`Literal["dall-e-2", "dall-e-3"]`, defaults to `"dall-e-2"`):
            The model to use for image generation.
        size (`Literal["256x256", "512x512", "1024x1024", "1792x1024", \
        "1024x1792"]`, defaults to `"256x256"`):
            The size of the generated images.
            Must be one of 1024x1024, 1536x1024 (landscape), 1024x1536 (
            portrait), or auto (default value) for gpt-image-1,
            one of 256x256, 512x512, or 1024x1024 for dall-e-2,
            and one of 1024x1024, 1792x1024, or 1024x1792 for dall-e-3.
        quality (`Literal["auto", "standard", "hd", "high", "medium", \
        "low"]`,  defaults to `"auto"`):
            The quality of the image that will be generated.

            - `auto` (default value) will automatically select the best
              quality for the given model.
            - `high`, `medium` and `low` are supported for gpt-image-1.
            - `hd` and `standard` are supported for dall-e-3.
            - `standard` is the only option for dall-e-2.
        style (`Literal["vivid", "natural"]`, defaults to `"vivid"`):
            The style of the generated images.
            This parameter is only supported for dall-e-3.
            Must be one of `vivid` or `natural`.

            - `Vivid` causes the model to lean towards generating hyper-real
              and dramatic images.
            - `Natural` causes the model to produce more natural,
              less hyper-real looking images.
        response_format (`Literal["url", "b64_json"]`, defaults to `"url"`):
            The format in which generated images with dall-e-2 and dall-e-3
            are returned.

            - Must be one of "url" or "b64_json".
            - URLs are only valid for 60 minutes after the image has been
              generated.
            - This parameter isn't supported for gpt-image-1 which will always
              return base64-encoded images.
    Returns:
        `ToolResponse`:
            A ToolResponse containing the generated content
            (ImageBlock/TextBlock/AudioBlock) or error information if the
            operation failed.
    """

    kwargs = {
        "model": model,
        "prompt": prompt,
        "n": n,
        "size": size,
    }
    if model == "dall-e-3":
        kwargs["style"] = style
    if model != "dall-e-2":
        kwargs["quality"] = quality
    if model != "gpt-image-1":
        kwargs["response_format"] = response_format
    if model == "gpt-image-1":
        response_format = "b64_json"

    try:
        import openai

        client = openai.OpenAI(
            api_key=api_key,
        )
        response = client.images.generate(
            **kwargs,
        )
        image_blocks: list = []
        if response_format == "url":
            image_urls = [_.url for _ in response.data]
            for image_url in image_urls:
                image_blocks.append(
                    ImageBlock(
                        type="image",
                        source=URLSource(
                            type="url",
                            url=image_url,
                        ),
                    ),
                )
        else:
            image_datas = [_.b64_json for _ in response.data]
            for image_data in image_datas:
                image_blocks.append(
                    ImageBlock(
                        type="image",
                        source=Base64Source(
                            type="base64",
                            media_type="image/png",
                            data=image_data,
                        ),
                    ),
                )
        return ToolResponse(
            content=image_blocks,
        )
    except Exception as e:
        return ToolResponse(
            [
                TextBlock(
                    type="text",
                    text=f"Failed to generate image: {str(e)}",
                ),
            ],
        )




[docs]
def openai_edit_image(
    image_url: str,
    prompt: str,
    api_key: str,
    model: Literal["dall-e-2", "gpt-image-1"] = "dall-e-2",
    mask_url: str | None = None,
    n: int = 1,
    size: Literal[
        "256x256",
        "512x512",
        "1024x1024",
    ] = "256x256",
    response_format: Literal["url", "b64_json"] = "url",
) -> ToolResponse:
    """
    Edit an image based on the provided mask and prompt, and return the edited
    image URL(s) or base64 data.

    Args:
        image_url (`str`):
            The file path or URL to the image that needs editing.
        prompt (`str`):
            The text prompt describing the edits to be made to the image.
        api_key (`str`):
            The API key for the OpenAI API.
        model (`Literal["dall-e-2", "gpt-image-1"]`, defaults to `"dall-e-2"`):
            The model to use for image generation.
        mask_url (`str | None`, defaults to `None`):
            The file path or URL to the mask image that specifies the regions
            to be edited.
        n (`int`, defaults to `1`):
            The number of edited images to generate.
        size (`Literal["256x256", "512x512", "1024x1024"]`, defaults to \
        `"256x256"`):
            The size of the edited images.
        response_format (`Literal["url", "b64_json"]`, defaults to `"url"`):
            The format in which generated images are returned.

            - Must be one of "url" or "b64_json".
            - URLs are only valid for 60 minutes after generation.
            - This parameter isn't supported for gpt-image-1 which will
              always return base64-encoded images.

    Returns:
        `ToolResponse`:
            A ToolResponse containing the generated content
            (ImageBlock/TextBlock/AudioBlock) or error information if the
            operation failed.

    """

    try:
        import openai

        client = openai.OpenAI(
            api_key=api_key,
        )

        def prepare_image(url_or_path: str) -> BytesIO:
            from PIL import Image

            if url_or_path.startswith(("http://", "https://")):
                response = requests.get(url_or_path)
                response.raise_for_status()
                img = Image.open(BytesIO(response.content))
            else:
                img = Image.open(url_or_path)

            if img.mode != "RGBA":
                img = img.convert("RGBA")
            img_buffer = BytesIO()
            img.save(img_buffer, format="PNG")
            img_buffer.seek(0)
            img_buffer.name = "image.png"
            return img_buffer

        image_file = prepare_image(image_url)

        kwargs = {
            "model": model,
            "image": image_file,
            "prompt": prompt,
            "n": n,
            "size": size,
        }

        if mask_url:
            kwargs["mask"] = prepare_image(mask_url)

        if model == "dall-e-2":
            kwargs["response_format"] = response_format
        else:
            response_format = "b64_json"

        response = client.images.edit(**kwargs)

        if response_format == "url":
            urls = [_.url for _ in response.data]
            image_blocks: list = []
            for url in urls:
                image_blocks.append(
                    ImageBlock(
                        type="image",
                        source=URLSource(
                            type="url",
                            url=url,
                        ),
                    ),
                )
            return ToolResponse(
                content=image_blocks,
            )
        elif response_format == "b64_json":
            image_datas = [_.b64_json for _ in response.data]
            image_blocks = []
            for image_data in image_datas:
                image_blocks.append(
                    ImageBlock(
                        type="image",
                        source=Base64Source(
                            type="base64",
                            media_type="image/png",
                            data=image_data,
                        ),
                    ),
                )
            return ToolResponse(
                content=image_blocks,
            )
    except Exception as e:
        return ToolResponse(
            [
                TextBlock(
                    type="text",
                    text=f"Failed to generate image: {str(e)}",
                ),
            ],
        )




[docs]
def openai_create_image_variation(
    image_url: str,
    api_key: str,
    n: int = 1,
    model: Literal["dall-e-2"] = "dall-e-2",
    size: Literal[
        "256x256",
        "512x512",
        "1024x1024",
    ] = "256x256",
    response_format: Literal["url", "b64_json"] = "url",
) -> ToolResponse:
    """
    Create variations of an image and return the image URL(s) or base64 data.

    Args:
        image_url (`str`):
            The file path or URL to the image from which variations will be
            generated.
        api_key (`str`):
            The API key for the OpenAI API.
        n (`int`, defaults to `1`):
            The number of image variations to generate.
        model (` Literal["dall-e-2"]`, default to `dall-e-2`):
            The model to use for image variation.
        size (`Literal["256x256", "512x512", "1024x1024"]`, defaults to \
        `"256x256"`):
            The size of the generated image variations.
        response_format (`Literal["url", "b64_json"]`, defaults to `"url"`):
            The format in which generated images are returned.

            - Must be one of url or b64_json.
            - URLs are only valid for 60 minutes after the image has been
              generated.

    Returns:
        `ToolResponse`:
            A ToolResponse containing the generated content
            (ImageBlock/TextBlock/AudioBlock) or error information if the
            operation failed.
    """

    # _parse_url handles both local and web URLs and returns BytesIO
    image = _parse_url(image_url)
    try:
        import openai

        client = openai.OpenAI(
            api_key=api_key,
        )
        response = client.images.create_variation(
            model=model,
            image=image,
            n=n,
            size=size,
        )
        image_blocks: list = []
        if response_format == "url":
            urls = [_.url for _ in response.data]
            for url in urls:
                image_blocks.append(
                    ImageBlock(
                        type="image",
                        source=URLSource(
                            type="url",
                            url=url,
                        ),
                    ),
                )
        else:
            image_datas = [_.b64_json for _ in response.data]
            for image_data in image_datas:
                image_blocks.append(
                    ImageBlock(
                        type="image",
                        source=Base64Source(
                            type="base64",
                            media_type="image/png",
                            data=image_data,
                        ),
                    ),
                )
        return ToolResponse(
            content=image_blocks,
        )
    except Exception as e:
        return ToolResponse(
            [
                TextBlock(
                    type="text",
                    text=f"Failed to generate image: {str(e)}",
                ),
            ],
        )




[docs]
def openai_image_to_text(
    image_urls: str | list[str],
    api_key: str,
    prompt: str = "Describe the image",
    model: str = "gpt-4o",
) -> ToolResponse:
    """
    Generate descriptive text for given image(s) using a specified model, and
    return the generated text.

    Args:
        image_urls (`str | list[str]`):
            The URL or list of URLs pointing to the images that need to be
            described.
        api_key (`str`):
            The API key for the OpenAI API.
        prompt (`str`, defaults to `"Describe the image"`):
            The prompt that instructs the model on how to describe
            the image(s).
        model (`str`, defaults to `"gpt-4o"`):
            The model to use for generating the text descriptions.

    Returns:
        `ToolResponse`:
            A ToolResponse containing the generated content
            (ImageBlock/TextBlock/AudioBlock) or error information if the
            operation failed.

    """
    if isinstance(image_urls, str):
        image_urls = [image_urls]

    content = []
    for url in image_urls:
        content.append(
            {
                "type": "image_url",
                "image_url": {
                    "url": _to_openai_image_url(url),
                },
            },
        )
    content.append(
        {
            "type": "text",
            "text": prompt,
        },
    )
    messages = [
        {
            "role": "user",
            "content": content,
        },
    ]

    try:
        import openai

        client = openai.OpenAI(
            api_key=api_key,
        )
        response = client.chat.completions.create(
            messages=messages,
            model=model,
        )
        return ToolResponse(
            [
                TextBlock(
                    type="text",
                    text=response.choices[0].message.content,
                ),
            ],
        )
    except Exception as e:
        return ToolResponse(
            [
                TextBlock(
                    type="text",
                    text=f"Failed to generate text: {str(e)}",
                ),
            ],
        )




[docs]
def openai_text_to_audio(
    text: str,
    api_key: str,
    model: Literal["tts-1", "tts-1-hd", "gpt-4o-mini-tts"] = "tts-1",
    voice: Literal[
        "alloy",
        "ash",
        "ballad",
        "coral",
        "echo",
        "fable",
        "nova",
        "onyx",
        "sage",
        "shimmer",
    ] = "alloy",
    speed: float = 1.0,
    res_format: Literal[
        "mp3",
        "opus",
        "aac",
        "flac",
        "wav",
        "pcm",
    ] = "mp3",
) -> ToolResponse:
    """
    Convert text to an audio file using a specified model and voice.

    Args:
        text (`str`):
            The text to convert to audio.
        api_key (`str`):
            The API key for the OpenAI API.
        model (`Literal["tts-1", "tts-1-hd"]`, defaults to `"tts-1"`):
            The model to use for text-to-speech conversion.
        voice (`Literal["alloy", "echo", "fable", "onyx", "nova", \
        "shimmer"]`, defaults to `"alloy"`):
            The voice to use for the audio output.
        speed (`float`, defaults to `1.0`):
            The speed of the audio playback. A value of 1.0 is normal speed.
        res_format (`Literal["mp3", "wav", "opus", "aac", "flac", \
        "wav", "pcm"]`, defaults to `"mp3"`):
            The format of the audio file.

    Returns:
        `ToolResponse`:
            A ToolResponse containing the generated content
            (ImageBlock/TextBlock/AudioBlock) or error information if the
            operation failed.
    """

    try:
        import openai

        client = openai.OpenAI(
            api_key=api_key,
        )
        response = client.audio.speech.create(
            model=model,
            voice=voice,
            speed=speed,
            input=text,
            response_format=res_format,
        )
        audio_bytes = response.content
        audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")

        return ToolResponse(
            [
                AudioBlock(
                    type="audio",
                    source=Base64Source(
                        type="base64",
                        media_type=f"audio/{res_format}",
                        data=audio_base64,
                    ),
                ),
            ],
        )
    except Exception as e:
        return ToolResponse(
            [
                TextBlock(
                    type="text",
                    text=f"Error: Failed to generate audio. {str(e)}",
                ),
            ],
        )




[docs]
def openai_audio_to_text(
    audio_file_url: str,
    api_key: str,
    language: str = "en",
    temperature: float = 0.2,
) -> ToolResponse:
    """
    Convert an audio file to text using OpenAI's transcription service.

    Args:
        audio_file_url (`str`):
            The file path or URL to the audio file that needs to be
            transcribed.
        api_key (`str`):
            The API key for the OpenAI API.
        language (`str`, defaults to `"en"`):
            The language of the input audio in
            `ISO-639-1 format \
            <https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes>`_
            (e.g., "en", "zh", "fr"). Improves accuracy and latency.
        temperature (`float`, defaults to `0.2`):
            The temperature for the transcription, which affects the
            randomness of the output.

    Returns:
        `ToolResponse`:
            A ToolResponse containing the generated content
            (ImageBlock/TextBlock/AudioBlock) or error information if the
            operation failed.
    """

    try:
        import openai

        client = openai.OpenAI(
            api_key=api_key,
        )

        if audio_file_url.startswith(("http://", "https://")):
            response = requests.get(audio_file_url)
            response.raise_for_status()
            audio_buffer = BytesIO(response.content)
            import urllib.parse
            from pathlib import Path

            parsed_url = urllib.parse.urlparse(audio_file_url)
            filename = Path(parsed_url.path).name or "audio.mp3"
            audio_buffer.name = filename
            audio_file = audio_buffer
            transcription = client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
                language=language,
                temperature=temperature,
            )
        else:
            if not os.path.exists(audio_file_url):
                raise FileNotFoundError(f"File not found: {audio_file_url}")
            with open(audio_file_url, "rb") as audio_file:
                transcription = client.audio.transcriptions.create(
                    model="whisper-1",
                    file=audio_file,
                    language=language,
                    temperature=temperature,
                )
        return ToolResponse(
            [
                TextBlock(
                    type="text",
                    text=transcription.text,
                ),
            ],
        )
    except Exception as e:
        return ToolResponse(
            [
                TextBlock(
                    type="text",
                    text=f"Error: Failed to transcribe audio: {str(e)}",
                ),
            ],
        )