Source code for agentscope.tts._gemini_tts_model

# -*- coding: utf-8 -*-
"""Gemini TTS model implementation."""
import base64
from typing import TYPE_CHECKING, Any, Literal, AsyncGenerator, Iterator

from ._tts_base import TTSModelBase
from ._tts_response import TTSResponse
from ..message import Msg, AudioBlock, Base64Source
from ..types import JSONSerializableObject

if TYPE_CHECKING:
    from google.genai import Client
    from google.genai.types import GenerateContentResponse
else:
    Client = "google.genai.Client"
    GenerateContentResponse = "google.genai.types.GenerateContentResponse"


[docs] class GeminiTTSModel(TTSModelBase): """Gemini TTS model implementation. For more details, please see the `official document <https://ai.google.dev/gemini-api/docs/speech-generation>`_. """ supports_streaming_input: bool = False """Whether the model supports streaming input."""
[docs] def __init__( self, api_key: str, model_name: str = "gemini-2.5-flash-preview-tts", voice: Literal["Zephyr", "Kore", "Orus", "Autonoe"] | str = "Kore", stream: bool = True, client_kwargs: dict[str, JSONSerializableObject] | None = None, generate_kwargs: dict[str, JSONSerializableObject] | None = None, ) -> None: """Initialize the Gemini TTS model. .. note:: More details about the parameters, such as `model_name` and `voice` can be found in the `official document <https://ai.google.dev/gemini-api/docs/speech-generation>`_. Args: api_key (`str`): The Gemini API key. model_name (`str`, defaults to "gemini-2.5-flash-preview-tts"): The TTS model name. Supported models are "gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts", etc. voice (`Literal["Zephyr", "Kore", "Orus", "Autonoe"] | str`, \ defaults to "Kore"): The voice name to use. Supported voices are "Zephyr", "Kore", "Orus", "Autonoe", etc. stream (`bool`, defaults to `True`): Whether to use streaming synthesis if supported by the model. client_kwargs (`dict[str, JSONSerializableObject] | None`, \ optional): The extra keyword arguments to initialize the Gemini client. generate_kwargs (`dict[str, JSONSerializableObject] | None`, \ optional): The extra keyword arguments used in Gemini API generation, e.g. `temperature`, `seed`. """ super().__init__(model_name=model_name, stream=stream) self.api_key = api_key self.voice = voice from google import genai self._client = genai.Client( api_key=self.api_key, **(client_kwargs or {}), ) self.generate_kwargs = generate_kwargs or {}
[docs] async def synthesize( self, msg: Msg | None = None, **kwargs: Any, ) -> TTSResponse | AsyncGenerator[TTSResponse, None]: """Append text to be synthesized and return TTS response. Args: msg (`Msg | None`, optional): The message to be synthesized. **kwargs (`Any`): Additional keyword arguments to pass to the TTS API call. Returns: `TTSResponse | AsyncGenerator[TTSResponse, None]`: The TTSResponse object in non-streaming mode, or an async generator yielding TTSResponse objects in streaming mode. """ if msg is None: return TTSResponse(content=None) from google.genai import types # Only call API for synthesis when last=True text = msg.get_text_content() # Prepare config config = types.GenerateContentConfig( response_modalities=["AUDIO"], speech_config=types.SpeechConfig( voice_config=types.VoiceConfig( prebuilt_voice_config=types.PrebuiltVoiceConfig( voice_name=self.voice, ), ), ), **self.generate_kwargs, **kwargs, ) # Prepare API kwargs api_kwargs: dict[str, JSONSerializableObject] = { "model": self.model_name, "contents": text, "config": config, } if self.stream: response = self._client.models.generate_content_stream( **api_kwargs, ) return self._parse_into_async_generator(response) # Call Gemini TTS API response = self._client.models.generate_content(**api_kwargs) # Extract audio data if ( response.candidates and response.candidates[0].content and response.candidates[0].content.parts and response.candidates[0].content.parts[0].inline_data ): audio_data = ( response.candidates[0].content.parts[0].inline_data.data ) mime_type = ( response.candidates[0].content.parts[0].inline_data.mime_type ) # Convert PCM data to base64 audio_base64 = base64.b64encode(audio_data).decode("utf-8") audio_block = AudioBlock( type="audio", source=Base64Source( type="base64", data=audio_base64, media_type=mime_type, ), ) return TTSResponse(content=audio_block) else: # Not the last chunk, return empty AudioBlock return TTSResponse( content=AudioBlock( type="audio", source=Base64Source( type="base64", data="", media_type="audio/pcm;rate=24000", ), ), )
@staticmethod async def _parse_into_async_generator( response: Iterator[GenerateContentResponse], ) -> AsyncGenerator[TTSResponse, None]: """Parse the TTS response into an async generator. Args: response (`Iterator[GenerateContentResponse]`): The streaming response from Gemini TTS API. Returns: `AsyncGenerator[TTSResponse, None]`: An async generator yielding TTSResponse objects. """ audio_data = "" for chunk in response: chunk_audio_data = ( chunk.candidates[0].content.parts[0].inline_data.data ) mime_type = ( chunk.candidates[0].content.parts[0].inline_data.mime_type ) chunk_audio_base64 = base64.b64encode(chunk_audio_data).decode( "utf-8", ) audio_data += chunk_audio_base64 yield TTSResponse( content=AudioBlock( type="audio", source=Base64Source( type="base64", data=audio_data, media_type=mime_type, ), ), ) yield TTSResponse(content=None)