Source code for agentscope.tts._tts_base

# -*- coding: utf-8 -*-
"""The TTS model base class."""

from abc import ABC, abstractmethod
from typing import Any, AsyncGenerator

from agentscope.message import Msg

from ._tts_response import TTSResponse



[docs]
class TTSModelBase(ABC):
    """Base class for TTS models in AgentScope.

    This base class provides general abstraction for both realtime and
    non-realtime TTS models (depending on whether streaming input is
    supported).

    For non-realtime TTS models, the `synthesize` method is used to
    synthesize speech from the input text. You only need to implement the
    `_call_api` method to handle the TTS API calls.

    For realtime TTS models, its lifecycle is managed via the async context
    manager or calling `connect` and `close` methods. The `push` method will
    append text chunks and return the received TTS response, while the
    `synthesize` method will block until the full speech is synthesized.
    You need to implement the `connect`, `close`, and `_call_api` methods
    to handle the TTS API calls and resource management.
    """

    supports_streaming_input: bool = False
    """If the TTS model class supports streaming input."""

    model_name: str
    """The name of the TTS model."""

    stream: bool
    """Whether to use streaming synthesis if supported by the model."""


[docs]
    def __init__(self, model_name: str, stream: bool) -> None:
        """Initialize the TTS model base class.

        Args:
            model_name (`str`):
                The name of the TTS model
            stream  (`bool`):
                Whether to use streaming synthesis if supported by the model.
        """
        self.model_name = model_name
        self.stream = stream


    async def __aenter__(self) -> "TTSModelBase":
        """Enter the async context manager and initialize resources if
        needed."""
        if self.supports_streaming_input:
            await self.connect()

        return self

    async def __aexit__(
        self,
        exc_type: Any,
        exc_value: Any,
        traceback: Any,
    ) -> None:
        """Exit the async context manager and clean up resources if needed."""
        if self.supports_streaming_input:
            await self.close()


[docs]
    async def connect(self) -> None:
        """Connect to the TTS model and initialize resources. For non-realtime
        TTS models, leave this method empty.

        .. note:: Only needs to be implemented for realtime TTS models.

        """
        raise NotImplementedError(
            f"The connect method is not implemented for "
            f"{self.__class__.__name__} class.",
        )



[docs]
    async def close(self) -> None:
        """Close the connection to the TTS model and clean up resources. For
        non-realtime TTS models, leave this method empty.

        .. note:: Only needs to be implemented for realtime TTS models.

        """
        raise NotImplementedError(
            "The close method is not implemented for "
            f"{self.__class__.__name__} class.",
        )



[docs]
    async def push(
        self,
        msg: Msg,
        **kwargs: Any,
    ) -> TTSResponse:
        """Append text to be synthesized and return the received TTS response.
        Note this method is non-blocking, and maybe return an empty response
        if no audio is received yet.

        To receive all the synthesized speech, call the `synthesize` method
        after pushing all the text chunks.

        .. note:: Only needs to be implemented for realtime TTS models.

        Args:
            msg (`Msg`):
                The message to be synthesized. The `msg.id` identifies the
                streaming input request.
            **kwargs (`Any`):
                Additional keyword arguments to pass to the TTS API call.

        Returns:
            `TTSResponse`:
                The TTSResponse containing audio block.
        """
        raise NotImplementedError(
            "The push method is not implemented for "
            f"{self.__class__.__name__} class.",
        )



[docs]
    @abstractmethod
    async def synthesize(
        self,
        msg: Msg | None = None,
        **kwargs: Any,
    ) -> TTSResponse | AsyncGenerator[TTSResponse, None]:
        """Synthesize speech from the appended text. Different from the `push`
        method, this method will block until the full speech is synthesized.

        Args:
            msg (`Msg | None`, defaults to `None`):
                The message to be synthesized. If `None`, this method will
                wait for all previously pushed text to be synthesized, and
                return the last synthesized TTSResponse.

        Returns:
            `TTSResponse | AsyncGenerator[TTSResponse, None]`:
                The TTSResponse containing audio blocks, or an async generator
                yielding TTSResponse objects in streaming mode.
        """