Source code for agentscope.tts._dashscope_tts_model

# -*- coding: utf-8 -*-
"""DashScope SDK TTS model implementation using MultiModalConversation API."""
from typing import (
    Any,
    Literal,
    AsyncGenerator,
    Generator,
    TYPE_CHECKING,
)

from ._tts_base import TTSModelBase
from ._tts_response import TTSResponse
from ..message import Msg, AudioBlock, Base64Source
from ..types import JSONSerializableObject

if TYPE_CHECKING:
    from dashscope.api_entities.dashscope_response import (
        MultiModalConversationResponse,
    )

else:
    MultiModalConversationResponse = (
        "dashscope.api_entities.dashscope_response."
        "MultiModalConversationResponse"
    )



[docs]
class DashScopeTTSModel(TTSModelBase):
    """DashScope TTS model implementation using MultiModalConversation API.
    For more details, please see the `official document
    <https://bailian.console.aliyun.com/?tab=doc#/doc/?type=model&url=2879134>`_.
    """

    supports_streaming_input: bool = False
    """Whether the model supports streaming input."""


[docs]
    def __init__(
        self,
        api_key: str,
        model_name: str = "qwen3-tts-flash",
        voice: Literal["Cherry", "Serena", "Ethan", "Chelsie"]
        | str = "Cherry",
        language_type: str = "Auto",
        stream: bool = True,
        generate_kwargs: dict[str, JSONSerializableObject] | None = None,
    ) -> None:
        """Initialize the DashScope SDK TTS model.

        .. note:: More details about the parameters, such as `model_name`,
        `voice`, and language_type can be found in the `official document
        <https://bailian.console.aliyun.com/?tab=doc#/doc/?type=model&url=2879134>`_.

        Args:
            api_key (`str`):
                The DashScope API key. Required.
            model_name (`str`, defaults to "qwen3-tts-flash"):
                The TTS model name. Supported models are qwen3-tts-flash,
                qwen-tts, etc.
            voice (`Literal["Cherry", "Serena", "Ethan", "Chelsie"] | str`, \
             defaults to "Cherry"):
                The voice to use. Supported voices are "Cherry", "Serena",
                "Ethan", "Chelsie", etc.
            language_type (`str`, default to "Auto"):
                The language type. Should match the text language for
                correct pronunciation and natural intonation.
            generate_kwargs (`dict[str, JSONSerializableObject] | None`, \
             optional):
               The extra keyword arguments used in Dashscope TTS API
               generation, e.g. `temperature`, `seed`.
        """
        super().__init__(model_name=model_name, stream=stream)

        self.api_key = api_key
        self.voice = voice
        self.language_type = language_type
        self.generate_kwargs = generate_kwargs or {}



[docs]
    async def synthesize(
        self,
        msg: Msg | None = None,
        **kwargs: Any,
    ) -> TTSResponse | AsyncGenerator[TTSResponse, None]:
        """Call the DashScope TTS API to synthesize speech from text.

        Args:
            msg (`Msg | None`, optional):
                The message to be synthesized.
            **kwargs (`Any`):
                Additional keyword arguments to pass to the TTS API call.

        Returns:
            `TTSResponse | AsyncGenerator[TTSResponse, None]`:
                The TTS response or an async generator yielding TTSResponse
                objects in streaming mode.
        """

        if msg is None:
            return TTSResponse(content=None)

        text = msg.get_text_content()

        import dashscope

        # Call DashScope TTS API with streaming mode
        response = dashscope.MultiModalConversation.call(
            model=self.model_name,
            api_key=self.api_key,
            text=text,
            voice=self.voice,
            language_type=self.language_type,
            stream=True,
            **self.generate_kwargs,
            **kwargs,
        )

        if self.stream:
            return self._parse_into_async_generator(response)

        audio_data = ""
        for chunk in response:
            if chunk.output is not None:
                audio_data += chunk.output.audio.data

        res = TTSResponse(
            content=AudioBlock(
                type="audio",
                source=Base64Source(
                    type="base64",
                    data=audio_data,
                    media_type="audio/pcm;rate=24000",
                ),
            ),
        )
        return res


    @staticmethod
    async def _parse_into_async_generator(
        response: Generator[MultiModalConversationResponse, None, None],
    ) -> AsyncGenerator[TTSResponse, None]:
        """Parse the TTS response into an async generator.

        Args:
            response (`Generator[MultiModalConversationResponse, None, None]`):
                The streaming response from DashScope TTS API.

        Returns:
            `AsyncGenerator[TTSResponse, None]`:
                An async generator yielding TTSResponse objects.
        """
        audio_data = ""
        for chunk in response:
            if chunk.output is not None:
                audio = chunk.output.audio
                if audio and audio.data:
                    audio_data += audio.data
                    yield TTSResponse(
                        content=AudioBlock(
                            type="audio",
                            source=Base64Source(
                                type="base64",
                                data=audio_data,
                                media_type="audio/pcm;rate=24000",
                            ),
                        ),
                        is_last=False,
                    )
        yield TTSResponse(
            content=AudioBlock(
                type="audio",
                source=Base64Source(
                    type="base64",
                    data=audio_data,
                    media_type="audio/pcm;rate=24000",
                ),
            ),
            is_last=True,
        )