Source code for agentscope.tool._multi_modality._dashscope_tools

# -*- coding: utf-8 -*-
"""Use DashScope API to generate images,
convert text to audio, and convert images to text.
Please refer to the `official documentation <https://dashscope.aliyun.com/>`_
 for more details.
"""
import base64
from typing import Literal, Sequence

import os


from ..._utils._common import _get_bytes_from_web_url
from ...message import ImageBlock, TextBlock, AudioBlock
from ...tool import ToolResponse


[docs] def dashscope_text_to_image( prompt: str, api_key: str, n: int = 1, size: Literal["1024*1024", "720*1280", "1280*720"] = "1024*1024", model: str = "wanx-v1", use_base64: bool = False, ) -> ToolResponse: """Generate image(s) based on the given prompt, and return image url(s) or base64 data. Args: prompt (`str`): The text prompt to generate image. api_key (`str`): The api key for the dashscope api. n (`int`, defaults to `1`): The number of images to generate. size (`Literal["1024*1024", "720*1280", "1280*720"]`, defaults to \ `"1024*1024"`): Size of the image. model (`str`, defaults to '"wanx-v1"'): The model to use, such as "wanx-v1", "qwen-image", "wan2.2-t2i-flash", etc. use_base64 (`bool`, defaults to 'False'): Whether to use base64 data for images. Returns: `ToolResponse`: A ToolResponse containing the generated content (ImageBlock/TextBlock/AudioBlock) or error information if the operation failed. """ try: import dashscope response = dashscope.ImageSynthesis.call( model=model, prompt=prompt, api_key=api_key, n=n, size=size, ) images = response.output["results"] urls = [_["url"] for _ in images] image_blocks: list = [] if urls is not None: for url in urls: if use_base64: extension = url.split(".")[-1].lower() image_data = _get_bytes_from_web_url(url) image_blocks.append( ImageBlock( type="image", source={ "type": "base64", "media_type": f"image/{extension}", "data": image_data, }, ), ) else: image_blocks.append( ImageBlock( type="image", source={ "type": "url", "url": url, }, ), ) return ToolResponse( content=image_blocks, ) else: return ToolResponse( [ TextBlock( type="text", text="Error: Failed to generate images", ), ], ) except Exception as e: return ToolResponse( [ TextBlock( type="text", text=f"Failed to generate images: {str(e)}", ), ], )
[docs] def dashscope_image_to_text( image_urls: str | Sequence[str], api_key: str, prompt: str = "Describe the image", model: str = "qwen-vl-plus", ) -> ToolResponse: """Generate text based on the given images. Args: image_urls (`str | Sequence[str]`): The url of single or multiple images. api_key (`str`): The api key for the dashscope api. prompt (`str`, defaults to 'Describe the image' ): The text prompt. model (`str`, defaults to 'qwen-vl-plus'): The model to use in DashScope MultiModal API. Returns: `ToolResponse`: A ToolResponse containing the generated content (ImageBlock/TextBlock/AudioBlock) or error information if the operation failed. """ if isinstance(image_urls, str): image_urls = [image_urls] # Check if the local url is valid img_abs_urls = [] for url in image_urls: if os.path.exists(url): if os.path.isfile(url): img_abs_urls.append(os.path.abspath(url)) else: return ToolResponse( [ TextBlock( type="text", text=f'Error: The input image url "{url}" is ' f"not a file.", ), ], ) else: # Maybe a web url or an invalid url, we leave it to the API # to handle img_abs_urls.append(url) # Convert image paths according to the model requirements contents = [] for url in img_abs_urls: contents.append( { "image": url, }, ) contents.append({"text": prompt}) # currently only support one round of conversation # if multiple rounds of conversation are needed, # it would be better to implement an Agent class sys_message = { "role": "system", "content": [{"text": "You are a helpful assistant."}], } user_message = { "role": "user", "content": contents, } messages = [sys_message, user_message] try: import dashscope response = dashscope.MultiModalConversation.call( model=model, messages=messages, api_key=api_key, ) content = response.output["choices"][0]["message"]["content"] if isinstance(content, list): content = content[0]["text"] if content is not None: return ToolResponse( [ TextBlock( type="text", text=content, ), ], ) else: return ToolResponse( [ TextBlock( type="text", text="Error: Failed to generate text", ), ], ) except Exception as e: return ToolResponse( [ TextBlock( type="text", text=f"Failed to generate text: {str(e)}", ), ], )
[docs] def dashscope_text_to_audio( text: str, api_key: str, model: str = "sambert-zhichu-v1", sample_rate: int = 48000, ) -> ToolResponse: """Convert the given text to audio. Args: text (`str`): The text to be converted into audio. api_key (`str`): The api key for the dashscope API. model (`str`, defaults to 'sambert-zhichu-v1'): The model to use. Full model list can be found in https://help.aliyun.com/zh/dashscope/model-list sample_rate (`int`, defaults to 48000): Sample rate of the audio. Returns: `ToolResponse`: A ToolResponse containing the generated content (ImageBlock/TextBlock/AudioBlock) or error information if the operation failed. """ try: import dashscope dashscope.api_key = api_key res = dashscope.audio.tts.SpeechSynthesizer.call( model=model, text=text, sample_rate=sample_rate, format="wav", ) audio_data = res.get_audio_data() if audio_data is not None: audio_base64 = base64.b64encode(audio_data).decode("utf-8") return ToolResponse( [ AudioBlock( type="audio", source={ "type": "base64", "media_type": "audio/wav", "data": audio_base64, }, ), ], ) else: return ToolResponse( [ TextBlock( type="text", text="Error: Failed to generate audio", ), ], ) except Exception as e: return ToolResponse( [ TextBlock( type="text", text=f"Failed to generate audio: {str(e)}", ), ], )