Source code for agentscope.service.multi_modality.dashscope_services

# -*- coding: utf-8 -*-
"""Use DashScope API to generate images,
convert text to audio, and convert images to text.
Please refer to the official documentation for more details:
https://dashscope.aliyun.com/
"""

from typing import Union, Optional, Literal, Sequence

import os

from ...models import (
    DashScopeImageSynthesisWrapper,
    DashScopeMultiModalWrapper,
)

from ..service_response import (
    ServiceResponse,
    ServiceExecStatus,
)
from ...utils.common import _download_file


[docs] def dashscope_text_to_image( prompt: str, api_key: str, n: int = 1, size: Literal["1024*1024", "720*1280", "1280*720"] = "1024*1024", model: str = "wanx-v1", save_dir: Optional[str] = None, ) -> ServiceResponse: """Generate image(s) based on the given prompt, and return image url(s). Args: prompt (`str`): The text prompt to generate image. api_key (`str`): The api key for the dashscope api. n (`int`, defaults to `1`): The number of images to generate. size (`Literal["1024*1024", "720*1280", "1280*720"]`, defaults to `"1024*1024"`): Size of the image. model (`str`, defaults to '"wanx-v1"'): The model to use. save_dir (`Optional[str]`, defaults to 'None'): The directory to save the generated images. If not specified, will return the web urls. Returns: ServiceResponse: A dictionary with two variables: `status` and`content`. If `status` is ServiceExecStatus.SUCCESS, the `content` is a dict with key 'fig_paths" and value is a list of the paths to the generated images. Example: .. code-block:: python prompt = "A beautiful sunset in the mountains" print(dashscope_text_to_image(prompt, "{api_key}")) > { > 'status': 'SUCCESS', > 'content': {'image_urls': ['IMAGE_URL1', 'IMAGE_URL2']} > } """ text2img = DashScopeImageSynthesisWrapper( config_name="dashscope-text-to-image-service", # Just a placeholder model_name=model, api_key=api_key, ) try: res = text2img( prompt=prompt, n=n, size=size, ) urls = res.image_urls # save images to save_dir if urls is not None: if save_dir: os.makedirs(save_dir, exist_ok=True) urls_local = [] # Obtain the image file names in the url for url in urls: image_name = url.split("/")[-1] image_path = os.path.abspath( os.path.join(save_dir, image_name), ) # Download the image _download_file(url, image_path) urls_local.append(image_path) return ServiceResponse( ServiceExecStatus.SUCCESS, {"image_urls": urls_local}, ) else: # Return the web urls return ServiceResponse( ServiceExecStatus.SUCCESS, {"image_urls": urls}, ) else: return ServiceResponse( ServiceExecStatus.ERROR, "Error: Failed to generate images", ) except Exception as e: return ServiceResponse( ServiceExecStatus.ERROR, str(e), )
[docs] def dashscope_image_to_text( image_urls: Union[str, Sequence[str]], api_key: str, prompt: str = "Describe the image", model: str = "qwen-vl-plus", ) -> ServiceResponse: """Generate text based on the given images. Args: image_urls (`Union[str, Sequence[str]]`): The url of single or multiple images. api_key (`str`): The api key for the dashscope api. prompt (`str`, defaults to 'Describe the image' ): The text prompt. model (`str`, defaults to 'qwen-vl-plus'): The model to use in DashScope MultiModal API. Returns: `ServiceResponse`: A dictionary with two variables: `status` and`content`. If `status` is ServiceExecStatus.SUCCESS, the `content` is the generated text. Example: .. code-block:: python image_url = "image.jpg" prompt = "Describe the image" print(image_to_text(image_url, prompt)) > {'status': 'SUCCESS', 'content': 'A beautiful sunset in the mountains'} """ img2text = DashScopeMultiModalWrapper( config_name="dashscope-image-to-text-service", # Just a placeholder model_name=model, api_key=api_key, ) if isinstance(image_urls, str): image_urls = [image_urls] # Check if the local url is valid img_abs_urls = [] for url in image_urls: if os.path.exists(url): if os.path.isfile(url): img_abs_urls.append(os.path.abspath(url)) else: return ServiceResponse( ServiceExecStatus.ERROR, f'Error: The input image url "{url}" is not a file.', ) else: # Maybe a web url or an invalid url, we leave it to the API # to handle img_abs_urls.append(url) # Convert image paths according to the model requirements contents = img2text.convert_url(img_abs_urls) contents.append({"text": prompt}) # currently only support one round of conversation # if multiple rounds of conversation are needed, # it would be better to implement an Agent class sys_message = { "role": "system", "content": [{"text": "You are a helpful assistant."}], } user_message = { "role": "user", "content": contents, } messages = [sys_message, user_message] try: res = img2text(messages, stream=False) description = res.text if description is not None: return ServiceResponse( ServiceExecStatus.SUCCESS, description, ) else: return ServiceResponse( ServiceExecStatus.ERROR, "Error: Failed to generate text", ) except Exception as e: return ServiceResponse( ServiceExecStatus.ERROR, str(e), )
[docs] def dashscope_text_to_audio( text: str, api_key: str, save_dir: str, model: str = "sambert-zhichu-v1", sample_rate: int = 48000, ) -> ServiceResponse: """Convert the given text to audio. Args: text (`str`): The text to be converted into audio. api_key (`str`): The api key for the dashscope API. save_dir (`str`): The directory to save the generated audio. model (`str`, defaults to 'sambert-zhichu-v1'): The model to use. Full model list can be found in https://help.aliyun.com/zh/dashscope/model-list sample_rate (`int`, defaults to 48000): Samplerate of the audio. Returns: `ServiceResponse`: A dictionary with two variables: `status` and`content`. If `status` is ServiceExecStatus.SUCCESS, the `content` contains a dictionary with key "audio_path" and value is the path to the generated audio. Example: .. code-block:: python text = "How is the weather today?" print(text_to_audio(text)) gives: > {'status': 'SUCCESS', 'content': {"audio_path": "AUDIO_PATH"}} """ try: import dashscope except ImportError as e: raise ImportError( "The package 'dashscope' is not installed. Please install it by " "running `pip install dashscope==1.14.1`", ) from e dashscope.api_key = api_key res = dashscope.audio.tts.SpeechSynthesizer.call( model=model, text=text, sample_rate=sample_rate, format="wav", ) audio_data = res.get_audio_data() if audio_data is not None: if save_dir is not None: os.makedirs(save_dir, exist_ok=True) # Save locally text = text[0:15] if len(text) > 15 else text audio_path = os.path.join(save_dir, f"{text.strip()}.wav") with open(audio_path, "wb") as f: f.write(audio_data) return ServiceResponse( ServiceExecStatus.SUCCESS, {"audio_path": audio_path}, ) else: return ServiceResponse( ServiceExecStatus.ERROR, "Error: Failed to generate audio", )