Source code for agentscope.service.web.web_digest

# -*- coding: utf-8 -*-
"""parsing and digesting the web pages"""
import json
from urllib.parse import urlparse
from typing import Optional, Callable, Sequence, Any
import requests
from loguru import logger


from agentscope.service.service_response import ServiceResponse
from agentscope.service.service_status import ServiceExecStatus
from agentscope.models.model import ModelWrapperBase
from agentscope.service import summarization


DEFAULT_WEB_SYS_PROMPT = (
    "You're a web page analyser. You job is to extract important"
    "and useful information from html or webpage description.\n"
)


[docs] def is_valid_url(url: str) -> bool: """ Use urlparse to check if a URL is valid Args: url (str): string to be checked Returns: bool: True if url is valid, False otherwise """ try: result = urlparse(url) # Check if the URL has both a scheme # (e.g., "http" or "https") and a netloc (domain). return all([result.scheme, result.netloc]) except ValueError: return False # A ValueError indicates that the URL is not valid.
[docs] def load_web( url: str, keep_raw: bool = True, html_selected_tags: Optional[Sequence[str]] = None, self_parse_func: Optional[Callable[[requests.Response], Any]] = None, timeout: int = 5, ) -> ServiceResponse: """Function for parsing and digesting the web page. Args: url (str): the url of the web page keep_raw (bool): Whether to keep raw HTML. If True, the content is stored with key "raw". html_selected_tags (Optional[Sequence[str]]): the text in elements of `html_selected_tags` will be extracted and stored with "html_to_text" key in return. self_parse_func (Optional[Callable]): if "self_parse_func" is not None, then the function will be invoked with the requests.Response as input. The result is stored with `self_define_func` key timeout (int): timeout parameter for requests. Returns: `ServiceResponse`: If successful, `ServiceResponse` object is returned with `content` field is a dict, where keys are subset of: "raw": exists if `keep_raw` is True, store raw HTML content`; "self_define_func": exists if `self_parse_func` is provided, store the return of self_define_func; "html_to_text": exists if `html_selected_tags` is provided and not empty; "json": exists if url links to a json webpage, then it is parsed as json. For example, `ServiceResponse.content` field is .. code-block:: python { "raw": xxxxx, "selected_tags_text": xxxxx } """ header = { "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64)" " AppleWebKit/537.36 (KHTML, like Gecko) ", } try: response = requests.get(url=url, headers=header, timeout=timeout) if response.status_code == 200: results = {} if keep_raw: results["raw"] = response.content if self_parse_func: results["self_define_func"] = self_parse_func(response) content_type = response.headers["Content-Type"].lower() if "html" in content_type and html_selected_tags: html_clean_text = parse_html_to_text( response.text, html_selected_tags, ) results["html_to_text"] = html_clean_text elif "pdf" in content_type: # TODO: support pdf in the future logger.warning( "Current version does not parse url with pdf " "Content-Types", ) elif "json" in content_type: results["json"] = json.loads(response.text) elif "image" in content_type: # TODO: to support image (gif, jpeg, png) data logger.warning( "Current implementation returns binary " "response.content for url with image Content-Types", ) else: raise NotImplementedError( f"Unsupported content type ({content_type}) " f"with url: ({url})", ) return ServiceResponse( ServiceExecStatus.SUCCESS, content=results, ) else: logger.warning( f"Fail to load web page, " f"status code {response.status_code}", ) return ServiceResponse( ServiceExecStatus.ERROR, content="", ) except Exception as e: logger.warning(e) return ServiceResponse(ServiceExecStatus.ERROR, content="")
[docs] def parse_html_to_text( html_text: str, html_selected_tags: Optional[Sequence[str]] = None, ) -> str: """ Parse the obtained HTML file. Args: html_text (str): HTML source code html_selected_tags (Optional[Sequence[str]]): the text in elements of `html_selected_tags` will be extracted and returned. Returns: `ServiceResponse`: If successful, `ServiceResponse` object is returned with `content` field is processed text content of the selected tags, """ if html_selected_tags: logger.info( f"extracting text information from tags: " f"{html_selected_tags}", ) try: from bs4 import BeautifulSoup, NavigableString, Tag except ImportError as exc: raise ImportError( "BeautifulSoup4 is required for processing the " "web page without model." "Please install with `pip install bs4` .", ) from exc doc = BeautifulSoup(html_text, "html.parser") def get_navigable_strings( e: Tag, ) -> str: # pylint: disable=cell-var-from-loop text = "" for child in e.children: if isinstance(child, Tag): # pylint: disable=cell-var-from-loop text += get_navigable_strings(child).strip(" \n\t") elif isinstance(child, NavigableString): if (e.name == "a") and (href := e.get("href")): if is_valid_url(href): text += f"[{child.strip()}]({href})" else: text += child.text return " ".join(text.split()) text_parts = "" for element in doc.find_all(recursive=True): if element.name in html_selected_tags: text_parts += get_navigable_strings(element).strip(" \n\t") element.decompose() else: text_parts = "" return text_parts
[docs] def digest_webpage( web_text_or_url: str, model: ModelWrapperBase = None, html_selected_tags: Sequence[str] = ("h", "p", "li", "div", "a"), digest_prompt: str = DEFAULT_WEB_SYS_PROMPT, ) -> ServiceResponse: """Digest the given webpage. Args: web_text_or_url (str): preprocessed web text or url to the web page model (ModelWrapperBase): the model to digest the web content html_selected_tags (Sequence[str]): the text in elements of `html_selected_tags` will be extracted and feed to the model digest_prompt (str): system prompt for the model to digest the web content Returns: `ServiceResponse`: If successful, `ServiceResponse` object is returned with `content` field filled with the model output. """ if is_valid_url(web_text_or_url): # if an url is provided, then # load the content of the url first if html_selected_tags is None or len(html_selected_tags) == 0: html_selected_tags = ["h", "p", "li", "div", "a"] response = load_web( url=web_text_or_url, html_selected_tags=html_selected_tags, ) if response.status == ServiceExecStatus.SUCCESS: web_text = response.content["html_to_text"] else: return response else: web_text = web_text_or_url return summarization( model=model, text=web_text, system_prompt=digest_prompt, )