Source code for agentscope.service.web.arxiv

# -*- coding: utf-8 -*-
"""Search papers in arXiv API. This implementation refers to the repository
https://github.com/lukasschwab/arxiv.py, which is MIT licensed.
"""
import json
import re
import time
import urllib
from calendar import timegm
from datetime import datetime, timezone
from typing import List, Optional, Union

try:
    import feedparser
except ImportError:
    feedparser = None
from loguru import logger

from agentscope.service.service_response import (
    ServiceResponse,
    ServiceExecStatus,
)

ARXIV_SEARCH_URL = "http://export.arxiv.org/api/query?{parameters_str}"

LOGIC_OPERATORS = ["ANDNOT", "AND", "OR"]

SYMBOLS = ["(", ")"]

QUERY_PREFIX = ["all:", "ti:", "au:", "abs:", "co:", "jr:", "cat:", "rn:"]


class _Result(dict):
    """The class for arXiv search results."""

    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__

    id: str
    """A url of the form `https://arxiv.org/abs/{id}`."""

    title: str
    """The title of the result."""

    updated: str
    """When the result was last updated."""

    published: str
    """When the result was published."""

    summary: str
    """The summary of the search result."""

    authors: List[str]
    """The authors of the search result."""

    comment: Optional[str]
    """The authors' comment if present."""

    primary_category: Optional[str]
    """The result's primary arXiv category. See [arXiv: Category
    Taxonomy](https://arxiv.org/category_taxonomy)."""

    tags: List[str]
    """All of the result's tags. See [arXiv: Category
    Taxonomy](https://arxiv.org/category_taxonomy)."""

    journal_ref: Optional[str]
    """A journal reference if present."""

    doi: Optional[str]
    """A URL for the resolved DOI to an external resource if present."""

    def __init__(
        self,
        entry_id: str,
        title: str,
        updated: str,
        published: str,
        summary: str,
        authors: List[str],
        pdf_url: Optional[str] = None,
        comment: Optional[str] = None,
        primary_category: Optional[str] = None,
        tags: List[str] = None,
        journal_ref: Optional[str] = None,
        doi: Optional[str] = None,
    ) -> None:
        """The class for arXiv search results."""
        self.entry_id = entry_id
        self.title = title
        self.updated = updated
        self.published = published
        self.summary = summary
        self.authors = authors
        self.pdf_url = pdf_url
        self.comment = comment
        self.primary_category = primary_category
        self.tags = tags
        self.journal_ref = journal_ref
        self.doi = doi

    def __str__(self) -> str:
        cleaned_dict = {}
        for key in self:
            if self[key] is not None:
                cleaned_dict[key] = self[key]
        return json.dumps(cleaned_dict, ensure_ascii=False)

    def __repr__(self) -> str:
        return self.__str__()


def _parse_pdf_url(links: List) -> Union[str, None]:
    """Parse the pdf url from the links."""
    for link in links:
        if link.get("title") == "pdf":
            return link.get("href")
    return None


def _parse_timestamp(timestamp: time.struct_time) -> str:
    """Parse the timestamp to a string."""
    timestamp = datetime.fromtimestamp(timegm(timestamp), tz=timezone.utc)
    return timestamp.strftime("%Y-%m-%d %H:%M:%S")


def _clean_arxiv_search_results(result: dict) -> dict:
    """Clean the arXiv search results, and remove unnecessary information."""
    feed = result.feed

    # Basic information
    cleaned_dict = {
        "updated": _parse_timestamp(feed.updated_parsed),
        "opensearch_total_results": int(feed.opensearch_totalresults),
        "opensearch_start_index": int(feed.opensearch_startindex),
        "opensearch_itemsperpage": int(feed.opensearch_itemsperpage),
    }

    # Entries
    entries = []
    for entry in result.entries:
        title = "0"
        if hasattr(entry, "title"):
            title = entry.title
        else:
            logger.warning(
                "Result %s is missing title attribute; defaulting to '0'",
                entry.id,
            )

        tags = [tag.get("term") for tag in entry.tags]
        if len(tags) == 0:
            tags = None

        entry_dict = _Result(
            # Basic properties
            entry_id=entry.id,
            title=title,
            updated=_parse_timestamp(entry.updated_parsed),
            published=_parse_timestamp(entry.published_parsed),
            summary=entry.summary,
            authors=[author.name for author in entry.authors],
            # Optional properties
            pdf_url=_parse_pdf_url(entry.links),
            comment=entry.get("arxiv_comment"),
            primary_category=entry.arxiv_primary_category.get("term"),
            tags=tags,
            journal_ref=entry.get("arxiv_journal_ref"),
            doi=entry.get("arxiv_doi"),
        )

        entries.append(entry_dict)

    cleaned_dict["entries"] = entries

    return cleaned_dict


def _reformat_query(query: str) -> str:
    """Reformat the query string for arxiv search, refer to
    https://info.arxiv.org/help/api/user-manual.html."""
    delimiter_regex = (
        "("
        + "|".join(
            map(re.escape, LOGIC_OPERATORS + QUERY_PREFIX + SYMBOLS),
        )
        + ")"
    )

    parts = re.split(delimiter_regex, query)

    parts = [part.strip() for part in parts if part.strip()]

    for i, part in enumerate(parts):
        if part not in LOGIC_OPERATORS + QUERY_PREFIX + SYMBOLS:
            # Add double quotes if it does not contain double quotes
            part = part.replace('"', "%22").replace(" ", "+")

            if not part.startswith("%22"):
                part = f"%22{part}"
            if not part.endswith("%22"):
                part = f"{part}%22"
            parts[i] = part
        elif part in SYMBOLS:
            parts[i] = part.replace("(", "%28").replace(")", "%29")
        elif part in LOGIC_OPERATORS:
            parts[i] = f"+{part}+"

    refined_query = "".join(parts)

    return refined_query



[docs]
def arxiv_search(
    search_query: str,
    id_list: List[str] = None,
    start: int = 0,
    max_results: Optional[int] = None,
) -> ServiceResponse:
    """Search arXiv paper by a given query string.

    Args:
        search_query (`str`):
            The query string, supporting prefixes "all:", "ti:", "au:",
            "abs:", "co:", "jr:", "cat:", and "rn:", boolean operators "AND",
            "OR" and "ANDNOT". For example, searching for papers with
            title "Deep Learning" and author "LeCun" by a
            search_query ti:"Deep Learning" AND au:"LeCun"
        id_list (`List[str]`, defaults to `None`):
            A list of arXiv IDs to search.
        start (`int`, defaults to `0`):
            The index of the first search result to return.
        max_results (`Optional[int]`, defaults to `None`):
            The maximum number of search results to return.

    Returns:
        `ServiceResponse`: A dictionary with two variables: `status` and
        `content`. The `status` variable is from the ServiceExecStatus enum,
        and `content` is a list of search results or error information,
        which depends on the `status` variable.
    """

    if feedparser is None:
        raise ImportError(
            "The `feedparser` module is not installed. Please install it by "
            "running `pip install feedparser`.",
        )

    # construct url
    search_query = _reformat_query(search_query)

    parameters = {"search_query": search_query}

    if id_list:
        parameters["id_list"] = ",".join(id_list)

    if start > 0:
        parameters["start"] = str(start)

    if max_results:
        parameters["max_results"] = str(max_results)

    parameters_str = "&".join([f"{k}={v}" for k, v in parameters.items()])

    url = ARXIV_SEARCH_URL.format(parameters_str=parameters_str)

    try:
        logger.debug(f"Searching arXiv by url: {url}")

        with urllib.request.urlopen(url) as data:
            # Parse the results by feedparser
            feedparser_dict = feedparser.parse(data.read().decode("utf-8"))

        # Remove unnecessary information
        results = _clean_arxiv_search_results(feedparser_dict)

        if data.code == 200:
            # Return the searching results
            return ServiceResponse(
                status=ServiceExecStatus.SUCCESS,
                content=results,
            )
        else:
            return ServiceResponse(
                status=ServiceExecStatus.ERROR,
                content=f"Error: {data.code}, {data}",
            )
    except Exception as e:
        return ServiceResponse(
            status=ServiceExecStatus.ERROR,
            content=f"Error: {e}",
        )