agentscope.service.web.arxiv 源代码

# -*- coding: utf-8 -*-
"""Search papers in arXiv API. This implementation refers to the repository
https://github.com/lukasschwab/arxiv.py, which is MIT licensed.
"""
import json
import re
import time
import urllib
from calendar import timegm
from datetime import datetime, timezone
from typing import List, Optional, Union

try:
    import feedparser
except ImportError:
    feedparser = None
from loguru import logger

from agentscope.service.service_response import (
    ServiceResponse,
    ServiceExecStatus,
)

ARXIV_SEARCH_URL = "http://export.arxiv.org/api/query?{parameters_str}"

LOGIC_OPERATORS = ["ANDNOT", "AND", "OR"]

SYMBOLS = ["(", ")"]

QUERY_PREFIX = ["all:", "ti:", "au:", "abs:", "co:", "jr:", "cat:", "rn:"]


class _Result(dict):
    """The class for arXiv search results."""

    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__

    id: str
    """A url of the form `https://arxiv.org/abs/{id}`."""

    title: str
    """The title of the result."""

    updated: str
    """When the result was last updated."""

    published: str
    """When the result was published."""

    summary: str
    """The summary of the search result."""

    authors: List[str]
    """The authors of the search result."""

    comment: Optional[str]
    """The authors' comment if present."""

    primary_category: Optional[str]
    """The result's primary arXiv category. See [arXiv: Category
    Taxonomy](https://arxiv.org/category_taxonomy)."""

    tags: List[str]
    """All of the result's tags. See [arXiv: Category
    Taxonomy](https://arxiv.org/category_taxonomy)."""

    journal_ref: Optional[str]
    """A journal reference if present."""

    doi: Optional[str]
    """A URL for the resolved DOI to an external resource if present."""

    def __init__(
        self,
        entry_id: str,
        title: str,
        updated: str,
        published: str,
        summary: str,
        authors: List[str],
        pdf_url: Optional[str] = None,
        comment: Optional[str] = None,
        primary_category: Optional[str] = None,
        tags: List[str] = None,
        journal_ref: Optional[str] = None,
        doi: Optional[str] = None,
    ) -> None:
        """The class for arXiv search results."""
        self.entry_id = entry_id
        self.title = title
        self.updated = updated
        self.published = published
        self.summary = summary
        self.authors = authors
        self.pdf_url = pdf_url
        self.comment = comment
        self.primary_category = primary_category
        self.tags = tags
        self.journal_ref = journal_ref
        self.doi = doi

    def __str__(self) -> str:
        cleaned_dict = {}
        for key in self:
            if self[key] is not None:
                cleaned_dict[key] = self[key]
        return json.dumps(cleaned_dict, ensure_ascii=False)

    def __repr__(self) -> str:
        return self.__str__()


def _parse_pdf_url(links: List) -> Union[str, None]:
    """Parse the pdf url from the links."""
    for link in links:
        if link.get("title") == "pdf":
            return link.get("href")
    return None


def _parse_timestamp(timestamp: time.struct_time) -> str:
    """Parse the timestamp to a string."""
    timestamp = datetime.fromtimestamp(timegm(timestamp), tz=timezone.utc)
    return timestamp.strftime("%Y-%m-%d %H:%M:%S")


def _clean_arxiv_search_results(result: dict) -> dict:
    """Clean the arXiv search results, and remove unnecessary information."""
    feed = result.feed

    # Basic information
    cleaned_dict = {
        "updated": _parse_timestamp(feed.updated_parsed),
        "opensearch_total_results": int(feed.opensearch_totalresults),
        "opensearch_start_index": int(feed.opensearch_startindex),
        "opensearch_itemsperpage": int(feed.opensearch_itemsperpage),
    }

    # Entries
    entries = []
    for entry in result.entries:
        title = "0"
        if hasattr(entry, "title"):
            title = entry.title
        else:
            logger.warning(
                "Result %s is missing title attribute; defaulting to '0'",
                entry.id,
            )

        tags = [tag.get("term") for tag in entry.tags]
        if len(tags) == 0:
            tags = None

        entry_dict = _Result(
            # Basic properties
            entry_id=entry.id,
            title=title,
            updated=_parse_timestamp(entry.updated_parsed),
            published=_parse_timestamp(entry.published_parsed),
            summary=entry.summary,
            authors=[author.name for author in entry.authors],
            # Optional properties
            pdf_url=_parse_pdf_url(entry.links),
            comment=entry.get("arxiv_comment"),
            primary_category=entry.arxiv_primary_category.get("term"),
            tags=tags,
            journal_ref=entry.get("arxiv_journal_ref"),
            doi=entry.get("arxiv_doi"),
        )

        entries.append(entry_dict)

    cleaned_dict["entries"] = entries

    return cleaned_dict


def _reformat_query(query: str) -> str:
    """Reformat the query string for arxiv search, refer to
    https://info.arxiv.org/help/api/user-manual.html."""
    delimiter_regex = (
        "("
        + "|".join(
            map(re.escape, LOGIC_OPERATORS + QUERY_PREFIX + SYMBOLS),
        )
        + ")"
    )

    parts = re.split(delimiter_regex, query)

    parts = [part.strip() for part in parts if part.strip()]

    for i, part in enumerate(parts):
        if part not in LOGIC_OPERATORS + QUERY_PREFIX + SYMBOLS:
            # Add double quotes if it does not contain double quotes
            part = part.replace('"', "%22").replace(" ", "+")

            if not part.startswith("%22"):
                part = f"%22{part}"
            if not part.endswith("%22"):
                part = f"{part}%22"
            parts[i] = part
        elif part in SYMBOLS:
            parts[i] = part.replace("(", "%28").replace(")", "%29")
        elif part in LOGIC_OPERATORS:
            parts[i] = f"+{part}+"

    refined_query = "".join(parts)

    return refined_query