Source code for swh.fuse.backends.web_api
# Copyright (C) 2025  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import asyncio
from functools import partial
import logging
from typing import Any, Dict, List, Tuple
from urllib.parse import unquote_plus
from requests import HTTPError
from swh.fuse import LOGGER_NAME
from swh.fuse.backends import GraphBackend, ObjBackend
from swh.model.swhids import CoreSWHID
from swh.web.client.client import WebAPIClient
[docs]
class WebApiBackend(GraphBackend, ObjBackend):
    """
    A Backend querying everything via Software Heritage's public API.
    This is simpler to configure and deploy, but expect long response times.
    """
    def __init__(self, conf: Dict):
        """
        Only needs the `web-api` key of `conf`, searching for `url` and maybe `auth-token` keys.
        """
        self.web_api = WebAPIClient(
            conf["web-api"]["url"], conf["web-api"]["auth-token"]
        )
        self.logger = logging.getLogger(LOGGER_NAME)
[docs]
    async def get_blob(self, swhid) -> bytes:
        try:
            self.logger.debug("Retrieving blob %s via web API...", swhid)
            loop = asyncio.get_event_loop()
            resp = await loop.run_in_executor(None, self.web_api.content_raw, swhid)
            blob = b"".join(list(resp))
            return blob
        except HTTPError as err:
            self.logger.error("Cannot fetch blob for object %s: %s", swhid, err)
            raise 
[docs]
    async def get_history(self, swhid: CoreSWHID) -> List[Tuple[str, str]]:
        try:
            # Use the swh-graph API to retrieve the full history very fast
            self.logger.debug("Retrieving history of %s via graph API...", swhid)
            call = f"graph/visit/edges/{swhid}?edges=rev:rev"
            loop = asyncio.get_event_loop()
            request = await loop.run_in_executor(None, self.web_api._call, call)
            history = request.text.strip()
            if history:
                edges = []
                for edge in history.split("\n"):
                    split = edge.split(" ")
                    if len(split) == 2:
                        edges.append((split[0], split[1]))
                return edges
            else:
                return []
        except HTTPError as err:
            self.logger.error("Cannot fetch history for object %s: %s", swhid, err)
            # Ignore exception since swh-graph does not necessarily contain the
            # most recent artifacts from the archive. Computing the full history
            # from the Web API is too computationally intensive so simply return
            # an empty list.
            return [] 
[docs]
    async def get_visits(self, url_encoded: str) -> List[Dict[str, Any]]:
        try:
            self.logger.debug(
                "Retrieving visits for origin '%s' via web API...", url_encoded
            )
            loop = asyncio.get_event_loop()
            # Web API only takes non-encoded URL
            url = unquote_plus(url_encoded)
            origin_exists = await loop.run_in_executor(
                None, self.web_api.origin_exists, url
            )
            if not origin_exists:
                raise ValueError("origin does not exist")
            visits_it = await loop.run_in_executor(
                None, partial(self.web_api.visits, url, typify=False)
            )
            visits = list(visits_it)
            return visits
        except (ValueError, HTTPError) as err:
            self.logger.error(
                "Cannot fetch visits for origin '%s': %s", url_encoded, err
            )
            raise