Source code for swh.fuse.backends.web_api
# Copyright (C) 2025 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import asyncio
from functools import partial
import logging
from typing import Any, Dict, List, Tuple
from urllib.parse import unquote_plus
from requests import HTTPError
from swh.fuse import LOGGER_NAME
from swh.fuse.backends import GraphBackend, ObjBackend
from swh.model.swhids import CoreSWHID
from swh.web.client.client import WebAPIClient
[docs]
class WebApiBackend(GraphBackend, ObjBackend):
"""
A Backend querying everything via Software Heritage's public API.
This is simpler to configure and deploy, but expect long response times.
"""
def __init__(self, conf: Dict):
"""
Only needs the `web-api` key of `conf`, searching for `url` and maybe `auth-token` keys.
"""
self.web_api = WebAPIClient(
conf["web-api"]["url"], conf["web-api"]["auth-token"]
)
self.logger = logging.getLogger(LOGGER_NAME)
[docs]
async def get_blob(self, swhid) -> bytes:
try:
self.logger.debug("Retrieving blob %s via web API...", swhid)
loop = asyncio.get_event_loop()
resp = await loop.run_in_executor(None, self.web_api.content_raw, swhid)
blob = b"".join(list(resp))
return blob
except HTTPError as err:
self.logger.error("Cannot fetch blob for object %s: %s", swhid, err)
raise
[docs]
async def get_history(self, swhid: CoreSWHID) -> List[Tuple[str, str]]:
try:
# Use the swh-graph API to retrieve the full history very fast
self.logger.debug("Retrieving history of %s via graph API...", swhid)
call = f"graph/visit/edges/{swhid}?edges=rev:rev"
loop = asyncio.get_event_loop()
request = await loop.run_in_executor(None, self.web_api._call, call)
history = request.text.strip()
if history:
edges = []
for edge in history.split("\n"):
split = edge.split(" ")
if len(split) == 2:
edges.append((split[0], split[1]))
return edges
else:
return []
except HTTPError as err:
self.logger.error("Cannot fetch history for object %s: %s", swhid, err)
# Ignore exception since swh-graph does not necessarily contain the
# most recent artifacts from the archive. Computing the full history
# from the Web API is too computationally intensive so simply return
# an empty list.
return []
[docs]
async def get_visits(self, url_encoded: str) -> List[Dict[str, Any]]:
try:
self.logger.debug(
"Retrieving visits for origin '%s' via web API...", url_encoded
)
loop = asyncio.get_event_loop()
# Web API only takes non-encoded URL
url = unquote_plus(url_encoded)
origin_exists = await loop.run_in_executor(
None, self.web_api.origin_exists, url
)
if not origin_exists:
raise ValueError("origin does not exist")
visits_it = await loop.run_in_executor(
None, partial(self.web_api.visits, url, typify=False)
)
visits = list(visits_it)
return visits
except (ValueError, HTTPError) as err:
self.logger.error(
"Cannot fetch visits for origin '%s': %s", url_encoded, err
)
raise