Source code for swh.web.api.views.metadata

# Copyright (C) 2021-2025 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information

import re
from typing import Any, Dict

from django.http import HttpResponse
from django.shortcuts import redirect
from rest_framework import serializers
from rest_framework.request import Request

from swh.model import hashutil, swhids
from swh.model.model import MetadataAuthority, MetadataAuthorityType, Origin
from swh.web import config
from swh.web.api.apidoc import api_doc, format_docstring
from swh.web.api.apiurls import api_route
from swh.web.api.serializers import SoftLimitsIntegerField
from swh.web.utils import archive, reverse
from swh.web.utils.exc import BadInputExc, NotFoundExc


[docs] class MetadataAuthorityField(serializers.Field): """A DRF field to handle metadata authorities."""
[docs] def to_representation(self, value: MetadataAuthority) -> str: """Serialize value. Args: value: a MetadataAuthority Returns: A metadata authority identifier, formatted as ``type IRI`` """ return f"{value.type.value} {value.url}"
[docs] def to_internal_value(self, data: str) -> MetadataAuthority: """From ``type IRI`` to MetadataAuthority. Handles serialization and validation of a metadata authority. Args: data: A metadata authority identifier, formatted as ``type IRI`` Raises: serializers.ValidationError: invalid value (missing space, invalid type). Returns: A MetadataAuthority """ authority = data.strip() if " " not in authority: raise serializers.ValidationError( "The 'authority' query parameter must contain a space." ) type_, url = authority.split(" ", 1) type_choices = [e.value for e in MetadataAuthorityType] if type_ not in type_choices: raise serializers.ValidationError( f"Invalid type {type_}, must be one of: {', '.join(type_choices)}" ) return MetadataAuthority(MetadataAuthorityType(type_), url)
[docs] class RawExtrinsicMetadataQuerySerializer(serializers.Serializer): """Raw Extrinsic Metadata query parameters serializer.""" authority = MetadataAuthorityField(required=True) after = serializers.DateTimeField(required=False, default=None) limit = SoftLimitsIntegerField( required=False, default=100, min_value=1, max_value=10000 ) page_token = serializers.CharField(required=False, default=None)
[docs] @api_route( "/raw-extrinsic-metadata/swhid/<swhid:target>/", "api-1-raw-extrinsic-metadata-swhid", query_params_serializer=RawExtrinsicMetadataQuerySerializer, ) @api_doc("/raw-extrinsic-metadata/swhid/", category="Metadata") @format_docstring() def api_raw_extrinsic_metadata_swhid( request: Request, target: str, validated_query_params: dict[str, Any], ): """ .. http:get:: /api/1/raw-extrinsic-metadata/swhid/(target) Returns raw `extrinsic metadata <https://docs.softwareheritage.org/devel/glossary.html#term-extrinsic-metadata>`__ collected on a given object. :param string target: The core SWHID of the object whose metadata should be returned :query string authority: A metadata authority identifier, formatted as ``<type> <IRI>``. Required. :query string after: ISO8601 representation of the minimum timestamp of metadata to fetch. Defaults to allowing all metadata. :query int limit: Maximum number of metadata objects to return (default to 100). :query string page_token: optional opaque token, used to get the next page of results {common_headers} :>jsonarr string target: SWHID of the object described by this metadata (absent when ``target`` is not a core SWHID (ie. it does not have type ``cnt``/``dir``/``rev``/``rel``/``snp``) :>jsonarr string discovery_date: ISO8601/RFC3339 timestamp of the moment this metadata was collected. :>jsonarr object authority: authority this metadata is coming from :>jsonarr object fetcher: tool used to fetch the metadata :>jsonarr string format: short identifier of the format of the metadata :>jsonarr string metadata_url: link to download the metadata "blob" itself :>jsonarr string origin: URL of the origin in which context's the metadata is valid, if any :>jsonarr int visit: identifier of the visit in which context's the metadata is valid, if any :>jsonarr string snapshot: SWHID of the snapshot in which context's the metadata is valid, if any :>jsonarr string release: SWHID of the release in which context's the metadata is valid, if any :>jsonarr string revision: SWHID of the revision in which context's the metadata is valid, if any :>jsonarr string path: SWHID of the path in which context's is valid if any, relative to a release or revision as anchor :>jsonarr string directory: SWHID of the directory in which context's the metadata is valid, if any :statuscode 200: no error **Example:** .. parsed-literal:: :swh_web_api:`raw-extrinsic-metadata/swhid/swh:1:dir:a2faa28028657859c16ff506924212b33f0e1307/?authority=forge%20https://pypi.org/` """ # noqa B950 authority = validated_query_params["authority"] after = validated_query_params["after"] page_token = validated_query_params["page_token"] limit = validated_query_params["limit"] try: parsed_target = swhids.ExtendedSWHID.from_string(target) except swhids.ValidationError as e: raise BadInputExc(f"Invalid target SWHID: {e}") from None result_page = archive.lookup_raw_extrinsic_metadata( parsed_target, authority, after, page_token, limit ) filename = None if parsed_target.object_type == swhids.ExtendedObjectType.ORIGIN: origin_sha1 = hashutil.hash_to_hex(parsed_target.object_id) (origin_info,) = list(archive.lookup_origins_by_sha1s([origin_sha1])) if origin_info is not None: filename = re.sub("[:/_.]+", "_", origin_info["url"]) + "_metadata" if filename is None: filename = f"{target}_metadata" results = [] for metadata in result_page.results: # We can't reliably send metadata directly, because it is a bytestring, # and we have to return JSON documents. metadata["metadata_url"] = reverse( "api-1-raw-extrinsic-metadata-get", url_args={"id": metadata["id"]}, query_params={"filename": filename}, request=request, ) del metadata["id"] results.append(metadata) headers: Dict[str, str] = {} if result_page.next_page_token is not None: headers["link-next"] = reverse( "api-1-raw-extrinsic-metadata-swhid", url_args={"target": target}, query_params=dict( authority=request.query_params["authority"], after=str(after) if after else None, limit=limit, page_token=result_page.next_page_token, ), request=request, ) return { "results": results, "headers": headers, }
[docs] @api_route( "/raw-extrinsic-metadata/get/(?P<id>[0-9a-z]+)/", "api-1-raw-extrinsic-metadata-get", ) def api_raw_extrinsic_metadata_get(request: Request, id: str): # This is an internal endpoint that should only be accessed via URLs given # by /raw-extrinsic-metadata/swhid/; so it is not documented. metadata = config.storage().raw_extrinsic_metadata_get_by_ids( [hashutil.hash_to_bytes(id)] ) if not metadata: raise NotFoundExc( "Metadata not found. Use /raw-extrinsic-metadata/swhid/ to access metadata." ) response = HttpResponse( metadata[0].metadata, content_type="application/octet-stream" ) filename = request.query_params.get("filename") if filename and re.match("[a-zA-Z0-9:._-]+", filename): response["Content-disposition"] = f'attachment; filename="{filename}"' else: # It should always be not-None and match the regexp if the URL was created by # /raw-extrinsic-metadata/swhid/, but we're better safe than sorry. response["Content-disposition"] = "attachment" return response
[docs] @api_route( "/raw-extrinsic-metadata/swhid/<swhid:target>/authorities/", "api-1-raw-extrinsic-metadata-swhid-authorities", ) @api_doc("/raw-extrinsic-metadata/swhid/authorities/", category="Metadata") @format_docstring() def api_raw_extrinsic_metadata_swhid_authorities(request: Request, target: str): """ .. http:get:: /api/1/raw-extrinsic-metadata/swhid/(target)/authorities/ Returns a list of metadata authorities that provided metadata on the given target. They can then be used to get the raw `extrinsic metadata <https://docs.softwareheritage.org/devel/glossary.html#term-extrinsic-metadata>`__ collected on that object from each of the authorities. This endpoint should only be used directly to retrieve metadata from core SWHIDs (with type ``cnt``, ``dir``, ``rev``, ``rel``, and ``snp``). For "extended" SWHIDs such as origins, :http:get:`/api/1/raw-extrinsic-metadata/origin/(origin_url)/authorities/` should be used instead of building this URL directly. :param string target: The core SWHID of the object whose metadata-providing authorities should be returned {common_headers} :>jsonarr string type: Type of authority (deposit_client, forge, registry) :>jsonarr string url: Unique IRI identifying the authority :>jsonarr object metadata_list_url: URL to get the list of metadata objects on the given object from this authority :statuscode 200: no error **Example:** .. parsed-literal:: :swh_web_api:`raw-extrinsic-metadata/swhid/swh:1:dir:a2faa28028657859c16ff506924212b33f0e1307/authorities/` """ # noqa try: parsed_target = swhids.ExtendedSWHID.from_string(target) except swhids.ValidationError as e: raise BadInputExc(f"Invalid target SWHID: {e}") from None authorities = config.storage().raw_extrinsic_metadata_get_authorities( target=parsed_target ) results = [ { **authority.to_dict(), "metadata_list_url": reverse( "api-1-raw-extrinsic-metadata-swhid", url_args={"target": target}, query_params={"authority": f"{authority.type.value} {authority.url}"}, request=request, ), } for authority in authorities ] return { "results": results, "headers": {}, }
[docs] @api_route( "/raw-extrinsic-metadata/origin/(?P<origin_url>.*)/authorities/", "api-1-raw-extrinsic-metadata-origin-authorities", ) @api_doc("/raw-extrinsic-metadata/origin/authorities/", category="Metadata") @format_docstring() def api_raw_extrinsic_metadata_origin_authorities(request: Request, origin_url: str): """ .. http:get:: /api/1/raw-extrinsic-metadata/origin/(origin_url)/authorities/ Similar to :http:get:`/api/1/raw-extrinsic-metadata/swhid/(target)/authorities/` but to get metadata on origins instead of objects :param string origin_url: The URL of the origin whose metadata-providing authorities should be returned {common_headers} :>jsonarr string type: Type of authority (deposit_client, forge, registry) :>jsonarr string url: Unique IRI identifying the authority :>jsonarr object metadata_list_url: URL to get the list of metadata objects on the given object from this authority :statuscode 200: no error **Example:** .. parsed-literal:: :swh_web_api:`raw-extrinsic-metadata/origin/https://github.com/rdicosmo/parmap/authorities/` """ # noqa url = reverse( "api-1-raw-extrinsic-metadata-swhid-authorities", url_args={"target": Origin(url=origin_url).swhid()}, ) return redirect(url)