Source code for swh.deposit.parsers

# Copyright (C) 2017-2020  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information


"""Module in charge of defining parsers with SWORD 2.0 supported mediatypes."""

import logging
from xml.etree import ElementTree

from django.conf import settings
from rest_framework.parsers import BaseParser, FileUploadParser, MultiPartParser

from swh.deposit.errors import ParserError

logger = logging.getLogger(__name__)



[docs]
class SWHFileUploadZipParser(FileUploadParser):
    """File upload parser limited to zip archive."""

    media_type = "application/zip"




[docs]
class SWHFileUploadTarParser(FileUploadParser):
    """File upload parser limited to tarball (tar, tar.gz, tar.*) archives."""

    media_type = "application/x-tar"




[docs]
class SWHXMLParser(BaseParser):
    """
    XML parser.
    """

    media_type = "application/xml"


[docs]
    def parse(self, stream, media_type=None, parser_context=None):
        """
        Parses the incoming bytestream as XML and returns the resulting data.
        """
        parser_context = parser_context or {}
        encoding = parser_context.get("encoding", settings.DEFAULT_CHARSET)
        parser = ElementTree.XMLParser(encoding=encoding)
        return ElementTree.parse(stream, parser=parser)





[docs]
class SWHAtomEntryParser(SWHXMLParser):
    """Atom entry parser limited to specific mediatype"""

    media_type = "application/atom+xml;type=entry"


[docs]
    def parse(self, stream, media_type=None, parser_context=None):
        # We do not actually want to parse the stream yet
        # because we want to keep the raw data as well
        # this is done later in the atom entry call
        # (cf. swh.deposit.api.common.APIBase._atom_entry)
        return stream





[docs]
class SWHMultiPartParser(MultiPartParser):
    """Multipart parser limited to a subset of mediatypes."""

    media_type = "multipart/*; *"




[docs]
def parse_xml(raw_content):
    """Parse xml body.

    Args:
        raw_content (bytes): The content to parse

    Raises:
        ParserError in case of a malformed xml

    Returns:
        content parsed as dict.

    """
    try:
        return ElementTree.fromstring(raw_content)
    except ElementTree.ParseError as e:
        raise ParserError(str(e))