Source code for swh.deposit.parsers
# Copyright (C) 2017-2020  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Module in charge of defining parsers with SWORD 2.0 supported mediatypes."""
import logging
from xml.etree import ElementTree
from django.conf import settings
from rest_framework.parsers import BaseParser, FileUploadParser, MultiPartParser
from swh.deposit.errors import ParserError
logger = logging.getLogger(__name__)
[docs]
class SWHFileUploadZipParser(FileUploadParser):
    """File upload parser limited to zip archive."""
    media_type = "application/zip" 
[docs]
class SWHFileUploadTarParser(FileUploadParser):
    """File upload parser limited to tarball (tar, tar.gz, tar.*) archives."""
    media_type = "application/x-tar" 
[docs]
class SWHXMLParser(BaseParser):
    """
    XML parser.
    """
    media_type = "application/xml"
[docs]
    def parse(self, stream, media_type=None, parser_context=None):
        """
        Parses the incoming bytestream as XML and returns the resulting data.
        """
        parser_context = parser_context or {}
        encoding = parser_context.get("encoding", settings.DEFAULT_CHARSET)
        parser = ElementTree.XMLParser(encoding=encoding)
        return ElementTree.parse(stream, parser=parser) 
 
[docs]
class SWHAtomEntryParser(SWHXMLParser):
    """Atom entry parser limited to specific mediatype"""
    media_type = "application/atom+xml;type=entry"
[docs]
    def parse(self, stream, media_type=None, parser_context=None):
        # We do not actually want to parse the stream yet
        # because we want to keep the raw data as well
        # this is done later in the atom entry call
        # (cf. swh.deposit.api.common.APIBase._atom_entry)
        return stream 
 
[docs]
class SWHMultiPartParser(MultiPartParser):
    """Multipart parser limited to a subset of mediatypes."""
    media_type = "multipart/*; *" 
[docs]
def parse_xml(raw_content):
    """Parse xml body.
    Args:
        raw_content (bytes): The content to parse
    Raises:
        ParserError in case of a malformed xml
    Returns:
        content parsed as dict.
    """
    try:
        return ElementTree.fromstring(raw_content)
    except ElementTree.ParseError as e:
        raise ParserError(str(e))