Source code for PyXWF.Parsers.PyWebXML

# encoding=utf-8
# File name: PyWebXML.py
# This file is part of: pyxwf
#
# LICENSE
#
# The contents of this file are subject to the Mozilla Public License
# Version 1.1 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
# the License for the specific language governing rights and limitations
# under the License.
#
# Alternatively, the contents of this file may be used under the terms
# of the GNU General Public license (the  "GPL License"), in which case
# the provisions of GPL License are applicable instead of those above.
#
# FEEDBACK & QUESTIONS
#
# For feedback and questions about pyxwf please e-mail one of the
# authors named in the AUTHORS file.
########################################################################
from __future__ import unicode_literals

import itertools
from datetime import datetime

from PyXWF.utils import ET
import PyXWF.utils as utils
import PyXWF.Registry as Registry
import PyXWF.Parsers as Parsers
import PyXWF.Document as Document
import PyXWF.Namespaces as NS

map = itertools.imap

[docs]class PyWebXML(Parsers.ParserBase): """ This class parses PyWebXML documents. Usually, you don't create instances of this, you just access it using via the :attr:`~PyXWF.Site.parser_registry` attribute of your :class:`~PyXWF.Site` instance. """ __metaclass__ = Registry.SitletonMeta mimetypes = ["application/x-pywebxml"] _known_nodes = { NS.XHTML.meta, NS.PyWebXML.title, NS.PyWebXML.link, NS.PyWebXML.author, NS.PyWebXML.date, NS.PyWebXML.description, NS.PyWebXML.kw, NS.PyWebXML.script, } def __init__(self, site): super(PyWebXML, self).__init__(site, parser_mimetypes=self.mimetypes ) def _link_from_node(self, node): return node def _is_ext_node(self, node): return node.tag is not ET.Comment and node.tag not in self._known_nodes def get_links(self, meta): return list(map(self._link_from_node, itertools.chain( meta.findall(NS.PyWebXML.link), meta.findall(NS.PyWebXML.script) ))) @classmethod def get_keywords(cls, meta): return list(map( lambda node: unicode(node.text), meta.findall(NS.PyWebXML.kw))) @classmethod def get_authors(cls, meta): return list(map(Document.Author.from_node, meta.findall(NS.PyWebXML.author))) @classmethod def get_date(cls, meta): datetext = meta.findtext(NS.PyWebXML.date) return utils.parse_iso_date(datetext) @classmethod def get_meta(cls, meta): return meta.findall(NS.XHTML.meta) @classmethod def get_description(cls, meta): return meta.findtext(NS.PyWebXML.description)
[docs] def parse_tree(self, root, header_offset=1): """ Take the root element of an ElementTree and interpret it as PyWebXML document. Return the resulting :class:`~PyXWF.Document.Document` instance on success and raise on error. *header_offset* works as documented in the base class' :meth:`~PyXWF.Parsers.ParserBase.transform_headers` method. """ if root.tag != NS.PyWebXML.page: raise ValueError("This is not a pyxwf-xml document.") meta = root.find(NS.PyWebXML.meta) if meta is None: raise ValueError("Metadata is missing.") title = unicode(meta.findtext(NS.PyWebXML.title)) if title is None: raise ValueError("Title is missing.") keywords = self.get_keywords(meta) links = self.get_links(meta) body = root.find(NS.XHTML.body) if body is None: raise ValueError("No body tag found") self.transform_headers(body, header_offset) date = self.get_date(meta) authors = self.get_authors(meta) hmeta = self.get_meta(meta) description = self.get_description(meta) ext = [node for node in meta if self._is_ext_node(node)] return Document.Document(title, keywords, links, body, ext=ext, date=date, authors=authors, hmeta=hmeta, description=description)
[docs] def parse(self, fileref, **kwargs): """ Parse the file referenced by *fileref* as PyWebXML document and return the resulting :class:`~PyXWF.Document.Document` instance. """ tree = ET.parse(fileref) root = tree.getroot() return self.parse_tree(root, **kwargs)