Source code for PyXWF.Parsers.PyWebXML
# encoding=utf-8
# File name: PyWebXML.py
# This file is part of: pyxwf
#
# LICENSE
#
# The contents of this file are subject to the Mozilla Public License
# Version 1.1 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
# the License for the specific language governing rights and limitations
# under the License.
#
# Alternatively, the contents of this file may be used under the terms
# of the GNU General Public license (the "GPL License"), in which case
# the provisions of GPL License are applicable instead of those above.
#
# FEEDBACK & QUESTIONS
#
# For feedback and questions about pyxwf please e-mail one of the
# authors named in the AUTHORS file.
########################################################################
from __future__ import unicode_literals
import itertools
from datetime import datetime
from PyXWF.utils import ET
import PyXWF.utils as utils
import PyXWF.Registry as Registry
import PyXWF.Parsers as Parsers
import PyXWF.Document as Document
import PyXWF.Namespaces as NS
map = itertools.imap
[docs]class PyWebXML(Parsers.ParserBase):
"""
This class parses PyWebXML documents. Usually, you don't create instances of
this, you just access it using via the :attr:`~PyXWF.Site.parser_registry`
attribute of your :class:`~PyXWF.Site` instance.
"""
__metaclass__ = Registry.SitletonMeta
mimetypes = ["application/x-pywebxml"]
_known_nodes = {
NS.XHTML.meta,
NS.PyWebXML.title,
NS.PyWebXML.link,
NS.PyWebXML.author,
NS.PyWebXML.date,
NS.PyWebXML.description,
NS.PyWebXML.kw,
NS.PyWebXML.script,
}
def __init__(self, site):
super(PyWebXML, self).__init__(site,
parser_mimetypes=self.mimetypes
)
def _link_from_node(self, node):
return node
def _is_ext_node(self, node):
return node.tag is not ET.Comment and node.tag not in self._known_nodes
def get_links(self, meta):
return list(map(self._link_from_node, itertools.chain(
meta.findall(NS.PyWebXML.link),
meta.findall(NS.PyWebXML.script)
)))
@classmethod
def get_keywords(cls, meta):
return list(map(
lambda node: unicode(node.text), meta.findall(NS.PyWebXML.kw)))
@classmethod
def get_authors(cls, meta):
return list(map(Document.Author.from_node, meta.findall(NS.PyWebXML.author)))
@classmethod
def get_date(cls, meta):
datetext = meta.findtext(NS.PyWebXML.date)
return utils.parse_iso_date(datetext)
@classmethod
def get_meta(cls, meta):
return meta.findall(NS.XHTML.meta)
@classmethod
def get_description(cls, meta):
return meta.findtext(NS.PyWebXML.description)
[docs] def parse_tree(self, root, header_offset=1):
"""
Take the root element of an ElementTree and interpret it as PyWebXML
document. Return the resulting :class:`~PyXWF.Document.Document`
instance on success and raise on error.
*header_offset* works as documented in the base class'
:meth:`~PyXWF.Parsers.ParserBase.transform_headers` method.
"""
if root.tag != NS.PyWebXML.page:
raise ValueError("This is not a pyxwf-xml document.")
meta = root.find(NS.PyWebXML.meta)
if meta is None:
raise ValueError("Metadata is missing.")
title = unicode(meta.findtext(NS.PyWebXML.title))
if title is None:
raise ValueError("Title is missing.")
keywords = self.get_keywords(meta)
links = self.get_links(meta)
body = root.find(NS.XHTML.body)
if body is None:
raise ValueError("No body tag found")
self.transform_headers(body, header_offset)
date = self.get_date(meta)
authors = self.get_authors(meta)
hmeta = self.get_meta(meta)
description = self.get_description(meta)
ext = [node for node in meta if self._is_ext_node(node)]
return Document.Document(title, keywords, links, body,
ext=ext, date=date, authors=authors, hmeta=hmeta,
description=description)
[docs] def parse(self, fileref, **kwargs):
"""
Parse the file referenced by *fileref* as PyWebXML document and return
the resulting :class:`~PyXWF.Document.Document` instance.
"""
tree = ET.parse(fileref)
root = tree.getroot()
return self.parse_tree(root, **kwargs)