# encoding=utf-8
# File name: utils.py
# This file is part of: pyxwf
# The contents of this file are subject to the Mozilla Public License
# Version 1.1 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
# the License for the specific language governing rights and limitations
# under the License.
# Alternatively, the contents of this file may be used under the terms
# of the GNU General Public license (the "GPL License"), in which case
# the provisions of GPL License are applicable instead of those above.
# For feedback and questions about pyxwf please e-mail one of the
# authors named in the AUTHORS file.
import abc, os, re, logging
from datetime import datetime
import lxml.etree as ET
# http://plumberjack.blogspot.de/2010/10/supporting-alternative-formatting.html
class BraceMessage(object):
def __init__(self, fmt, *args, **kwargs):
self.fmt = fmt
self.args = args
self.kwargs = kwargs
def __str__(self):
return self.fmt.format(*self.args, **self.kwargs)
_F = BraceMessage
class NoInstance(type):
def _not_instanciable(*args):
raise TypeError("Cannot instanciate {0}".format(cls.__name__))
def __new__(mcls, name, bases, dct):
dct["__new__"] = mcls._not_instanciable
return super(NoInstance, mcls).__new__(mcls, name, bases, dct)
[docs]def split_tag(tag):
Split an ElementTree tag into its namespace and XML local-name and return
these as a tuple ``(namespace, localname)``. If the tag has no namespace
associated, :data:`None` is returned for *namespace*.
assert(isinstance(tag, basestring))
if len(tag) == 0:
return None, ""
if tag[0] == "{":
cbrace = tag.find("}")
ns = tag[1:cbrace]
name = tag[cbrace+1:]
return ns, name
return None, tag
[docs]def add_class(node, cls):
Take the ``@class`` attribute of *node*, split it at spaces, put it into a
:class:`set`, add *cls* to the set and re-join the set with spaces.
classes = set(node.get("class", "").split())
node.set("class", " ".join(classes))
[docs]def file_last_modified(fileref, float_times=False):
If *fileref* is a file name or a file like with associated fileno which
points to an actual file, return the date of last modification
stored in the filesystem, :data:`None` otherwise.
By default, the times are truncated to full seconds. If you need the
floating point part of the times (if supported by the platform), pass
``True`` to *float_times*.
if isinstance(fileref, basestring):
st = os.stat(fileref)
fno = fileref.fileno()
if fno >= 0:
st = os.fstat(fno)
return None
except (OSError, AttributeError):
return None
if float_times:
mtime = st.st_mtime
mtime = int(st.st_mtime)
return datetime.utcfromtimestamp(mtime)
def unicode2xpathstr(value):
return '"'+unicode(value).replace("\"", "\\\"")+'"'
[docs]def parse_iso_date(s):
Parse a date like returned with :meth:`~datetime.datetime.isoformat`, but
with a trailing `Z` to indicate the UTC timezone.
if s is None:
return None
return datetime.strptime(s, "%Y-%m-%dT%H:%M:%SZ")
[docs]def XHTMLToHTML(tree):
Converts the given ETree *tree* from XHTML to HTML *in-place*. Raises
:class:`ValueError` if a non-XHTML namespace is encountered.
import PyXWF.Namespaces as NS
xhtml_ns = str(NS.XHTML)
for item in tree.iter():
if not isinstance(item.tag, basestring):
ns, name = split_tag(item.tag)
if ns != xhtml_ns:
logging.debug("offending element: {0}".format(ET.tostring(item)))
raise ValueError("tree contains non-xhtml elements: {0}:{1}".format(ns, name))
item.tag = name
mobile_useragent_re = re.compile("(\sMobile\s|\sMobile/[0-9a-fA-F]+)")
useragent_regexes = [
("googlebot", re.compile("Googlebot/(?P<version>[0-9]+(\.[0-9]+)?)")),
("googlebot", re.compile("Googlebot-Image/(?P<version>[0-9]+(\.[0-9]+)?)")),
("bingbot", re.compile("bingbot/(?P<version>[0-9]+(\.[0-9]+)?)")),
("ahrefsbot", re.compile("AhrefsBot/(?P<version>[0-9]+(\.[0-9]+)?)")),
("yandexbot", re.compile("YandexBot/(?P<version>[0-9]+(\.[0-9]+)?)")),
("yahoo-slurp", re.compile("Yahoo! Slurp/(?P<version>[0-9]+(\.[0-9]+)?)")),
("yahoo-slurp", re.compile("Yahoo! Slurp")),
("speedy-spider", re.compile("Speedy Spider")),
("sistrix-crawler", re.compile("SISTRIX Crawler")),
("msnbot", re.compile("msnbot/(?P<version>[0-9]+(\.[0-9]+)?)")),
("msnbot", re.compile("msnbot-media/(?P<version>[0-9]+(\.[0-9]+)?)")),
("konqueror", re.compile("Konqueror/(?P<version>[0-9]+(\.[0-9]+)?)")),
("chrome", re.compile("Chrome/(?P<version>[0-9]+(\.[0-9]+)?)")),
("ie", re.compile("MSIE ([0-9]+(\.[0-9]+)?)")),
("firefox", re.compile("Firefox/(?P<version>[0-9]+(\.[0-9]+)?)")),
("firefox", re.compile("Gecko/[0-9]+\s+Firefox[0-9]+")),
("firefox", re.compile("Minefield/(?P<version>[0-9]+(\.[0-9]+)?)")),
("firefox", re.compile("Iceape/(?P<version>[0-9]+(\.[0-9]+)?)")),
("firefox", re.compile("Iceweasel/(?P<version>[0-9]+(\.[0-9]+)?)")),
("seamonkey", re.compile("SeaMonkey/(?P<version>[0-9]+(\.[0-9]+)?)")),
("safari", re.compile("Safari/(?P<version>[0-9]+(\.[0-9]+)?)")),
("opera", re.compile("Opera/([0-9.]+).*Version/(?P<version>[0-9]+(\.[0-9]+)?)")),
("opera", re.compile("Opera/(?P<version>[0-9]+(\.[0-9]+)?)")),
("lynx", re.compile("Lynx/(?P<version>[0-9]+(\.[0-9]+)?)")),
("links", re.compile("Links ")),
("w3m", re.compile("w3m/(?P<version>[0-9]+(\.[0-9]+)?)")),
("wget", re.compile("[Ww]get/(?P<version>[0-9]+(\.[0-9]+)?)")),
("rotfuchs", re.compile("Gecko Rotfuchs")),
("epiphany", re.compile("Epiphany/(?P<version>[0-9]+(\.[0-9]+)?)")),
("rssowl", re.compile("RSSOwl/(?P<version>[0-9]+(\.[0-9]+)?)")),
("askbot", re.compile("Ask Jeeves")),
("exabot", re.compile("Exabot/(?P<version>[0-9]+(\.[0-9]+)?)")),
("seekbot", re.compile("Seekbot/(?P<version>[0-9]+(\.[0-9]+)?)")),
("libwww-perl", re.compile("libwww-perl/(?P<version>[0-9]+(\.[0-9]+)?)")),
("blank", re.compile("^\s*-\s*$"))
useragent_classes = {
"googlebot": "indexer",
"bingbot": "indexer",
"yahoo-slurp": "indexer",
"msnbot": "indexer",
"speedy-spider": "crawler",
"sistrix-crawler": "crawler",
"wget": "crawler",
"firefox": "browser",
"seamonkey": "browser",
"safari": "browser",
"opera": "browser",
"links": "browser",
"lynx": "browser",
"rotfuchs": "browser",
"chrome": "browser",
"ie": "browser",
"w3m": "browser",
"konqueror": "browser",
"yandexbot": "indexer",
"ahrefsbot": "crawler",
"epiphany": "browser",
"askbot": "indexer",
"rssowl": "feedreader",
"exabot": "indexer",
"seekbot": "indexer",
[docs]def guess_useragent(headerval):
Return a tuple ``(useragent, version)``, where *useragent* is one of:
* ``ie`` for Internet Explorerâ„¢
* ``firefox`` for firefox
* ``mozilla`` for mozilla
* ``opera`` for opera
* ``safari`` for safari
* ``links`` for links
* ``lynx`` for lynx
* ``wget`` for wget
* ``chrome`` for chrome
* ``yahoo-slurp`` for yahoo slurp bot
* ``konqueror`` for konqueror
* ``googlebot`` for googlebot
* None for each unknown user agent
*version* will be either the version number of the user agent or None if the
version could not be determined reliably. The version number is represented
as a floating point value.
for agentname, regex in useragent_regexes:
m = regex.search(headerval)
if m:
groups = m.groupdict()
if len(groups) > 0:
version = float(groups["version"])
version = None
return agentname, version
return (None, None)
def classify_useragent(uaname):
return useragent_classes.get(uaname, None)
def is_mobile_useragent(headerval):
return mobile_useragent_re.search(headerval) is not None
def chunk_string(s, chunk_size=1024):
off = 0
while True:
yield s[off:off+chunk_size]
off += chunk_size
if off >= len(s):
import threading
except ImportError as err:
logging.warning(_F("Could not import threading: {0}", err))
logging.warning("Will fallback to dummy_threading; expect promlems")
import dummy_threading as threading
import blist
except AttributeError:
# blist doesn't have version tag
blist.__version__ = "native"
except ImportError as err:
logging.warning(_F("Could not import blist: {0}", err))
logging.warning("Will fallback to surrogate; sortedlist will be slow")
import PyXWF.Surrogates.blist as blist