#!/usr/bin/python3.4
# -*- coding: iso-8859-15 -*-
# $Id$
# $HeadURL$
"""This module contains general functions that help parsing the mediawiki markup.
I looked for an already existing MediaWiki parser in Python but I didn't find anything
that convinced me. However, here are the links:
* py-wikimarkup https://github.com/dcramer/py-wikimarkup
* mwlib http://code.pediapress.com/wiki/wiki
"""
import re
import xml.etree.ElementTree
import collections
import formencode
class ParseError(RuntimeError):
"""Exception used by some of the functions"""
pass
def find_template(wikitext, template_title):
"""Returns the tuple (start, end) of the first occurence of the template '{{template ...}} within wikitext'.
(None, None) is returned if the template is not found.
If you are sure that the wikitext contains the template, the template could be extracted like follows:
>>> wikitext = u'This is a {{Color|red|red text}} template.'
>>> start, end = find_template(wikitext, u'Color')
>>> print wikitext[start:end]
{{Color|red|red text}}
or just:
>>> print wikitext.__getslice__(*find_template(wikitext, u'Color'))
{{Color|red|red text}}
The search is done with regular expression. It gives wrong results when parsing a template
containing the characters "}}"
:param wikitext: The text (preferalbe unicode) that has the template in it.
:param template_title: The page title of the template with or without namespace (but as in the wikitext).
:return:
(start, end) of the first occurence with start >= 0 and end > start.
(None, None) if the template is not found.
"""
match = re.search("\{\{" + template_title + "\s*(\|[^\}]*)?\}\}", wikitext, re.DOTALL)
if match is None: return None, None
return match.start(), match.end()
class TemplateValidator(formencode.FancyValidator):
def __init__(self, strip=True, as_table=False, as_table_keylen=None):
"""Validates a MediaWiki template, e.g. {{Color|red}}
:param stip: If strip is True, the title, and the parameter keys and values are stripped in to_python.
:param as_table: formats the returned template in one row for each parameter
:param as_table_keylen: length of the key field for from_python. None for "automatic"."""
self.strip = (lambda s: s.strip()) if strip else (lambda s: s)
self.as_table = as_table
self.as_table_keylen = as_table_keylen
def to_python(self, value, state=None):
"""Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple
(title, anonym_params, named_params) where title is the template title,
anonym_params is a list of anonymous parameters and named_params is a OrderedDict
of named parameters. Whitespace of the parameters is stripped."""
if not value.startswith('{{'):
raise formencode.Invalid('Template does not start with "{{"', value, state)
if not value.endswith('}}'):
raise formencode.Invalid('Template does not end with "}}"', value, state)
parts = value[2:-2].split('|')
# template name
title = self.strip(parts[0])
if len(title) == 0:
raise formencode.Invalid('Empty template tilte.', value, state)
del parts[0]
# anonymous parameters
anonym_params = []
while len(parts) > 0:
equalsign_pos = parts[0].find('=')
if equalsign_pos >= 0: break # named parameter
anonym_params.append(self.strip(parts[0]))
del parts[0]
# named or numbered parameters
named_params = collections.OrderedDict()
while len(parts) > 0:
equalsign_pos = parts[0].find('=')
if equalsign_pos < 0:
raise formencode.Invalid('Anonymous parameter after named parameter.', value, state)
key, sep, value = parts[0].partition('=')
key = self.strip(key)
if len(key) == 0:
raise formencode.Invalid('Empty key.', value, state)
if key in named_params:
raise formencode.Invalid('Duplicate key: "{0}"'.format(key), value, state)
named_params[key] = self.strip(value)
del parts[0]
return title, anonym_params, named_params
def from_python(self, value, state=None):
"""Formats a MediaWiki template.
value is a tuple: (title, anonym_params, named_params)
where title is the template title, anonym_params is a list of anonymous parameters and
named_params is a dict or OrderedDict of named parameters."""
title, anonym_params, named_params = value
pipe_char, equal_char, end_char = ('\n| ', ' = ', '\n}}') if self.as_table else ('|', '=', '}}')
parts = ["{{" + title]
parts += anonym_params
as_table_keylen = self.as_table_keylen
if self.as_table and as_table_keylen is None:
as_table_keylen = max(list(map(len, iter(named_params.keys()))))
for k, v in named_params.items():
if self.as_table:
k = k.ljust(as_table_keylen)
parts.append((k + equal_char + v).rstrip())
else:
parts.append(k + equal_char + v)
return pipe_char.join(parts) + end_char
def split_template(template):
"""Deprecated legacy function.
Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple
(template_title, parameters) where parameters is a Python dictionary {u'1': u'red', u'text'=u'Any text'}.
Anonymous parameters get integer keys (converted to unicode) starting with 1
like in MediaWiki, named parameters are unicode strings.
Whitespace is stripped.
If an unexpected format is encountered, a ValueError is raised."""
try:
title, anonym_params, named_params = TemplateValidator().to_python(template)
parameters = dict(named_params)
for i in range(len(anonym_params)):
parameters[str(i+1)] = anonym_params[i]
except formencode.Invalid as e:
raise ValueError(e[0])
return title, parameters
def create_template(template_title, anonym_params=[], named_param_keys=[], named_param_values=[], as_table=False, as_table_keylen=None):
"""Deprecated legacy function.
Formats a MediaWiki template.
:param template_title: Unicode string with the template name
:param anonym_params: list with parameters without keys
:param named_param_keys: list with keys of named parameters
:param named_param_values: list with values of named parameters, corresponding to named_param_keys.
:param as_table: formats the returned template in one row for each parameter
:param as_table_keylen: length of the key field. None for "automatic".
:return: unicode template"""
named_params = collections.OrderedDict(list(zip(named_param_keys, named_param_values)))
return TemplateValidator(as_table=as_table, as_table_keylen=as_table_keylen).from_python((template_title, anonym_params, named_params))
def find_tag(wikitext, tagname, pos=0):
"""Returns position information of the first occurence of the tag '...'
or ''.
If you are sure that the wikitext contains the tag, the tag could be extracted like follows:
>>> wikitext = u'This is a mytag tag.'
>>> start, content, endtag, end = find_template(wikitext, u'tag')
>>> print wikitext[start:end]
mytag
:param wikitext: The text (preferalbe unicode) that has the template in it.
:param tagname: Name of the tag, e.g. u'tag' for .
:param pos: position within wikitext to start searching the tag.
:return:
(start, content, endtag, end). start is the position of '<' of the tag,
content is the beginning of the content (after '>'), enttag is the
beginning of the end tag ('') and end is one position after the end tag.
For single tags, (start, None, None, end) is returned.
If the tag is not found (or only the start tag is present,
(None, None, None, None) is returned.
"""
# Find start tag
regexp_starttag = re.compile("<{0}.*?(/?)>".format(tagname), re.DOTALL)
match_starttag = regexp_starttag.search(wikitext, pos)
if match_starttag is None:
return None, None, None, None
# does the tag have content?
if len(match_starttag.group(1)) == 1: # group(1) is either '' or '/'.
# single tag
return match_starttag.start(), None, None, match_starttag.end()
# tag with content
regexp_endtag = re.compile('{0}>'.format(tagname), re.DOTALL)
match_endtag = regexp_endtag.search(wikitext, match_starttag.end())
if match_endtag is None:
# No closing tag - error in wikitext
return None, None, None, None
return match_starttag.start(), match_starttag.end(), match_endtag.start(), match_endtag.end()
def parse_googlemap(wikitext):
"""Parses the (unicode) u'content' of the googlemap extension.
If wikitext does not contain the tag or if the tag contains
invalid formatted lines, a ParseError is raised.
Use find_tag(wikitext, 'googlemap') to find the googlemap tag within an arbitrary
wikitext before using this function.
:param wikitext: wikitext containing the template. Example:
wikitext = '''
(Parkplatz)47.114958,11.266026
Parkplatz
(Gasthaus) 47.114715, 11.266262, Alt Bärnbad (Gasthaus)
6#FF014E9A
47.114715,11.266262
47.114135,11.268381
47.113421,11.269322
47.11277,11.269979
47.112408,11.271119
'''
:returns: The tuple (attributes, coords, paths) is returned.
attributes is a dict that contains the attribues that are present
(e.g. lon, lat, zoom, width, height) converted to float (lon, lat) or int.
coords is a list of (lon, lat, symbol, title) tuples.
paths is a list of (style, coords) tuples.
coords is again a list of (lon, lat, symbol, title) tuples."""
def is_coord(line):
"""Returns True if the line contains a coordinate."""
match = re.search('[0-9]{1,2}\.[0-9]+, ?[0-9]{1,2}\.[0-9]+', line)
return not match is None
def is_path(line):
"""Returns True if the line contains a path style definition."""
match = re.match('[0-9]#[0-9a-fA-F]{8}', line)
return not match is None
def parse_coord(line):
"""Returns (lon, lat, symbol, title). If symbol or text is not present, None is returned."""
match = re.match('\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line)
if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), match.group(4))
match = re.match('\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), None)
match = re.match('([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line)
if not match is None: return (float(match.group(2)), float(match.group(1)), None, match.group(3))
match = re.match('([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
if not match is None: return (float(match.group(2)), float(match.group(1)), None, None)
return ParseError('Could not parse line ' + line)
start, content, endtag, end = find_tag(wikitext, 'googlemap')
if start is None:
raise ParseError(' tag not found.')
if content is None:
xml_only = wikitext[start:endtag]
else:
xml_only = wikitext[start:content]+wikitext[endtag:end]
try:
gm = xml.etree.ElementTree.XML(xml_only.encode('UTF8'))
except xml.etree.ElementTree.ParseError as e:
row, column = e.position
raise ParseError("XML parse error in .")
# parse attributes
attributes = {}
try:
for key in ['lon', 'lat']:
if gm.get(key) is not None:
attributes[key] = float(gm.get(key))
for key in ['zoom', 'width', 'height']:
if gm.get(key) is not None:
attributes[key] = int(gm.get(key))
except ValueError as error:
raise ParseError('Error at parsing attribute {0} of : {1}'.format(key, str(error)))
# parse points and lines
coords = []
paths = []
lines = wikitext[content:endtag].split("\n")
i = 0
while i < len(lines):
line = lines[i].strip()
i += 1
# Skip whitespace
if len(line) == 0: continue
# Handle a path
if is_path(line):
match = re.match('([0-9]#[0-9a-fA-F]{8})', line)
style = match.group(1)
local_coords = []
while i < len(lines):
line = lines[i].strip()
i += 1
if is_path(line):
i -= 1
break
if is_coord(line):
lon, lat, symbol, title = parse_coord(line)
local_coords.append((lon, lat, symbol, title))
paths.append((style, local_coords))
continue
# Handle a coordinate
if is_coord(line):
lon, lat, symbol, title = parse_coord(line)
while i < len(lines):
line = lines[i].strip()
i += 1
if is_path(line) or is_coord(line):
i -= 1
break
if len(line) > 0 and title is None: title = line
coords.append((lon, lat, symbol, title))
continue
raise ParseError('Unknown line syntax: ' + line)
return (attributes, coords, paths)