#!/usr/bin/python2.6
# -*- coding: iso-8859-15 -*-
# $Id$
# $HeadURL$
"""This module contains general functions that help parsing the mediawiki markup.
I looked for an already existing MediaWiki parser in Python but I didn't find anything
that convinced me. However, here are the links:
* py-wikimarkup https://github.com/dcramer/py-wikimarkup
* mwlib http://code.pediapress.com/wiki/wiki
"""
import re
import xml.etree.ElementTree
def find_template(wikitext, template_title):
"""Returns the tuple (start, end) of the first occurence of the template '{{template ...}} within wikitext'.
(None, None) is returned if the template is not found.
If you are sure that the wikitext contains the template, the template could be extracted like follows:
>>> wikitext = u'This is a {{Color|red|red text}} template.'
>>> start, end = find_template(wikitext, u'Color')
>>> print wikitext[start:end]
{{Color|red|red text}}
or just:
>>> print wikitext.__getslice__(*find_template(wikitext, u'Color'))
{{Color|red|red text}}
The search is done with regular expression.
:param wikitext: The text (preferalbe unicode) that has the template in it.
:param template_title: The page title of the template with or without namespace (but as in the wikitext).
:return:
(start, end) of the first occurence with start >= 0 and end > start.
(None, None) if the template is not found.
"""
match = re.search(u"\{\{" + template_title + "[^\}]*\}\}", wikitext, re.DOTALL)
if match is None: return None, None
return match.start(), match.end()
def split_template(template):
"""Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple
(template_title, parameters) where parameters is a Python dictionary {1: u'red', u'text'=u'Any text'}.
Anonymous parameters get integer keys (converted to unicode) starting with 1
like in MediaWiki, named parameters are unicode strings.
Whitespace is stripped.
If an unexpected format is encountered, a ValueError is raised."""
if not template.startswith(u'{{'): raise ValueError(u'Template does not start with "{{"')
if not template.endswith(u'}}'): raise ValueError(u'Template does not end with "}}"')
parts = template[2:-2].split(u'|')
# template name
template_title = parts[0].strip()
if len(template_title) == 0: raise ValueError(u'Empty template tilte.')
del parts[0]
# anonymous parameters
params = {} # result dictionary
param_num = 1
while len(parts) > 0:
equalsign_pos = parts[0].find(u'=')
if equalsign_pos >= 0: break # named parameter
params[unicode(param_num)] = parts[0].strip()
del parts[0]
param_num += 1
# named or numbered parameters
while len(parts) > 0:
equalsign_pos = parts[0].find(u'=')
if equalsign_pos < 0: raise ValueError(u'Anonymous parameter after named parameter.')
key, sep, value = parts[0].partition(u'=')
key = key.strip()
if len(key) == 0: raise ValueError(u'Empty key.')
if params.has_key(key): raise ValueError(u'Duplicate key: "{0}"'.format(key))
params[key] = value.strip()
del parts[0]
return template_title, params
def create_template(template_title, anonym_params=[], named_param_keys=[], named_param_values=[], as_table=False, as_table_keylen=None):
"""Formats a MediaWiki template.
:param template_title: Unicode string with the template name
:param anonym_params: list with parameters without keys
:param named_param_keys: list with keys of named parameters
:param named_param_values: list with values of named parameters, corresponding to named_param_keys.
:param as_table: formats the returned template in one row for each parameter
:param as_table_keylen: length of the key field. None for "automatic".
:return: unicode template"""
pipe_char, equal_char, end_char = (u'\n| ', u' = ', u'\n}}') if as_table else (u'|', u'=', u'}}')
parts = [u"{{" + template_title]
parts += anonym_params
if as_table and as_table_keylen is None:
as_table_keylen = max([len(k) for k in named_param_keys])
for i in xrange(len(named_param_keys)):
key = named_param_keys[i]
if as_table: key = key.ljust(as_table_keylen)
parts.append(key + equal_char + named_param_values[i])
return pipe_char.join(parts) + end_char
def parse_googlemap(wikitext):
"""Parses the (unicode) u'content' of the googlemap extension
out of a page. If wikitext does not contain the googlemaps extension text None is returned.
If the googlemap contains invalid formatted lines, a RuntimeError is raised.
:param wikitext: wikitext containing the template. Example:
wikitext = '''
(Parkplatz)47.114958,11.266026
Parkplatz
(Gasthaus) 47.114715, 11.266262, Alt Bärnbad (Gasthaus)
6#FF014E9A
47.114715,11.266262
47.114135,11.268381
47.113421,11.269322
47.11277,11.269979
47.112408,11.271119
'''
:returns: the tuple (center, zoom, coords, paths).
center is the tuple (lon, lat) of the google maps or (None, None) if not provided
zoom is the google zoom level as integer or None if not provided
coords is a list of (lon, lat, symbol, title) tuples.
paths is a list of (style, coords) tuples.
coords is again a list of (lot, lat, symbol, title) tuples."""
def is_coord(line):
"""Returns True if the line contains a coordinate."""
match = re.search('[0-9]{1,2}\.[0-9]+, ?[0-9]{1,2}\.[0-9]+', line)
return not match is None
def is_path(line):
"""Returns True if the line contains a path style definition."""
match = re.match('[0-9]#[0-9a-fA-F]{8}', line)
return not match is None
def parse_coord(line):
"""Returns (lon, lat, symbol, title). If symbol or text is not present, None is returned."""
match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+),(.*)', line)
if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), match.group(4))
match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), None)
match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+),(.*)', line)
if not match is None: return (float(match.group(2)), float(match.group(1)), None, match.group(3))
match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
if not match is None: return (float(match.group(2)), float(match.group(1)), None, None)
return RuntimeError(u'Could not parse line ' + line)
regexp = re.compile(u"(]*>)(.*)()", re.DOTALL)
match = regexp.search(wikitext)
if match is None: return None
content = match.group(2)
gm = xml.etree.ElementTree.XML((match.group(1)+match.group(3)).encode('UTF8'))
zoom = gm.get('zoom')
lon = gm.get('lon')
lat = gm.get('lat')
if not zoom is None: zoom = int(zoom)
if not lon is None: lon = float(lon)
if not lat is None: lat = float(lat)
center = (lon, lat)
coords = []
paths = []
lines = content.split("\n")
i = 0
while i < len(lines):
line = lines[i].strip()
i += 1
# Skip whitespace
if len(line) == 0: continue
# Handle a path
if is_path(line):
match = re.match(u'([0-9]#[0-9a-fA-F]{8})', line)
style = match.group(1)
local_coords = []
while i < len(lines):
line = lines[i].strip()
i += 1
if is_path(line):
i -= 1
break
if is_coord(line):
lon, lat, symbol, title = parse_coord(line)
local_coords.append((lon, lat, symbol, title))
paths.append((style, local_coords))
continue
# Handle a coordinate
if is_coord(line):
lon, lat, symbol, title = parse_coord(line)
while i < len(lines):
line = lines[i].strip()
i += 1
if is_path(line) or is_coord(line):
i -= 1
break
if len(line) > 0 and title is None: title = line
coords.append((lon, lat, symbol, title))
continue
raise RuntimeError(u'Unknown line syntax: ' + line)
return (center, zoom, coords, paths)