X-Git-Url: https://git.toastfreeware.priv.at/philipp/winterrodeln/wrpylib.git/blobdiff_plain/9a2e65bfb6528e0bfe9500b4f2cfe7e46f6491e6..ebd14c98b1238a008b6f38d0f00e3eb13b157cda:/wrpylib/mwmarkup.py
diff --git a/wrpylib/mwmarkup.py b/wrpylib/mwmarkup.py
index b430502..572b30c 100644
--- a/wrpylib/mwmarkup.py
+++ b/wrpylib/mwmarkup.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python2.6
+#!/usr/bin/python3.4
# -*- coding: iso-8859-15 -*-
# $Id$
# $HeadURL$
@@ -11,6 +11,13 @@ that convinced me. However, here are the links:
"""
import re
import xml.etree.ElementTree
+import collections
+import formencode
+
+
+class ParseError(RuntimeError):
+ """Exception used by some of the functions"""
+ pass
def find_template(wikitext, template_title):
@@ -37,53 +44,107 @@ def find_template(wikitext, template_title):
(start, end) of the first occurence with start >= 0 and end > start.
(None, None) if the template is not found.
"""
- match = re.search(u"\{\{" + template_title + "\s*(\|[^\}]*)?\}\}", wikitext, re.DOTALL)
+ match = re.search("\{\{" + template_title + "\s*(\|[^\}]*)?\}\}", wikitext, re.DOTALL)
if match is None: return None, None
return match.start(), match.end()
+class TemplateValidator(formencode.FancyValidator):
+ def __init__(self, strip=True, as_table=False, as_table_keylen=None):
+ """Validates a MediaWiki template, e.g. {{Color|red}}
+ :param stip: If strip is True, the title, and the parameter keys and values are stripped in to_python.
+ :param as_table: formats the returned template in one row for each parameter
+ :param as_table_keylen: length of the key field for from_python. None for "automatic"."""
+ self.strip = (lambda s: s.strip()) if strip else (lambda s: s)
+ self.as_table = as_table
+ self.as_table_keylen = as_table_keylen
+
+ def to_python(self, value, state=None):
+ """Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple
+ (title, anonym_params, named_params) where title is the template title,
+ anonym_params is a list of anonymous parameters and named_params is a OrderedDict
+ of named parameters. Whitespace of the parameters is stripped."""
+ if not value.startswith('{{'):
+ raise formencode.Invalid('Template does not start with "{{"', value, state)
+ if not value.endswith('}}'):
+ raise formencode.Invalid('Template does not end with "}}"', value, state)
+ parts = value[2:-2].split('|')
+
+ # template name
+ title = self.strip(parts[0])
+ if len(title) == 0:
+ raise formencode.Invalid('Empty template tilte.', value, state)
+ del parts[0]
+
+ # anonymous parameters
+ anonym_params = []
+ while len(parts) > 0:
+ equalsign_pos = parts[0].find('=')
+ if equalsign_pos >= 0: break # named parameter
+ anonym_params.append(self.strip(parts[0]))
+ del parts[0]
+
+ # named or numbered parameters
+ named_params = collections.OrderedDict()
+ while len(parts) > 0:
+ equalsign_pos = parts[0].find('=')
+ if equalsign_pos < 0:
+ raise formencode.Invalid('Anonymous parameter after named parameter.', value, state)
+ key, sep, value = parts[0].partition('=')
+ key = self.strip(key)
+ if len(key) == 0:
+ raise formencode.Invalid('Empty key.', value, state)
+ if key in named_params:
+ raise formencode.Invalid('Duplicate key: "{0}"'.format(key), value, state)
+ named_params[key] = self.strip(value)
+ del parts[0]
+
+ return title, anonym_params, named_params
+
+ def from_python(self, value, state=None):
+ """Formats a MediaWiki template.
+ value is a tuple: (title, anonym_params, named_params)
+ where title is the template title, anonym_params is a list of anonymous parameters and
+ named_params is a dict or OrderedDict of named parameters."""
+ title, anonym_params, named_params = value
+ pipe_char, equal_char, end_char = ('\n| ', ' = ', '\n}}') if self.as_table else ('|', '=', '}}')
+ parts = ["{{" + title]
+ parts += anonym_params
+ as_table_keylen = self.as_table_keylen
+ if self.as_table and as_table_keylen is None:
+ as_table_keylen = max(list(map(len, iter(named_params.keys()))))
+ for k, v in named_params.items():
+ if self.as_table:
+ k = k.ljust(as_table_keylen)
+ parts.append((k + equal_char + v).rstrip())
+ else:
+ parts.append(k + equal_char + v)
+ return pipe_char.join(parts) + end_char
+
+
def split_template(template):
- """Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple
+ """Deprecated legacy function.
+
+ Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple
(template_title, parameters) where parameters is a Python dictionary {u'1': u'red', u'text'=u'Any text'}.
Anonymous parameters get integer keys (converted to unicode) starting with 1
like in MediaWiki, named parameters are unicode strings.
Whitespace is stripped.
If an unexpected format is encountered, a ValueError is raised."""
- if not template.startswith(u'{{'): raise ValueError(u'Template does not start with "{{"')
- if not template.endswith(u'}}'): raise ValueError(u'Template does not end with "}}"')
- parts = template[2:-2].split(u'|')
-
- # template name
- template_title = parts[0].strip()
- if len(template_title) == 0: raise ValueError(u'Empty template tilte.')
- del parts[0]
-
- # anonymous parameters
- params = {} # result dictionary
- param_num = 1
- while len(parts) > 0:
- equalsign_pos = parts[0].find(u'=')
- if equalsign_pos >= 0: break # named parameter
- params[unicode(param_num)] = parts[0].strip()
- del parts[0]
- param_num += 1
-
- # named or numbered parameters
- while len(parts) > 0:
- equalsign_pos = parts[0].find(u'=')
- if equalsign_pos < 0: raise ValueError(u'Anonymous parameter after named parameter.')
- key, sep, value = parts[0].partition(u'=')
- key = key.strip()
- if len(key) == 0: raise ValueError(u'Empty key.')
- if params.has_key(key): raise ValueError(u'Duplicate key: "{0}"'.format(key))
- params[key] = value.strip()
- del parts[0]
-
- return template_title, params
+ try:
+ title, anonym_params, named_params = TemplateValidator().to_python(template)
+ parameters = dict(named_params)
+ for i in range(len(anonym_params)):
+ parameters[str(i+1)] = anonym_params[i]
+ except formencode.Invalid as e:
+ raise ValueError(e[0])
+ return title, parameters
def create_template(template_title, anonym_params=[], named_param_keys=[], named_param_values=[], as_table=False, as_table_keylen=None):
- """Formats a MediaWiki template.
+ """Deprecated legacy function.
+
+ Formats a MediaWiki template.
:param template_title: Unicode string with the template name
:param anonym_params: list with parameters without keys
:param named_param_keys: list with keys of named parameters
@@ -91,25 +152,57 @@ def create_template(template_title, anonym_params=[], named_param_keys=[], named
:param as_table: formats the returned template in one row for each parameter
:param as_table_keylen: length of the key field. None for "automatic".
:return: unicode template"""
- pipe_char, equal_char, end_char = (u'\n| ', u' = ', u'\n}}') if as_table else (u'|', u'=', u'}}')
- parts = [u"{{" + template_title]
- parts += anonym_params
- if as_table and as_table_keylen is None:
- as_table_keylen = max([len(k) for k in named_param_keys])
- for i in xrange(len(named_param_keys)):
- key = named_param_keys[i]
- if as_table:
- key = key.ljust(as_table_keylen)
- parts.append((key + equal_char + named_param_values[i]).rstrip())
- else:
- parts.append(key + equal_char + named_param_values[i])
- return pipe_char.join(parts) + end_char
+ named_params = collections.OrderedDict(list(zip(named_param_keys, named_param_values)))
+ return TemplateValidator(as_table=as_table, as_table_keylen=as_table_keylen).from_python((template_title, anonym_params, named_params))
+
+
+def find_tag(wikitext, tagname, pos=0):
+ """Returns position information of the first occurence of the tag '...'
+ or ''.
+ If you are sure that the wikitext contains the tag, the tag could be extracted like follows:
+
+ >>> wikitext = u'This is a mytag tag.'
+ >>> start, content, endtag, end = find_template(wikitext, u'tag')
+ >>> print wikitext[start:end]
+ mytag
+
+ :param wikitext: The text (preferalbe unicode) that has the template in it.
+ :param tagname: Name of the tag, e.g. u'tag' for .
+ :param pos: position within wikitext to start searching the tag.
+ :return:
+ (start, content, endtag, end). start is the position of '<' of the tag,
+ content is the beginning of the content (after '>'), enttag is the
+ beginning of the end tag ('') and end is one position after the end tag.
+ For single tags, (start, None, None, end) is returned.
+ If the tag is not found (or only the start tag is present,
+ (None, None, None, None) is returned.
+ """
+ # Find start tag
+ regexp_starttag = re.compile("<{0}.*?(/?)>".format(tagname), re.DOTALL)
+ match_starttag = regexp_starttag.search(wikitext, pos)
+ if match_starttag is None:
+ return None, None, None, None
+
+ # does the tag have content?
+ if len(match_starttag.group(1)) == 1: # group(1) is either '' or '/'.
+ # single tag
+ return match_starttag.start(), None, None, match_starttag.end()
+
+ # tag with content
+ regexp_endtag = re.compile('{0}>'.format(tagname), re.DOTALL)
+ match_endtag = regexp_endtag.search(wikitext, match_starttag.end())
+ if match_endtag is None:
+ # No closing tag - error in wikitext
+ return None, None, None, None
+ return match_starttag.start(), match_starttag.end(), match_endtag.start(), match_endtag.end()
def parse_googlemap(wikitext):
- """Parses the (unicode) u'content' of the googlemap extension
- out of a page. If wikitext does not contain the googlemaps extension text None is returned.
- If the googlemap contains invalid formatted lines, a RuntimeError is raised.
+ """Parses the (unicode) u'content' of the googlemap extension.
+ If wikitext does not contain the tag or if the tag contains
+ invalid formatted lines, a ParseError is raised.
+ Use find_tag(wikitext, 'googlemap') to find the googlemap tag within an arbitrary
+ wikitext before using this function.
:param wikitext: wikitext containing the template. Example:
@@ -127,12 +220,12 @@ def parse_googlemap(wikitext):
47.112408,11.271119
'''
- :returns: the tuple (center, zoom, coords, paths).
- center is the tuple (lon, lat) of the google maps or (None, None) if not provided
- zoom is the google zoom level as integer or None if not provided
+ :returns: The tuple (attributes, coords, paths) is returned.
+ attributes is a dict that contains the attribues that are present
+ (e.g. lon, lat, zoom, width, height) converted to float (lon, lat) or int.
coords is a list of (lon, lat, symbol, title) tuples.
paths is a list of (style, coords) tuples.
- coords is again a list of (lot, lat, symbol, title) tuples."""
+ coords is again a list of (lon, lat, symbol, title) tuples."""
def is_coord(line):
"""Returns True if the line contains a coordinate."""
@@ -146,32 +239,46 @@ def parse_googlemap(wikitext):
def parse_coord(line):
"""Returns (lon, lat, symbol, title). If symbol or text is not present, None is returned."""
- match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+),(.*)', line)
+ match = re.match('\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line)
if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), match.group(4))
- match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
+ match = re.match('\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), None)
- match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+),(.*)', line)
+ match = re.match('([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line)
if not match is None: return (float(match.group(2)), float(match.group(1)), None, match.group(3))
- match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
+ match = re.match('([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
if not match is None: return (float(match.group(2)), float(match.group(1)), None, None)
- return RuntimeError(u'Could not parse line ' + line)
-
- regexp = re.compile(u"(]*>)(.*)()", re.DOTALL)
- match = regexp.search(wikitext)
- if match is None: return None
- content = match.group(2)
- gm = xml.etree.ElementTree.XML((match.group(1)+match.group(3)).encode('UTF8'))
- zoom = gm.get('zoom')
- lon = gm.get('lon')
- lat = gm.get('lat')
- if not zoom is None: zoom = int(zoom)
- if not lon is None: lon = float(lon)
- if not lat is None: lat = float(lat)
- center = (lon, lat)
+ return ParseError('Could not parse line ' + line)
+ start, content, endtag, end = find_tag(wikitext, 'googlemap')
+ if start is None:
+ raise ParseError(' tag not found.')
+ if content is None:
+ xml_only = wikitext[start:endtag]
+ else:
+ xml_only = wikitext[start:content]+wikitext[endtag:end]
+
+ try:
+ gm = xml.etree.ElementTree.XML(xml_only.encode('UTF8'))
+ except xml.etree.ElementTree.ParseError as e:
+ row, column = e.position
+ raise ParseError("XML parse error in .")
+
+ # parse attributes
+ attributes = {}
+ try:
+ for key in ['lon', 'lat']:
+ if gm.get(key) is not None:
+ attributes[key] = float(gm.get(key))
+ for key in ['zoom', 'width', 'height']:
+ if gm.get(key) is not None:
+ attributes[key] = int(gm.get(key))
+ except ValueError as error:
+ raise ParseError('Error at parsing attribute {0} of : {1}'.format(key, str(error)))
+
+ # parse points and lines
coords = []
paths = []
- lines = content.split("\n")
+ lines = wikitext[content:endtag].split("\n")
i = 0
while i < len(lines):
line = lines[i].strip()
@@ -182,7 +289,7 @@ def parse_googlemap(wikitext):
# Handle a path
if is_path(line):
- match = re.match(u'([0-9]#[0-9a-fA-F]{8})', line)
+ match = re.match('([0-9]#[0-9a-fA-F]{8})', line)
style = match.group(1)
local_coords = []
while i < len(lines):
@@ -210,6 +317,7 @@ def parse_googlemap(wikitext):
coords.append((lon, lat, symbol, title))
continue
- raise RuntimeError(u'Unknown line syntax: ' + line)
- return (center, zoom, coords, paths)
+ raise ParseError('Unknown line syntax: ' + line)
+
+ return (attributes, coords, paths)