X-Git-Url: https://git.toastfreeware.priv.at/philipp/winterrodeln/wrpylib.git/blobdiff_plain/618cce49056a3c52e8cb3838c1f781bd652f061b..f64a954a8596d88ee77707d2c42892aba161b385:/wrpylib/mwmarkup.py diff --git a/wrpylib/mwmarkup.py b/wrpylib/mwmarkup.py index 41b9b39..96e7049 100644 --- a/wrpylib/mwmarkup.py +++ b/wrpylib/mwmarkup.py @@ -1,4 +1,4 @@ -#!/usr/bin/python2.6 +#!/usr/bin/python2.7 # -*- coding: iso-8859-15 -*- # $Id$ # $HeadURL$ @@ -9,4 +9,207 @@ that convinced me. However, here are the links: * py-wikimarkup https://github.com/dcramer/py-wikimarkup * mwlib http://code.pediapress.com/wiki/wiki """ +import re +import xml.etree.ElementTree + + +def find_template(wikitext, template_title): + """Returns the tuple (start, end) of the first occurence of the template '{{template ...}} within wikitext'. + (None, None) is returned if the template is not found. + If you are sure that the wikitext contains the template, the template could be extracted like follows: + + >>> wikitext = u'This is a {{Color|red|red text}} template.' + >>> start, end = find_template(wikitext, u'Color') + >>> print wikitext[start:end] + {{Color|red|red text}} + + or just: + + >>> print wikitext.__getslice__(*find_template(wikitext, u'Color')) + {{Color|red|red text}} + + The search is done with regular expression. It gives wrong results when parsing a template + containing the characters "}}" + + :param wikitext: The text (preferalbe unicode) that has the template in it. + :param template_title: The page title of the template with or without namespace (but as in the wikitext). + :return: + (start, end) of the first occurence with start >= 0 and end > start. + (None, None) if the template is not found. + """ + match = re.search(u"\{\{" + template_title + "\s*(\|[^\}]*)?\}\}", wikitext, re.DOTALL) + if match is None: return None, None + return match.start(), match.end() + + +def split_template(template): + """Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple + (template_title, parameters) where parameters is a Python dictionary {u'1': u'red', u'text'=u'Any text'}. + Anonymous parameters get integer keys (converted to unicode) starting with 1 + like in MediaWiki, named parameters are unicode strings. + Whitespace is stripped. + If an unexpected format is encountered, a ValueError is raised.""" + if not template.startswith(u'{{'): raise ValueError(u'Template does not start with "{{"') + if not template.endswith(u'}}'): raise ValueError(u'Template does not end with "}}"') + parts = template[2:-2].split(u'|') + + # template name + template_title = parts[0].strip() + if len(template_title) == 0: raise ValueError(u'Empty template tilte.') + del parts[0] + + # anonymous parameters + params = {} # result dictionary + param_num = 1 + while len(parts) > 0: + equalsign_pos = parts[0].find(u'=') + if equalsign_pos >= 0: break # named parameter + params[unicode(param_num)] = parts[0].strip() + del parts[0] + param_num += 1 + + # named or numbered parameters + while len(parts) > 0: + equalsign_pos = parts[0].find(u'=') + if equalsign_pos < 0: raise ValueError(u'Anonymous parameter after named parameter.') + key, sep, value = parts[0].partition(u'=') + key = key.strip() + if len(key) == 0: raise ValueError(u'Empty key.') + if params.has_key(key): raise ValueError(u'Duplicate key: "{0}"'.format(key)) + params[key] = value.strip() + del parts[0] + + return template_title, params + + +def create_template(template_title, anonym_params=[], named_param_keys=[], named_param_values=[], as_table=False, as_table_keylen=None): + """Formats a MediaWiki template. + :param template_title: Unicode string with the template name + :param anonym_params: list with parameters without keys + :param named_param_keys: list with keys of named parameters + :param named_param_values: list with values of named parameters, corresponding to named_param_keys. + :param as_table: formats the returned template in one row for each parameter + :param as_table_keylen: length of the key field. None for "automatic". + :return: unicode template""" + pipe_char, equal_char, end_char = (u'\n| ', u' = ', u'\n}}') if as_table else (u'|', u'=', u'}}') + parts = [u"{{" + template_title] + parts += anonym_params + if as_table and as_table_keylen is None: + as_table_keylen = max([len(k) for k in named_param_keys]) + for i in xrange(len(named_param_keys)): + key = named_param_keys[i] + if as_table: + key = key.ljust(as_table_keylen) + parts.append((key + equal_char + named_param_values[i]).rstrip()) + else: + parts.append(key + equal_char + named_param_values[i]) + return pipe_char.join(parts) + end_char + + +def parse_googlemap(wikitext): + """Parses the (unicode) u'content' of the googlemap extension + out of a page. If wikitext does not contain the googlemaps extension text None is returned. + If the googlemap contains invalid formatted lines, a RuntimeError is raised. + + :param wikitext: wikitext containing the template. Example: + + wikitext = ''' + + (Parkplatz)47.114958,11.266026 + Parkplatz + + (Gasthaus) 47.114715, 11.266262, Alt Bärnbad (Gasthaus) + 6#FF014E9A + 47.114715,11.266262 + 47.114135,11.268381 + 47.113421,11.269322 + 47.11277,11.269979 + 47.112408,11.271119 + + ''' + :returns: the tuple (center, zoom, coords, paths). + center is the tuple (lon, lat) of the google maps or (None, None) if not provided + zoom is the google zoom level as integer or None if not provided + coords is a list of (lon, lat, symbol, title) tuples. + paths is a list of (style, coords) tuples. + coords is again a list of (lot, lat, symbol, title) tuples.""" + + def is_coord(line): + """Returns True if the line contains a coordinate.""" + match = re.search('[0-9]{1,2}\.[0-9]+, ?[0-9]{1,2}\.[0-9]+', line) + return not match is None + + def is_path(line): + """Returns True if the line contains a path style definition.""" + match = re.match('[0-9]#[0-9a-fA-F]{8}', line) + return not match is None + + def parse_coord(line): + """Returns (lon, lat, symbol, title). If symbol or text is not present, None is returned.""" + match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line) + if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), match.group(4)) + match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line) + if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), None) + match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line) + if not match is None: return (float(match.group(2)), float(match.group(1)), None, match.group(3)) + match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line) + if not match is None: return (float(match.group(2)), float(match.group(1)), None, None) + return RuntimeError(u'Could not parse line ' + line) + + regexp = re.compile(u"(]*>)(.*)()", re.DOTALL) + match = regexp.search(wikitext) + if match is None: return None + content = match.group(2) + gm = xml.etree.ElementTree.XML((match.group(1)+match.group(3)).encode('UTF8')) + zoom = gm.get('zoom') + lon = gm.get('lon') + lat = gm.get('lat') + if not zoom is None: zoom = int(zoom) + if not lon is None: lon = float(lon) + if not lat is None: lat = float(lat) + center = (lon, lat) + + coords = [] + paths = [] + lines = content.split("\n") + i = 0 + while i < len(lines): + line = lines[i].strip() + i += 1 + + # Skip whitespace + if len(line) == 0: continue + + # Handle a path + if is_path(line): + match = re.match(u'([0-9]#[0-9a-fA-F]{8})', line) + style = match.group(1) + local_coords = [] + while i < len(lines): + line = lines[i].strip() + i += 1 + if is_path(line): + i -= 1 + break + if is_coord(line): + lon, lat, symbol, title = parse_coord(line) + local_coords.append((lon, lat, symbol, title)) + paths.append((style, local_coords)) + continue + + # Handle a coordinate + if is_coord(line): + lon, lat, symbol, title = parse_coord(line) + while i < len(lines): + line = lines[i].strip() + i += 1 + if is_path(line) or is_coord(line): + i -= 1 + break + if len(line) > 0 and title is None: title = line + coords.append((lon, lat, symbol, title)) + continue + + raise RuntimeError(u'Unknown line syntax: ' + line) + return (center, zoom, coords, paths)