-#!/usr/bin/python2.7
-# -*- coding: iso-8859-15 -*-
+#!/usr/bin/python3.4
# $Id$
# $HeadURL$
-"""This module contains general functions that help parsing the mediawiki markup.
-I looked for an already existing MediaWiki parser in Python but I didn't find anything
-that convinced me. However, here are the links:
+"""For parsing MediaWiki text, we rely on the package mwparserfromhell (https://github.com/earwig/mwparserfromhell).
+This module just contains a few additional useful functions.
+Other Python MediaWiki parsers:
* py-wikimarkup https://github.com/dcramer/py-wikimarkup
* mwlib http://code.pediapress.com/wiki/wiki
+* https://www.mediawiki.org/wiki/Alternative_parsers
"""
-import re
-import xml.etree.ElementTree
-
-def find_template(wikitext, template_title):
- """Returns the tuple (start, end) of the first occurence of the template '{{template ...}} within wikitext'.
- (None, None) is returned if the template is not found.
- If you are sure that the wikitext contains the template, the template could be extracted like follows:
-
- >>> wikitext = u'This is a {{Color|red|red text}} template.'
- >>> start, end = find_template(wikitext, u'Color')
- >>> print wikitext[start:end]
- {{Color|red|red text}}
-
- or just:
-
- >>> print wikitext.__getslice__(*find_template(wikitext, u'Color'))
- {{Color|red|red text}}
-
- The search is done with regular expression. It gives wrong results when parsing a template
- containing the characters "}}"
-
- :param wikitext: The text (preferalbe unicode) that has the template in it.
- :param template_title: The page title of the template with or without namespace (but as in the wikitext).
- :return:
- (start, end) of the first occurence with start >= 0 and end > start.
- (None, None) if the template is not found.
- """
- match = re.search(u"\{\{" + template_title + "\s*(\|[^\}]*)?\}\}", wikitext, re.DOTALL)
- if match is None: return None, None
- return match.start(), match.end()
-
-
-def split_template(template):
- """Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple
- (template_title, parameters) where parameters is a Python dictionary {u'1': u'red', u'text'=u'Any text'}.
- Anonymous parameters get integer keys (converted to unicode) starting with 1
- like in MediaWiki, named parameters are unicode strings.
- Whitespace is stripped.
- If an unexpected format is encountered, a ValueError is raised."""
- if not template.startswith(u'{{'): raise ValueError(u'Template does not start with "{{"')
- if not template.endswith(u'}}'): raise ValueError(u'Template does not end with "}}"')
- parts = template[2:-2].split(u'|')
-
- # template name
- template_title = parts[0].strip()
- if len(template_title) == 0: raise ValueError(u'Empty template tilte.')
- del parts[0]
-
- # anonymous parameters
- params = {} # result dictionary
- param_num = 1
- while len(parts) > 0:
- equalsign_pos = parts[0].find(u'=')
- if equalsign_pos >= 0: break # named parameter
- params[unicode(param_num)] = parts[0].strip()
- del parts[0]
- param_num += 1
-
- # named or numbered parameters
- while len(parts) > 0:
- equalsign_pos = parts[0].find(u'=')
- if equalsign_pos < 0: raise ValueError(u'Anonymous parameter after named parameter.')
- key, sep, value = parts[0].partition(u'=')
- key = key.strip()
- if len(key) == 0: raise ValueError(u'Empty key.')
- if params.has_key(key): raise ValueError(u'Duplicate key: "{0}"'.format(key))
- params[key] = value.strip()
- del parts[0]
-
- return template_title, params
-
-
-def create_template(template_title, anonym_params=[], named_param_keys=[], named_param_values=[], as_table=False, as_table_keylen=None):
- """Formats a MediaWiki template.
- :param template_title: Unicode string with the template name
- :param anonym_params: list with parameters without keys
- :param named_param_keys: list with keys of named parameters
- :param named_param_values: list with values of named parameters, corresponding to named_param_keys.
- :param as_table: formats the returned template in one row for each parameter
- :param as_table_keylen: length of the key field. None for "automatic".
- :return: unicode template"""
- pipe_char, equal_char, end_char = (u'\n| ', u' = ', u'\n}}') if as_table else (u'|', u'=', u'}}')
- parts = [u"{{" + template_title]
- parts += anonym_params
- if as_table and as_table_keylen is None:
- as_table_keylen = max([len(k) for k in named_param_keys])
- for i in xrange(len(named_param_keys)):
- key = named_param_keys[i]
- if as_table:
- key = key.ljust(as_table_keylen)
- parts.append((key + equal_char + named_param_values[i]).rstrip())
+class ParseError(RuntimeError):
+ """Exception used by some of the functions"""
+ pass
+
+
+def format_template_table(template, keylen=None):
+ """Reformat the given template to be tabular.
+
+ >>> template
+ {{foo|bar|bazz=7}}
+ >>> format_template_table(template)
+ {{foo
+ | bar
+ | bazz = 7
+ }}
+
+ :param keylen: length of the keys or None for automatic determination
+ """
+ if keylen is None:
+ shown_keys = [len(param.name.strip()) for param in template.params if param.showkey]
+ keylen = max(shown_keys) if shown_keys else 0
+ template.name = '{}\n'.format(template.name.strip())
+ for param in template.params:
+ if param.showkey:
+ param.name = ' {{:{}}} '.format(keylen).format(param.name.strip())
+ value = param.value.strip()
+ if len(value) > 0:
+ param.value = ' {}\n'.format(value)
else:
- parts.append(key + equal_char + named_param_values[i])
- return pipe_char.join(parts) + end_char
-
-
-def parse_googlemap(wikitext):
- """Parses the (unicode) u'<googlemap ...>content</googlemap>' of the googlemap extension
- out of a page. If wikitext does not contain the googlemaps extension text None is returned.
- If the googlemap contains invalid formatted lines, a RuntimeError is raised.
-
- :param wikitext: wikitext containing the template. Example:
-
- wikitext = '''
- <googlemap version="0.9" lat="47.113291" lon="11.272337" zoom="15">
- (Parkplatz)47.114958,11.266026
- Parkplatz
-
- (Gasthaus) 47.114715, 11.266262, Alt Bärnbad (Gasthaus)
- 6#FF014E9A
- 47.114715,11.266262
- 47.114135,11.268381
- 47.113421,11.269322
- 47.11277,11.269979
- 47.112408,11.271119
- </googlemap>
- '''
- :returns: the tuple (center, zoom, coords, paths).
- center is the tuple (lon, lat) of the google maps or (None, None) if not provided
- zoom is the google zoom level as integer or None if not provided
- coords is a list of (lon, lat, symbol, title) tuples.
- paths is a list of (style, coords) tuples.
- coords is again a list of (lot, lat, symbol, title) tuples."""
-
- def is_coord(line):
- """Returns True if the line contains a coordinate."""
- match = re.search('[0-9]{1,2}\.[0-9]+, ?[0-9]{1,2}\.[0-9]+', line)
- return not match is None
-
- def is_path(line):
- """Returns True if the line contains a path style definition."""
- match = re.match('[0-9]#[0-9a-fA-F]{8}', line)
- return not match is None
-
- def parse_coord(line):
- """Returns (lon, lat, symbol, title). If symbol or text is not present, None is returned."""
- match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line)
- if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), match.group(4))
- match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
- if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), None)
- match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line)
- if not match is None: return (float(match.group(2)), float(match.group(1)), None, match.group(3))
- match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
- if not match is None: return (float(match.group(2)), float(match.group(1)), None, None)
- return RuntimeError(u'Could not parse line ' + line)
-
- regexp = re.compile(u"(<googlemap[^>]*>)(.*)(</googlemap>)", re.DOTALL)
- match = regexp.search(wikitext)
- if match is None: return None
- content = match.group(2)
- gm = xml.etree.ElementTree.XML((match.group(1)+match.group(3)).encode('UTF8'))
- zoom = gm.get('zoom')
- lon = gm.get('lon')
- lat = gm.get('lat')
- if not zoom is None: zoom = int(zoom)
- if not lon is None: lon = float(lon)
- if not lat is None: lat = float(lat)
- center = (lon, lat)
-
- coords = []
- paths = []
- lines = content.split("\n")
- i = 0
- while i < len(lines):
- line = lines[i].strip()
- i += 1
-
- # Skip whitespace
- if len(line) == 0: continue
-
- # Handle a path
- if is_path(line):
- match = re.match(u'([0-9]#[0-9a-fA-F]{8})', line)
- style = match.group(1)
- local_coords = []
- while i < len(lines):
- line = lines[i].strip()
- i += 1
- if is_path(line):
- i -= 1
- break
- if is_coord(line):
- lon, lat, symbol, title = parse_coord(line)
- local_coords.append((lon, lat, symbol, title))
- paths.append((style, local_coords))
- continue
-
- # Handle a coordinate
- if is_coord(line):
- lon, lat, symbol, title = parse_coord(line)
- while i < len(lines):
- line = lines[i].strip()
- i += 1
- if is_path(line) or is_coord(line):
- i -= 1
- break
- if len(line) > 0 and title is None: title = line
- coords.append((lon, lat, symbol, title))
- continue
-
- raise RuntimeError(u'Unknown line syntax: ' + line)
- return (center, zoom, coords, paths)
-
+ param.value = '\n'
+
+
+def format_template_oneline(template):
+ """Formats a template like this: {{template_name|param| }}
+ (whitespace is stripped and empty parameters are replaced with one space)."""
+ template.name = template.name.strip()
+ for param in template.params:
+ if param.showkey:
+ param.name = param.name.strip()
+ value = param.value.strip()
+ if value == '':
+ value = ' '
+ param.value = value
+
+
+def dbkey_to_title(value):
+ """Converts a article database key to a article title. Private function secureAndSplit() of the Title class
+ on line 3316 of includes/Title.php says:
+ $this->mTextform = str_replace( '_', ' ', $this->mDbkeyform );
+ No check for None because a missing title is an error."""
+ return value.replace('_', ' ')