#!/usr/bin/python3.4 # -*- coding: iso-8859-15 -*- # $Id$ # $HeadURL$ """This module contains general functions that help parsing the mediawiki markup. I looked for an already existing MediaWiki parser in Python but I didn't find anything that convinced me. However, here are the links: * py-wikimarkup https://github.com/dcramer/py-wikimarkup * mwlib http://code.pediapress.com/wiki/wiki """ import re import xml.etree.ElementTree import collections import formencode class ParseError(RuntimeError): """Exception used by some of the functions""" pass def find_template(wikitext, template_title): """Returns the tuple (start, end) of the first occurence of the template '{{template ...}} within wikitext'. (None, None) is returned if the template is not found. If you are sure that the wikitext contains the template, the template could be extracted like follows: >>> wikitext = u'This is a {{Color|red|red text}} template.' >>> start, end = find_template(wikitext, u'Color') >>> print wikitext[start:end] {{Color|red|red text}} or just: >>> print wikitext.__getslice__(*find_template(wikitext, u'Color')) {{Color|red|red text}} The search is done with regular expression. It gives wrong results when parsing a template containing the characters "}}" :param wikitext: The text (preferalbe unicode) that has the template in it. :param template_title: The page title of the template with or without namespace (but as in the wikitext). :return: (start, end) of the first occurence with start >= 0 and end > start. (None, None) if the template is not found. """ match = re.search("\{\{" + template_title + "\s*(\|[^\}]*)?\}\}", wikitext, re.DOTALL) if match is None: return None, None return match.start(), match.end() class TemplateValidator(formencode.FancyValidator): def __init__(self, strip=True, as_table=False, as_table_keylen=None): """Validates a MediaWiki template, e.g. {{Color|red}} :param stip: If strip is True, the title, and the parameter keys and values are stripped in to_python. :param as_table: formats the returned template in one row for each parameter :param as_table_keylen: length of the key field for from_python. None for "automatic".""" self.strip = (lambda s: s.strip()) if strip else (lambda s: s) self.as_table = as_table self.as_table_keylen = as_table_keylen def to_python(self, value, state=None): """Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple (title, anonym_params, named_params) where title is the template title, anonym_params is a list of anonymous parameters and named_params is a OrderedDict of named parameters. Whitespace of the parameters is stripped.""" if not value.startswith('{{'): raise formencode.Invalid('Template does not start with "{{"', value, state) if not value.endswith('}}'): raise formencode.Invalid('Template does not end with "}}"', value, state) parts = value[2:-2].split('|') # template name title = self.strip(parts[0]) if len(title) == 0: raise formencode.Invalid('Empty template tilte.', value, state) del parts[0] # anonymous parameters anonym_params = [] while len(parts) > 0: equalsign_pos = parts[0].find('=') if equalsign_pos >= 0: break # named parameter anonym_params.append(self.strip(parts[0])) del parts[0] # named or numbered parameters named_params = collections.OrderedDict() while len(parts) > 0: equalsign_pos = parts[0].find('=') if equalsign_pos < 0: raise formencode.Invalid('Anonymous parameter after named parameter.', value, state) key, sep, value = parts[0].partition('=') key = self.strip(key) if len(key) == 0: raise formencode.Invalid('Empty key.', value, state) if key in named_params: raise formencode.Invalid('Duplicate key: "{0}"'.format(key), value, state) named_params[key] = self.strip(value) del parts[0] return title, anonym_params, named_params def from_python(self, value, state=None): """Formats a MediaWiki template. value is a tuple: (title, anonym_params, named_params) where title is the template title, anonym_params is a list of anonymous parameters and named_params is a dict or OrderedDict of named parameters.""" title, anonym_params, named_params = value pipe_char, equal_char, end_char = ('\n| ', ' = ', '\n}}') if self.as_table else ('|', '=', '}}') parts = ["{{" + title] parts += anonym_params as_table_keylen = self.as_table_keylen if self.as_table and as_table_keylen is None: as_table_keylen = max(list(map(len, iter(named_params.keys())))) for k, v in named_params.items(): if self.as_table: k = k.ljust(as_table_keylen) parts.append((k + equal_char + v).rstrip()) else: parts.append(k + equal_char + v) return pipe_char.join(parts) + end_char def split_template(template): """Deprecated legacy function. Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple (template_title, parameters) where parameters is a Python dictionary {u'1': u'red', u'text'=u'Any text'}. Anonymous parameters get integer keys (converted to unicode) starting with 1 like in MediaWiki, named parameters are unicode strings. Whitespace is stripped. If an unexpected format is encountered, a ValueError is raised.""" try: title, anonym_params, named_params = TemplateValidator().to_python(template) parameters = dict(named_params) for i in range(len(anonym_params)): parameters[str(i+1)] = anonym_params[i] except formencode.Invalid as e: raise ValueError(e[0]) return title, parameters def create_template(template_title, anonym_params=[], named_param_keys=[], named_param_values=[], as_table=False, as_table_keylen=None): """Deprecated legacy function. Formats a MediaWiki template. :param template_title: Unicode string with the template name :param anonym_params: list with parameters without keys :param named_param_keys: list with keys of named parameters :param named_param_values: list with values of named parameters, corresponding to named_param_keys. :param as_table: formats the returned template in one row for each parameter :param as_table_keylen: length of the key field. None for "automatic". :return: unicode template""" named_params = collections.OrderedDict(list(zip(named_param_keys, named_param_values))) return TemplateValidator(as_table=as_table, as_table_keylen=as_table_keylen).from_python((template_title, anonym_params, named_params)) def find_tag(wikitext, tagname, pos=0): """Returns position information of the first occurence of the tag '...' or ''. If you are sure that the wikitext contains the tag, the tag could be extracted like follows: >>> wikitext = u'This is a mytag tag.' >>> start, content, endtag, end = find_template(wikitext, u'tag') >>> print wikitext[start:end] mytag :param wikitext: The text (preferalbe unicode) that has the template in it. :param tagname: Name of the tag, e.g. u'tag' for . :param pos: position within wikitext to start searching the tag. :return: (start, content, endtag, end). start is the position of '<' of the tag, content is the beginning of the content (after '>'), enttag is the beginning of the end tag ('".format(tagname), re.DOTALL) match_starttag = regexp_starttag.search(wikitext, pos) if match_starttag is None: return None, None, None, None # does the tag have content? if len(match_starttag.group(1)) == 1: # group(1) is either '' or '/'. # single tag return match_starttag.start(), None, None, match_starttag.end() # tag with content regexp_endtag = re.compile(''.format(tagname), re.DOTALL) match_endtag = regexp_endtag.search(wikitext, match_starttag.end()) if match_endtag is None: # No closing tag - error in wikitext return None, None, None, None return match_starttag.start(), match_starttag.end(), match_endtag.start(), match_endtag.end() def parse_googlemap(wikitext): """Parses the (unicode) u'content' of the googlemap extension. If wikitext does not contain the tag or if the tag contains invalid formatted lines, a ParseError is raised. Use find_tag(wikitext, 'googlemap') to find the googlemap tag within an arbitrary wikitext before using this function. :param wikitext: wikitext containing the template. Example: wikitext = ''' (Parkplatz)47.114958,11.266026 Parkplatz (Gasthaus) 47.114715, 11.266262, Alt Bärnbad (Gasthaus) 6#FF014E9A 47.114715,11.266262 47.114135,11.268381 47.113421,11.269322 47.11277,11.269979 47.112408,11.271119 ''' :returns: The tuple (attributes, coords, paths) is returned. attributes is a dict that contains the attribues that are present (e.g. lon, lat, zoom, width, height) converted to float (lon, lat) or int. coords is a list of (lon, lat, symbol, title) tuples. paths is a list of (style, coords) tuples. coords is again a list of (lon, lat, symbol, title) tuples.""" def is_coord(line): """Returns True if the line contains a coordinate.""" match = re.search('[0-9]{1,2}\.[0-9]+, ?[0-9]{1,2}\.[0-9]+', line) return not match is None def is_path(line): """Returns True if the line contains a path style definition.""" match = re.match('[0-9]#[0-9a-fA-F]{8}', line) return not match is None def parse_coord(line): """Returns (lon, lat, symbol, title). If symbol or text is not present, None is returned.""" match = re.match('\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line) if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), match.group(4)) match = re.match('\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line) if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), None) match = re.match('([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line) if not match is None: return (float(match.group(2)), float(match.group(1)), None, match.group(3)) match = re.match('([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line) if not match is None: return (float(match.group(2)), float(match.group(1)), None, None) return ParseError('Could not parse line ' + line) start, content, endtag, end = find_tag(wikitext, 'googlemap') if start is None: raise ParseError(' tag not found.') if content is None: xml_only = wikitext[start:endtag] else: xml_only = wikitext[start:content]+wikitext[endtag:end] try: gm = xml.etree.ElementTree.XML(xml_only.encode('UTF8')) except xml.etree.ElementTree.ParseError as e: row, column = e.position raise ParseError("XML parse error in .") # parse attributes attributes = {} try: for key in ['lon', 'lat']: if gm.get(key) is not None: attributes[key] = float(gm.get(key)) for key in ['zoom', 'width', 'height']: if gm.get(key) is not None: attributes[key] = int(gm.get(key)) except ValueError as error: raise ParseError('Error at parsing attribute {0} of : {1}'.format(key, str(error))) # parse points and lines coords = [] paths = [] lines = wikitext[content:endtag].split("\n") i = 0 while i < len(lines): line = lines[i].strip() i += 1 # Skip whitespace if len(line) == 0: continue # Handle a path if is_path(line): match = re.match('([0-9]#[0-9a-fA-F]{8})', line) style = match.group(1) local_coords = [] while i < len(lines): line = lines[i].strip() i += 1 if is_path(line): i -= 1 break if is_coord(line): lon, lat, symbol, title = parse_coord(line) local_coords.append((lon, lat, symbol, title)) paths.append((style, local_coords)) continue # Handle a coordinate if is_coord(line): lon, lat, symbol, title = parse_coord(line) while i < len(lines): line = lines[i].strip() i += 1 if is_path(line) or is_coord(line): i -= 1 break if len(line) > 0 and title is None: title = line coords.append((lon, lat, symbol, title)) continue raise ParseError('Unknown line syntax: ' + line) return (attributes, coords, paths)