X-Git-Url: https://git.toastfreeware.priv.at/philipp/winterrodeln/wrpylib.git/blobdiff_plain/e554ec1b9d06c76888b42d2b4a6a78a22b0bdb69..ebd14c98b1238a008b6f38d0f00e3eb13b157cda:/wrpylib/mwmarkup.py diff --git a/wrpylib/mwmarkup.py b/wrpylib/mwmarkup.py index 46e11e4..572b30c 100644 --- a/wrpylib/mwmarkup.py +++ b/wrpylib/mwmarkup.py @@ -1,4 +1,4 @@ -#!/usr/bin/python2.6 +#!/usr/bin/python3.4 # -*- coding: iso-8859-15 -*- # $Id$ # $HeadURL$ @@ -10,6 +10,14 @@ that convinced me. However, here are the links: * mwlib http://code.pediapress.com/wiki/wiki """ import re +import xml.etree.ElementTree +import collections +import formencode + + +class ParseError(RuntimeError): + """Exception used by some of the functions""" + pass def find_template(wikitext, template_title): @@ -27,7 +35,8 @@ def find_template(wikitext, template_title): >>> print wikitext.__getslice__(*find_template(wikitext, u'Color')) {{Color|red|red text}} - The search is done with regular expression. + The search is done with regular expression. It gives wrong results when parsing a template + containing the characters "}}" :param wikitext: The text (preferalbe unicode) that has the template in it. :param template_title: The page title of the template with or without namespace (but as in the wikitext). @@ -35,66 +44,280 @@ def find_template(wikitext, template_title): (start, end) of the first occurence with start >= 0 and end > start. (None, None) if the template is not found. """ - match = re.search(u"\{\{" + template_title + "[^\}]*\}\}", wikitext, re.DOTALL) + match = re.search("\{\{" + template_title + "\s*(\|[^\}]*)?\}\}", wikitext, re.DOTALL) if match is None: return None, None return match.start(), match.end() +class TemplateValidator(formencode.FancyValidator): + def __init__(self, strip=True, as_table=False, as_table_keylen=None): + """Validates a MediaWiki template, e.g. {{Color|red}} + :param stip: If strip is True, the title, and the parameter keys and values are stripped in to_python. + :param as_table: formats the returned template in one row for each parameter + :param as_table_keylen: length of the key field for from_python. None for "automatic".""" + self.strip = (lambda s: s.strip()) if strip else (lambda s: s) + self.as_table = as_table + self.as_table_keylen = as_table_keylen + + def to_python(self, value, state=None): + """Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple + (title, anonym_params, named_params) where title is the template title, + anonym_params is a list of anonymous parameters and named_params is a OrderedDict + of named parameters. Whitespace of the parameters is stripped.""" + if not value.startswith('{{'): + raise formencode.Invalid('Template does not start with "{{"', value, state) + if not value.endswith('}}'): + raise formencode.Invalid('Template does not end with "}}"', value, state) + parts = value[2:-2].split('|') + + # template name + title = self.strip(parts[0]) + if len(title) == 0: + raise formencode.Invalid('Empty template tilte.', value, state) + del parts[0] + + # anonymous parameters + anonym_params = [] + while len(parts) > 0: + equalsign_pos = parts[0].find('=') + if equalsign_pos >= 0: break # named parameter + anonym_params.append(self.strip(parts[0])) + del parts[0] + + # named or numbered parameters + named_params = collections.OrderedDict() + while len(parts) > 0: + equalsign_pos = parts[0].find('=') + if equalsign_pos < 0: + raise formencode.Invalid('Anonymous parameter after named parameter.', value, state) + key, sep, value = parts[0].partition('=') + key = self.strip(key) + if len(key) == 0: + raise formencode.Invalid('Empty key.', value, state) + if key in named_params: + raise formencode.Invalid('Duplicate key: "{0}"'.format(key), value, state) + named_params[key] = self.strip(value) + del parts[0] + + return title, anonym_params, named_params + + def from_python(self, value, state=None): + """Formats a MediaWiki template. + value is a tuple: (title, anonym_params, named_params) + where title is the template title, anonym_params is a list of anonymous parameters and + named_params is a dict or OrderedDict of named parameters.""" + title, anonym_params, named_params = value + pipe_char, equal_char, end_char = ('\n| ', ' = ', '\n}}') if self.as_table else ('|', '=', '}}') + parts = ["{{" + title] + parts += anonym_params + as_table_keylen = self.as_table_keylen + if self.as_table and as_table_keylen is None: + as_table_keylen = max(list(map(len, iter(named_params.keys())))) + for k, v in named_params.items(): + if self.as_table: + k = k.ljust(as_table_keylen) + parts.append((k + equal_char + v).rstrip()) + else: + parts.append(k + equal_char + v) + return pipe_char.join(parts) + end_char + def split_template(template): - """Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple - (template_title, parameters) where parameters is a Python dictionary {1: u'red', u'text'=u'Any text'}. + """Deprecated legacy function. + + Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple + (template_title, parameters) where parameters is a Python dictionary {u'1': u'red', u'text'=u'Any text'}. Anonymous parameters get integer keys (converted to unicode) starting with 1 like in MediaWiki, named parameters are unicode strings. Whitespace is stripped. If an unexpected format is encountered, a ValueError is raised.""" - if not template.startswith(u'{{'): raise ValueError(u'Template does not start with "{{"') - if not template.endswith(u'}}'): raise ValueError(u'Template does not end with "}}"') - parts = template[2:-2].split(u'|') - - # template name - template_title = parts[0].strip() - if len(template_title) == 0: raise ValueError(u'Empty template tilte.') - del parts[0] - - # anonymous parameters - params = {} # result dictionary - param_num = 1 - while len(parts) > 0: - equalsign_pos = parts[0].find(u'=') - if equalsign_pos >= 0: break # named parameter - params[unicode(param_num)] = parts[0].strip() - del parts[0] - param_num += 1 - - # named or numbered parameters - while len(parts) > 0: - equalsign_pos = parts[0].find(u'=') - if equalsign_pos < 0: raise ValueError(u'Anonymous parameter after named parameter.') - key, sep, value = parts[0].partition(u'=') - key = key.strip() - if len(key) == 0: raise ValueError(u'Empty key.') - if params.has_key(key): raise ValueError(u'Duplicate key: "{0}"'.format(key)) - params[key] = value.strip() - del parts[0] + try: + title, anonym_params, named_params = TemplateValidator().to_python(template) + parameters = dict(named_params) + for i in range(len(anonym_params)): + parameters[str(i+1)] = anonym_params[i] + except formencode.Invalid as e: + raise ValueError(e[0]) + return title, parameters - return template_title, params +def create_template(template_title, anonym_params=[], named_param_keys=[], named_param_values=[], as_table=False, as_table_keylen=None): + """Deprecated legacy function. -def create_template(template_title, anonym_params=[], named_param_keys=[], named_param_values=[], as_table=False): - """Formats a MediaWiki template. + Formats a MediaWiki template. :param template_title: Unicode string with the template name :param anonym_params: list with parameters without keys :param named_param_keys: list with keys of named parameters :param named_param_values: list with values of named parameters, corresponding to named_param_keys. + :param as_table: formats the returned template in one row for each parameter + :param as_table_keylen: length of the key field. None for "automatic". :return: unicode template""" - pipe_char, equal_char, end_char = (u'\n| ', u' = ', u'\n}}') if as_table else (u'|', u'=', u'}}') - parts = [u"{{" + template_title] - parts += anonym_params - if as_table: max_key_len = max([len(k) for k in named_param_keys]) - for i in xrange(len(named_param_keys)): - key = named_param_keys[i] - if as_table: key = key.ljust(max_key_len) - parts.append(key + equal_char + named_param_values[i]) - return pipe_char.join(parts) + end_char + named_params = collections.OrderedDict(list(zip(named_param_keys, named_param_values))) + return TemplateValidator(as_table=as_table, as_table_keylen=as_table_keylen).from_python((template_title, anonym_params, named_params)) + + +def find_tag(wikitext, tagname, pos=0): + """Returns position information of the first occurence of the tag '...' + or ''. + If you are sure that the wikitext contains the tag, the tag could be extracted like follows: + + >>> wikitext = u'This is a mytag tag.' + >>> start, content, endtag, end = find_template(wikitext, u'tag') + >>> print wikitext[start:end] + mytag + + :param wikitext: The text (preferalbe unicode) that has the template in it. + :param tagname: Name of the tag, e.g. u'tag' for . + :param pos: position within wikitext to start searching the tag. + :return: + (start, content, endtag, end). start is the position of '<' of the tag, + content is the beginning of the content (after '>'), enttag is the + beginning of the end tag ('".format(tagname), re.DOTALL) + match_starttag = regexp_starttag.search(wikitext, pos) + if match_starttag is None: + return None, None, None, None + + # does the tag have content? + if len(match_starttag.group(1)) == 1: # group(1) is either '' or '/'. + # single tag + return match_starttag.start(), None, None, match_starttag.end() + + # tag with content + regexp_endtag = re.compile(''.format(tagname), re.DOTALL) + match_endtag = regexp_endtag.search(wikitext, match_starttag.end()) + if match_endtag is None: + # No closing tag - error in wikitext + return None, None, None, None + return match_starttag.start(), match_starttag.end(), match_endtag.start(), match_endtag.end() + + +def parse_googlemap(wikitext): + """Parses the (unicode) u'content' of the googlemap extension. + If wikitext does not contain the tag or if the tag contains + invalid formatted lines, a ParseError is raised. + Use find_tag(wikitext, 'googlemap') to find the googlemap tag within an arbitrary + wikitext before using this function. + + :param wikitext: wikitext containing the template. Example: + + wikitext = ''' + + (Parkplatz)47.114958,11.266026 + Parkplatz + + (Gasthaus) 47.114715, 11.266262, Alt Bärnbad (Gasthaus) + 6#FF014E9A + 47.114715,11.266262 + 47.114135,11.268381 + 47.113421,11.269322 + 47.11277,11.269979 + 47.112408,11.271119 + + ''' + :returns: The tuple (attributes, coords, paths) is returned. + attributes is a dict that contains the attribues that are present + (e.g. lon, lat, zoom, width, height) converted to float (lon, lat) or int. + coords is a list of (lon, lat, symbol, title) tuples. + paths is a list of (style, coords) tuples. + coords is again a list of (lon, lat, symbol, title) tuples.""" + + def is_coord(line): + """Returns True if the line contains a coordinate.""" + match = re.search('[0-9]{1,2}\.[0-9]+, ?[0-9]{1,2}\.[0-9]+', line) + return not match is None + + def is_path(line): + """Returns True if the line contains a path style definition.""" + match = re.match('[0-9]#[0-9a-fA-F]{8}', line) + return not match is None + + def parse_coord(line): + """Returns (lon, lat, symbol, title). If symbol or text is not present, None is returned.""" + match = re.match('\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line) + if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), match.group(4)) + match = re.match('\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line) + if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), None) + match = re.match('([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line) + if not match is None: return (float(match.group(2)), float(match.group(1)), None, match.group(3)) + match = re.match('([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line) + if not match is None: return (float(match.group(2)), float(match.group(1)), None, None) + return ParseError('Could not parse line ' + line) + + start, content, endtag, end = find_tag(wikitext, 'googlemap') + if start is None: + raise ParseError(' tag not found.') + if content is None: + xml_only = wikitext[start:endtag] + else: + xml_only = wikitext[start:content]+wikitext[endtag:end] + + try: + gm = xml.etree.ElementTree.XML(xml_only.encode('UTF8')) + except xml.etree.ElementTree.ParseError as e: + row, column = e.position + raise ParseError("XML parse error in .") + + # parse attributes + attributes = {} + try: + for key in ['lon', 'lat']: + if gm.get(key) is not None: + attributes[key] = float(gm.get(key)) + for key in ['zoom', 'width', 'height']: + if gm.get(key) is not None: + attributes[key] = int(gm.get(key)) + except ValueError as error: + raise ParseError('Error at parsing attribute {0} of : {1}'.format(key, str(error))) + + # parse points and lines + coords = [] + paths = [] + lines = wikitext[content:endtag].split("\n") + i = 0 + while i < len(lines): + line = lines[i].strip() + i += 1 + + # Skip whitespace + if len(line) == 0: continue + + # Handle a path + if is_path(line): + match = re.match('([0-9]#[0-9a-fA-F]{8})', line) + style = match.group(1) + local_coords = [] + while i < len(lines): + line = lines[i].strip() + i += 1 + if is_path(line): + i -= 1 + break + if is_coord(line): + lon, lat, symbol, title = parse_coord(line) + local_coords.append((lon, lat, symbol, title)) + paths.append((style, local_coords)) + continue + + # Handle a coordinate + if is_coord(line): + lon, lat, symbol, title = parse_coord(line) + while i < len(lines): + line = lines[i].strip() + i += 1 + if is_path(line) or is_coord(line): + i -= 1 + break + if len(line) > 0 and title is None: title = line + coords.append((lon, lat, symbol, title)) + continue + + raise ParseError('Unknown line syntax: ' + line) + + return (attributes, coords, paths)