X-Git-Url: https://git.toastfreeware.priv.at/philipp/winterrodeln/wrpylib.git/blobdiff_plain/42b2a8c2664b5b1916d11b9043c47d87dc917aba..5ddd2a48a18a837e3283173cef8f5f8dcc192e7e:/wrpylib/mwmarkup.py diff --git a/wrpylib/mwmarkup.py b/wrpylib/mwmarkup.py index f377c82..221f1cb 100644 --- a/wrpylib/mwmarkup.py +++ b/wrpylib/mwmarkup.py @@ -1,264 +1,63 @@ -#!/usr/bin/python2.7 -# -*- coding: iso-8859-15 -*- +#!/usr/bin/python3.4 # $Id$ # $HeadURL$ -"""This module contains general functions that help parsing the mediawiki markup. -I looked for an already existing MediaWiki parser in Python but I didn't find anything -that convinced me. However, here are the links: +"""For parsing MediaWiki text, we rely on the package mwparserfromhell (https://github.com/earwig/mwparserfromhell). +This module just contains a few additional useful functions. +Other Python MediaWiki parsers: * py-wikimarkup https://github.com/dcramer/py-wikimarkup * mwlib http://code.pediapress.com/wiki/wiki +* https://www.mediawiki.org/wiki/Alternative_parsers """ -import re -import xml.etree.ElementTree +class ParseError(RuntimeError): + """Exception used by some of the functions""" + pass -def find_template(wikitext, template_title): - """Returns the tuple (start, end) of the first occurence of the template '{{template ...}} within wikitext'. - (None, None) is returned if the template is not found. - If you are sure that the wikitext contains the template, the template could be extracted like follows: - >>> wikitext = u'This is a {{Color|red|red text}} template.' - >>> start, end = find_template(wikitext, u'Color') - >>> print wikitext[start:end] - {{Color|red|red text}} +def format_template_table(template, keylen=None): + """Reformat the given template to be tabular. - or just: + >>> template + {{foo|bar|bazz=7}} + >>> format_template_table(template) + {{foo + | bar + | bazz = 7 + }} - >>> print wikitext.__getslice__(*find_template(wikitext, u'Color')) - {{Color|red|red text}} - - The search is done with regular expression. It gives wrong results when parsing a template - containing the characters "}}" - - :param wikitext: The text (preferalbe unicode) that has the template in it. - :param template_title: The page title of the template with or without namespace (but as in the wikitext). - :return: - (start, end) of the first occurence with start >= 0 and end > start. - (None, None) if the template is not found. - """ - match = re.search(u"\{\{" + template_title + "\s*(\|[^\}]*)?\}\}", wikitext, re.DOTALL) - if match is None: return None, None - return match.start(), match.end() - - -def split_template(template): - """Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple - (template_title, parameters) where parameters is a Python dictionary {u'1': u'red', u'text'=u'Any text'}. - Anonymous parameters get integer keys (converted to unicode) starting with 1 - like in MediaWiki, named parameters are unicode strings. - Whitespace is stripped. - If an unexpected format is encountered, a ValueError is raised.""" - if not template.startswith(u'{{'): raise ValueError(u'Template does not start with "{{"') - if not template.endswith(u'}}'): raise ValueError(u'Template does not end with "}}"') - parts = template[2:-2].split(u'|') - - # template name - template_title = parts[0].strip() - if len(template_title) == 0: raise ValueError(u'Empty template tilte.') - del parts[0] - - # anonymous parameters - params = {} # result dictionary - param_num = 1 - while len(parts) > 0: - equalsign_pos = parts[0].find(u'=') - if equalsign_pos >= 0: break # named parameter - params[unicode(param_num)] = parts[0].strip() - del parts[0] - param_num += 1 - - # named or numbered parameters - while len(parts) > 0: - equalsign_pos = parts[0].find(u'=') - if equalsign_pos < 0: raise ValueError(u'Anonymous parameter after named parameter.') - key, sep, value = parts[0].partition(u'=') - key = key.strip() - if len(key) == 0: raise ValueError(u'Empty key.') - if params.has_key(key): raise ValueError(u'Duplicate key: "{0}"'.format(key)) - params[key] = value.strip() - del parts[0] - - return template_title, params - - -def create_template(template_title, anonym_params=[], named_param_keys=[], named_param_values=[], as_table=False, as_table_keylen=None): - """Formats a MediaWiki template. - :param template_title: Unicode string with the template name - :param anonym_params: list with parameters without keys - :param named_param_keys: list with keys of named parameters - :param named_param_values: list with values of named parameters, corresponding to named_param_keys. - :param as_table: formats the returned template in one row for each parameter - :param as_table_keylen: length of the key field. None for "automatic". - :return: unicode template""" - pipe_char, equal_char, end_char = (u'\n| ', u' = ', u'\n}}') if as_table else (u'|', u'=', u'}}') - parts = [u"{{" + template_title] - parts += anonym_params - if as_table and as_table_keylen is None: - as_table_keylen = max([len(k) for k in named_param_keys]) - for i in xrange(len(named_param_keys)): - key = named_param_keys[i] - if as_table: - key = key.ljust(as_table_keylen) - parts.append((key + equal_char + named_param_values[i]).rstrip()) - else: - parts.append(key + equal_char + named_param_values[i]) - return pipe_char.join(parts) + end_char - - -def find_tag(wikitext, tagname, pos=0): - """Returns the tuple (start, end) of the first occurence of the tag '...' - or ''. - (None, None) is returned if the tag is not found. - If you are sure that the wikitext contains the tag, the tag could be extracted like follows: - - >>> wikitext = u'This is a mytag tag.' - >>> start, end = find_template(wikitext, u'tag') - >>> print wikitext[start:end] - mytag - - :param wikitext: The text (preferalbe unicode) that has the template in it. - :param tagname: Name of the tag, e.g. u'tag' for . - :param pos: position within wikitext to start searching the tag. - :return: - (start, content, endtag, end). start is the position of '<' of the tag, - content is the beginning of the content (after '>'), enttag is the - beginning of the end tag ('".format(tagname), re.DOTALL) - match_starttag = regexp_starttag.search(wikitext, pos) - if match_starttag is None: - return None, None, None, None - - # does the tag have content? - if len(match_starttag.group(1)) == 1: # group(1) is either '' or '/'. - # single tag - return match_starttag.start(), None, None, match_starttag.end() - - # tag with content - regexp_endtag = re.compile(u''.format(tagname), re.DOTALL) - match_endtag = regexp_endtag.search(wikitext, match_starttag.end()) - if match_endtag is None: - # No closing tag - error in wikitext - return None, None, None, None - return match_starttag.start(), match_starttag.end(), match_endtag.start(), match_endtag.end() - - -def parse_googlemap(wikitext, detail=False): - """Parses the (unicode) u'content' of the googlemap extension - out of a page. If wikitext does not contain the googlemap extension text None is returned. - If the googlemap contains invalid formatted lines, a RuntimeError is raised. - - :param wikitext: wikitext containing the template. Example: - :param detail: bool. If True, start and end position of ... is - returned additionally. - - wikitext = ''' - - (Parkplatz)47.114958,11.266026 - Parkplatz - - (Gasthaus) 47.114715, 11.266262, Alt Bärnbad (Gasthaus) - 6#FF014E9A - 47.114715,11.266262 - 47.114135,11.268381 - 47.113421,11.269322 - 47.11277,11.269979 - 47.112408,11.271119 - - ''' - :returns: the tuple (center, zoom, coords, paths). - center is the tuple (lon, lat) of the google maps or (None, None) if not provided - zoom is the google zoom level as integer or None if not provided - coords is a list of (lon, lat, symbol, title) tuples. - paths is a list of (style, coords) tuples. - coords is again a list of (lot, lat, symbol, title) tuples. - If detail is True, (center, zoom, coords, paths, start, end) is returned.""" - - def is_coord(line): - """Returns True if the line contains a coordinate.""" - match = re.search('[0-9]{1,2}\.[0-9]+, ?[0-9]{1,2}\.[0-9]+', line) - return not match is None - - def is_path(line): - """Returns True if the line contains a path style definition.""" - match = re.match('[0-9]#[0-9a-fA-F]{8}', line) - return not match is None - - def parse_coord(line): - """Returns (lon, lat, symbol, title). If symbol or text is not present, None is returned.""" - match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line) - if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), match.group(4)) - match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line) - if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), None) - match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line) - if not match is None: return (float(match.group(2)), float(match.group(1)), None, match.group(3)) - match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line) - if not match is None: return (float(match.group(2)), float(match.group(1)), None, None) - return RuntimeError(u'Could not parse line ' + line) - - regexp = re.compile(u"(]*>)(.*?)()", re.DOTALL) - match = regexp.search(wikitext) - if match is None: return None - start = match.start() - end = match.end() - content = match.group(2) - gm = xml.etree.ElementTree.XML((match.group(1)+match.group(3)).encode('UTF8')) - zoom = gm.get('zoom') - lon = gm.get('lon') - lat = gm.get('lat') - if not zoom is None: zoom = int(zoom) - if not lon is None: lon = float(lon) - if not lat is None: lat = float(lat) - center = (lon, lat) - - coords = [] - paths = [] - lines = content.split("\n") - i = 0 - while i < len(lines): - line = lines[i].strip() - i += 1 - - # Skip whitespace - if len(line) == 0: continue - - # Handle a path - if is_path(line): - match = re.match(u'([0-9]#[0-9a-fA-F]{8})', line) - style = match.group(1) - local_coords = [] - while i < len(lines): - line = lines[i].strip() - i += 1 - if is_path(line): - i -= 1 - break - if is_coord(line): - lon, lat, symbol, title = parse_coord(line) - local_coords.append((lon, lat, symbol, title)) - paths.append((style, local_coords)) - continue - - # Handle a coordinate - if is_coord(line): - lon, lat, symbol, title = parse_coord(line) - while i < len(lines): - line = lines[i].strip() - i += 1 - if is_path(line) or is_coord(line): - i -= 1 - break - if len(line) > 0 and title is None: title = line - coords.append((lon, lat, symbol, title)) - continue - - raise RuntimeError(u'Unknown line syntax: ' + line) - if detail: - return (center, zoom, coords, paths, start, end) - return (center, zoom, coords, paths) - + if keylen is None: + shown_keys = [len(param.name.strip()) for param in template.params if param.showkey] + keylen = max(shown_keys) if shown_keys else 0 + template.name = '{}\n'.format(template.name.strip()) + for param in template.params: + if param.showkey: + param.name = ' {{:{}}} '.format(keylen).format(param.name.strip()) + value = param.value.strip() + if len(value) > 0: + param.value = ' {}\n'.format(value) + else: + param.value = '\n' + + +def format_template_oneline(template): + """Formats a template like this: {{template_name|param| }} + (whitespace is stripped and empty parameters are replaced with one space).""" + template.name = template.name.strip() + for param in template.params: + if param.showkey: + param.name = param.name.strip() + value = param.value.strip() + if value == '': + value = ' ' + param.value = value + + +def dbkey_to_title(value): + """Converts a article database key to a article title. Private function secureAndSplit() of the Title class + on line 3316 of includes/Title.php says: + $this->mTextform = str_replace( '_', ' ', $this->mDbkeyform ); + No check for None because a missing title is an error.""" + return value.replace('_', ' ')