2 # -*- coding: iso-8859-15 -*-
5 """This module contains general functions that help parsing the mediawiki markup.
6 I looked for an already existing MediaWiki parser in Python but I didn't find anything
7 that convinced me. However, here are the links:
9 * py-wikimarkup https://github.com/dcramer/py-wikimarkup
10 * mwlib http://code.pediapress.com/wiki/wiki
13 import xml.etree.ElementTree
16 def find_template(wikitext, template_title):
17 """Returns the tuple (start, end) of the first occurence of the template '{{template ...}} within wikitext'.
18 (None, None) is returned if the template is not found.
19 If you are sure that the wikitext contains the template, the template could be extracted like follows:
21 >>> wikitext = u'This is a {{Color|red|red text}} template.'
22 >>> start, end = find_template(wikitext, u'Color')
23 >>> print wikitext[start:end]
24 {{Color|red|red text}}
28 >>> print wikitext.__getslice__(*find_template(wikitext, u'Color'))
29 {{Color|red|red text}}
31 The search is done with regular expression. It gives wrong results when parsing a template
32 containing the characters "}}"
34 :param wikitext: The text (preferalbe unicode) that has the template in it.
35 :param template_title: The page title of the template with or without namespace (but as in the wikitext).
37 (start, end) of the first occurence with start >= 0 and end > start.
38 (None, None) if the template is not found.
40 match = re.search(u"\{\{" + template_title + "\s*(\|[^\}]*)?\}\}", wikitext, re.DOTALL)
41 if match is None: return None, None
42 return match.start(), match.end()
45 def split_template(template):
46 """Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple
47 (template_title, parameters) where parameters is a Python dictionary {u'1': u'red', u'text'=u'Any text'}.
48 Anonymous parameters get integer keys (converted to unicode) starting with 1
49 like in MediaWiki, named parameters are unicode strings.
50 Whitespace is stripped.
51 If an unexpected format is encountered, a ValueError is raised."""
52 if not template.startswith(u'{{'): raise ValueError(u'Template does not start with "{{"')
53 if not template.endswith(u'}}'): raise ValueError(u'Template does not end with "}}"')
54 parts = template[2:-2].split(u'|')
57 template_title = parts[0].strip()
58 if len(template_title) == 0: raise ValueError(u'Empty template tilte.')
61 # anonymous parameters
62 params = {} # result dictionary
65 equalsign_pos = parts[0].find(u'=')
66 if equalsign_pos >= 0: break # named parameter
67 params[unicode(param_num)] = parts[0].strip()
71 # named or numbered parameters
73 equalsign_pos = parts[0].find(u'=')
74 if equalsign_pos < 0: raise ValueError(u'Anonymous parameter after named parameter.')
75 key, sep, value = parts[0].partition(u'=')
77 if len(key) == 0: raise ValueError(u'Empty key.')
78 if params.has_key(key): raise ValueError(u'Duplicate key: "{0}"'.format(key))
79 params[key] = value.strip()
82 return template_title, params
85 def create_template(template_title, anonym_params=[], named_param_keys=[], named_param_values=[], as_table=False, as_table_keylen=None):
86 """Formats a MediaWiki template.
87 :param template_title: Unicode string with the template name
88 :param anonym_params: list with parameters without keys
89 :param named_param_keys: list with keys of named parameters
90 :param named_param_values: list with values of named parameters, corresponding to named_param_keys.
91 :param as_table: formats the returned template in one row for each parameter
92 :param as_table_keylen: length of the key field. None for "automatic".
93 :return: unicode template"""
94 pipe_char, equal_char, end_char = (u'\n| ', u' = ', u'\n}}') if as_table else (u'|', u'=', u'}}')
95 parts = [u"{{" + template_title]
96 parts += anonym_params
97 if as_table and as_table_keylen is None:
98 as_table_keylen = max([len(k) for k in named_param_keys])
99 for i in xrange(len(named_param_keys)):
100 key = named_param_keys[i]
102 key = key.ljust(as_table_keylen)
103 parts.append((key + equal_char + named_param_values[i]).rstrip())
105 parts.append(key + equal_char + named_param_values[i])
106 return pipe_char.join(parts) + end_char
109 def parse_googlemap(wikitext):
110 """Parses the (unicode) u'<googlemap ...>content</googlemap>' of the googlemap extension
111 out of a page. If wikitext does not contain the googlemaps extension text None is returned.
112 If the googlemap contains invalid formatted lines, a RuntimeError is raised.
114 :param wikitext: wikitext containing the template. Example:
117 <googlemap version="0.9" lat="47.113291" lon="11.272337" zoom="15">
118 (Parkplatz)47.114958,11.266026
121 (Gasthaus) 47.114715, 11.266262, Alt Bärnbad (Gasthaus)
130 :returns: the tuple (center, zoom, coords, paths).
131 center is the tuple (lon, lat) of the google maps or (None, None) if not provided
132 zoom is the google zoom level as integer or None if not provided
133 coords is a list of (lon, lat, symbol, title) tuples.
134 paths is a list of (style, coords) tuples.
135 coords is again a list of (lot, lat, symbol, title) tuples."""
138 """Returns True if the line contains a coordinate."""
139 match = re.search('[0-9]{1,2}\.[0-9]+, ?[0-9]{1,2}\.[0-9]+', line)
140 return not match is None
143 """Returns True if the line contains a path style definition."""
144 match = re.match('[0-9]#[0-9a-fA-F]{8}', line)
145 return not match is None
147 def parse_coord(line):
148 """Returns (lon, lat, symbol, title). If symbol or text is not present, None is returned."""
149 match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line)
150 if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), match.group(4))
151 match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
152 if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), None)
153 match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line)
154 if not match is None: return (float(match.group(2)), float(match.group(1)), None, match.group(3))
155 match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
156 if not match is None: return (float(match.group(2)), float(match.group(1)), None, None)
157 return RuntimeError(u'Could not parse line ' + line)
159 regexp = re.compile(u"(<googlemap[^>]*>)(.*)(</googlemap>)", re.DOTALL)
160 match = regexp.search(wikitext)
161 if match is None: return None
162 content = match.group(2)
163 gm = xml.etree.ElementTree.XML((match.group(1)+match.group(3)).encode('UTF8'))
164 zoom = gm.get('zoom')
167 if not zoom is None: zoom = int(zoom)
168 if not lon is None: lon = float(lon)
169 if not lat is None: lat = float(lat)
174 lines = content.split("\n")
176 while i < len(lines):
177 line = lines[i].strip()
181 if len(line) == 0: continue
185 match = re.match(u'([0-9]#[0-9a-fA-F]{8})', line)
186 style = match.group(1)
188 while i < len(lines):
189 line = lines[i].strip()
195 lon, lat, symbol, title = parse_coord(line)
196 local_coords.append((lon, lat, symbol, title))
197 paths.append((style, local_coords))
200 # Handle a coordinate
202 lon, lat, symbol, title = parse_coord(line)
203 while i < len(lines):
204 line = lines[i].strip()
206 if is_path(line) or is_coord(line):
209 if len(line) > 0 and title is None: title = line
210 coords.append((lon, lat, symbol, title))
213 raise RuntimeError(u'Unknown line syntax: ' + line)
214 return (center, zoom, coords, paths)