2 # -*- coding: iso-8859-15 -*-
5 """This module contains general functions that help parsing the mediawiki markup.
6 I looked for an already existing MediaWiki parser in Python but I didn't find anything
7 that convinced me. However, here are the links:
9 * py-wikimarkup https://github.com/dcramer/py-wikimarkup
10 * mwlib http://code.pediapress.com/wiki/wiki
13 import xml.etree.ElementTree
18 class ParseError(RuntimeError):
19 """Exception used by some of the functions"""
23 def find_template(wikitext, template_title):
24 """Returns the tuple (start, end) of the first occurence of the template '{{template ...}} within wikitext'.
25 (None, None) is returned if the template is not found.
26 If you are sure that the wikitext contains the template, the template could be extracted like follows:
28 >>> wikitext = u'This is a {{Color|red|red text}} template.'
29 >>> start, end = find_template(wikitext, u'Color')
30 >>> print wikitext[start:end]
31 {{Color|red|red text}}
35 >>> print wikitext.__getslice__(*find_template(wikitext, u'Color'))
36 {{Color|red|red text}}
38 The search is done with regular expression. It gives wrong results when parsing a template
39 containing the characters "}}"
41 :param wikitext: The text (preferalbe unicode) that has the template in it.
42 :param template_title: The page title of the template with or without namespace (but as in the wikitext).
44 (start, end) of the first occurence with start >= 0 and end > start.
45 (None, None) if the template is not found.
47 match = re.search(u"\{\{" + template_title + "\s*(\|[^\}]*)?\}\}", wikitext, re.DOTALL)
48 if match is None: return None, None
49 return match.start(), match.end()
52 class TemplateValidator(formencode.FancyValidator):
53 def to_python(self, value, state=None):
54 """Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple
55 (title, anonym_params, named_params) where title is the template title,
56 anonym_params is a list of anonymous parameters and named_params is a OrderedDict
57 of named parameters. Whitespace of the parameters is stripped."""
58 if not value.startswith(u'{{'):
59 raise formencode.Invalid(u'Template does not start with "{{"', value, state)
60 if not value.endswith(u'}}'):
61 raise formencode.Invalid(u'Template does not end with "}}"', value, state)
62 parts = value[2:-2].split(u'|')
65 title = parts[0].strip()
67 raise formencode.Invalid(u'Empty template tilte.', value, state)
70 # anonymous parameters
73 equalsign_pos = parts[0].find(u'=')
74 if equalsign_pos >= 0: break # named parameter
75 anonym_params.append(parts[0].strip())
78 # named or numbered parameters
79 named_params = collections.OrderedDict()
81 equalsign_pos = parts[0].find(u'=')
83 raise formencode.Invalid(u'Anonymous parameter after named parameter.', value, state)
84 key, sep, value = parts[0].partition(u'=')
87 raise formencode.Invalid(u'Empty key.', value, state)
88 if named_params.has_key(key):
89 raise formencode.Invalid(u'Duplicate key: "{0}"'.format(key), value, state)
90 named_params[key] = value.strip()
93 return title, anonym_params, named_params
96 def split_template(template):
97 """Deprecated legacy function.
99 Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple
100 (template_title, parameters) where parameters is a Python dictionary {u'1': u'red', u'text'=u'Any text'}.
101 Anonymous parameters get integer keys (converted to unicode) starting with 1
102 like in MediaWiki, named parameters are unicode strings.
103 Whitespace is stripped.
104 If an unexpected format is encountered, a ValueError is raised."""
106 title, anonym_params, named_params = TemplateValidator().to_python(template)
107 parameters = dict(named_params)
108 for i in xrange(len(anonym_params)):
109 parameters[unicode(i+1)] = anonym_params[i]
110 except formencode.Invalid as e:
111 raise ValueError(e[0])
112 return title, parameters
115 def create_template(template_title, anonym_params=[], named_param_keys=[], named_param_values=[], as_table=False, as_table_keylen=None):
116 """Formats a MediaWiki template.
117 :param template_title: Unicode string with the template name
118 :param anonym_params: list with parameters without keys
119 :param named_param_keys: list with keys of named parameters
120 :param named_param_values: list with values of named parameters, corresponding to named_param_keys.
121 :param as_table: formats the returned template in one row for each parameter
122 :param as_table_keylen: length of the key field. None for "automatic".
123 :return: unicode template"""
124 pipe_char, equal_char, end_char = (u'\n| ', u' = ', u'\n}}') if as_table else (u'|', u'=', u'}}')
125 parts = [u"{{" + template_title]
126 parts += anonym_params
127 if as_table and as_table_keylen is None:
128 as_table_keylen = max([len(k) for k in named_param_keys])
129 for i in xrange(len(named_param_keys)):
130 key = named_param_keys[i]
132 key = key.ljust(as_table_keylen)
133 parts.append((key + equal_char + named_param_values[i]).rstrip())
135 parts.append(key + equal_char + named_param_values[i])
136 return pipe_char.join(parts) + end_char
139 def find_tag(wikitext, tagname, pos=0):
140 """Returns position information of the first occurence of the tag '<tag ...>...</tag>'
142 If you are sure that the wikitext contains the tag, the tag could be extracted like follows:
144 >>> wikitext = u'This is a <tag>mytag</tag> tag.'
145 >>> start, content, endtag, end = find_template(wikitext, u'tag')
146 >>> print wikitext[start:end]
149 :param wikitext: The text (preferalbe unicode) that has the template in it.
150 :param tagname: Name of the tag, e.g. u'tag' for <tag>.
151 :param pos: position within wikitext to start searching the tag.
153 (start, content, endtag, end). start is the position of '<' of the tag,
154 content is the beginning of the content (after '>'), enttag is the
155 beginning of the end tag ('</') and end is one position after the end tag.
156 For single tags, (start, None, None, end) is returned.
157 If the tag is not found (or only the start tag is present,
158 (None, None, None, None) is returned.
161 regexp_starttag = re.compile(u"<{0}.*?(/?)>".format(tagname), re.DOTALL)
162 match_starttag = regexp_starttag.search(wikitext, pos)
163 if match_starttag is None:
164 return None, None, None, None
166 # does the tag have content?
167 if len(match_starttag.group(1)) == 1: # group(1) is either '' or '/'.
169 return match_starttag.start(), None, None, match_starttag.end()
172 regexp_endtag = re.compile(u'</{0}>'.format(tagname), re.DOTALL)
173 match_endtag = regexp_endtag.search(wikitext, match_starttag.end())
174 if match_endtag is None:
175 # No closing tag - error in wikitext
176 return None, None, None, None
177 return match_starttag.start(), match_starttag.end(), match_endtag.start(), match_endtag.end()
180 def parse_googlemap(wikitext):
181 """Parses the (unicode) u'<googlemap ...>content</googlemap>' of the googlemap extension.
182 If wikitext does not contain the <googlemap> tag or if the <googlemap> tag contains
183 invalid formatted lines, a ParseError is raised.
184 Use find_tag(wikitext, 'googlemap') to find the googlemap tag within an arbitrary
185 wikitext before using this function.
187 :param wikitext: wikitext containing the template. Example:
190 <googlemap version="0.9" lat="47.113291" lon="11.272337" zoom="15">
191 (Parkplatz)47.114958,11.266026
194 (Gasthaus) 47.114715, 11.266262, Alt Bärnbad (Gasthaus)
203 :returns: The tuple (attributes, coords, paths) is returned.
204 attributes is a dict that contains the attribues that are present
205 (e.g. lon, lat, zoom, width, height) converted to float (lon, lat) or int.
206 coords is a list of (lon, lat, symbol, title) tuples.
207 paths is a list of (style, coords) tuples.
208 coords is again a list of (lon, lat, symbol, title) tuples."""
211 """Returns True if the line contains a coordinate."""
212 match = re.search('[0-9]{1,2}\.[0-9]+, ?[0-9]{1,2}\.[0-9]+', line)
213 return not match is None
216 """Returns True if the line contains a path style definition."""
217 match = re.match('[0-9]#[0-9a-fA-F]{8}', line)
218 return not match is None
220 def parse_coord(line):
221 """Returns (lon, lat, symbol, title). If symbol or text is not present, None is returned."""
222 match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line)
223 if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), match.group(4))
224 match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
225 if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), None)
226 match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line)
227 if not match is None: return (float(match.group(2)), float(match.group(1)), None, match.group(3))
228 match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
229 if not match is None: return (float(match.group(2)), float(match.group(1)), None, None)
230 return ParseError(u'Could not parse line ' + line)
232 start, content, endtag, end = find_tag(wikitext, 'googlemap')
234 raise ParseError(u'<googlemap> tag not found.')
236 xml_only = wikitext[start:endtag]
238 xml_only = wikitext[start:content]+wikitext[endtag:end]
241 gm = xml.etree.ElementTree.XML(xml_only.encode('UTF8'))
242 except xml.etree.ElementTree.ParseError as e:
243 row, column = e.position
244 raise ParseError(u"XML parse error in <googlemap ...>.")
249 for key in ['lon', 'lat']:
250 if gm.get(key) is not None:
251 attributes[key] = float(gm.get(key))
252 for key in ['zoom', 'width', 'height']:
253 if gm.get(key) is not None:
254 attributes[key] = int(gm.get(key))
255 except ValueError as error:
256 raise ParseError(u'Error at parsing attribute {0} of <googlemap>: {1}'.format(key, unicode(error)))
258 # parse points and lines
261 lines = wikitext[content:endtag].split("\n")
263 while i < len(lines):
264 line = lines[i].strip()
268 if len(line) == 0: continue
272 match = re.match(u'([0-9]#[0-9a-fA-F]{8})', line)
273 style = match.group(1)
275 while i < len(lines):
276 line = lines[i].strip()
282 lon, lat, symbol, title = parse_coord(line)
283 local_coords.append((lon, lat, symbol, title))
284 paths.append((style, local_coords))
287 # Handle a coordinate
289 lon, lat, symbol, title = parse_coord(line)
290 while i < len(lines):
291 line = lines[i].strip()
293 if is_path(line) or is_coord(line):
296 if len(line) > 0 and title is None: title = line
297 coords.append((lon, lat, symbol, title))
300 raise ParseError(u'Unknown line syntax: ' + line)
302 return (attributes, coords, paths)