2 # -*- coding: iso-8859-15 -*-
5 """This module contains general functions that help parsing the mediawiki markup.
6 I looked for an already existing MediaWiki parser in Python but I didn't find anything
7 that convinced me. However, here are the links:
9 * py-wikimarkup https://github.com/dcramer/py-wikimarkup
10 * mwlib http://code.pediapress.com/wiki/wiki
13 import xml.etree.ElementTree
18 class ParseError(RuntimeError):
19 """Exception used by some of the functions"""
23 def find_template(wikitext, template_title):
24 """Returns the tuple (start, end) of the first occurence of the template '{{template ...}} within wikitext'.
25 (None, None) is returned if the template is not found.
26 If you are sure that the wikitext contains the template, the template could be extracted like follows:
28 >>> wikitext = u'This is a {{Color|red|red text}} template.'
29 >>> start, end = find_template(wikitext, u'Color')
30 >>> print wikitext[start:end]
31 {{Color|red|red text}}
35 >>> print wikitext.__getslice__(*find_template(wikitext, u'Color'))
36 {{Color|red|red text}}
38 The search is done with regular expression. It gives wrong results when parsing a template
39 containing the characters "}}"
41 :param wikitext: The text (preferalbe unicode) that has the template in it.
42 :param template_title: The page title of the template with or without namespace (but as in the wikitext).
44 (start, end) of the first occurence with start >= 0 and end > start.
45 (None, None) if the template is not found.
47 match = re.search("\{\{" + template_title + "\s*(\|[^\}]*)?\}\}", wikitext, re.DOTALL)
48 if match is None: return None, None
49 return match.start(), match.end()
52 class TemplateValidator(formencode.FancyValidator):
53 def __init__(self, strip=True, as_table=False, as_table_keylen=None):
54 """Validates a MediaWiki template, e.g. {{Color|red}}
55 :param stip: If strip is True, the title, and the parameter keys and values are stripped in to_python.
56 :param as_table: formats the returned template in one row for each parameter
57 :param as_table_keylen: length of the key field for from_python. None for "automatic"."""
58 self.strip = (lambda s: s.strip()) if strip else (lambda s: s)
59 self.as_table = as_table
60 self.as_table_keylen = as_table_keylen
62 def to_python(self, value, state=None):
63 """Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple
64 (title, anonym_params, named_params) where title is the template title,
65 anonym_params is a list of anonymous parameters and named_params is a OrderedDict
66 of named parameters. Whitespace of the parameters is stripped."""
67 if not value.startswith('{{'):
68 raise formencode.Invalid('Template does not start with "{{"', value, state)
69 if not value.endswith('}}'):
70 raise formencode.Invalid('Template does not end with "}}"', value, state)
71 parts = value[2:-2].split('|')
74 title = self.strip(parts[0])
76 raise formencode.Invalid('Empty template tilte.', value, state)
79 # anonymous parameters
82 equalsign_pos = parts[0].find('=')
83 if equalsign_pos >= 0: break # named parameter
84 anonym_params.append(self.strip(parts[0]))
87 # named or numbered parameters
88 named_params = collections.OrderedDict()
90 equalsign_pos = parts[0].find('=')
92 raise formencode.Invalid('Anonymous parameter after named parameter.', value, state)
93 key, sep, value = parts[0].partition('=')
96 raise formencode.Invalid('Empty key.', value, state)
97 if key in named_params:
98 raise formencode.Invalid('Duplicate key: "{0}"'.format(key), value, state)
99 named_params[key] = self.strip(value)
102 return title, anonym_params, named_params
104 def from_python(self, value, state=None):
105 """Formats a MediaWiki template.
106 value is a tuple: (title, anonym_params, named_params)
107 where title is the template title, anonym_params is a list of anonymous parameters and
108 named_params is a dict or OrderedDict of named parameters."""
109 title, anonym_params, named_params = value
110 pipe_char, equal_char, end_char = ('\n| ', ' = ', '\n}}') if self.as_table else ('|', '=', '}}')
111 parts = ["{{" + title]
112 parts += anonym_params
113 as_table_keylen = self.as_table_keylen
114 if self.as_table and as_table_keylen is None:
115 as_table_keylen = max(list(map(len, iter(named_params.keys()))))
116 for k, v in named_params.items():
118 k = k.ljust(as_table_keylen)
119 parts.append((k + equal_char + v).rstrip())
121 parts.append(k + equal_char + v)
122 return pipe_char.join(parts) + end_char
125 def split_template(template):
126 """Deprecated legacy function.
128 Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple
129 (template_title, parameters) where parameters is a Python dictionary {u'1': u'red', u'text'=u'Any text'}.
130 Anonymous parameters get integer keys (converted to unicode) starting with 1
131 like in MediaWiki, named parameters are unicode strings.
132 Whitespace is stripped.
133 If an unexpected format is encountered, a ValueError is raised."""
135 title, anonym_params, named_params = TemplateValidator().to_python(template)
136 parameters = dict(named_params)
137 for i in range(len(anonym_params)):
138 parameters[str(i+1)] = anonym_params[i]
139 except formencode.Invalid as e:
140 raise ValueError(e[0])
141 return title, parameters
144 def create_template(template_title, anonym_params=[], named_param_keys=[], named_param_values=[], as_table=False, as_table_keylen=None):
145 """Deprecated legacy function.
147 Formats a MediaWiki template.
148 :param template_title: Unicode string with the template name
149 :param anonym_params: list with parameters without keys
150 :param named_param_keys: list with keys of named parameters
151 :param named_param_values: list with values of named parameters, corresponding to named_param_keys.
152 :param as_table: formats the returned template in one row for each parameter
153 :param as_table_keylen: length of the key field. None for "automatic".
154 :return: unicode template"""
155 named_params = collections.OrderedDict(list(zip(named_param_keys, named_param_values)))
156 return TemplateValidator(as_table=as_table, as_table_keylen=as_table_keylen).from_python((template_title, anonym_params, named_params))
159 def find_tag(wikitext, tagname, pos=0):
160 """Returns position information of the first occurence of the tag '<tag ...>...</tag>'
162 If you are sure that the wikitext contains the tag, the tag could be extracted like follows:
164 >>> wikitext = u'This is a <tag>mytag</tag> tag.'
165 >>> start, content, endtag, end = find_template(wikitext, u'tag')
166 >>> print wikitext[start:end]
169 :param wikitext: The text (preferalbe unicode) that has the template in it.
170 :param tagname: Name of the tag, e.g. u'tag' for <tag>.
171 :param pos: position within wikitext to start searching the tag.
173 (start, content, endtag, end). start is the position of '<' of the tag,
174 content is the beginning of the content (after '>'), enttag is the
175 beginning of the end tag ('</') and end is one position after the end tag.
176 For single tags, (start, None, None, end) is returned.
177 If the tag is not found (or only the start tag is present,
178 (None, None, None, None) is returned.
181 regexp_starttag = re.compile("<{0}.*?(/?)>".format(tagname), re.DOTALL)
182 match_starttag = regexp_starttag.search(wikitext, pos)
183 if match_starttag is None:
184 return None, None, None, None
186 # does the tag have content?
187 if len(match_starttag.group(1)) == 1: # group(1) is either '' or '/'.
189 return match_starttag.start(), None, None, match_starttag.end()
192 regexp_endtag = re.compile('</{0}>'.format(tagname), re.DOTALL)
193 match_endtag = regexp_endtag.search(wikitext, match_starttag.end())
194 if match_endtag is None:
195 # No closing tag - error in wikitext
196 return None, None, None, None
197 return match_starttag.start(), match_starttag.end(), match_endtag.start(), match_endtag.end()
200 def parse_googlemap(wikitext):
201 """Parses the (unicode) u'<googlemap ...>content</googlemap>' of the googlemap extension.
202 If wikitext does not contain the <googlemap> tag or if the <googlemap> tag contains
203 invalid formatted lines, a ParseError is raised.
204 Use find_tag(wikitext, 'googlemap') to find the googlemap tag within an arbitrary
205 wikitext before using this function.
207 :param wikitext: wikitext containing the template. Example:
210 <googlemap version="0.9" lat="47.113291" lon="11.272337" zoom="15">
211 (Parkplatz)47.114958,11.266026
214 (Gasthaus) 47.114715, 11.266262, Alt Bärnbad (Gasthaus)
223 :returns: The tuple (attributes, coords, paths) is returned.
224 attributes is a dict that contains the attribues that are present
225 (e.g. lon, lat, zoom, width, height) converted to float (lon, lat) or int.
226 coords is a list of (lon, lat, symbol, title) tuples.
227 paths is a list of (style, coords) tuples.
228 coords is again a list of (lon, lat, symbol, title) tuples."""
231 """Returns True if the line contains a coordinate."""
232 match = re.search('[0-9]{1,2}\.[0-9]+, ?[0-9]{1,2}\.[0-9]+', line)
233 return not match is None
236 """Returns True if the line contains a path style definition."""
237 match = re.match('[0-9]#[0-9a-fA-F]{8}', line)
238 return not match is None
240 def parse_coord(line):
241 """Returns (lon, lat, symbol, title). If symbol or text is not present, None is returned."""
242 match = re.match('\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line)
243 if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), match.group(4))
244 match = re.match('\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
245 if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), None)
246 match = re.match('([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line)
247 if not match is None: return (float(match.group(2)), float(match.group(1)), None, match.group(3))
248 match = re.match('([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
249 if not match is None: return (float(match.group(2)), float(match.group(1)), None, None)
250 return ParseError('Could not parse line ' + line)
252 start, content, endtag, end = find_tag(wikitext, 'googlemap')
254 raise ParseError('<googlemap> tag not found.')
256 xml_only = wikitext[start:endtag]
258 xml_only = wikitext[start:content]+wikitext[endtag:end]
261 gm = xml.etree.ElementTree.XML(xml_only.encode('UTF8'))
262 except xml.etree.ElementTree.ParseError as e:
263 row, column = e.position
264 raise ParseError("XML parse error in <googlemap ...>.")
269 for key in ['lon', 'lat']:
270 if gm.get(key) is not None:
271 attributes[key] = float(gm.get(key))
272 for key in ['zoom', 'width', 'height']:
273 if gm.get(key) is not None:
274 attributes[key] = int(gm.get(key))
275 except ValueError as error:
276 raise ParseError('Error at parsing attribute {0} of <googlemap>: {1}'.format(key, str(error)))
278 # parse points and lines
281 lines = wikitext[content:endtag].split("\n")
283 while i < len(lines):
284 line = lines[i].strip()
288 if len(line) == 0: continue
292 match = re.match('([0-9]#[0-9a-fA-F]{8})', line)
293 style = match.group(1)
295 while i < len(lines):
296 line = lines[i].strip()
302 lon, lat, symbol, title = parse_coord(line)
303 local_coords.append((lon, lat, symbol, title))
304 paths.append((style, local_coords))
307 # Handle a coordinate
309 lon, lat, symbol, title = parse_coord(line)
310 while i < len(lines):
311 line = lines[i].strip()
313 if is_path(line) or is_coord(line):
316 if len(line) > 0 and title is None: title = line
317 coords.append((lon, lat, symbol, title))
320 raise ParseError('Unknown line syntax: ' + line)
322 return (attributes, coords, paths)