-#!/usr/bin/python2.6
+#!/usr/bin/python2.7
# -*- coding: iso-8859-15 -*-
# $Id$
# $HeadURL$
import xml.etree.ElementTree
+class ParseError(RuntimeError):
+ """Exception used by some of the functions"""
+ pass
+
+
def find_template(wikitext, template_title):
"""Returns the tuple (start, end) of the first occurence of the template '{{template ...}} within wikitext'.
(None, None) is returned if the template is not found.
>>> print wikitext.__getslice__(*find_template(wikitext, u'Color'))
{{Color|red|red text}}
- The search is done with regular expression.
+ The search is done with regular expression. It gives wrong results when parsing a template
+ containing the characters "}}"
:param wikitext: The text (preferalbe unicode) that has the template in it.
:param template_title: The page title of the template with or without namespace (but as in the wikitext).
(start, end) of the first occurence with start >= 0 and end > start.
(None, None) if the template is not found.
"""
- match = re.search(u"\{\{" + template_title + "[^\}]*\}\}", wikitext, re.DOTALL)
+ match = re.search(u"\{\{" + template_title + "\s*(\|[^\}]*)?\}\}", wikitext, re.DOTALL)
if match is None: return None, None
return match.start(), match.end()
-
def split_template(template):
"""Takes a template, like u'{{Color|red|text=Any text}}' and translates it to a Python tuple
- (template_title, parameters) where parameters is a Python dictionary {1: u'red', u'text'=u'Any text'}.
+ (template_title, parameters) where parameters is a Python dictionary {u'1': u'red', u'text'=u'Any text'}.
Anonymous parameters get integer keys (converted to unicode) starting with 1
like in MediaWiki, named parameters are unicode strings.
Whitespace is stripped.
as_table_keylen = max([len(k) for k in named_param_keys])
for i in xrange(len(named_param_keys)):
key = named_param_keys[i]
- if as_table: key = key.ljust(as_table_keylen)
- parts.append(key + equal_char + named_param_values[i])
+ if as_table:
+ key = key.ljust(as_table_keylen)
+ parts.append((key + equal_char + named_param_values[i]).rstrip())
+ else:
+ parts.append(key + equal_char + named_param_values[i])
return pipe_char.join(parts) + end_char
+def find_tag(wikitext, tagname, pos=0):
+ """Returns the tuple (start, end) of the first occurence of the tag '<tag ...>...</tag>'
+ or '<tag ... />'.
+ (None, None) is returned if the tag is not found.
+ If you are sure that the wikitext contains the tag, the tag could be extracted like follows:
+
+ >>> wikitext = u'This is a <tag>mytag</tag> tag.'
+ >>> start, end = find_template(wikitext, u'tag')
+ >>> print wikitext[start:end]
+ <tag>mytag</tag>
+
+ :param wikitext: The text (preferalbe unicode) that has the template in it.
+ :param tagname: Name of the tag, e.g. u'tag' for <tag>.
+ :param pos: position within wikitext to start searching the tag.
+ :return:
+ (start, content, endtag, end). start is the position of '<' of the tag,
+ content is the beginning of the content (after '>'), enttag is the
+ beginning of the end tag ('</') and end is one position after the end tag.
+ For single tags, (start, None, None, end) is returned.
+ If the tag is not found (or only the start tag is present,
+ (None, None, None, None) is returned.
+ """
+ # Find start tag
+ regexp_starttag = re.compile(u"<{0}.*?(/?)>".format(tagname), re.DOTALL)
+ match_starttag = regexp_starttag.search(wikitext, pos)
+ if match_starttag is None:
+ return None, None, None, None
+
+ # does the tag have content?
+ if len(match_starttag.group(1)) == 1: # group(1) is either '' or '/'.
+ # single tag
+ return match_starttag.start(), None, None, match_starttag.end()
+
+ # tag with content
+ regexp_endtag = re.compile(u'</{0}>'.format(tagname), re.DOTALL)
+ match_endtag = regexp_endtag.search(wikitext, match_starttag.end())
+ if match_endtag is None:
+ # No closing tag - error in wikitext
+ return None, None, None, None
+ return match_starttag.start(), match_starttag.end(), match_endtag.start(), match_endtag.end()
+
+
def parse_googlemap(wikitext):
- """Parses the (unicode) u'<googlemap ...>content</googlemap>' of the googlemap extension
- out of a page. If wikitext does not contain the googlemaps extension text None is returned.
- If the googlemap contains invalid formatted lines, a RuntimeError is raised.
+ """Parses the (unicode) u'<googlemap ...>content</googlemap>' of the googlemap extension.
+ If wikitext does not contain the <googlemap> tag or if the <googlemap> tag contains
+ invalid formatted lines, a ParseError is raised.
+ Use find_tag(wikitext, 'googlemap') to find the googlemap tag within an arbitrary
+ wikitext before using this function.
:param wikitext: wikitext containing the template. Example:
def parse_coord(line):
"""Returns (lon, lat, symbol, title). If symbol or text is not present, None is returned."""
- match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+),(.*)', line)
+ match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line)
if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), match.group(4))
match = re.match(u'\(([^)]+)\) ?([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
if not match is None: return (float(match.group(3)), float(match.group(2)), match.group(1), None)
- match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+),(.*)', line)
+ match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+), ?(.*)', line)
if not match is None: return (float(match.group(2)), float(match.group(1)), None, match.group(3))
match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line)
if not match is None: return (float(match.group(2)), float(match.group(1)), None, None)
- return RuntimeError(u'Could not parse line ' + line)
+ return ParseError(u'Could not parse line ' + line)
- regexp = re.compile(u"(<googlemap[^>]*>)(.*)(</googlemap>)", re.DOTALL)
- match = regexp.search(wikitext)
- if match is None: return None
- content = match.group(2)
- gm = xml.etree.ElementTree.XML((match.group(1)+match.group(3)).encode('UTF8'))
+ start, content, endtag, end = find_tag(wikitext, 'googlemap')
+ if start is None:
+ raise ParseError('<googlemap> tag not found.')
+ if content is None:
+ xml_only = wikitext[start:endtag]
+ else:
+ xml_only = wikitext[start:content]+wikitext[endtag:end]
+
+ try:
+ gm = xml.etree.ElementTree.XML(xml_only.encode('UTF8'))
+ except xml.etree.ElementTree.ParseError as e:
+ row, column = e.position
+ raise ParseError("XML parse error in <googlemap ...>.")
zoom = gm.get('zoom')
lon = gm.get('lon')
lat = gm.get('lat')
coords = []
paths = []
- lines = content.split("\n")
+ lines = wikitext[content:endtag].split("\n")
i = 0
while i < len(lines):
line = lines[i].strip()
# Handle a coordinate
if is_coord(line):
lon, lat, symbol, title = parse_coord(line)
- coords.append((lon, lat, symbol, title))
while i < len(lines):
line = lines[i].strip()
i += 1
if is_path(line) or is_coord(line):
i -= 1
break
+ if len(line) > 0 and title is None: title = line
+ coords.append((lon, lat, symbol, title))
continue
- raise RuntimeError(u'Unknown line syntax: ' + line)
+ raise ParseError(u'Unknown line syntax: ' + line)
return (center, zoom, coords, paths)