X-Git-Url: https://git.toastfreeware.priv.at/philipp/winterrodeln/wrpylib.git/blobdiff_plain/d188a36c9bf0d3705932c9b3d6f98758e2e7caf0..5d56f386cd826bedc7f2831b6657fdb87df1a529:/wrpylib/mwmarkup.py?ds=sidebyside diff --git a/wrpylib/mwmarkup.py b/wrpylib/mwmarkup.py index 1ec41c0..ace7362 100644 --- a/wrpylib/mwmarkup.py +++ b/wrpylib/mwmarkup.py @@ -13,6 +13,11 @@ import re import xml.etree.ElementTree +class ParseError(RuntimeError): + """Exception used by some of the functions""" + pass + + def find_template(wikitext, template_title): """Returns the tuple (start, end) of the first occurence of the template '{{template ...}} within wikitext'. (None, None) is returned if the template is not found. @@ -106,14 +111,55 @@ def create_template(template_title, anonym_params=[], named_param_keys=[], named return pipe_char.join(parts) + end_char -def parse_googlemap(wikitext, detail=False): - """Parses the (unicode) u'content' of the googlemap extension - out of a page. If wikitext does not contain the googlemaps extension text None is returned. - If the googlemap contains invalid formatted lines, a RuntimeError is raised. +def find_tag(wikitext, tagname, pos=0): + """Returns position information of the first occurence of the tag '...' + or ''. + If you are sure that the wikitext contains the tag, the tag could be extracted like follows: + + >>> wikitext = u'This is a mytag tag.' + >>> start, content, endtag, end = find_template(wikitext, u'tag') + >>> print wikitext[start:end] + mytag + + :param wikitext: The text (preferalbe unicode) that has the template in it. + :param tagname: Name of the tag, e.g. u'tag' for . + :param pos: position within wikitext to start searching the tag. + :return: + (start, content, endtag, end). start is the position of '<' of the tag, + content is the beginning of the content (after '>'), enttag is the + beginning of the end tag ('".format(tagname), re.DOTALL) + match_starttag = regexp_starttag.search(wikitext, pos) + if match_starttag is None: + return None, None, None, None + + # does the tag have content? + if len(match_starttag.group(1)) == 1: # group(1) is either '' or '/'. + # single tag + return match_starttag.start(), None, None, match_starttag.end() + + # tag with content + regexp_endtag = re.compile(u''.format(tagname), re.DOTALL) + match_endtag = regexp_endtag.search(wikitext, match_starttag.end()) + if match_endtag is None: + # No closing tag - error in wikitext + return None, None, None, None + return match_starttag.start(), match_starttag.end(), match_endtag.start(), match_endtag.end() + + +def parse_googlemap(wikitext): + """Parses the (unicode) u'content' of the googlemap extension. + If wikitext does not contain the tag or if the tag contains + invalid formatted lines, a ParseError is raised. + Use find_tag(wikitext, 'googlemap') to find the googlemap tag within an arbitrary + wikitext before using this function. :param wikitext: wikitext containing the template. Example: - :param detail: bool. If True, start and end position of ... is - returned additionally. wikitext = ''' @@ -129,13 +175,12 @@ def parse_googlemap(wikitext, detail=False): 47.112408,11.271119 ''' - :returns: the tuple (center, zoom, coords, paths). - center is the tuple (lon, lat) of the google maps or (None, None) if not provided - zoom is the google zoom level as integer or None if not provided + :returns: The tuple (attributes, coords, paths) is returned. + attributes is a dict that contains the attribues that are present + (e.g. lon, lat, zoom, width, height) converted to float (lon, lat) or int. coords is a list of (lon, lat, symbol, title) tuples. paths is a list of (style, coords) tuples. - coords is again a list of (lot, lat, symbol, title) tuples. - If detail is True, (center, zoom, coords, paths, start, end) is returned.""" + coords is again a list of (lon, lat, symbol, title) tuples.""" def is_coord(line): """Returns True if the line contains a coordinate.""" @@ -157,26 +202,38 @@ def parse_googlemap(wikitext, detail=False): if not match is None: return (float(match.group(2)), float(match.group(1)), None, match.group(3)) match = re.match(u'([0-9]{1,2}\.[0-9]+), ?([0-9]{1,2}\.[0-9]+)', line) if not match is None: return (float(match.group(2)), float(match.group(1)), None, None) - return RuntimeError(u'Could not parse line ' + line) - - regexp = re.compile(u"(]*>)(.*?)()", re.DOTALL) - match = regexp.search(wikitext) - if match is None: return None - start = match.start() - end = match.end() - content = match.group(2) - gm = xml.etree.ElementTree.XML((match.group(1)+match.group(3)).encode('UTF8')) - zoom = gm.get('zoom') - lon = gm.get('lon') - lat = gm.get('lat') - if not zoom is None: zoom = int(zoom) - if not lon is None: lon = float(lon) - if not lat is None: lat = float(lat) - center = (lon, lat) + return ParseError(u'Could not parse line ' + line) + start, content, endtag, end = find_tag(wikitext, 'googlemap') + if start is None: + raise ParseError(u' tag not found.') + if content is None: + xml_only = wikitext[start:endtag] + else: + xml_only = wikitext[start:content]+wikitext[endtag:end] + + try: + gm = xml.etree.ElementTree.XML(xml_only.encode('UTF8')) + except xml.etree.ElementTree.ParseError as e: + row, column = e.position + raise ParseError(u"XML parse error in .") + + # parse attributes + attributes = {} + try: + for key in ['lon', 'lat']: + if gm.get(key) is not None: + attributes[key] = float(gm.get(key)) + for key in ['zoom', 'width', 'height']: + if gm.get(key) is not None: + attributes[key] = int(gm.get(key)) + except ValueError as error: + raise ParseError(u'Error at parsing attribute {0} of : {1}'.format(key, unicode(error))) + + # parse points and lines coords = [] paths = [] - lines = content.split("\n") + lines = wikitext[content:endtag].split("\n") i = 0 while i < len(lines): line = lines[i].strip() @@ -215,8 +272,7 @@ def parse_googlemap(wikitext, detail=False): coords.append((lon, lat, symbol, title)) continue - raise RuntimeError(u'Unknown line syntax: ' + line) - if detail: - return (center, zoom, coords, paths, start, end) - return (center, zoom, coords, paths) + raise ParseError(u'Unknown line syntax: ' + line) + + return (attributes, coords, paths)