From: Philipp Spitzer Date: Sun, 9 Jan 2022 22:30:35 +0000 (+0100) Subject: Better parsing for car description. X-Git-Url: https://git.toastfreeware.priv.at/philipp/winterrodeln/wrpylib.git/commitdiff_plain/3c412c54364920d193dae43724998a02bf09cf91 Better parsing for car description. --- diff --git a/bots/sledrun_wikitext_to_json.py b/bots/sledrun_wikitext_to_json.py index 1e3cbfe..f71b19a 100644 --- a/bots/sledrun_wikitext_to_json.py +++ b/bots/sledrun_wikitext_to_json.py @@ -12,13 +12,14 @@ The following generators and filters are supported: import io import json import re +from itertools import takewhile, dropwhile from typing import Any, Optional import mwparserfromhell from mwparserfromhell.nodes.extras import Parameter import pywikibot -from mwparserfromhell.nodes import Tag, Text, ExternalLink, Template, Wikilink +from mwparserfromhell.nodes import Tag, Text, ExternalLink, Template, Wikilink, Heading from mwparserfromhell.wikicode import Wikicode from pywikibot import pagegenerators, Page from pywikibot.bot import ( @@ -271,34 +272,39 @@ class SledrunWikiTextToJsonBot( break def _car(): - for v in wikicode.get_sections(levels=[2], matches='Anreise mit dem Auto'): - for w in v.ifilter_text(recursive=False): - x = w.strip() - if x: - sledrun_json["car_description"] = str(x) - break - x = [] - for w in v.ifilter_templates(matches='Parkplatz'): - za = str_or_none(w.get(1, None)) - zb = str_or_none(w.get(2, None)) - z = lonlat_ele_to_json(opt_lonlat_from_str(za), opt_uint_from_str(zb)) - if len(z) > 0: - x.append({'position': z}) - if len(x) > 0: - sledrun_json['car_parking'] = x - - x = [] - for w in io.StringIO(str(v)): - match = re.match(r"\*\* von \'\'\'(.+)\'\'\'(.*): ([\d.,]+) km", w.rstrip()) - if match: - ya, yb, yc = match.groups() - yc = float(yc.replace(',', '.')) - x.append({ - 'km': yc, - 'route': (ya.strip() + ' ' + yb.strip()).strip(), - }) - if len(x) > 0: - sledrun_json['car_distances'] = x + car_section_list = wikicode.get_sections(levels=[2], matches='Anreise mit dem Auto') + if not car_section_list: + return + v = car_section_list[0] + + description_nodes = dropwhile(lambda w: isinstance(w, Heading), v.nodes) + description_nodes = takewhile(lambda w: not (isinstance(w, Tag) and w.wiki_markup == '*'), + description_nodes) + if description := str(Wikicode(list(description_nodes))).strip(): + sledrun_json["car_description"] = description + + x = [] + for w in v.ifilter_templates(matches='Parkplatz'): + za = str_or_none(w.get(1, None)) + zb = str_or_none(w.get(2, None)) + z = lonlat_ele_to_json(opt_lonlat_from_str(za), opt_uint_from_str(zb)) + if len(z) > 0: + x.append({'position': z}) + if len(x) > 0: + sledrun_json['car_parking'] = x + + x = [] + for w in io.StringIO(str(v)): + match = re.match(r"\*\* von \'\'\'(.+)\'\'\'(.*): ([\d.,]+) km", w.rstrip()) + if match: + ya, yb, yc = match.groups() + yc = float(yc.replace(',', '.')) + x.append({ + 'km': yc, + 'route': (ya.strip() + ' ' + yb.strip()).strip(), + }) + if len(x) > 0: + sledrun_json['car_distances'] = x _car() x = []