From 5d169c6e119cf7ae6d7d412720f6e87ed7bea11b Mon Sep 17 00:00:00 2001 From: Philipp Spitzer Date: Fri, 15 Jul 2022 18:18:23 +0200 Subject: [PATCH] Improve automatic parsing of car-distances. --- bots/sledrun_wikitext_to_json.py | 2 +- ...ledrun_json_from_wikitext_car_distances.py | 117 ++++++++++++++++++ 2 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 bots/update_sledrun_json_from_wikitext_car_distances.py diff --git a/bots/sledrun_wikitext_to_json.py b/bots/sledrun_wikitext_to_json.py index d53313d..81c2a9b 100644 --- a/bots/sledrun_wikitext_to_json.py +++ b/bots/sledrun_wikitext_to_json.py @@ -302,7 +302,7 @@ class SledrunWikiTextToJsonBot( x = [] for w in io.StringIO(str(v)): - match = re.match(r"\*\* von \'\'\'(.+)\'\'\'(.*): ([\d.,]+) km", w.rstrip()) + match = re.match(r"\*\* [Vv]on \'\'\'(.+)\'\'\'(.*): ([\d.,]+) km", w.rstrip()) if match: ya, yb, yc = match.groups() yc = float(yc.replace(',', '.')) diff --git a/bots/update_sledrun_json_from_wikitext_car_distances.py b/bots/update_sledrun_json_from_wikitext_car_distances.py new file mode 100644 index 0000000..20f323e --- /dev/null +++ b/bots/update_sledrun_json_from_wikitext_car_distances.py @@ -0,0 +1,117 @@ +#!/usr/bin/python +""" +User script for pywikibot (https://gerrit.wikimedia.org/r/pywikibot/core.git), tested with version 6.6.1. +Put it in directory scripts/userscripts. + +Update a sledrun JSON page from a detail in a sledrun wikitext page. + +The following generators and filters are supported: + +¶ms; +""" +import io +import json +import re +from itertools import takewhile, dropwhile +from typing import Optional + +import jsonschema +import mwparserfromhell +from mwparserfromhell.nodes.extras import Parameter + +import pywikibot +from mwparserfromhell.nodes import Tag, Text, ExternalLink, Template, Wikilink, Heading +from mwparserfromhell.wikicode import Wikicode +from pywikibot import pagegenerators, Page +from pywikibot.bot import ( + AutomaticTWSummaryBot, + ConfigParserBot, + ExistingPageBot, + NoRedirectPageBot, + SingleSiteBot, +) +from pywikibot.logging import warning +from pywikibot.site._namespace import BuiltinNamespace +from wrpylib.json_tools import order_json_keys + +from wrpylib.wrmwmarkup import create_sledrun_wiki, lonlat_to_json, lonlat_ele_to_json, parse_wrmap +from wrpylib.wrvalidators import rodelbahnbox_from_template, tristate_german_to_str, difficulty_german_to_str, \ + avalanches_german_to_str, public_transport_german_to_str, opt_lonlat_from_str, \ + opt_uint_from_str +from wrpylib.lib_sledrun_wikitext_to_json import optional_set, get_sledrun_description + +docuReplacements = {'¶ms;': pagegenerators.parameterHelp} + + +class UpdateSledrunJsonFromWikiText( + SingleSiteBot, + ConfigParserBot, + ExistingPageBot, + AutomaticTWSummaryBot, +): + def setup(self) -> None: + schema = Page(self.site, 'Winterrodeln:Datenschema/Rodelbahn/V1.json') + assert schema.content_model == 'json' + self.sledrun_schema = json.loads(schema.text) + + def treat_page(self) -> None: + """Load the given page, do some changes, and save it.""" + wikitext_content_model = 'wikitext' + if self.current_page.content_model != wikitext_content_model: + warning(f"The content model of {self.current_page.title()} is {self.current_page.content_model} " + f"instead of {wikitext_content_model}.") + return + + sledrun_json_page = Page(self.site, self.current_page.title() + '/Rodelbahn.json') + if not sledrun_json_page.exists(): + return + sledrun_json = json.loads(sledrun_json_page.text) + sledrun_json_orig = json.loads(sledrun_json_page.text) + sledrun_json_orig_text = json.dumps(sledrun_json_orig, ensure_ascii=False, indent=4) + + car_distances = [] + for line in self.current_page.text.split('\n'): + match = re.match(r"\*\* [Vv]on \'\'\'(.+)\'\'\'(.*): ([\d.,]+) km", line.rstrip()) + if match: + ya, yb, yc = match.groups() + yc = float(yc.replace(',', '.')) + car_distances.append({ + 'km': yc, + 'route': (ya.strip() + ' ' + yb.strip()).strip(), + }) + else: + match = re.match(r"\*\* [Vv]on (.+): ([\d.,]+) km", line.rstrip()) + if match: + ya, yb = match.groups() + yb = float(yb.replace(',', '.')) + car_distances.append({ + 'km': yb, + 'route': ya.strip(), + }) + if len(car_distances) > 0: + sledrun_json['car_distances'] = car_distances + + jsonschema.validate(instance=sledrun_json, schema=self.sledrun_schema) + sledrun_json_ordered = order_json_keys(sledrun_json, self.sledrun_schema) + assert sledrun_json_ordered == sledrun_json + if sledrun_json == sledrun_json_orig: + return + sledrun_json_text = json.dumps(sledrun_json_ordered, ensure_ascii=False, indent=4) + summary = 'Entfernung mit dem Auto im Rodelbahn JSON aktualisiert vom Wikitext.' + self.userPut(sledrun_json_page, sledrun_json_orig_text, sledrun_json_text, summary=summary, contentmodel='json') + + +def main(*args: str) -> None: + local_args = pywikibot.handle_args(args) + gen_factory = pagegenerators.GeneratorFactory() + gen_factory.handle_args(local_args) + gen = gen_factory.getCombinedGenerator(preload=True) + if gen: + bot = UpdateSledrunJsonFromWikiText(generator=gen) + bot.run() + else: + pywikibot.bot.suggest_help(missing_generator=True) + + +if __name__ == '__main__': + main() -- 2.39.5