From: Philipp Spitzer Date: Thu, 16 Jun 2022 15:18:34 +0000 (+0200) Subject: Fine-tune parsing of "see also". X-Git-Url: https://git.toastfreeware.priv.at/philipp/winterrodeln/wrpylib.git/commitdiff_plain/d02ca7270cb827f46f2e5af6202e144afa0732bb Fine-tune parsing of "see also". --- diff --git a/bots/update_sledrun_json_from_wikitext_see_also.py b/bots/update_sledrun_json_from_wikitext_see_also.py new file mode 100644 index 0000000..4b2fdea --- /dev/null +++ b/bots/update_sledrun_json_from_wikitext_see_also.py @@ -0,0 +1,143 @@ +#!/usr/bin/python +""" +User script for pywikibot (https://gerrit.wikimedia.org/r/pywikibot/core.git), tested with version 6.6.1. +Put it in directory scripts/userscripts. + +Update a sledrun JSON page from a detail in a sledrun wikitext page. + +The following generators and filters are supported: + +¶ms; +""" +import io +import itertools +import json +import re +from itertools import takewhile, dropwhile +from typing import Optional, List, Dict, Iterable + +import jsonschema +import mwparserfromhell +from mwparserfromhell.nodes.extras import Parameter + +import pywikibot +from mwparserfromhell.nodes import Tag, Text, ExternalLink, Template, Wikilink, Heading +from mwparserfromhell.wikicode import Wikicode +from pywikibot import pagegenerators, Page +from pywikibot.bot import ( + AutomaticTWSummaryBot, + ConfigParserBot, + ExistingPageBot, + NoRedirectPageBot, + SingleSiteBot, +) +from pywikibot.logging import warning +from pywikibot.site._namespace import BuiltinNamespace +from wrpylib.json_tools import order_json_keys + +from wrpylib.wrmwmarkup import create_sledrun_wiki, lonlat_to_json, lonlat_ele_to_json, parse_wrmap +from wrpylib.wrvalidators import rodelbahnbox_from_template, tristate_german_to_str, difficulty_german_to_str, \ + avalanches_german_to_str, public_transport_german_to_str, opt_lonlat_from_str, \ + opt_uint_from_str +from wrpylib.lib_sledrun_wikitext_to_json import optional_set, get_sledrun_description, wikilink_to_json, \ + external_link_to_json + +docuReplacements = {'¶ms;': pagegenerators.parameterHelp} + + +class UpdateSledrunJsonFromWikiText( + SingleSiteBot, + ConfigParserBot, + ExistingPageBot, + NoRedirectPageBot, + AutomaticTWSummaryBot, +): + def setup(self) -> None: + schema = Page(self.site, 'Winterrodeln:Datenschema/Rodelbahn/V1.json') + assert schema.content_model == 'json' + self.sledrun_schema = json.loads(schema.text) + + def treat_page(self) -> None: + """Load the given page, do some changes, and save it.""" + wikitext_content_model = 'wikitext' + if self.current_page.content_model != wikitext_content_model: + warning(f"The content model of {self.current_page.title()} is {self.current_page.content_model} " + f"instead of {wikitext_content_model}.") + return + + wikicode = mwparserfromhell.parse(self.current_page.text) + + sledrun_json_page = Page(self.site, self.current_page.title() + '/Rodelbahn.json') + if not sledrun_json_page.exists(): + return + sledrun_json = json.loads(sledrun_json_page.text) + sledrun_json_orig = json.loads(sledrun_json_page.text) + sledrun_json_orig_text = json.dumps(sledrun_json_orig, ensure_ascii=False, indent=4) + + def _strip_brackets(text: str) -> str: + """Removes brackets if they are present""" + match = re.match(r'\((.+)\)', text) + if match is None: + return text + return match.group(1) + + def _parse_weblink(line: str) -> Optional[Dict]: + wikicode = mwparserfromhell.parse(line) + nodes = dropwhile(lambda node: not isinstance(node, ExternalLink), wikicode.nodes) + link = next(nodes, None) + if link is None: + return + remaining = _strip_brackets(str(Wikicode(list(nodes))).strip()).strip() + title = link.title + if title is None: + if remaining != '': + title = remaining + else: + title = str(link.title) + if remaining != '': + title = f'{title} ({remaining})' + weblink = {'url': str(link.url)} + if title is not None: + weblink['text'] = title + return weblink + + def _see_also(wikicode: Wikicode) -> Iterable[Dict]: + wikicode_common = next(iter(wikicode.get_sections(levels=[2], matches='Allgemeines')), None) + wikitext_common = str(wikicode_common) + lines = wikitext_common.split('\n') + lines = dropwhile(lambda line: "'''Siehe auch'''" not in line, lines) + lines = itertools.islice(lines, 1, None) # omit "Siehe auch" line + lines = takewhile(lambda line: line.startswith('**'), lines) + for line in lines: + weblink = _parse_weblink(line) + if weblink is not None: + yield weblink + + see_also_list = list(_see_also(wikicode)) + if len(see_also_list) > 0: + sledrun_json['see_also'] = see_also_list + + jsonschema.validate(instance=sledrun_json, schema=self.sledrun_schema) + sledrun_json_ordered = order_json_keys(sledrun_json, self.sledrun_schema) + assert sledrun_json_ordered == sledrun_json + if sledrun_json == sledrun_json_orig: + return + sledrun_json_text = json.dumps(sledrun_json_ordered, ensure_ascii=False, indent=4) + summary = 'Information zu "Anderen Rodelbahnen" im Rodelbahn JSON aktualisiert vom Wikitext.' + self.userPut(sledrun_json_page, sledrun_json_orig_text, sledrun_json_text, summary=summary, contentmodel='json') + + +def main(*args: str) -> None: + local_args = pywikibot.handle_args(args) + gen_factory = pagegenerators.GeneratorFactory() + gen_factory.handle_args(local_args) + gen = gen_factory.getCombinedGenerator(preload=True) + if gen: + bot = UpdateSledrunJsonFromWikiText(generator=gen) + bot.run() + else: + pywikibot.bot.suggest_help(missing_generator=True) + + +if __name__ == '__main__': + main()