#!/usr/bin/python import argparse import pickle import sys from copy import deepcopy from typing import List, Iterable, Tuple, Dict import jsonschema from termcolor import cprint from wrpylib.cli_tools import unified_diff, input_yes_no_quit, Choice from wrpylib.json_tools import order_json_keys, format_json from wrpylib.mwapi import WikiSite, page_json VALID_FORBIDDEN = [ 'https://www.lizumerhof.at/', 'https://www.madrisa.ch/madrisa-mia', ] def remove_dead_links_in_dict_with_weblink(container: Dict, weblink_key: str, all_dead_links: set[str]): weblink = container.get(weblink_key) if weblink is not None and weblink['url'] in all_dead_links: del container[weblink_key] def remove_dead_links_in_weblink_list(container: Dict, weblink_key: str, all_dead_links: set[str]): weblink_list = container.get(weblink_key) if weblink_list is None: return weblink_list[:] = filter(lambda wl: wl.get('url') not in all_dead_links, weblink_list) if len(weblink_list) == 0: del container[weblink_key] def remove_dead_links_on_sledrun_title(site: WikiSite, title: str, all_dead_links: set[str]): sledrun_json_page = site.query_page(f'{title}/Rodelbahn.json') sledrun = page_json(sledrun_json_page) sledrun_orig = deepcopy(sledrun) if sledrun.get('official_url') in all_dead_links: del sledrun['official_url'] for rental in sledrun.get('sled_rental', []): remove_dead_links_in_dict_with_weblink(rental, 'weblink', all_dead_links) for pt_line in sledrun.get('public_transport_lines', []): remove_dead_links_in_weblink_list(pt_line, 'timetable_links', all_dead_links) if 'gastronomy' in sledrun: for gastronomy in sledrun['gastronomy']: gastronomy_orig = gastronomy.copy() remove_dead_links_in_dict_with_weblink(gastronomy, 'weblink', all_dead_links) if 'weblink' in gastronomy_orig and 'weblink' not in gastronomy: if gastronomy.get('name') is None and gastronomy_orig.get('weblink', {}).get('text') is not None: gastronomy['name'] = gastronomy_orig['weblink']['text'] sledrun['gastronomy'][:] = [g for g in sledrun['gastronomy'] if g] if len(sledrun['gastronomy']) == 0: del sledrun['gastronomy'] for key in ['info_web', 'videos', 'webcams', 'see_also', 'public_transport_links']: remove_dead_links_in_weblink_list(sledrun, key, all_dead_links) if sledrun == sledrun_orig: return jsonschema.validate(instance=sledrun, schema=site.sledrun_schema()) sledrun_ordered = order_json_keys(sledrun, site.sledrun_schema()) assert sledrun_ordered == sledrun sledrun_str = format_json(sledrun_ordered) sledrun_orig_str = format_json(sledrun_orig) cprint(title, 'green') unified_diff(sledrun_orig_str, sledrun_str) choice = input_yes_no_quit('Do you accept the changes [yes, no, quit]? ', None) if choice == Choice.no: return elif choice == Choice.quit: sys.exit(0) site( 'edit', pageid=sledrun_json_page['pageid'], text=sledrun_str, summary=f'Ungültige Links entfernt.', minor=0, bot=1, baserevid=sledrun_json_page['revisions'][0]['revid'], nocreate=1, token=site.token(), ) def get_all_sledrun_titles(site: WikiSite) -> Iterable[str]: for result in site.query(list='categorymembers', cmtitle='Kategorie:Rodelbahn', cmlimit='max'): for page in result['categorymembers']: yield page['title'] def get_all_titles_with_dead_links(dead_links: Dict[str, List[Tuple[str, int, str]]]): for title_list in dead_links.values(): for title, time, reason in title_list: yield title def print_forbidden(dead_links: Dict[str, List[Tuple[str, int, str]]]): for dead_link, page_list in dead_links.items(): if len(page_list) >= 1 and page_list[-1][-1] == 'Forbidden' and dead_link not in VALID_FORBIDDEN: print(f'{dead_link} ({", ".join(p[0] for p in page_list)})') def dead_link_filter(dead_link_info: Tuple[str, List[Tuple[str, int, str]]]) -> bool: dead_link, page_list = dead_link_info if len(page_list) < 2: return False if dead_link in VALID_FORBIDDEN: return False return True def update_dead_links(ini_files: List[str], dead_link_file: str, only_print_forbidden: bool): with open(dead_link_file, 'rb') as fp: dead_links: Dict[str, List[Tuple[str, int, str]]] = pickle.load(fp) if only_print_forbidden: print_forbidden(dead_links) return dead_links = dict(filter(dead_link_filter, dead_links.items())) all_titles_with_dead_links = set(get_all_titles_with_dead_links(dead_links)) all_dead_links = set(dead_links.keys()) site = WikiSite(ini_files) all_sledrun_titles = set(get_all_sledrun_titles(site)) sledrun_titles_with_dead_links = all_sledrun_titles.intersection(all_titles_with_dead_links) for sledrun_title in sledrun_titles_with_dead_links: remove_dead_links_on_sledrun_title(site, sledrun_title, all_dead_links) def main(): parser = argparse.ArgumentParser(description='Remove dead links.') parser.add_argument('--print-forbidden', action='store_true', help='Print forbidden pages and exit') parser.add_argument('dead_links_file', help='deadlinks-winterrodelncolleen-de-formal.dat') parser.add_argument('ini_file', nargs='+', help='see: https://www.winterrodeln.org/trac/wiki/ConfigIni') args = parser.parse_args() update_dead_links(args.ini_file, args.dead_links_file, args.print_forbidden) if __name__ == '__main__': main()