scripts/remove_dead_links.py

   1 #!/usr/bin/python
   2 import argparse
   3 import pickle
   4 import sys
   5 from copy import deepcopy
   6 from typing import List, Iterable, Tuple, Dict
   7
   8 import jsonschema
   9 from termcolor import cprint
  10
  11 from wrpylib.cli_tools import unified_diff, input_yes_no_quit, Choice
  12 from wrpylib.json_tools import order_json_keys, format_json
  13 from wrpylib.mwapi import WikiSite, page_json
  14
  15
  16 VALID_FORBIDDEN = [
  17     'https://www.lizumerhof.at/',
  18     'https://www.madrisa.ch/madrisa-mia',
  19 ]
  20
  21
  22 def remove_dead_links_in_dict_with_weblink(container: Dict, weblink_key: str, all_dead_links: set[str]):
  23     weblink = container.get(weblink_key)
  24     if weblink is not None and weblink['url'] in all_dead_links:
  25         del container[weblink_key]
  26
  27 def remove_dead_links_in_weblink_list(container: Dict, weblink_key: str, all_dead_links: set[str]):
  28     weblink_list = container.get(weblink_key)
  29     if weblink_list is None:
  30         return
  31     weblink_list[:] = filter(lambda wl: wl.get('url') not in all_dead_links, weblink_list)
  32     if len(weblink_list) == 0:
  33         del container[weblink_key]
  34
  35
  36 def remove_dead_links_on_sledrun_title(site: WikiSite, title: str, all_dead_links: set[str]):
  37     sledrun_json_page = site.query_page(f'{title}/Rodelbahn.json')
  38     sledrun = page_json(sledrun_json_page)
  39     sledrun_orig = deepcopy(sledrun)
  40
  41     if sledrun.get('official_url') in all_dead_links:
  42         del sledrun['official_url']
  43
  44     for rental in sledrun.get('sled_rental', []):
  45         remove_dead_links_in_dict_with_weblink(rental, 'weblink', all_dead_links)
  46
  47     for pt_line in sledrun.get('public_transport_lines', []):
  48         remove_dead_links_in_weblink_list(pt_line, 'timetable_links', all_dead_links)
  49
  50     if 'gastronomy' in sledrun:
  51         for gastronomy in sledrun['gastronomy']:
  52             gastronomy_orig = gastronomy.copy()
  53             remove_dead_links_in_dict_with_weblink(gastronomy, 'weblink', all_dead_links)
  54             if 'weblink' in gastronomy_orig and 'weblink' not in gastronomy:
  55                 if gastronomy.get('name') is None and gastronomy_orig.get('weblink', {}).get('text') is not None:
  56                     gastronomy['name'] = gastronomy_orig['weblink']['text']
  57         sledrun['gastronomy'][:] = [g for g in sledrun['gastronomy'] if g]
  58         if len(sledrun['gastronomy']) == 0:
  59             del sledrun['gastronomy']
  60
  61     for key in ['info_web', 'videos', 'webcams', 'see_also', 'public_transport_links']:
  62         remove_dead_links_in_weblink_list(sledrun, key, all_dead_links)
  63
  64     if sledrun == sledrun_orig:
  65         return
  66
  67     jsonschema.validate(instance=sledrun, schema=site.sledrun_schema())
  68     sledrun_ordered = order_json_keys(sledrun, site.sledrun_schema())
  69     assert sledrun_ordered == sledrun
  70
  71     sledrun_str = format_json(sledrun_ordered)
  72     sledrun_orig_str = format_json(sledrun_orig)
  73
  74     cprint(title, 'green')
  75     unified_diff(sledrun_orig_str, sledrun_str)
  76     choice = input_yes_no_quit('Do you accept the changes [yes, no, quit]? ', None)
  77     if choice == Choice.no:
  78         return
  79     elif choice == Choice.quit:
  80         sys.exit(0)
  81
  82     site(
  83         'edit',
  84         pageid=sledrun_json_page['pageid'],
  85         text=sledrun_str,
  86         summary=f'Ungültige Links entfernt.',
  87         minor=0,
  88         bot=1,
  89         baserevid=sledrun_json_page['revisions'][0]['revid'],
  90         nocreate=1,
  91         token=site.token(),
  92     )
  93
  94
  95 def get_all_sledrun_titles(site: WikiSite) -> Iterable[str]:
  96     for result in site.query(list='categorymembers', cmtitle='Kategorie:Rodelbahn', cmlimit='max'):
  97         for page in result['categorymembers']:
  98             yield page['title']
  99
 100
 101 def get_all_titles_with_dead_links(dead_links: Dict[str, List[Tuple[str, int, str]]]):
 102     for title_list in dead_links.values():
 103         for title, time, reason in title_list:
 104             yield title
 105
 106
 107 def print_forbidden(dead_links: Dict[str, List[Tuple[str, int, str]]]):
 108     for dead_link, page_list in dead_links.items():
 109         if len(page_list) >= 1 and page_list[-1][-1] == 'Forbidden' and dead_link not in VALID_FORBIDDEN:
 110             print(f'{dead_link} ({", ".join(p[0] for p in page_list)})')
 111
 112
 113 def dead_link_filter(dead_link_info: Tuple[str, List[Tuple[str, int, str]]]) -> bool:
 114     dead_link, page_list = dead_link_info
 115     if len(page_list) < 2:
 116         return False
 117     if dead_link in VALID_FORBIDDEN:
 118         return False
 119     return True
 120
 121
 122
 123 def update_dead_links(ini_files: List[str], dead_link_file: str, only_print_forbidden: bool):
 124     with open(dead_link_file, 'rb') as fp:
 125         dead_links: Dict[str, List[Tuple[str, int, str]]] = pickle.load(fp)
 126
 127     if only_print_forbidden:
 128         print_forbidden(dead_links)
 129         return
 130
 131     dead_links = dict(filter(dead_link_filter, dead_links.items()))
 132     all_titles_with_dead_links = set(get_all_titles_with_dead_links(dead_links))
 133     all_dead_links = set(dead_links.keys())
 134
 135     site = WikiSite(ini_files)
 136     all_sledrun_titles = set(get_all_sledrun_titles(site))
 137
 138     sledrun_titles_with_dead_links = all_sledrun_titles.intersection(all_titles_with_dead_links)
 139
 140     for sledrun_title in sledrun_titles_with_dead_links:
 141         remove_dead_links_on_sledrun_title(site, sledrun_title, all_dead_links)
 142
 143
 144 def main():
 145     parser = argparse.ArgumentParser(description='Remove dead links.')
 146     parser.add_argument('--print-forbidden', action='store_true', help='Print forbidden pages and exit')
 147     parser.add_argument('dead_links_file', help='deadlinks-winterrodelncolleen-de-formal.dat')
 148     parser.add_argument('ini_file', nargs='+', help='see: https://www.winterrodeln.org/trac/wiki/ConfigIni')
 149     args = parser.parse_args()
 150     update_dead_links(args.ini_file, args.dead_links_file, args.print_forbidden)
 151
 152
 153 if __name__ == '__main__':
 154     main()