wrpylib/wrmwcache.py

   1 """Contains functions that maintain/update the cache tables."""
   2 import urllib.parse
   3 import urllib.request
   4 import json
   5 import mwparserfromhell
   6 import sqlalchemy.engine
   7 from sqlalchemy import schema, Table
   8 from sqlalchemy.engine import Connection
   9 from sqlalchemy.sql import select
  10 from sqlalchemy.sql.expression import func as sqlfunc
  11 from osgeo import ogr
  12 from wrpylib import mwdb, wrmwdb, wrmwmarkup, wrvalidators
  13
  14
  15 class UpdateCacheError(RuntimeError):
  16     pass
  17
  18
  19 def _get_mw_text(connection: Connection, text: Table, content_address: str) -> str:
  20     parts = content_address.split(':')  # e.g. 'tt:15664'
  21     if len(parts) != 2 or parts[0] != 'tt':
  22         raise ValueError('Content has unexpected format')
  23     old_id = int(parts[1])
  24     query = select([text], text.c.old_id == old_id)
  25     text_row = connection.execute(query).fetchone()
  26     return text_row.old_text
  27
  28
  29 def update_wrsledruncache(connection):
  30     """Updates the wrsledruncache table from the wiki. If convert errors occur, an UpdateCacheError exception
  31     is raised. No other exception type should be raised under normal circumstances.
  32
  33     >>> from sqlalchemy.engine import create_engine
  34     >>> engine = create_engine('mysql://philipp@localhost:3306/philipp_winterrodeln_wiki?charset=utf8mb4')
  35     >>> update_wrsledruncache(engine.connect())
  36     """
  37     metadata = schema.MetaData()
  38     wrsledruncache = wrmwdb.wrsledruncache_table(metadata)
  39     page = mwdb.page_table(metadata)
  40     categorylinks = mwdb.categorylinks_table(metadata)
  41     slots = mwdb.slots_table(metadata)
  42     content = mwdb.content_table(metadata)
  43     text = mwdb.text_table(metadata)
  44
  45     class Sledrun:
  46         pass
  47
  48     try:
  49         with connection.begin():
  50
  51             # Query all sled runs
  52             q = select(
  53                 [page, categorylinks, slots, content],
  54                 (page.c.page_latest == slots.c.slot_revision_id) & (slots.c.slot_content_id == content.c.content_id) &
  55                 (categorylinks.c.cl_from == page.c.page_id) & (categorylinks.c.cl_to == 'Rodelbahn'))
  56             sledrun_pages = connection.execute(q)
  57
  58             # Delete all existing entries in wrsledruncache
  59             # We rely on transactions MySQL InnoDB
  60             connection.execute(wrsledruncache.delete())
  61
  62             # Refill wrsledruncache table
  63             for sledrun_page in sledrun_pages:
  64                 old_text = _get_mw_text(connection, text, sledrun_page.content_address)
  65                 rodelbahnbox = wrvalidators.rodelbahnbox_from_str(old_text)
  66                 sledrun = wrmwmarkup.sledrun_from_rodelbahnbox(rodelbahnbox, Sledrun())
  67                 sledrun.page_id = sledrun_page.page_id
  68                 sledrun.page_title = sledrun_page.page_title
  69                 sledrun.name_url = wrvalidators.sledrun_page_title_to_pretty_url(sledrun_page.page_title)
  70                 sledrun.under_construction = connection.execute(select(
  71                     [sqlfunc.count()],
  72                     (categorylinks.c.cl_from == sledrun_page.page_id) &
  73                     (categorylinks.c.cl_to == 'In_Arbeit')).alias('x')).fetchone()[0] > 0
  74                 connection.execute(wrsledruncache.insert(sledrun.__dict__))
  75
  76     except ValueError as e:
  77         error_msg = f"Error at sled run '{sledrun_page.page_title}': {e}"
  78         raise UpdateCacheError(error_msg, sledrun_page.page_title, e)
  79
  80
  81 def update_wrinncache(connection):
  82     """Updates the wrinncache table from the wiki. If convert errors occur, an UpdateCacheError exception
  83     is raised. No other exception type should be raised under normal circumstances.
  84
  85     >>> from sqlalchemy.engine import create_engine
  86     >>> engine = create_engine('mysql://philipp@localhost:3306/philipp_winterrodeln_wiki?charset=utf8mb4')
  87     >>> update_wrinncache(engine.connect())
  88     """
  89     metadata = schema.MetaData()
  90     wrinncache = wrmwdb.wrinncache_table(metadata)
  91     page = mwdb.page_table(metadata)
  92     categorylinks = mwdb.categorylinks_table(metadata)
  93     slots = mwdb.slots_table(metadata)
  94     content = mwdb.content_table(metadata)
  95     text = mwdb.text_table(metadata)
  96
  97     class Inn:
  98         pass
  99
 100     try:
 101         with connection.begin():
 102
 103             # Query all inns
 104             q = select(
 105                 [page, categorylinks, slots, content],
 106                 (page.c.page_latest == slots.c.slot_revision_id) & (slots.c.slot_content_id == content.c.content_id) &
 107                 (categorylinks.c.cl_from == page.c.page_id) & (categorylinks.c.cl_to == 'Gasthaus'))
 108             inn_pages = connection.execute(q)
 109
 110             # Delete all existing entries in wrinncache
 111             # We rely on transactions MySQL InnoDB
 112             connection.execute(wrinncache.delete())
 113
 114             # Refill wrinncache table
 115             for inn_page in inn_pages:
 116                 old_text = _get_mw_text(connection, text, inn_page.content_address)
 117                 gasthausbox = wrvalidators.gasthausbox_from_str(old_text)
 118                 inn = wrmwmarkup.inn_from_gasthausbox(gasthausbox, Inn())
 119                 inn.page_id = inn_page.page_id
 120                 inn.page_title = inn_page.page_title
 121                 inn.under_construction = connection.execute(select(
 122                     [sqlfunc.count()],
 123                     (categorylinks.c.cl_from == inn_page.page_id) &
 124                     (categorylinks.c.cl_to == 'In_Arbeit')).alias('x')) \
 125                     .fetchone()[0] > 0  # it would be better to do this in the query above
 126                 connection.execute(wrinncache.insert(inn.__dict__))
 127
 128     except ValueError as e:
 129         error_msg = f"Error as inn '{inn_page.page_title}': {e}"
 130         raise UpdateCacheError(error_msg, inn_page.page_title, e)
 131
 132
 133 def update_wrreportcache(connection, page_id=None):
 134     """Updates the wrreportcache table.
 135     :param connection: sqlalchemy connection
 136     :param page_id: Updates only the reportcache table for the sledrun described on the Winterrodeln wiki page
 137         with the specified page_id. Use None for this parameter to update the whole table.
 138
 139     >>> from sqlalchemy.engine import create_engine
 140     >>> engine = create_engine('mysql://philipp@localhost:3306/philipp_winterrodeln_wiki?charset=utf8mb4')
 141     >>> update_wrreportcache(engine.connect())
 142     """
 143     metadata = schema.MetaData()
 144     wrreportcache = wrmwdb.wrreportcache_table(metadata)
 145     with connection.begin():
 146         # Delete the datasets we are going to update
 147         sql_del = wrreportcache.delete()
 148         if page_id is not None:
 149             sql_del = sql_del.where(wrreportcache.c.page_id == page_id)
 150         connection.execute(sql_del)
 151
 152         def insert_row(connection_, row_list_):
 153             if len(row_list_) == 0:
 154                 return
 155             # Insert the report
 156             row_ = dict(row_list_[0])
 157             connection_.execute(wrreportcache.insert(values=row_))
 158
 159         # Select the rows to update
 160         sql = 'select page_id, page_title, wrreport.id as report_id, date_report, `condition`, description, ' \
 161               'author_name, ' \
 162               'if(author_userid is null, null, author_username) as author_username from wrreport ' \
 163               'where {0}`condition` is not null and date_invalid > now() and delete_date is null ' \
 164               'order by page_id, date_report desc, date_entry desc' \
 165               .format('' if page_id is None else f'page_id={page_id} and ')
 166         cursor = connection.execute(sql)
 167         page_id = None
 168         row_list = []
 169         for row in cursor:
 170             if row.page_id != page_id:
 171                 insert_row(connection, row_list)
 172                 page_id = row.page_id
 173                 row_list = []
 174             row_list.append(row)
 175         insert_row(connection, row_list)
 176
 177
 178 def update_wrmapcache(connection):
 179     """Updates the wrmappointcache and wrmappathcache tables from the wiki. If convert errors occur,
 180     an UpdateCacheError exception is raised. No other exception type should be raised under normal circumstances.
 181
 182     >>> from sqlalchemy.engine import create_engine
 183     >>> engine = create_engine('mysql://philipp@localhost:3306/philipp_winterrodeln_wiki?charset=utf8mb4')
 184     >>> # or:
 185     >>> # engine = create_engine('mysql://philipp@localhost:3306/philipp_winterrodeln_wiki?charset=utf8mb4&passwd=XXX')
 186     >>> update_wrmapcache(engine.connect())
 187     """
 188     metadata = schema.MetaData()
 189     page = mwdb.page_table(metadata)
 190     categorylinks = mwdb.categorylinks_table(metadata)
 191     slots = mwdb.slots_table(metadata)
 192     content = mwdb.content_table(metadata)
 193     text = mwdb.text_table(metadata)
 194
 195     try:
 196         with connection.begin():
 197
 198             # Query all sledruns
 199             q = select(
 200                 [page, categorylinks, slots, content],
 201                 (page.c.page_latest == slots.c.slot_revision_id) & (slots.c.slot_content_id == content.c.content_id) &
 202                 (categorylinks.c.cl_from == page.c.page_id) & (categorylinks.c.cl_to == 'Rodelbahn'))
 203             sledrun_pages = connection.execute(q)
 204
 205             # Delete all existing entries in wrmappointcache
 206             # We rely on transactions MySQL InnoDB
 207             connection.execute('delete from wrmappointcache')
 208             connection.execute('delete from wrmappathcache')
 209
 210             # Refill wrmappointcache and wrmappathcache tables
 211             for sledrun_page in sledrun_pages:
 212                 old_text = _get_mw_text(connection, text, sledrun_page.content_address)
 213                 wikicode = mwparserfromhell.parse(old_text)
 214                 wrmap_list = wikicode.filter_tags(recursive=False, matches=lambda tag: tag.tag == 'wrmap')
 215                 if len(wrmap_list) == 0:
 216                     continue  # not wrmap in page
 217                 if len(wrmap_list) > 1:
 218                     raise UpdateCacheError(
 219                         f'{len(wrmap_list)} <wrmap ...> entries found in article "{sledrun_page.page_title}"')
 220                 wrmap = wrmap_list[0]
 221                 geojson = wrmwmarkup.parse_wrmap(str(wrmap))
 222
 223                 for feature in geojson['features']:
 224                     properties = feature['properties']
 225                     coordinates = feature['geometry']['coordinates']
 226
 227                     # Points
 228                     if properties['type'] in wrmwmarkup.WRMAP_POINT_TYPES:
 229                         lon, lat = coordinates
 230                         label = properties.get('name')
 231                         point_types = {
 232                             'gasthaus': 'hut',
 233                             'haltestelle': 'busstop',
 234                             'parkplatz': 'carpark',
 235                             'achtung': 'warning',
 236                             'foto': 'photo',
 237                             'verleih': 'rental',
 238                             'punkt': 'point'
 239                         }
 240                         point_type = point_types[properties['type']]
 241                         sql = 'insert into wrmappointcache (page_id, type, point, label) values (%s, %s, POINT(%s, %s), %s)'
 242                         connection.execute(sql, (sledrun_page.page_id, point_type, lon, lat, label))
 243
 244                     # Paths
 245                     elif properties['type'] in wrmwmarkup.WRMAP_LINE_TYPES:
 246                         path_types = {
 247                             'rodelbahn': 'sledrun',
 248                             'gehweg': 'walkup',
 249                             'alternative': 'alternative',
 250                             'lift': 'lift',
 251                             'anfahrt': 'recommendedcarroute',
 252                             'linie': 'line'}
 253                         path_type = path_types[properties['type']]
 254                         path = ", ".join([f"{lon} {lat}" for lon, lat in coordinates])
 255                         path = f'LineString({path})'
 256                         if path_type == 'recommendedcarroute':
 257                             continue
 258                         sql = 'insert into wrmappathcache (path, page_id, type) values (GeomFromText(%s), %s, %s)'
 259                         connection.execute(sql, (path, sledrun_page.page_id, path_type))
 260
 261                     else:
 262                         raise RuntimeError(f'Unknown feature type {properties["type"]}')
 263     except RuntimeError as e:
 264         error_msg = f"Error at sledrun '{sledrun_page.page_title}': {e}"
 265         raise UpdateCacheError(error_msg, sledrun_page.page_title, e)
 266
 267
 268 def update_wrregioncache(connection):
 269     """Updates the wrregioncache table from the wiki.
 270     It relays on the table wrsledruncache to be up-to-date.
 271     No exceptions should be raised under normal circumstances.
 272
 273     >>> from sqlalchemy.engine import create_engine
 274     >>> engine = create_engine('mysql://philipp@localhost:3306/philipp_winterrodeln_wiki?charset=utf8mb4')
 275     >>> # or:
 276     >>> # engine = create_engine('mysql://philipp@localhost:3306/philipp_winterrodeln_wiki?charset=utf8mb4&passwd=XXX')
 277     >>> update_wrregioncache(engine.connect())
 278     """
 279     metadata = schema.MetaData()
 280     wrregion = wrmwdb.wrregion_table(metadata)
 281     wrsledruncache = wrmwdb.wrsledruncache_table(metadata)
 282     wrregioncache = wrmwdb.wrregioncache_table(metadata)
 283
 284     with connection.begin():
 285
 286         # Delete all existing entries in wrregioncache
 287         # We rely on transactions MySQL InnoDB
 288         connection.execute(wrregioncache.delete())
 289
 290         # Query all combinations of sledruns and regions
 291         sel = select(
 292             [
 293                 wrregion.c.id.label('region_id'),
 294                 sqlfunc.AsWKB(wrregion.c.border).label('border'),
 295                 wrsledruncache.c.page_id,
 296                 wrsledruncache.c.position_longitude,
 297                 wrsledruncache.c.position_latitude
 298             ],
 299             sqlfunc.contains(
 300                 wrregion.c.border,
 301                 sqlfunc.point(wrsledruncache.c.position_longitude, wrsledruncache.c.position_latitude)
 302             )
 303         )
 304         ins = wrregioncache.insert()
 305
 306         # Refill wrregioncache
 307         point = ogr.Geometry(ogr.wkbPoint)
 308         result = connection.execute(sel)
 309         for row in result:
 310             point.SetPoint(0, row.position_longitude, row.position_latitude)
 311             if point.Within(ogr.CreateGeometryFromWkb(row.border)):
 312                 connection.execute(ins.values(region_id=row.region_id, page_id=row.page_id))
 313
 314
 315 def update_wrsledrunjsoncache(api_url: urllib.parse.ParseResult, connection: sqlalchemy.engine.Connection):
 316     """Updates the wrsledrunjsoncache from JSON files in the wiki.
 317
 318     :param api_url: URL to the API, e.g. https://www.winterrodeln.org/mediawiki/api.php
 319     :param connection: open sqlalchemy connection
 320     """
 321     request = api_url._replace(query='action=query&list=categorymembers&cmtitle=Kategorie:Rodelbahn'
 322                                      '&cmlimit=3&format=json')  # cmlimit=max (5000)
 323     response = urllib.request.urlopen(request.geturl())
 324     sledrun_list_response = json.load(response)
 325     rows_to_insert = sledrun_list_response['query']['categorymembers']
 326     for row in rows_to_insert:
 327         row['sledrun_title'] = f'{row["title"]}/Rodelbahn.json'
 328         row['map_title'] = f'{row["title"]}/Landkarte.json'
 329
 330     qs_titles = urllib.parse.urlencode({"titles": '|'.join(r['sledrun_title'] for r in rows_to_insert)})
 331     request = api_url._replace(query='action=query&prop=revisions&rvprop=content&format=json&rvslots=main&' + qs_titles)
 332     response = urllib.request.urlopen(request.geturl())
 333     sledrun_json_list_response = json.load(response)
 334     sledrun_json_dict = {s['title']: s for s in sledrun_json_list_response['query']['pages'].values()}
 335
 336     qs_titles = urllib.parse.urlencode({"titles": '|'.join(r['map_title'] for r in rows_to_insert)})
 337     request = api_url._replace(query='action=query&prop=revisions&rvprop=content&format=json&rvslots=main&' + qs_titles)
 338     response = urllib.request.urlopen(request.geturl())
 339     map_json_list_response = json.load(response)
 340     map_json_dict = {m['title']: m for m in map_json_list_response['query']['pages'].values()}
 341
 342     for row in rows_to_insert:
 343         sledrun_json = sledrun_json_dict.get(row['sledrun_title'])
 344         if sledrun_json is not None:
 345             row['sledrun_pageid'] = sledrun_json['pageid']
 346             row['sledrun'] = sledrun_json['revisions'][0]['slots']['main']['*']
 347         map_json = map_json_dict.get(row['map_title'])
 348         if map_json is not None:
 349             row['map_pageid'] = map_json['pageid']
 350             row['map'] = map_json['revisions'][0]['slots']['main']['*']
 351
 352     with connection.begin():
 353         # connection.execute('truncate table wrsledrunjsoncache')  # needs additional permissions
 354         connection.execute('delete from wrsledrunjsoncache')
 355         sql = 'insert into wrsledrunjsoncache ' \
 356               '(sledrun_page_id, sledrun_json_page_id, sledrun_json, map_json_page_id, map_json) ' \
 357               'values (%s, %s, %s, %s, %s)'
 358         for row in rows_to_insert:
 359             connection.execute(sql, (row['pageid'], row.get('sledrun_pageid'), row.get('sledrun'),
 360                                      row.get('map_pageid'), row.get('map')))