Now showing walk distance and walk time for bus stops.
[philipp/winterrodeln/wrpylib.git] / wrpylib / wrmwcache.py
1 """Contains functions that maintain/update the cache tables."""
2 import urllib.parse
3 import urllib.request
4 import json
5 import mwparserfromhell
6 import sqlalchemy.engine
7 from sqlalchemy import schema, Table
8 from sqlalchemy.engine import Connection
9 from sqlalchemy.sql import select
10 from sqlalchemy.sql.expression import func as sqlfunc, text
11 from osgeo import ogr
12 from wrpylib import mwdb, wrmwdb, wrmwmarkup, wrvalidators
13
14
15 class UpdateCacheError(RuntimeError):
16     pass
17
18
19 def _get_mw_text(connection: Connection, text: Table, content_address: str) -> str:
20     parts = content_address.split(':')  # e.g. 'tt:15664'
21     if len(parts) != 2 or parts[0] != 'tt':
22         raise ValueError('Content has unexpected format')
23     old_id = int(parts[1])
24     query = select([text], text.c.old_id == old_id)
25     text_row = connection.execute(query).fetchone()
26     return text_row.old_text
27
28
29 def update_wrsledruncache(connection):
30     """Updates the wrsledruncache table from the wiki. If convert errors occur, an UpdateCacheError exception
31     is raised. No other exception type should be raised under normal circumstances.
32
33     >>> from sqlalchemy.engine import create_engine
34     >>> engine = create_engine('mysql://philipp@localhost:3306/philipp_winterrodeln_wiki?charset=utf8mb4')
35     >>> update_wrsledruncache(engine.connect())
36     """
37     metadata = schema.MetaData()
38     wrsledruncache = wrmwdb.wrsledruncache_table(metadata)
39     page = mwdb.page_table(metadata)
40     categorylinks = mwdb.categorylinks_table(metadata)
41     slots = mwdb.slots_table(metadata)
42     content = mwdb.content_table(metadata)
43     text = mwdb.text_table(metadata)
44
45     class Sledrun:
46         pass
47
48     try:
49         with connection.begin():
50
51             # Query all sled runs
52             q = select(
53                 [page, categorylinks, slots, content],
54                 (page.c.page_latest == slots.c.slot_revision_id) & (slots.c.slot_content_id == content.c.content_id) &
55                 (categorylinks.c.cl_from == page.c.page_id) & (categorylinks.c.cl_to == 'Rodelbahn'))
56             sledrun_pages = connection.execute(q)
57
58             # Delete all existing entries in wrsledruncache
59             # We rely on transactions MySQL InnoDB
60             connection.execute(wrsledruncache.delete())
61
62             # Refill wrsledruncache table
63             for sledrun_page in sledrun_pages:
64                 old_text = _get_mw_text(connection, text, sledrun_page.content_address)
65                 rodelbahnbox = wrvalidators.rodelbahnbox_from_str(old_text)
66                 sledrun = wrmwmarkup.sledrun_from_rodelbahnbox(rodelbahnbox, Sledrun())
67                 sledrun.page_id = sledrun_page.page_id
68                 sledrun.page_title = sledrun_page.page_title
69                 sledrun.name_url = wrvalidators.sledrun_page_title_to_pretty_url(sledrun_page.page_title)
70                 sledrun.under_construction = connection.execute(select(
71                     [sqlfunc.count()],
72                     (categorylinks.c.cl_from == sledrun_page.page_id) &
73                     (categorylinks.c.cl_to == 'In_Arbeit')).alias('x')).fetchone()[0] > 0
74                 connection.execute(wrsledruncache.insert(sledrun.__dict__))
75
76     except ValueError as e:
77         error_msg = f"Error at sled run '{sledrun_page.page_title}': {e}"
78         raise UpdateCacheError(error_msg, sledrun_page.page_title, e)
79
80
81 def update_wrinncache(connection):
82     """Updates the wrinncache table from the wiki. If convert errors occur, an UpdateCacheError exception
83     is raised. No other exception type should be raised under normal circumstances.
84
85     >>> from sqlalchemy.engine import create_engine
86     >>> engine = create_engine('mysql://philipp@localhost:3306/philipp_winterrodeln_wiki?charset=utf8mb4')
87     >>> update_wrinncache(engine.connect())
88     """
89     metadata = schema.MetaData()
90     wrinncache = wrmwdb.wrinncache_table(metadata)
91     page = mwdb.page_table(metadata)
92     categorylinks = mwdb.categorylinks_table(metadata)
93     slots = mwdb.slots_table(metadata)
94     content = mwdb.content_table(metadata)
95     text = mwdb.text_table(metadata)
96
97     class Inn:
98         pass
99
100     try:
101         with connection.begin():
102
103             # Query all inns
104             q = select(
105                 [page, categorylinks, slots, content],
106                 (page.c.page_latest == slots.c.slot_revision_id) & (slots.c.slot_content_id == content.c.content_id) &
107                 (categorylinks.c.cl_from == page.c.page_id) & (categorylinks.c.cl_to == 'Gasthaus'))
108             inn_pages = connection.execute(q)
109
110             # Delete all existing entries in wrinncache
111             # We rely on transactions MySQL InnoDB
112             connection.execute(wrinncache.delete())
113
114             # Refill wrinncache table
115             for inn_page in inn_pages:
116                 old_text = _get_mw_text(connection, text, inn_page.content_address)
117                 gasthausbox = wrvalidators.gasthausbox_from_str(old_text)
118                 inn = wrmwmarkup.inn_from_gasthausbox(gasthausbox, Inn())
119                 inn.page_id = inn_page.page_id
120                 inn.page_title = inn_page.page_title
121                 inn.under_construction = connection.execute(select(
122                     [sqlfunc.count()],
123                     (categorylinks.c.cl_from == inn_page.page_id) &
124                     (categorylinks.c.cl_to == 'In_Arbeit')).alias('x')) \
125                     .fetchone()[0] > 0  # it would be better to do this in the query above
126                 connection.execute(wrinncache.insert(inn.__dict__))
127
128     except ValueError as e:
129         error_msg = f"Error as inn '{inn_page.page_title}': {e}"
130         raise UpdateCacheError(error_msg, inn_page.page_title, e)
131
132
133 def update_wrreportcache(connection, page_id=None):
134     """Updates the wrreportcache table.
135     :param connection: sqlalchemy connection
136     :param page_id: Updates only the reportcache table for the sledrun described on the Winterrodeln wiki page
137         with the specified page_id. Use None for this parameter to update the whole table.
138
139     >>> from sqlalchemy.engine import create_engine
140     >>> engine = create_engine('mysql://philipp@localhost:3306/philipp_winterrodeln_wiki?charset=utf8mb4')
141     >>> update_wrreportcache(engine.connect())
142     """
143     metadata = schema.MetaData()
144     wrreportcache = wrmwdb.wrreportcache_table(metadata)
145     with connection.begin():
146         # Delete the datasets we are going to update
147         sql_del = wrreportcache.delete()
148         if page_id is not None:
149             sql_del = sql_del.where(wrreportcache.c.page_id == page_id)
150         connection.execute(sql_del)
151
152         def insert_row(connection_, row_list_):
153             if len(row_list_) == 0:
154                 return
155             # Insert the report
156             row_ = dict(row_list_[0])
157             connection_.execute(wrreportcache.insert(values=row_))
158
159         # Select the rows to update
160         sql = 'select page_id, page_title, wrreport.id as report_id, date_report, `condition`, description, ' \
161               'author_name, ' \
162               'if(author_userid is null, null, author_username) as author_username from wrreport ' \
163               'where {0}`condition` is not null and date_invalid > now() and delete_date is null ' \
164               'order by page_id, date_report desc, date_entry desc' \
165               .format('' if page_id is None else f'page_id={page_id} and ')
166         cursor = connection.execute(sql)
167         page_id = None
168         row_list = []
169         for row in cursor:
170             if row.page_id != page_id:
171                 insert_row(connection, row_list)
172                 page_id = row.page_id
173                 row_list = []
174             row_list.append(row)
175         insert_row(connection, row_list)
176
177
178 def update_wrmapcache(connection):
179     """Updates the wrmappointcache and wrmappathcache tables from the wiki. If convert errors occur,
180     an UpdateCacheError exception is raised. No other exception type should be raised under normal circumstances.
181
182     >>> from sqlalchemy.engine import create_engine
183     >>> engine = create_engine('mysql://philipp@localhost:3306/philipp_winterrodeln_wiki?charset=utf8mb4')
184     >>> # or:
185     >>> # engine = create_engine('mysql://philipp@localhost:3306/philipp_winterrodeln_wiki?charset=utf8mb4&passwd=XXX')
186     >>> update_wrmapcache(engine.connect())
187     """
188     metadata = schema.MetaData()
189     page = mwdb.page_table(metadata)
190     categorylinks = mwdb.categorylinks_table(metadata)
191     slots = mwdb.slots_table(metadata)
192     content = mwdb.content_table(metadata)
193     text = mwdb.text_table(metadata)
194
195     try:
196         with connection.begin():
197
198             # Query all sledruns
199             q = select(
200                 [page, categorylinks, slots, content],
201                 (page.c.page_latest == slots.c.slot_revision_id) & (slots.c.slot_content_id == content.c.content_id) &
202                 (categorylinks.c.cl_from == page.c.page_id) & (categorylinks.c.cl_to == 'Rodelbahn'))
203             sledrun_pages = connection.execute(q)
204
205             # Delete all existing entries in wrmappointcache
206             # We rely on transactions MySQL InnoDB
207             connection.execute('delete from wrmappointcache')
208             connection.execute('delete from wrmappathcache')
209
210             # Refill wrmappointcache and wrmappathcache tables
211             for sledrun_page in sledrun_pages:
212                 old_text = _get_mw_text(connection, text, sledrun_page.content_address)
213                 wikicode = mwparserfromhell.parse(old_text)
214                 wrmap_list = wikicode.filter_tags(recursive=False, matches=lambda tag: tag.tag == 'wrmap')
215                 if len(wrmap_list) == 0:
216                     continue  # not wrmap in page
217                 if len(wrmap_list) > 1:
218                     raise UpdateCacheError(
219                         f'{len(wrmap_list)} <wrmap ...> entries found in article "{sledrun_page.page_title}"')
220                 wrmap = wrmap_list[0]
221                 geojson = wrmwmarkup.parse_wrmap(str(wrmap))
222
223                 for feature in geojson['features']:
224                     properties = feature['properties']
225                     coordinates = feature['geometry']['coordinates']
226
227                     # Points
228                     if properties['type'] in wrmwmarkup.WRMAP_POINT_TYPES:
229                         lon, lat = coordinates
230                         label = properties.get('name')
231                         point_types = {
232                             'gasthaus': 'hut',
233                             'haltestelle': 'busstop',
234                             'parkplatz': 'carpark',
235                             'achtung': 'warning',
236                             'foto': 'photo',
237                             'verleih': 'rental',
238                             'punkt': 'point'
239                         }
240                         point_type = point_types[properties['type']]
241                         sql = 'insert into wrmappointcache (page_id, type, point, label) values (%s, %s, POINT(%s, %s), %s)'
242                         connection.execute(sql, (sledrun_page.page_id, point_type, lon, lat, label))
243
244                     # Paths
245                     elif properties['type'] in wrmwmarkup.WRMAP_LINE_TYPES:
246                         path_types = {
247                             'rodelbahn': 'sledrun',
248                             'gehweg': 'walkup',
249                             'alternative': 'alternative',
250                             'lift': 'lift',
251                             'anfahrt': 'recommendedcarroute',
252                             'linie': 'line'}
253                         path_type = path_types[properties['type']]
254                         path = ", ".join([f"{lon} {lat}" for lon, lat in coordinates])
255                         path = f'LineString({path})'
256                         if path_type == 'recommendedcarroute':
257                             continue
258                         sql = 'insert into wrmappathcache (path, page_id, type) values (GeomFromText(%s), %s, %s)'
259                         connection.execute(sql, (path, sledrun_page.page_id, path_type))
260
261                     else:
262                         raise RuntimeError(f'Unknown feature type {properties["type"]}')
263     except RuntimeError as e:
264         error_msg = f"Error at sledrun '{sledrun_page.page_title}': {e}"
265         raise UpdateCacheError(error_msg, sledrun_page.page_title, e)
266
267
268 def update_wrregioncache(connection):
269     """Updates the wrregioncache table from the wiki.
270     It relays on the table wrsledruncache to be up-to-date.
271     No exceptions should be raised under normal circumstances.
272
273     >>> from sqlalchemy.engine import create_engine
274     >>> engine = create_engine('mysql://philipp@localhost:3306/philipp_winterrodeln_wiki?charset=utf8mb4')
275     >>> # or:
276     >>> # engine = create_engine('mysql://philipp@localhost:3306/philipp_winterrodeln_wiki?charset=utf8mb4&passwd=XXX')
277     >>> update_wrregioncache(engine.connect())
278     """
279     metadata = schema.MetaData()
280     wrregion = wrmwdb.wrregion_table(metadata)
281     wrsledruncache = wrmwdb.wrsledruncache_table(metadata)
282     wrregioncache = wrmwdb.wrregioncache_table(metadata)
283
284     with connection.begin():
285
286         # Delete all existing entries in wrregioncache
287         # We rely on transactions MySQL InnoDB
288         connection.execute(wrregioncache.delete())
289
290         # Query all combinations of sledruns and regions
291         sel = select(
292             [
293                 wrregion.c.id.label('region_id'),
294                 sqlfunc.AsWKB(wrregion.c.border).label('border'),
295                 wrsledruncache.c.page_id,
296                 wrsledruncache.c.position_longitude,
297                 wrsledruncache.c.position_latitude
298             ],
299             sqlfunc.contains(
300                 wrregion.c.border,
301                 sqlfunc.point(wrsledruncache.c.position_longitude, wrsledruncache.c.position_latitude)
302             )
303         )
304         ins = wrregioncache.insert()
305
306         # Refill wrregioncache
307         point = ogr.Geometry(ogr.wkbPoint)
308         result = connection.execute(sel)
309         for row in result:
310             point.SetPoint(0, row.position_longitude, row.position_latitude)
311             if point.Within(ogr.CreateGeometryFromWkb(row.border)):
312                 connection.execute(ins.values(region_id=row.region_id, page_id=row.page_id))
313
314
315 def update_wrsledrunjsoncache(api_url: urllib.parse.ParseResult, connection: sqlalchemy.engine.Connection):
316     """Updates the wrsledrunjsoncache from JSON files in the wiki.
317
318     :param api_url: URL to the API, e.g. https://www.winterrodeln.org/mediawiki/api.php
319     :param connection: open sqlalchemy connection
320     """
321     with connection.begin():
322         # connection.execute('truncate table wrsledrunjsoncache')  # needs additional permissions
323         connection.execute('delete from wrsledrunjsoncache')
324
325         cmcontinue = ''
326         while True:
327             request = api_url._replace(query='action=query&list=categorymembers&cmtitle=Kategorie:Rodelbahn'
328                                              f'&cmlimit=50&format=json&cmcontinue={cmcontinue}')  # cmlimit=max (5000)
329             response = urllib.request.urlopen(request.geturl())
330             sledrun_list_response = json.load(response)
331             rows_to_insert = sledrun_list_response['query']['categorymembers']
332             for row in rows_to_insert:
333                 row['sledrun_title'] = f'{row["title"]}/Rodelbahn.json'
334                 row['map_title'] = f'{row["title"]}/Landkarte.json'
335
336             qs_titles = urllib.parse.urlencode({"titles": '|'.join(r['sledrun_title'] for r in rows_to_insert)})
337             request = api_url._replace(query='action=query&prop=revisions&rvprop=content|ids&format=json&rvslots=main&'
338                                              + qs_titles)
339             response = urllib.request.urlopen(request.geturl())
340             sledrun_json_list_response = json.load(response)
341             sledrun_json_dict = {s['title']: s for s in sledrun_json_list_response['query']['pages'].values()}
342
343             qs_titles = urllib.parse.urlencode({"titles": '|'.join(r['map_title'] for r in rows_to_insert)})
344             request = api_url._replace(query='action=query&prop=revisions&rvprop=content|ids&format=json&rvslots=main&'
345                                              + qs_titles)
346             response = urllib.request.urlopen(request.geturl())
347             map_json_list_response = json.load(response)
348             map_json_dict = {m['title']: m for m in map_json_list_response['query']['pages'].values()}
349
350             for row in rows_to_insert:
351                 sledrun_json = sledrun_json_dict.get(row['sledrun_title'])
352                 if 'missing' not in sledrun_json:
353                     row['sledrun_pageid'] = sledrun_json['pageid']
354                     rev = sledrun_json['revisions'][0]
355                     row['sledrun_revid'] = rev['revid']
356                     row['sledrun'] = rev['slots']['main']['*']
357                 map_json = map_json_dict.get(row['map_title'])
358                 if 'missing' not in map_json:
359                     row['map_pageid'] = map_json['pageid']
360                     rev = map_json['revisions'][0]
361                     row['map_revid'] = rev['revid']
362                     row['map'] = rev['slots']['main']['*']
363
364             sql = 'insert into wrsledrunjsoncache ' \
365                   '(sledrun_page_id, ' \
366                   'sledrun_json_page_id, sledrun_json_rev_id, sledrun_json, ' \
367                   'map_json_page_id, map_json_rev_id, map_json) ' \
368                   'values (:sledrun_page_id, ' \
369                   ':sledrun_json_page_id, :sledrun_json_rev_id, :sledrun_json, ' \
370                   ':map_json_page_id, :map_json_rev_id, :map_json)'
371             for row in rows_to_insert:
372                 connection.execute(text(sql), [{
373                     'sledrun_page_id': row['pageid'],
374                     'sledrun_json_page_id': row['sledrun_pageid'],
375                     'sledrun_json_rev_id': row['sledrun_revid'],
376                     'sledrun_json': row.get('sledrun'),
377                     'map_json_page_id': row.get('map_pageid'),
378                     'map_json_rev_id': row.get('map_revid'),
379                     'map_json': row.get('map')
380                 }])
381
382             if 'continue' not in sledrun_list_response:
383                 break
384             cmcontinue = sledrun_list_response['continue']['cmcontinue']