Adapt to new design of bob homepage.
[toast/webscraper/bob.git] / bob_download.py
index 2e8d6997ababa0dfd7595d1773b14aae3c991321..97e38547ffcb42e1ffb67e41e5ce33ad46df4c80 100755 (executable)
@@ -17,7 +17,6 @@ optional arguments:
 """
 import os
 import re
-import time
 import argparse
 import warnings
 from urllib.parse import urljoin
@@ -28,6 +27,7 @@ from requests.packages.urllib3.exceptions import SubjectAltNameWarning
 # SubjectAltNameWarning's should go off 0 times per host
 warnings.simplefilter('ignore', SubjectAltNameWarning)
 
+
 def main(username, password, destdir, csv_format):
     session = requests.Session()
     session.headers.update({
@@ -52,50 +52,33 @@ def main(username, password, destdir, csv_format):
     assert response.ok
     assert 'invalid KKW response' not in response.text
     html = BeautifulSoup(response.text, 'html.parser')
-    assert html.find('a', title="ausloggen") is not None
+    assert html.find('a', title="logout") is not None
 
-    # Download PDFs
-    # Links look like:
-    # https://rechnung.bob.at/bill/download/pdf/sync/Rechnung_1703_523260091_1.pdf?ban=523260091&ben=1&bsn=79&original=true
-    # https://rechnung.bob.at/bill/download/pdf/sync/Rechnung_1702_523260091_1.pdf?ban=523260091&ben=1&bsn=78&original=true
-    # https://rechnung.bob.at/bill/download/pdf/sync/Rechnung_1701_523260091_1.pdf?ban=523260091&ben=1&bsn=77&original=true
-    regexp = re.compile(r'\/(Rechnung_.*)\?')
-    links = html.findAll('a', href=regexp)
+    # Download files
+    links = html.findAll('a', class_="table-bill__link--pdf")
     for link in links:
-        url = link['href']
-        filename = regexp.search(url).group(1)
-        assert filename.startswith('Rechnung_')
-        filepath = os.path.join(destdir, filename)
-        if not os.path.exists(filepath):
-            response = session.get(urljoin(response.url, url))
-            assert response.ok
-            with open(filepath, 'wb') as file:
-                file.write(response.content)
+        url_pdf = link['href']
+        date_range = link.parent.parent.parent.find(class_='table-bills__header').find(class_='text-copy').text  # 26.02.2019 - 25.03.2019
+        match = re.match(r'\d\d\.\d\d\.\d\d\d\d - (\d\d)\.(\d\d)\.(\d\d\d\d)', date_range)
 
-    # Download CSVs
-    # e.g. https://rechnung.bob.at/bill.ctn.cdr.obp?bsn=79
-    regexp = re.compile(r'\/bill.ctn.cdr.obp\?')
-    links = html.findAll('a', href=regexp)
-    for link in links:
-        url = link['href']
-        response = session.get(urljoin(response.url, url))  # e.g. 'https://rechnung.bob.at/bill.ctn.cdr.obp?bsn=79'
-        assert response.ok
-        html = BeautifulSoup(response.text, 'html.parser')
-        assert html.find('div', id='obp.calls.table')  # make sure we are on the right page
-
-        if not html.find('a', id='link_csv_download'):
-            filepath = os.path.join(destdir, 'debug_no_link_csv_download.txt')
-            with open(filepath, 'wb') as file:
+        # Download PDF
+        filename_pdf = '{}-{}-{}_Rechnung.pdf'.format(*match.groups()[::-1])  # '2019-03-25_Rechnung.pdf'
+        filepath_pdf = os.path.join(destdir, filename_pdf)
+        if not os.path.exists(filepath_pdf):
+            response = session.get(urljoin(response.url, url_pdf))
+            assert response.ok
+            with open(filepath_pdf, 'wb') as file:
                 file.write(response.content)
-            continue
 
-        response = session.get('https://rechnung.bob.at/obp/download.obp?fmt={}&table=obp.calls.table'.format(csv_format))
-        assert response.ok
-        filename = response.headers['Content-Disposition'].split('=')[1] # e.g. 'EVN_1509_523260091_1_069911934859.txt'
-        assert filename.startswith('EVN_')
-        filepath = os.path.join(destdir, filename)
-        if not os.path.exists(filepath):
-            with open(filepath, 'wb') as file:
+        # Download CSV
+        # https://ppp.bob.at/bobstart/invoiceDetailsCSV.sp?bsn=103
+        filename_csv = '{}-{}-{}_EVN.csv'.format(*match.groups()[::-1])  # '2019-03-25_EVN.pdf'
+        filepath_csv = os.path.join(destdir, filename_csv)
+        if not os.path.exists(filepath_csv):
+            url_csv = url_pdf.replace('invoicePdf', 'invoiceDetailsCSV')
+            response = session.get(urljoin(response.url, url_csv))
+            assert response.ok
+            with open(filepath_csv, 'wb') as file:
                 file.write(response.content)