The script works :-) (provided you use the right phone number and password).
authorPhilipp Spitzer <philipp@spitzer.priv.at>
Thu, 1 Oct 2015 20:17:28 +0000 (22:17 +0200)
committerPhilipp Spitzer <philipp@spitzer.priv.at>
Thu, 1 Oct 2015 20:17:28 +0000 (22:17 +0200)
bob_download.py [new file with mode: 0644]

diff --git a/bob_download.py b/bob_download.py
new file mode 100644 (file)
index 0000000..a06f00d
--- /dev/null
@@ -0,0 +1,74 @@
+#!/usr/bin/python3
+import os
+import re
+import requests
+from bs4 import BeautifulSoup
+
+dest_dir = '/tmp/xx'
+
+session = requests.Session()
+additional_headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'} # otherwise site with content '<HTML></HTML>' is returned
+# Accept-Encoding: identity # to get non-compressed things back
+
+# load login page
+main_url = 'https://rechnung.bob.at/'
+response = session.get(main_url, headers=additional_headers)
+html = BeautifulSoup(response.text, 'html.parser')
+
+# fill out login form (name='asmpform') with username=<phone number> and password
+form = html.find(attrs={'name': 'asmpform'})
+fields = {e['name']: e.get('value', '') for e in form.find_all('input', {'name': True}) if e['name'] != 'submit'}
+assert 'loginMsisdn' in fields # user name
+fields['loginMsisdn'] = '4369911111111' # user name is phone number
+assert 'kkw' in fields # password
+fields['kkw'] = 'abcdefg'
+
+# load overview page
+response = session.post(form['action'], data=fields, headers=additional_headers)
+
+# reload overview page rechnung.bob.at - that makes the URLs in the page much prettier
+# previously:
+# https://rechnung.bob.at/bill/pdf/;BOBOBPSESSIONID=B7DB9938A3B9541E3D0EB6CD728F54C0.obpBobCustomer4Rechnungskopie_1509_523260091_1.pdf?bsn=61
+# same after reload:
+# '/bill/pdf/Rechnungskopie_1509_523260091_1.pdf?bsn=61'
+response = session.get(main_url, headers=additional_headers)
+html = BeautifulSoup(response.text, 'html.parser')
+
+# Download PDFs
+# Links look like '/bill/pdf/Rechnungskopie_1509_523260091_1.pdf?bsn=61'
+regexp = re.compile(r'\/(Rechnungskopie_.*)\?')
+links = html.findAll('a', href=regexp)
+for link in links:
+    url = link['href']
+    filename = regexp.search(url).group(1)
+    assert filename.startswith('Rechnungskopie_')
+    filepath = os.path.join(dest_dir, filename)
+    if not os.path.exists(filepath):
+        response = session.get(main_url[:-1] + url)
+        assert response.status_code == 200
+        with open(filepath, 'wb') as file:
+            file.write(response.content)
+
+# Download CSVs
+# Links look like '/bill.set.obp?bsn=61'
+regexp = re.compile(r'\/bill.set.obp\?')
+links = html.findAll('a', href=regexp)
+for link in links:
+    url = link['href']
+    response = session.get(main_url[:-1] + url)
+    assert response.status_code == 200
+    assert response.text != ''
+    import time
+    time.sleep(3)
+    response = session.get(main_url + 'bill.ctn.cdr.set.obp')
+    html = BeautifulSoup(response.text, 'html.parser')
+    assert html.find('a', id='link_csv_download') is not None
+    response = session.get('https://rechnung.bob.at/download.table.obp?fmt=TAB&table=obp.calls.table', headers=additional_headers)
+    assert response.status_code == 200
+    filename = response.headers['Content-Disposition'].split('=')[1] # e.g. 'EVN_1509_523260091_1_069911934859.txt'
+    assert filename.startswith('EVN_')
+    filepath = os.path.join(dest_dir, filename)
+    if not os.path.exists(filepath):
+        with open(filepath, 'wb') as file:
+            file.write(response.content)
+