Additional headers can be specified in the session - that makes the code shorter.
[toast/webscraper/bob.git] / bob_download.py
index c35524a702d2c7c71bcd880503b4f67f54d11bbc..9896b7a39f1bdefad64e027628a43655ac1c16d6 100644 (file)
@@ -1,6 +1,7 @@
 #!/usr/bin/python3
 import os
 import re
+import time
 import requests
 from bs4 import BeautifulSoup
 
@@ -10,11 +11,13 @@ dest_dir = '/tmp/bob'
 
 
 session = requests.Session()
-additional_headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'} # otherwise site with content '<HTML></HTML>' is returned
+session.headers.update({
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # otherwise site with content '<HTML></HTML>' is returned
+    })
 
 # load login page
 main_url = 'https://rechnung.bob.at/'
-response = session.get(main_url, headers=additional_headers)
+response = session.get(main_url)
 html = BeautifulSoup(response.text, 'html.parser')
 
 # fill out login form (name='asmpform') with username=<phone number> and password
@@ -26,14 +29,14 @@ assert 'kkw' in fields # password
 fields['kkw'] = password
 
 # load overview page
-response = session.post(form['action'], data=fields, headers=additional_headers)
+response = session.post(form['action'], data=fields)
 
 # reload overview page rechnung.bob.at - that makes the URLs in the page much prettier
 # previously:
 # https://rechnung.bob.at/bill/pdf/;BOBOBPSESSIONID=B7DB9938A3B9541E3D0EB6CD728F54C0.obpBobCustomer4Rechnungskopie_1509_523260091_1.pdf?bsn=61
 # same after reload:
 # '/bill/pdf/Rechnungskopie_1509_523260091_1.pdf?bsn=61'
-response = session.get(main_url, headers=additional_headers)
+response = session.get(main_url)
 html = BeautifulSoup(response.text, 'html.parser')
 
 # Download PDFs
@@ -59,13 +62,13 @@ for link in links:
     url = link['href']
     response = session.get(main_url[:-1] + url)
     assert response.status_code == 200
-    assert response.text != ''
-    import time
-    time.sleep(3)
+    assert 'OBP.utils.reloadAfterDelay("/bill.ctn.cdr.set.obp",5);' in response.text
+    time.sleep(5) # OBP.utils.reloadAfterDelay("/bill.ctn.cdr.set.obp",5);
     response = session.get(main_url + 'bill.ctn.cdr.set.obp')
+    assert 'OBP.utils.reloadAfterDelay("/bill.ctn.cdr.set.obp",5);' not in response.text
     html = BeautifulSoup(response.text, 'html.parser')
     assert html.find('a', id='link_csv_download') is not None
-    response = session.get('https://rechnung.bob.at/download.table.obp?fmt=TAB&table=obp.calls.table', headers=additional_headers)
+    response = session.get('https://rechnung.bob.at/download.table.obp?fmt=TAB&table=obp.calls.table')
     assert response.status_code == 200
     filename = response.headers['Content-Disposition'].split('=')[1] # e.g. 'EVN_1509_523260091_1_069911934859.txt'
     assert filename.startswith('EVN_')