X-Git-Url: https://git.toastfreeware.priv.at/toast/webscraper/bob.git/blobdiff_plain/d6e6997a4f49f1cebd744e692ecd30f297a1dd70..bd17d02882a5713026e9a5d51ed311303ebe5a1a:/bob_download.py?ds=sidebyside diff --git a/bob_download.py b/bob_download.py index 2b1a272..9896b7a 100644 --- a/bob_download.py +++ b/bob_download.py @@ -11,11 +11,13 @@ dest_dir = '/tmp/bob' session = requests.Session() -additional_headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'} # otherwise site with content '' is returned +session.headers.update({ + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # otherwise site with content '' is returned + }) # load login page main_url = 'https://rechnung.bob.at/' -response = session.get(main_url, headers=additional_headers) +response = session.get(main_url) html = BeautifulSoup(response.text, 'html.parser') # fill out login form (name='asmpform') with username= and password @@ -27,14 +29,14 @@ assert 'kkw' in fields # password fields['kkw'] = password # load overview page -response = session.post(form['action'], data=fields, headers=additional_headers) +response = session.post(form['action'], data=fields) # reload overview page rechnung.bob.at - that makes the URLs in the page much prettier # previously: # https://rechnung.bob.at/bill/pdf/;BOBOBPSESSIONID=B7DB9938A3B9541E3D0EB6CD728F54C0.obpBobCustomer4Rechnungskopie_1509_523260091_1.pdf?bsn=61 # same after reload: # '/bill/pdf/Rechnungskopie_1509_523260091_1.pdf?bsn=61' -response = session.get(main_url, headers=additional_headers) +response = session.get(main_url) html = BeautifulSoup(response.text, 'html.parser') # Download PDFs @@ -66,7 +68,7 @@ for link in links: assert 'OBP.utils.reloadAfterDelay("/bill.ctn.cdr.set.obp",5);' not in response.text html = BeautifulSoup(response.text, 'html.parser') assert html.find('a', id='link_csv_download') is not None - response = session.get('https://rechnung.bob.at/download.table.obp?fmt=TAB&table=obp.calls.table', headers=additional_headers) + response = session.get('https://rechnung.bob.at/download.table.obp?fmt=TAB&table=obp.calls.table') assert response.status_code == 200 filename = response.headers['Content-Disposition'].split('=')[1] # e.g. 'EVN_1509_523260091_1_069911934859.txt' assert filename.startswith('EVN_')