#!/usr/bin/python3 import os import re import time from urllib.parse import urljoin import requests from bs4 import BeautifulSoup user_name = '4369911111111' # user name is phone number password = 'abcdefg' # login password dest_dir = '/tmp/bob' session = requests.Session() session.headers.update({ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # otherwise site with content '' is returned }) # load login page response = session.get('https://rechnung.bob.at/') html = BeautifulSoup(response.text, 'html.parser') # fill out login form (name='asmpform') with username= and password form = html.find(attrs={'name': 'asmpform'}) fields = {e['name']: e.get('value', '') for e in form.find_all('input', {'name': True}) if e['name'] != 'submit'} assert 'loginMsisdn' in fields # user name fields['loginMsisdn'] = user_name # e.g. '4369911111111' assert 'kkw' in fields # password fields['kkw'] = password # load overview page response = session.post(form['action'], data=fields) # reload overview page rechnung.bob.at - that makes the URLs in the page much prettier # previously: # https://rechnung.bob.at/bill/pdf/;BOBOBPSESSIONID=B7DB9938A3B9541E3D0EB6CD728F54C0.obpBobCustomer4Rechnungskopie_1509_523260091_1.pdf?bsn=61 # same after reload: # '/bill/pdf/Rechnungskopie_1509_523260091_1.pdf?bsn=61' response = session.get(response.url) html = BeautifulSoup(response.text, 'html.parser') # Download PDFs # Links look like '/bill/pdf/Rechnungskopie_1509_523260091_1.pdf?bsn=61' regexp = re.compile(r'\/(Rechnungskopie_.*)\?') links = html.findAll('a', href=regexp) for link in links: url = link['href'] filename = regexp.search(url).group(1) assert filename.startswith('Rechnungskopie_') filepath = os.path.join(dest_dir, filename) if not os.path.exists(filepath): response = session.get(urljoin(response.url, url)) assert response.status_code == 200 with open(filepath, 'wb') as file: file.write(response.content) # Download CSVs # Links look like '/bill.set.obp?bsn=61' regexp = re.compile(r'\/bill.set.obp\?') links = html.findAll('a', href=regexp) for link in links: url = link['href'] response = session.get(urljoin(response.url, url)) assert response.status_code == 200 assert 'OBP.utils.reloadAfterDelay("/bill.ctn.cdr.set.obp",5);' in response.text time.sleep(5) # OBP.utils.reloadAfterDelay("/bill.ctn.cdr.set.obp",5); response = session.get(urljoin(response.url, 'bill.ctn.cdr.set.obp')) assert 'OBP.utils.reloadAfterDelay("/bill.ctn.cdr.set.obp",5);' not in response.text html = BeautifulSoup(response.text, 'html.parser') assert html.find('a', id='link_csv_download') is not None response = session.get('https://rechnung.bob.at/download.table.obp?fmt=TAB&table=obp.calls.table') assert response.status_code == 200 filename = response.headers['Content-Disposition'].split('=')[1] # e.g. 'EVN_1509_523260091_1_069911934859.txt' assert filename.startswith('EVN_') filepath = os.path.join(dest_dir, filename) if not os.path.exists(filepath): with open(filepath, 'wb') as file: file.write(response.content)