From: Stefan Huber Date: Wed, 15 Jan 2014 10:23:16 +0000 (+0100) Subject: Initial commit X-Git-Url: https://git.sthu.org/?a=commitdiff_plain;h=0899de52a187c08b47843a98fd1ea2bce9ceeb89;p=dvrdb.git Initial commit --- 0899de52a187c08b47843a98fd1ea2bce9ceeb89 diff --git a/dump.sh b/dump.sh new file mode 100755 index 0000000..ff2e496 --- /dev/null +++ b/dump.sh @@ -0,0 +1,28 @@ +#!/bin/sh + +FROM=77000 +#FROM=190000 + +UNTIL=130000 +#UNTIL=200000 + +for id in `seq ${FROM} ${UNTIL}`; do + + echo -n "### ID ${id}" + + FN="dump/$id.html.gz" + if ! [ -e "${FN}" ]; then + echo -n " Fetching..." + curl -s -b ASP.NET_SessionId=xtnfwfeplogtuyrna0a5e355 \ + "https://dvr.dsk.gv.at/at.gv.bka.dvr.public/AuftraggeberDetail.aspx?Id=${id}" | gzip > "${FN}" + + # Do not stress server too much + sleep 0.2 + fi + echo "" + + client=$( zgrep "span.*ContentHolder_AuftraggeberDaten_tempBez" "${FN}" | sed "s_.*\(.*\).*_\1_" ) + if [ -n "${client}" ]; then + echo " client: $client" + fi +done diff --git a/dvr-db.py b/dvr-db.py new file mode 100755 index 0000000..1a2b5d3 --- /dev/null +++ b/dvr-db.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 + + +import bs4 +import getopt +import os +import re +import sqlite3 +import sys +import urllib.request +import urllib.parse +import urllib.error +import json + + +class Database: + """Encapsualtes a DVR database.""" + + def __init__(self, fn): + self.fn = fn + self.conn = sqlite3.connect(fn, timeout=60) + self.conn.execute("PRAGMA foreign_keys = ON") + self.createSchema() + + def createSchema(self): + """Create the database schema""" + conn = self.conn + + c = conn.cursor() + c.execute("BEGIN EXCLUSIVE") + + c.execute("CREATE TABLE IF NOT EXISTS dvrtable (\ + id INTEGER PRIMARY KEY, \ + dvr INTEGER UNIQUE \ + )") + + c.execute("CREATE TABLE IF NOT EXISTS registrations (\ + id INTEGER PRIMARY KEY \ + REFERENCES dvrtable(id) ON DELETE CASCADE, \ + name TEXT NOT NULL, \ + address TEXT NOT NULL \ + )") + + c.execute("CREATE TABLE IF NOT EXISTS purposes (\ + id INTEGER REFERENCES dvrtable(id) ON DELETE CASCADE, \ + num INTEGER, \ + purpose TEXT NOT NULL, \ + date TEXT NOT NULL, \ + status TEXT NOT NULL, \ + PRIMARY KEY(id, num) \ + )") + c.close() + conn.commit() + + def contains_id(self, id): + """Return whether the given ID is contained in DB.""" + c = self.conn.cursor() + c.execute("SELECT count(id) FROM dvrtable WHERE id=?", (id,)) + return c.fetchone()[0] == 1 + + def get_dvr(self, id): + """Get DVR from dataset with given ID.""" + c = self.conn.cursor() + c.execute("SELECT dvr FROM dvrtable WHERE id=?", (id,)) + return c.fetchone()[0] + + def add_dvr(self, id, dvr=None): + """"Add a DVR with given ID to the dataset""" + c = self.conn.cursor() + c.execute("INSERT INTO dvrtable VALUES (?, ?)", (id, dvr)) + c.close() + self.conn.commit() + + def get_registration(self, id): + """Return the registration of given ID, if any.""" + c = self.conn.cursor() + c.execute("SELECT * FROM registrations WHERE id=?", (id,)) + return c.fetchone() + + def add_registration(self, id, name, address): + """Add a registration for the given ID.""" + c = self.conn.cursor() + c.execute("INSERT INTO registrations VALUES (?, ?, ?)", + (id, name, address)) + c.close() + self.conn.commit() + + def get_purposes(self, id): + """Return all known purposes of given ID, if any.""" + c = self.conn.cursor() + c.execute("SELECT * FROM purposes WHERE id=?", (id,)) + return c.fetchall() + + def add_purpose(self, id, num, purpose, date, status): + """Add a purpose for a given ID.""" + c = self.conn.cursor() + c.execute("INSERT INTO purposes VALUES (?, ?, ?, ?, ?)", + (id, num, purpose, date, status)) + c.close() + self.conn.commit() + + +def usage(): + """Print help text.""" + + print(""" +USAGE: + {0} -d FILE [OPTIONS] + {0} -h + +OPTIONS: + -a Read webservers response from stdin and add data to the database. + -d FILE Use given sqlite3 database. + -h Print this help text. + -q ID Query dataset for dataset of given ID. Exit status is 1 in case + that no such dataset exists and 0 on success. + -g Query geographical location for address field of registration. +""") + + +def queryLocation(address): + """Get (lat, lng) pair of given postal address.""" + + urlargs = {} + urlargs['address'] = address + urlargs['sensor'] = "false" + urlparam = urllib.parse.urlencode(urlargs) + + url = "http://maps.googleapis.com/maps/api/geocode/json?" + urlparam + try: + response = urllib.request.urlopen(url).read().decode('utf-8') + data = json.loads(response) + loc = data['results'][0]['geometry']['location'] + return loc['lng'], loc['lat'] + except urllib.error.URLError as e: + print(e, file=sys.stderr) + return None + +def processQuery(db, id, showLocation=False): + """Process query for given ID.""" + + if not db.contains_id(id): + print("No such ID in the database.") + return False + print("ID %d found in the database." % id) + + dvr = db.get_dvr(id) + if dvr is None: + print(" No DVR exists for this ID.") + return + + reg = db.get_registration(id) + if reg is None: + print(" No registration exists.") + else: + name = reg[1] + address = reg[2] + + print(" DVR: ", dvr) + print(" Name: ", name) + print(" Address: ", address) + + if showLocation: + loc = queryLocation(address) + print(" Coordinates:", loc) + + purposes = db.get_purposes(id) + if len(purposes) == 0: + print(" No purposes known.") + else: + for purp in purposes: + print(" Purpose %d:" % purp[1]) + print(" Text: ", purp[2]) + print(" Date: ", purp[3]) + print(" Status:", purp[4]) + return True + +def processAdd(db): + """Read data from stdin, parse data, and add to database.""" + + htmldata = sys.stdin.read() + bs = bs4.BeautifulSoup(htmldata) + b = bs.body + + #bs.body.find('form', attrs = {'id' : 'aspnetForm'}). + idattr = b.find('form').get('action') + idmatch = re.match(r'.*Id=([0-9]*).*', idattr) + if idmatch: + id = idmatch.group(1) + id = int(id) + else: + print("Could not find ID.") + return + + dvrtagid = 'ctl00_ContentHolder_AuftraggeberDaten_tempDVRNr_labelDVRNr' + dvrtag = b.find(id=dvrtagid) + if dvrtag is not None: + dvr = dvrtag.text + dvr = int(dvr) + print("DVR is %d for ID %d." % (dvr, id)) + db.add_dvr(id, dvr) + else: + print("No DVR for ID %d." % (id,)) + db.add_dvr(id) + return + + nameid = 'ctl00_ContentHolder_AuftraggeberDaten_tempBez_labelBez' + name = b.find(id=nameid).text + addressid = 'ctl00_ContentHolder_AuftraggeberDaten_tempAdr_labelAdr' + address = b.find(id=addressid).text + db.add_registration(id, name, address) + + ptableid = 'ctl00_ContentHolder_Datenanwendungen_GridDatenanwendungen' + ptable = b.find(id=ptableid) + if not ptable is None: + for row in ptable.find_all('tr'): + if row.get('class') is None: + continue + cols = row.find_all('td') + num = int(cols[1].text.split('/')[1]) + purpose = cols[2].text + date = cols[3].text + status = cols[4].text + + db.add_purpose(id, num, purpose, date, status) + + +if __name__ == "__main__": + + add = False + dbfn = None + query = None + showLocation = False + + try: + opts, args = getopt.getopt(sys.argv[1:], "ad:hgq:") + + for opt, arg in opts: + if opt == "-a": + add = True + elif opt == "-d": + dbfn = arg + elif opt == "-h": + usage() + sys.exit(os.EX_OK) + elif opt == "-g": + showLocation = True + elif opt == "-q": + query = int(arg) + else: + print("Unknown option '", opt, "'.") + assert(False) + + except getopt.GetoptError as e: + print("Error parsing arguments:", e) + usage() + sys.exit(os.EX_USAGE) + + if dbfn is None: + print("No database given.") + sys.exit(os.EX_USAGE) + + db = Database(dbfn) + + if add: + processAdd(db) + + if query is not None: + success = processQuery(db, query, showLocation) + if not success: + sys.exit(1) + + sys.exit(os.EX_OK) diff --git a/filldb.sh b/filldb.sh new file mode 100755 index 0000000..a2be6c4 --- /dev/null +++ b/filldb.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +find dump/ -name "*.gz" -exec sh -c "zcat {} | ./dvr-db.py -d all.sqlite -a" \; +