]> git.sthu.org Git - dvrdb.git/commitdiff
Initial commit
authorStefan Huber <shuber@sthu.org>
Wed, 15 Jan 2014 10:23:16 +0000 (11:23 +0100)
committerStefan Huber <shuber@sthu.org>
Wed, 15 Jan 2014 10:23:16 +0000 (11:23 +0100)
dump.sh [new file with mode: 0755]
dvr-db.py [new file with mode: 0755]
filldb.sh [new file with mode: 0755]

diff --git a/dump.sh b/dump.sh
new file mode 100755 (executable)
index 0000000..ff2e496
--- /dev/null
+++ b/dump.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+
+FROM=77000
+#FROM=190000
+
+UNTIL=130000
+#UNTIL=200000
+
+for id in `seq ${FROM} ${UNTIL}`; do
+    
+    echo -n "### ID ${id}"
+
+    FN="dump/$id.html.gz"
+    if ! [ -e "${FN}" ]; then
+        echo -n " Fetching..."
+        curl -s -b ASP.NET_SessionId=xtnfwfeplogtuyrna0a5e355 \
+            "https://dvr.dsk.gv.at/at.gv.bka.dvr.public/AuftraggeberDetail.aspx?Id=${id}" | gzip > "${FN}"
+
+        # Do not stress server too much
+        sleep 0.2
+    fi
+    echo ""
+
+    client=$( zgrep "span.*ContentHolder_AuftraggeberDaten_tempBez" "${FN}" | sed "s_.*<span.*>\(.*\)</span>.*_\1_" )
+    if [ -n "${client}" ]; then
+        echo "  client: $client"
+    fi
+done
diff --git a/dvr-db.py b/dvr-db.py
new file mode 100755 (executable)
index 0000000..1a2b5d3
--- /dev/null
+++ b/dvr-db.py
@@ -0,0 +1,273 @@
+#!/usr/bin/env python3
+
+
+import bs4
+import getopt
+import os
+import re
+import sqlite3
+import sys
+import urllib.request
+import urllib.parse
+import urllib.error
+import json
+
+
+class Database:
+    """Encapsualtes a DVR database."""
+
+    def __init__(self, fn):
+        self.fn = fn
+        self.conn = sqlite3.connect(fn, timeout=60)
+        self.conn.execute("PRAGMA foreign_keys = ON")
+        self.createSchema()
+
+    def createSchema(self):
+        """Create the database schema"""
+        conn = self.conn
+
+        c = conn.cursor()
+        c.execute("BEGIN EXCLUSIVE")
+
+        c.execute("CREATE TABLE IF NOT EXISTS dvrtable (\
+                     id INTEGER PRIMARY KEY, \
+                     dvr INTEGER UNIQUE \
+                     )")
+
+        c.execute("CREATE TABLE IF NOT EXISTS registrations (\
+                     id INTEGER PRIMARY KEY \
+                        REFERENCES dvrtable(id) ON DELETE CASCADE, \
+                     name TEXT NOT NULL, \
+                     address TEXT NOT NULL \
+                     )")
+
+        c.execute("CREATE TABLE IF NOT EXISTS purposes (\
+                     id INTEGER REFERENCES dvrtable(id) ON DELETE CASCADE, \
+                     num INTEGER, \
+                     purpose TEXT NOT NULL, \
+                     date TEXT NOT NULL, \
+                     status TEXT NOT NULL, \
+                     PRIMARY KEY(id, num) \
+                     )")
+        c.close()
+        conn.commit()
+
+    def contains_id(self, id):
+        """Return whether the given ID is contained in DB."""
+        c = self.conn.cursor()
+        c.execute("SELECT count(id) FROM dvrtable WHERE id=?", (id,))
+        return c.fetchone()[0] == 1
+
+    def get_dvr(self, id):
+        """Get DVR from dataset with given ID."""
+        c = self.conn.cursor()
+        c.execute("SELECT dvr FROM dvrtable WHERE id=?", (id,))
+        return c.fetchone()[0]
+
+    def add_dvr(self, id, dvr=None):
+        """"Add a DVR with given ID to the dataset"""
+        c = self.conn.cursor()
+        c.execute("INSERT INTO dvrtable VALUES (?, ?)", (id, dvr))
+        c.close()
+        self.conn.commit()
+
+    def get_registration(self, id):
+        """Return the registration of given ID, if any."""
+        c = self.conn.cursor()
+        c.execute("SELECT * FROM registrations WHERE id=?", (id,))
+        return c.fetchone()
+
+    def add_registration(self, id, name, address):
+        """Add a registration for the given ID."""
+        c = self.conn.cursor()
+        c.execute("INSERT INTO registrations VALUES (?, ?, ?)",
+                  (id, name, address))
+        c.close()
+        self.conn.commit()
+
+    def get_purposes(self, id):
+        """Return all known purposes of given ID, if any."""
+        c = self.conn.cursor()
+        c.execute("SELECT * FROM purposes WHERE id=?", (id,))
+        return c.fetchall()
+
+    def add_purpose(self, id, num, purpose, date, status):
+        """Add a purpose for a given ID."""
+        c = self.conn.cursor()
+        c.execute("INSERT INTO purposes VALUES (?, ?, ?, ?, ?)",
+                  (id, num, purpose, date, status))
+        c.close()
+        self.conn.commit()
+
+
+def usage():
+    """Print help text."""
+
+    print("""
+USAGE:
+  {0} -d FILE [OPTIONS]
+  {0} -h
+
+OPTIONS:
+  -a          Read webservers response from stdin and add data to the database.
+  -d FILE     Use given sqlite3 database.
+  -h          Print this help text.
+  -q ID       Query dataset for dataset of given ID. Exit status is 1 in case
+              that no such dataset exists and 0 on success.
+  -g          Query geographical location for address field of registration.
+""")
+
+
+def queryLocation(address):
+    """Get (lat, lng) pair of given postal address."""
+
+    urlargs = {}
+    urlargs['address'] = address
+    urlargs['sensor'] = "false"
+    urlparam = urllib.parse.urlencode(urlargs)
+
+    url = "http://maps.googleapis.com/maps/api/geocode/json?" + urlparam
+    try:
+        response = urllib.request.urlopen(url).read().decode('utf-8')
+        data = json.loads(response)
+        loc = data['results'][0]['geometry']['location']
+        return loc['lng'], loc['lat']
+    except urllib.error.URLError as e:
+        print(e, file=sys.stderr)
+        return None
+
+def processQuery(db, id, showLocation=False):
+    """Process query for given ID."""
+
+    if not db.contains_id(id):
+        print("No such ID in the database.")
+        return False
+    print("ID %d found in the database." % id)
+
+    dvr = db.get_dvr(id)
+    if dvr is None:
+        print("  No DVR exists for this ID.")
+        return
+
+    reg = db.get_registration(id)
+    if reg is None:
+        print("  No registration exists.")
+    else:
+        name = reg[1]
+        address = reg[2]
+
+        print("  DVR:        ", dvr)
+        print("  Name:       ", name)
+        print("  Address:    ", address)
+
+        if showLocation:
+            loc = queryLocation(address)
+            print("  Coordinates:", loc)
+
+        purposes = db.get_purposes(id)
+        if len(purposes) == 0:
+            print("  No purposes known.")
+        else:
+            for purp in purposes:
+                print("  Purpose %d:" % purp[1])
+                print("    Text:  ", purp[2])
+                print("    Date:  ", purp[3])
+                print("    Status:", purp[4])
+    return True
+
+def processAdd(db):
+    """Read data from stdin, parse data, and add to database."""
+
+    htmldata = sys.stdin.read()
+    bs = bs4.BeautifulSoup(htmldata)
+    b = bs.body
+
+    #bs.body.find('form', attrs = {'id' : 'aspnetForm'}).
+    idattr = b.find('form').get('action')
+    idmatch = re.match(r'.*Id=([0-9]*).*', idattr)
+    if idmatch:
+        id = idmatch.group(1)
+        id = int(id)
+    else:
+        print("Could not find ID.")
+        return
+
+    dvrtagid = 'ctl00_ContentHolder_AuftraggeberDaten_tempDVRNr_labelDVRNr'
+    dvrtag = b.find(id=dvrtagid)
+    if dvrtag is not None:
+        dvr = dvrtag.text
+        dvr = int(dvr)
+        print("DVR is %d for ID %d." % (dvr, id))
+        db.add_dvr(id, dvr)
+    else:
+        print("No DVR for ID %d." % (id,))
+        db.add_dvr(id)
+        return
+
+    nameid = 'ctl00_ContentHolder_AuftraggeberDaten_tempBez_labelBez'
+    name = b.find(id=nameid).text
+    addressid = 'ctl00_ContentHolder_AuftraggeberDaten_tempAdr_labelAdr'
+    address = b.find(id=addressid).text
+    db.add_registration(id, name, address)
+
+    ptableid = 'ctl00_ContentHolder_Datenanwendungen_GridDatenanwendungen'
+    ptable = b.find(id=ptableid)
+    if not ptable is None:
+        for row in ptable.find_all('tr'):
+            if row.get('class') is None:
+                continue
+            cols = row.find_all('td')
+            num = int(cols[1].text.split('/')[1])
+            purpose = cols[2].text
+            date = cols[3].text
+            status = cols[4].text
+
+            db.add_purpose(id, num, purpose, date, status)
+
+
+if __name__ == "__main__":
+
+    add = False
+    dbfn = None
+    query = None
+    showLocation = False
+
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], "ad:hgq:")
+
+        for opt, arg in opts:
+            if opt == "-a":
+                add = True
+            elif opt == "-d":
+                dbfn = arg
+            elif opt == "-h":
+                usage()
+                sys.exit(os.EX_OK)
+            elif opt == "-g":
+                showLocation = True
+            elif opt == "-q":
+                query = int(arg)
+            else:
+                print("Unknown option '", opt, "'.")
+                assert(False)
+
+    except getopt.GetoptError as e:
+        print("Error parsing arguments:", e)
+        usage()
+        sys.exit(os.EX_USAGE)
+
+    if dbfn is None:
+        print("No database given.")
+        sys.exit(os.EX_USAGE)
+
+    db = Database(dbfn)
+
+    if add:
+        processAdd(db)
+
+    if query is not None:
+        success = processQuery(db, query, showLocation)
+        if not success:
+            sys.exit(1)
+
+    sys.exit(os.EX_OK)
diff --git a/filldb.sh b/filldb.sh
new file mode 100755 (executable)
index 0000000..a2be6c4
--- /dev/null
+++ b/filldb.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+find dump/ -name "*.gz" -exec sh -c "zcat {} | ./dvr-db.py -d all.sqlite -a" \;
+