From f1483df40ba6c8f9f03825419a75fd55aa8dbe67 Mon Sep 17 00:00:00 2001 From: Stefan Huber Date: Wed, 15 Jan 2014 14:15:40 +0100 Subject: [PATCH] Add support read files and directories. --- dvr-db.py | 50 +++++++++++++++++++++++++++++++++++++++++--------- filldb.sh | 2 +- 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/dvr-db.py b/dvr-db.py index 21f8a18..c46aec1 100755 --- a/dvr-db.py +++ b/dvr-db.py @@ -11,6 +11,7 @@ import urllib.request import urllib.parse import urllib.error import json +import gzip class Database: @@ -111,7 +112,10 @@ USAGE: {0} -h OPTIONS: - -a Read webservers response from stdin and add data to the database. + -a PATH Read webservers response and add data to the database. If PATH is + "-" then read from stdin. If PATH is a file, read from file. If + file ends with ".gz" then transparently decompress it. If PATH is + a directory, read all files ending with ".html" or ".html.gz". -d FILE Use given sqlite3 database. -h Print this help text. -q ID Query dataset for dataset of given ID. Exit status is 1 in case @@ -177,10 +181,38 @@ def processQuery(db, id, showLocation=False): print(" Status:", purp[4]) return True -def processAdd(db): - """Read data from stdin, parse data, and add to database.""" - htmldata = sys.stdin.read() +def processAdd(db, source): + """Read data from source, parse data, and add to database.""" + + if source == "-": + htmldata = sys.stdin.read() + parseAndInsert(db, htmldata) + else: + + if os.path.isfile(source): + print("Reading file '%s'..." % source) + if source.endswith(".gz"): + with gzip.GzipFile(source, "r") as f: + parseAndInsert(db, f.read()) + else: + with open(source, "r") as f: + parseAndInsert(db, f.read()) + + elif os.path.isdir(source): + for (dirpath, dirnames, filenames) in os.walk(source): + for f in filenames: + if f.endswith(".html") or f.endswith(".html.gz"): + p = os.path.join(dirpath,f) + processAdd(db, p) + else: + print("What the hell is '%s'?" % source, file=sys.stderr) + sys.exit(os.EX_IOERR) + + +def parseAndInsert(db, htmldata): + """Parse html data and insert dataset.""" + bs = bs4.BeautifulSoup(htmldata) b = bs.body @@ -234,17 +266,17 @@ def processAdd(db): if __name__ == "__main__": - add = False + add = None dbfn = None query = None showLocation = False try: - opts, args = getopt.getopt(sys.argv[1:], "ad:hgq:") + opts, args = getopt.getopt(sys.argv[1:], "a:d:hgq:") for opt, arg in opts: if opt == "-a": - add = True + add = arg elif opt == "-d": dbfn = arg elif opt == "-h": @@ -269,8 +301,8 @@ if __name__ == "__main__": db = Database(dbfn) - if add: - processAdd(db) + if add is not None: + processAdd(db, add) if query is not None: success = processQuery(db, query, showLocation) diff --git a/filldb.sh b/filldb.sh index a231e8c..22bb6ea 100755 --- a/filldb.sh +++ b/filldb.sh @@ -3,4 +3,4 @@ rm -f failed.txt rm -f all.sqlite -find dump/ -name "*.gz" -exec sh -c 'echo "Adding file {}"; zcat {} | ./dvr-db.py -d all.sqlite -a || echo {} >> failed.txt' \; +find dump/ -name "*.gz" -exec sh -c './dvr-db.py -d all.sqlite -a {} || echo {} >> failed.txt' \; -- 2.39.5