import urllib.parse
import urllib.error
import json
+import gzip
class Database:
{0} -h
OPTIONS:
- -a Read webservers response from stdin and add data to the database.
+ -a PATH Read webservers response and add data to the database. If PATH is
+ "-" then read from stdin. If PATH is a file, read from file. If
+ file ends with ".gz" then transparently decompress it. If PATH is
+ a directory, read all files ending with ".html" or ".html.gz".
-d FILE Use given sqlite3 database.
-h Print this help text.
-q ID Query dataset for dataset of given ID. Exit status is 1 in case
print(" Status:", purp[4])
return True
-def processAdd(db):
- """Read data from stdin, parse data, and add to database."""
- htmldata = sys.stdin.read()
+def processAdd(db, source):
+ """Read data from source, parse data, and add to database."""
+
+ if source == "-":
+ htmldata = sys.stdin.read()
+ parseAndInsert(db, htmldata)
+ else:
+
+ if os.path.isfile(source):
+ print("Reading file '%s'..." % source)
+ if source.endswith(".gz"):
+ with gzip.GzipFile(source, "r") as f:
+ parseAndInsert(db, f.read())
+ else:
+ with open(source, "r") as f:
+ parseAndInsert(db, f.read())
+
+ elif os.path.isdir(source):
+ for (dirpath, dirnames, filenames) in os.walk(source):
+ for f in filenames:
+ if f.endswith(".html") or f.endswith(".html.gz"):
+ p = os.path.join(dirpath,f)
+ processAdd(db, p)
+ else:
+ print("What the hell is '%s'?" % source, file=sys.stderr)
+ sys.exit(os.EX_IOERR)
+
+
+def parseAndInsert(db, htmldata):
+ """Parse html data and insert dataset."""
+
bs = bs4.BeautifulSoup(htmldata)
b = bs.body
if __name__ == "__main__":
- add = False
+ add = None
dbfn = None
query = None
showLocation = False
try:
- opts, args = getopt.getopt(sys.argv[1:], "ad:hgq:")
+ opts, args = getopt.getopt(sys.argv[1:], "a:d:hgq:")
for opt, arg in opts:
if opt == "-a":
- add = True
+ add = arg
elif opt == "-d":
dbfn = arg
elif opt == "-h":
db = Database(dbfn)
- if add:
- processAdd(db)
+ if add is not None:
+ processAdd(db, add)
if query is not None:
success = processQuery(db, query, showLocation)