From e2c80e5897964ec2b2670421d51f585f1556623d Mon Sep 17 00:00:00 2001 From: Holger Hans Peter Freyther Date: Tue, 24 Nov 2009 19:44:07 +0100 Subject: [PATCH] [mirror] Add script to update the http header The DB holds a copy of the HTTP response header and currently every includes the content-length attribute. When uploading new data the content-length is not updated... This script will go through the database and correct the HTTP headers. --- host-tools/mirror/update_content_length.py | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100755 host-tools/mirror/update_content_length.py diff --git a/host-tools/mirror/update_content_length.py b/host-tools/mirror/update_content_length.py new file mode 100755 index 0000000..30376bc --- /dev/null +++ b/host-tools/mirror/update_content_length.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python + +import sqlite3, sys, optparse + +def parse(): + parser = optparse.OptionParser(version = "Update the HTTP headers", + usage = "%prog [options] URL") + parser.add_option("-d", "--db", help = "The crawl database to be used", + action = "store", dest = "db_name", default="crawl_db.db") + return parser.parse_args(sys.argv) + +(opts, _) = parse() +connection = sqlite3.connect(opts.db_name) + + +def to_num(length): + if not length: + return 0 + return int(length) + +length = len("content-length: ") +cur = connection.execute("SELECT url, header, LENGTH(data) from responses") +for row in cur: + old_header = str(row[1]) + start = old_header.find("content-length: ") + end = old_header.find("\r", start) + + old_length = old_header[start + length:end] + new_length = to_num(row[2]) + + if int(old_length) == new_length: + continue + + new_entry = "content-length: %d" % new_length + new_header = old_header.replace(old_header[start:end], new_entry, 1) + connection.execute("UPDATE responses SET header = ? WHERE url like ?", [new_header, row[0]]) +connection.commit() -- 2.1.4