From: Joerg Jaspert Date: Sat, 16 Aug 2008 02:15:16 +0000 (+0200) Subject: Merge commit 'mhy/checksums' X-Git-Url: https://git.decadent.org.uk/gitweb/?a=commitdiff_plain;h=cce84a5c676e658c49f4d1fa754e272e26f979ae;hp=a7040b0e856c5d07f6e8d38e8559356f6fd9fe79;p=dak.git Merge commit 'mhy/checksums' * commit 'mhy/checksums': expand check_archive.py to cope with extra checksums add changelog to keep Ganneff happy add sha1sum and sha256sum columns fix typos make process accepted not die with old .dak files an attempt to move the functionaliity into a module let's store the known_hashes information centrally first attempt at bodging in support for sha1sum and sha256sum --- diff --git a/ChangeLog b/ChangeLog index d948974e..770fe5ff 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2008-08-15 Mark Hymers + + * setup/init_pool.sql, dak/check_archive.py, dak/decode_dot_dak.py, + dak/process_accepted.py, dak/process_unchecked.py, daklib/database.py, + daklib/queue.py, daklib/utils.py: Attempt to add sha1sum and + sha256sums into the database. The complication is that we have to + keep backwards compatibility with the .dak files already in existance. + Note that import_archive hasn't been hacked to deal with this yet. + 2008-08-14 Joerg Jaspert * config/debian/cron.dinstall: Added the i18n retrieval of package @@ -61,7 +70,6 @@ 2008-08-07 Stephen Gran * Drop use of exec to eval variable interpolation - 2008-08-07 Joerg Jaspert * dak/process_accepted.py (install): Error out with the new diff --git a/dak/check_archive.py b/dak/check_archive.py index 2d9321d6..93cc832c 100755 --- a/dak/check_archive.py +++ b/dak/check_archive.py @@ -52,7 +52,7 @@ Run various sanity checks of the archive and/or database. The following MODEs are available: - md5sums - validate the md5sums stored in the database + checksums - validate the checksums stored in the database files - check files in the database against what's in the archive dsc-syntax - validate the syntax of .dsc files in the archive missing-overrides - check for missing overrides @@ -194,16 +194,18 @@ SELECT l.path, f.filename FROM files f, dsc_files df, location l WHERE df.source ################################################################################ -def check_md5sums(): +def check_checksums(): print "Getting file information from database..." - q = projectB.query("SELECT l.path, f.filename, f.md5sum, f.size FROM files f, location l WHERE f.location = l.id") + q = projectB.query("SELECT l.path, f.filename, f.md5sum, f.sha1sum, f.sha256sum, f.size FROM files f, location l WHERE f.location = l.id") ql = q.getresult() - print "Checking file md5sums & sizes..." + print "Checking file checksums & sizes..." for i in ql: filename = os.path.abspath(i[0] + i[1]) db_md5sum = i[2] - db_size = int(i[3]) + db_sha1sum = i[3] + db_sha256sum = i[4] + db_size = int(i[5]) try: f = utils.open_file(filename) except: @@ -215,6 +217,18 @@ def check_md5sums(): utils.warn("**WARNING** md5sum mismatch for '%s' ('%s' [current] vs. '%s' [db])." % (filename, md5sum, db_md5sum)) if size != db_size: utils.warn("**WARNING** size mismatch for '%s' ('%s' [current] vs. '%s' [db])." % (filename, size, db_size)) + # Until the main database is filled, we need to not spit 500,000 warnings + # every time we scan the archive. Yet another hack (TM) which can go away + # once this is all working + if db_sha1sum is not None and db_sha1sum != '': + sha1sum = apt_pkg.sha1sum(f) + if sha1sum != db_sha1sum: + utils.warn("**WARNING** sha1sum mismatch for '%s' ('%s' [current] vs. '%s' [db])." % (filename, sha1sum, db_sha1sum)) + + if db_sha256sum is not None and db_sha256sum != '': + sha256sum = apt_pkg.sha256sum(f) + if sha256sum != db_sha256sum: + utils.warn("**WARNING** sha256sum mismatch for '%s' ('%s' [current] vs. '%s' [db])." % (filename, sha256sum, db_sha256sum)) print "Done." @@ -425,8 +439,8 @@ def main (): projectB = pg.connect(Cnf["DB::Name"], Cnf["DB::Host"], int(Cnf["DB::Port"])) database.init(Cnf, projectB) - if mode == "md5sums": - check_md5sums() + if mode == "checksums": + check_checksums() elif mode == "files": check_files() elif mode == "dsc-syntax": diff --git a/dak/decode_dot_dak.py b/dak/decode_dot_dak.py index 00bf8e7d..7ea342bd 100644 --- a/dak/decode_dot_dak.py +++ b/dak/decode_dot_dak.py @@ -101,9 +101,9 @@ def main(): for f in files.keys(): print " %s:" % (f) for i in [ "package", "version", "architecture", "type", "size", - "md5sum", "component", "location id", "source package", - "source version", "maintainer", "dbtype", "files id", - "new", "section", "priority", "pool name" ]: + "md5sum", "sha1sum", "sha256sum", "component", "location id", + "source package", "source version", "maintainer", "dbtype", + "files id", "new", "section", "priority", "pool name" ]: if files[f].has_key(i): print " %s: %s" % (i.capitalize(), files[f][i]) del files[f][i] diff --git a/dak/process_accepted.py b/dak/process_accepted.py index 94c2ea30..b28d9f9b 100755 --- a/dak/process_accepted.py +++ b/dak/process_accepted.py @@ -274,6 +274,10 @@ def install (): # Begin a transaction; if we bomb out anywhere between here and the COMMIT WORK below, the DB will not be changed. projectB.query("BEGIN WORK") + # Check the hashes are all present: HACK: Can go away once all dak files + # are known to be newer than the shasum changes + utils.ensure_hashes(Upload) + # Add the .dsc file to the DB for file in files.keys(): if files[file]["type"] == "dsc": @@ -291,7 +295,7 @@ def install (): dsc_component = files[file]["component"] dsc_location_id = files[file]["location id"] if not files[file].has_key("files id") or not files[file]["files id"]: - files[file]["files id"] = database.set_files_id (filename, files[file]["size"], files[file]["md5sum"], dsc_location_id) + files[file]["files id"] = database.set_files_id (filename, files[file]["size"], files[file]["md5sum"], files[file]["sha1sum"], files[file]["sha256sum"], dsc_location_id) projectB.query("INSERT INTO source (source, version, maintainer, changedby, file, install_date, sig_fpr) VALUES ('%s', '%s', %d, %d, %d, '%s', %s)" % (package, version, maintainer_id, changedby_id, files[file]["files id"], install_date, fingerprint_id)) @@ -307,10 +311,10 @@ def install (): # files id is stored in dsc_files by check_dsc(). files_id = dsc_files[dsc_file].get("files id", None) if files_id == None: - files_id = database.get_files_id(filename, dsc_files[dsc_file]["size"], dsc_files[dsc_file]["md5sum"], dsc_location_id) + files_id = database.get_files_id(filename, dsc_files[dsc_file]["size"], dsc_files[dsc_file]["md5sum"], files[file]["sha1sum"], files[file]["sha256sum"], dsc_location_id) # FIXME: needs to check for -1/-2 and or handle exception if files_id == None: - files_id = database.set_files_id (filename, dsc_files[dsc_file]["size"], dsc_files[dsc_file]["md5sum"], dsc_location_id) + files_id = database.set_files_id (filename, dsc_files[dsc_file]["size"], dsc_files[dsc_file]["md5sum"], files[file]["sha1sum"], files[file]["sha256sum"], dsc_location_id) projectB.query("INSERT INTO dsc_files (source, file) VALUES (currval('source_id_seq'), %d)" % (files_id)) # Add the src_uploaders to the DB @@ -388,16 +392,18 @@ def install (): # if changes["architecture"].has_key("source") and orig_tar_id and \ orig_tar_location != "legacy" and orig_tar_location != dsc_location_id: - q = projectB.query("SELECT l.path, f.filename, f.size, f.md5sum FROM files f, location l WHERE f.id = %s AND f.location = l.id" % (orig_tar_id)) + q = projectB.query("SELECT l.path, f.filename, f.size, f.md5sum, f.sha1sum, f.sha256sum FROM files f, location l WHERE f.id = %s AND f.location = l.id" % (orig_tar_id)) ql = q.getresult()[0] old_filename = ql[0] + ql[1] file_size = ql[2] file_md5sum = ql[3] + file_sha1sum = ql[4] + file_sha256sum = ql[5] new_filename = utils.poolify(changes["source"], dsc_component) + os.path.basename(old_filename) new_files_id = database.get_files_id(new_filename, file_size, file_md5sum, dsc_location_id) if new_files_id == None: utils.copy(old_filename, Cnf["Dir::Pool"] + new_filename) - new_files_id = database.set_files_id(new_filename, file_size, file_md5sum, dsc_location_id) + new_files_id = database.set_files_id(new_filename, file_size, file_md5sum, file_sha1sum, file_sha256sum, dsc_location_id) projectB.query("UPDATE dsc_files SET file = %s WHERE source = %s AND file = %s" % (new_files_id, source_id, orig_tar_id)) # Install the files into the pool diff --git a/dak/process_unchecked.py b/dak/process_unchecked.py index f2efe8c0..3354a577 100755 --- a/dak/process_unchecked.py +++ b/dak/process_unchecked.py @@ -630,11 +630,11 @@ def check_files(): # Check the md5sum & size against existing files (if any) files[f]["pool name"] = utils.poolify (changes["source"], files[f]["component"]) - files_id = database.get_files_id(files[f]["pool name"] + f, files[f]["size"], files[f]["md5sum"], files[f]["location id"]) + files_id = database.get_files_id(files[f]["pool name"] + f, files[f]["size"], files[f]["md5sum"], files[f]["sha1sum"], files[f]["sha256sum"], files[f]["location id"]) if files_id == -1: reject("INTERNAL ERROR, get_files_id() returned multiple matches for %s." % (f)) elif files_id == -2: - reject("md5sum and/or size mismatch on existing copy of %s." % (f)) + reject("md5sum, sha1sum, sha256sum and/or size mismatch on existing copy of %s." % (f)) files[f]["files id"] = files_id # Check for packages that have moved from one component to another @@ -777,6 +777,8 @@ def check_dsc(): files[orig_tar_gz] = {} files[orig_tar_gz]["size"] = os.stat(orig_tar_gz)[stat.ST_SIZE] files[orig_tar_gz]["md5sum"] = dsc_files[orig_tar_gz]["md5sum"] + files[orig_tar_gz]["sha1sum"] = dsc_files[orig_tar_gz]["sha1sum"] + files[orig_tar_gz]["sha256sum"] = dsc_files[orig_tar_gz]["sha256sum"] files[orig_tar_gz]["section"] = files[dsc_filename]["section"] files[orig_tar_gz]["priority"] = files[dsc_filename]["priority"] files[orig_tar_gz]["component"] = files[dsc_filename]["component"] @@ -924,77 +926,13 @@ def check_hashes (): else: format = int(float(format[0])), 0 - check_hash(".changes", files, "md5sum", apt_pkg.md5sum) - check_hash(".dsc", dsc_files, "md5sum", apt_pkg.md5sum) + utils.check_hash(".changes", files, "md5sum", apt_pkg.md5sum) + utils.check_hash(".dsc", dsc_files, "md5sum", apt_pkg.md5sum) - if format >= (1,8): - hashes = [("sha1", apt_pkg.sha1sum), - ("sha256", apt_pkg.sha256sum)] - else: - hashes = [] - - for x in changes: - if x.startswith("checksum-"): - h = x.split("-",1)[1] - if h not in dict(hashes): - reject("Unsupported checksum field in .changes" % (h)) - - for x in dsc: - if x.startswith("checksum-"): - h = x.split("-",1)[1] - if h not in dict(hashes): - reject("Unsupported checksum field in .dsc" % (h)) - - for h,f in hashes: - try: - fs = utils.build_file_list(changes, 0, "checksums-%s" % h, h) - check_hash(".changes %s" % (h), fs, h, f, files) - except NoFilesFieldError: - reject("No Checksums-%s: field in .changes" % (h)) - except UnknownFormatError, format: - reject("%s: unknown format of .changes" % (format)) - except ParseChangesError, line: - reject("parse error for Checksums-%s in .changes, can't grok: %s." % (h, line)) - - if "source" not in changes["architecture"]: continue - - try: - fs = utils.build_file_list(dsc, 1, "checksums-%s" % h, h) - check_hash(".dsc %s" % (h), fs, h, f, dsc_files) - except UnknownFormatError, format: - reject("%s: unknown format of .dsc" % (format)) - except NoFilesFieldError: - reject("No Checksums-%s: field in .dsc" % (h)) - except ParseChangesError, line: - reject("parse error for Checksums-%s in .dsc, can't grok: %s." % (h, line)) - -################################################################################ - -def check_hash (where, lfiles, key, testfn, basedict = None): - if basedict: - for f in basedict.keys(): - if f not in lfiles: - reject("%s: no %s checksum" % (f, key)) - - for f in lfiles.keys(): - if basedict and f not in basedict: - reject("%s: extraneous entry in %s checksums" % (f, key)) - - try: - file_handle = utils.open_file(f) - except CantOpenError: - continue - - # Check hash - if testfn(file_handle) != lfiles[f][key]: - reject("%s: %s check failed." % (f, key)) - file_handle.close() - # Check size - actual_size = os.stat(f)[stat.ST_SIZE] - size = int(lfiles[f]["size"]) - if size != actual_size: - reject("%s: actual file size (%s) does not match size (%s) in %s" - % (f, actual_size, size, where)) + # This is stupid API, but it'll have to do for now until + # we actually have proper abstraction + for m in utils.ensure_hashes(Upload): + reject(m) ################################################################################ diff --git a/daklib/database.py b/daklib/database.py index 5c362604..b2b55a78 100755 --- a/daklib/database.py +++ b/daklib/database.py @@ -317,7 +317,7 @@ def get_or_set_fingerprint_id (fingerprint): ################################################################################ -def get_files_id (filename, size, md5sum, location_id): +def get_files_id (filename, size, md5sum, sha1sum, sha256sum, location_id): global files_id_cache cache_key = "%s_%d" % (filename, location_id) @@ -326,7 +326,7 @@ def get_files_id (filename, size, md5sum, location_id): return files_id_cache[cache_key] size = int(size) - q = projectB.query("SELECT id, size, md5sum FROM files WHERE filename = '%s' AND location = %d" % (filename, location_id)) + q = projectB.query("SELECT id, size, md5sum, sha1sum, sha256sum FROM files WHERE filename = '%s' AND location = %d" % (filename, location_id)) ql = q.getresult() if ql: if len(ql) != 1: @@ -334,7 +334,9 @@ def get_files_id (filename, size, md5sum, location_id): ql = ql[0] orig_size = int(ql[1]) orig_md5sum = ql[2] - if orig_size != size or orig_md5sum != md5sum: + orig_sha1sum = ql[3] + orig_sha256sum = ql[4] + if orig_size != size or orig_md5sum != md5sum or orig_sha1sum != sha1sum or orig_sha256sum != sha256sum: return -2 files_id_cache[cache_key] = ql[0] return files_id_cache[cache_key] @@ -360,12 +362,12 @@ def get_or_set_queue_id (queue): ################################################################################ -def set_files_id (filename, size, md5sum, location_id): +def set_files_id (filename, size, md5sum, sha1sum, sha256sum, location_id): global files_id_cache - projectB.query("INSERT INTO files (filename, size, md5sum, location) VALUES ('%s', %d, '%s', %d)" % (filename, long(size), md5sum, location_id)) + projectB.query("INSERT INTO files (filename, size, md5sum, sha1sum, sha256sum, location) VALUES ('%s', %d, '%s', %d)" % (filename, long(size), md5sum, sha1sum, sha256sum, location_id)) - return get_files_id (filename, size, md5sum, location_id) + return get_files_id (filename, size, md5sum, sha1sum, sha256sum, location_id) ### currval has issues with postgresql 7.1.3 when the table is big ### it was taking ~3 seconds to return on auric which is very Not diff --git a/daklib/queue.py b/daklib/queue.py index 08b8b5c6..40960b90 100755 --- a/daklib/queue.py +++ b/daklib/queue.py @@ -236,9 +236,10 @@ class Upload: for file_entry in files.keys(): d_files[file_entry] = {} for i in [ "package", "version", "architecture", "type", "size", - "md5sum", "component", "location id", "source package", - "source version", "maintainer", "dbtype", "files id", - "new", "section", "priority", "othercomponents", + "md5sum", "sha1sum", "sha256sum", "component", + "location id", "source package", "source version", + "maintainer", "dbtype", "files id", "new", + "section", "priority", "othercomponents", "pool name", "original component" ]: if files[file_entry].has_key(i): d_files[file_entry][i] = files[file_entry][i] diff --git a/daklib/utils.py b/daklib/utils.py index ec82782f..34154ce1 100755 --- a/daklib/utils.py +++ b/daklib/utils.py @@ -55,6 +55,10 @@ default_apt_config = "/etc/dak/apt.conf" alias_cache = None key_uid_email_cache = {} +# (hashname, function, earliest_changes_version) +known_hashes = [("sha1", apt_pkg.sha1sum, (1, 8)), + ("sha256", apt_pkg.sha256sum, (1, 8))] + ################################################################################ def open_file(filename, mode='r'): @@ -207,6 +211,109 @@ The rules for (signing_rules == 1)-mode are: ################################################################################ +def create_hash (lfiles, key, testfn, basedict = None): + rejmsg = [] + for f in lfiles.keys(): + try: + file_handle = open_file(f) + except CantOpenError: + rejmsg.append("Could not open file %s for checksumming" % (f)) + + # Check hash + basedict[f]['%ssum' % key] = testfn(file_handle) + file_handle.close() + + return rejmsg + +################################################################################ + +def check_hash (where, lfiles, key, testfn, basedict = None): + rejmsg = [] + if basedict: + for f in basedict.keys(): + if f not in lfiles: + rejmsg.append("%s: no %s checksum" % (f, key)) + + for f in lfiles.keys(): + if basedict and f not in basedict: + rejmsg.append("%s: extraneous entry in %s checksums" % (f, key)) + + try: + file_handle = open_file(f) + except CantOpenError: + continue + + # Check hash + if testfn(file_handle) != lfiles[f][key]: + rejmsg.append("%s: %s check failed." % (f, key)) + file_handle.close() + # Store the hashes for later use + basedict[f]['%ssum' % key] = lfiles[f][key] + # Check size + actual_size = os.stat(f)[stat.ST_SIZE] + size = int(lfiles[f]["size"]) + if size != actual_size: + rejmsg.append("%s: actual file size (%s) does not match size (%s) in %s" + % (f, actual_size, size, where)) + + return rejmsg + +################################################################################ + +def ensure_hashes(Upload): + rejmsg = [] + for x in Upload.changes: + if x.startswith("checksum-"): + h = x.split("-",1)[1] + if h not in dict(known_hashes): + rejmsg.append("Unsupported checksum field in .changes" % (h)) + + for x in Upload.dsc: + if x.startswith("checksum-"): + h = x.split("-",1)[1] + if h not in dict(known_hashes): + rejmsg.append("Unsupported checksum field in .dsc" % (h)) + + # We have to calculate the hash if we have an earlier changes version than + # the hash appears in rather than require it exist in the changes file + # I hate backwards compatibility + for h,f,v in known_hashes: + try: + fs = build_file_list(Upload.changes, 0, "checksums-%s" % h, h) + if format < v: + for m in create_hash(fs, h, f, Upload.files): + rejmsg.append(m) + else: + for m in check_hash(".changes %s" % (h), fs, h, f, Upload.files): + rejmsg.append(m) + except NoFilesFieldError: + rejmsg.append("No Checksums-%s: field in .changes" % (h)) + except UnknownFormatError, format: + rejmsg.append("%s: unknown format of .changes" % (format)) + except ParseChangesError, line: + rejmsg.append("parse error for Checksums-%s in .changes, can't grok: %s." % (h, line)) + + if "source" not in Upload.changes["architecture"]: continue + + try: + fs = build_file_list(Upload.dsc, 1, "checksums-%s" % h, h) + if format < v: + for m in create_hash(fs, h, f, Upload.dsc_files): + rejmsg.append(m) + else: + for m in check_hash(".dsc %s" % (h), fs, h, f, Upload.dsc_files): + rejmsg.append(m) + except UnknownFormatError, format: + rejmsg.append("%s: unknown format of .dsc" % (format)) + except NoFilesFieldError: + rejmsg.append("No Checksums-%s: field in .dsc" % (h)) + except ParseChangesError, line: + rejmsg.append("parse error for Checksums-%s in .dsc, can't grok: %s." % (h, line)) + + return rejmsg + +################################################################################ + # Dropped support for 1.4 and ``buggy dchanges 3.4'' (?!) compared to di.pl def build_file_list(changes, is_a_dsc=0, field="files", hashname="md5sum"): diff --git a/setup/init_pool.sql b/setup/init_pool.sql index 7a6e2a49..1e363940 100644 --- a/setup/init_pool.sql +++ b/setup/init_pool.sql @@ -70,6 +70,8 @@ CREATE TABLE files ( md5sum TEXT NOT NULL, location INT4 NOT NULL, -- REFERENCES location last_used TIMESTAMP, + sha1sum TEXT NOT NULL, + sha256sum TEXT NOT NULL, unique (filename, location) );