#!/usr/bin/env python # Populate the DB # Copyright (C) 2000 James Troup # $Id: neve,v 1.1.1.1 2000-11-24 00:20:09 troup Exp $ # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ################################################################################ # 04:36| elmo: you're making me waste 5 seconds per architecture!!!!!! YOU BASTARD!!!!! ################################################################################ # This code is a horrible mess for two reasons: # (o) For Debian's usage, it's doing something like 160k INSERTs, # even on auric, that makes the program unusable unless we get # involed in sorts of silly optimization games (local dicts to avoid # redundant SELECTS, using COPY FROM rather than INSERTS etc.) # (o) It's very site specific, because I don't expect to use this # script again in a hurry, and I don't want to spend any more time # on it than absolutely necessary. ############################################################################################################### import commands, os, pg, re, sys, string, tempfile import apt_pkg import db_access, utils ############################################################################################################### re_arch_from_filename = re.compile(r"binary-[^/]+") ############################################################################################################### Cnf = None; projectB = None; files_id_cache = {}; source_cache = {}; arch_all_cache = {}; binary_cache = {}; # files_id_serial = 0; source_id_serial = 0; src_associations_id_serial = 0; dsc_files_id_serial = 0; files_query_cache = None; source_query_cache = None; src_associations_query_cache = None; dsc_files_query_cache = None; orig_tar_gz_cache = {}; # binaries_id_serial = 0; binaries_query_cache = None; bin_associations_id_serial = 0; bin_associations_query_cache = None; # source_cache_for_binaries = {}; ############################################################################################################### # Prepares a filename or directory (s) to be file.filename by stripping any part of the location (sub) from it. def poolify (s, sub): for i in xrange(len(sub)): if sub[i:] == s[0:len(sub)-i]: return s[len(sub)-i:]; return s; def update_archives (): projectB.query("DELETE FROM archive") for archive in Cnf.SubTree("Archive").List(): SubSec = Cnf.SubTree("Archive::%s" % (archive)); projectB.query("INSERT INTO archive (name, origin_server, description) VALUES ('%s', '%s', '%s')" % (archive, SubSec["OriginServer"], SubSec["Description"])); def update_components (): projectB.query("DELETE FROM component") for component in Cnf.SubTree("Component").List(): SubSec = Cnf.SubTree("Component::%s" % (component)); projectB.query("INSERT INTO component (name, description, meets_dfsg) VALUES ('%s', '%s', '%s')" % (component, SubSec["Description"], SubSec["MeetsDFSG"])); def update_locations (): projectB.query("DELETE FROM location") for location in Cnf.SubTree("Location").List(): SubSec = Cnf.SubTree("Location::%s" % (location)); archive_id = db_access.get_archive_id(SubSec["archive"]); type = SubSec.Find("type"); if type == "legacy-mixed": projectB.query("INSERT INTO location (path, archive, type) VALUES ('%s', %d, '%s')" % (location, archive_id, SubSec["type"])); else: for component in Cnf.SubTree("Component").List(): component_id = db_access.get_component_id(component); projectB.query("INSERT INTO location (path, component, archive, type) VALUES ('%s', %d, %d, '%s')" % (location, component_id, archive_id, SubSec["type"])); def update_architectures (): projectB.query("DELETE FROM architecture") for arch in Cnf.SubTree("Architectures").List(): projectB.query("INSERT INTO architecture (arch_string, description) VALUES ('%s', '%s')" % (arch, Cnf["Architectures::%s" % (arch)])) def update_suites (): projectB.query("DELETE FROM suite") for suite in Cnf.SubTree("Suite").List(): SubSec = Cnf.SubTree("Suite::%s" %(suite)) projectB.query("INSERT INTO suite (suite_name, version, origin, description) VALUES ('%s', '%s', '%s', '%s')" % (string.lower(suite), SubSec["Version"], SubSec["Origin"], SubSec["Description"])) for architecture in Cnf.SubTree("Suite::%s::Architectures" % (suite)).List(): architecture_id = db_access.get_architecture_id (architecture); projectB.query("INSERT INTO suite_architectures (suite, architecture) VALUES (currval('suite_id_seq'), %d)" % (architecture_id)); ############################################################################################################## def get_or_set_files_id (filename, size, md5sum, location_id): global files_id_cache, files_id_serial, files_query_cache; cache_key = string.join((filename, size, md5sum, repr(location_id)), '~') if not files_id_cache.has_key(cache_key): files_id_serial = files_id_serial + 1 files_query_cache.write("%d\t%s\t%s\t%s\t%d\n" % (files_id_serial, filename, size, md5sum, location_id)); files_id_cache[cache_key] = files_id_serial return files_id_cache[cache_key] ############################################################################################################## def process_sources (location, filename, suite, component, archive): global source_cache, source_query_cache, src_associations_query_cache, dsc_files_query_cache, source_id_serial, src_associations_id_serial, dsc_files_id_serial, source_cache_for_binaries, orig_tar_gz_cache; suite = string.lower(suite) suite_id = db_access.get_suite_id(suite); if suite == 'stable': testing_id = db_access.get_suite_id("testing"); try: file = utils.open_file (filename, "r") except utils.cant_open_exc: print "WARNING: can't open '%s'" % (filename); return; Scanner = apt_pkg.ParseTagFile(file) while Scanner.Step() != 0: package = Scanner.Section["package"] version = Scanner.Section["version"] maintainer = Scanner.Section["maintainer"] maintainer = string.replace(maintainer, "'", "\\'") maintainer_id = db_access.get_or_set_maintainer_id(maintainer); directory = Scanner.Section["directory"] location_id = db_access.get_location_id (location, component, archive) if directory[-1:] != "/": directory = directory + '/'; directory = poolify (directory, location); if directory != "" and directory[-1:] != "/": directory = directory + '/'; no_epoch_version = utils.re_no_epoch.sub('', version) # Add all files referenced by the .dsc to the files table ids = []; for line in string.split(Scanner.Section["files"],'\n'): id = None; (md5sum, size, filename) = string.split(string.strip(line)); # Don't duplicate .orig.tar.gz's if filename[-12:] == ".orig.tar.gz": cache_key = "%s~%s~%s" % (filename, size, md5sum); if orig_tar_gz_cache.has_key(cache_key): id = orig_tar_gz_cache[cache_key]; else: id = get_or_set_files_id (directory + filename, size, md5sum, location_id); orig_tar_gz_cache[cache_key] = id; else: id = get_or_set_files_id (directory + filename, size, md5sum, location_id); ids.append(id); # If this is the .dsc itself; save the ID for later. if filename[-4:] == ".dsc": files_id = id; filename = directory + package + '_' + no_epoch_version + '.dsc' cache_key = "%s~%s" % (package, version) if not source_cache.has_key(cache_key): nasty_key = "%s~%s" % (package, version) source_id_serial = source_id_serial + 1; if not source_cache_for_binaries.has_key(nasty_key): source_cache_for_binaries[nasty_key] = source_id_serial; tmp_source_id = source_id_serial; source_cache[cache_key] = source_id_serial; source_query_cache.write("%d\t%s\t%s\t%d\t%d\n" % (source_id_serial, package, version, maintainer_id, files_id)) for id in ids: dsc_files_id_serial = dsc_files_id_serial + 1; dsc_files_query_cache.write("%d\t%d\t%d\n" % (dsc_files_id_serial, tmp_source_id,id)); else: tmp_source_id = source_cache[cache_key]; src_associations_id_serial = src_associations_id_serial + 1; src_associations_query_cache.write("%d\t%d\t%d\n" % (src_associations_id_serial, suite_id, tmp_source_id)) # populate 'testing' with a mirror of 'stable' if suite == "stable": src_associations_id_serial = src_associations_id_serial + 1; src_associations_query_cache.write("%d\t%d\t%d\n" % (src_associations_id_serial, testing_id, tmp_source_id)) file.close() ############################################################################################################## def process_packages (location, filename, suite, component, archive): global arch_all_cache, binary_cache, binaries_id_serial, binaries_query_cache, bin_associations_id_serial, bin_associations_query_cache; count_total = 0; count_bad = 0; suite = string.lower(suite); suite_id = db_access.get_suite_id(suite); if suite == "stable": testing_id = db_access.get_suite_id("testing"); try: file = utils.open_file (filename, "r") except utils.cant_open_exc: print "WARNING: can't open '%s'" % (filename); return; Scanner = apt_pkg.ParseTagFile(file); while Scanner.Step() != 0: package = Scanner.Section["package"] version = Scanner.Section["version"] maintainer = Scanner.Section["maintainer"] maintainer = string.replace(maintainer, "'", "\\'") maintainer_id = db_access.get_or_set_maintainer_id(maintainer); architecture = Scanner.Section["architecture"] architecture_id = db_access.get_architecture_id (architecture); if not Scanner.Section.has_key("source"): source = package else: source = Scanner.Section["source"] source_version = "" if string.find(source, "(") != -1: m = utils.re_extract_src_version.match(source) source = m.group(1) source_version = m.group(2) if not source_version: source_version = version filename = Scanner.Section["filename"] location_id = db_access.get_location_id (location, component, archive) filename = poolify (filename, location) if architecture == "all": filename = re_arch_from_filename.sub("binary-all", filename); cache_key = "%s~%s" % (source, source_version); source_id = source_cache_for_binaries.get(cache_key, None); size = Scanner.Section["size"]; md5sum = Scanner.Section["md5sum"]; files_id = get_or_set_files_id (filename, size, md5sum, location_id); type = "deb"; # FIXME cache_key = "%s~%s~%s~%d~%d~%d" % (package, version, repr(source_id), architecture_id, location_id, files_id); if not arch_all_cache.has_key(cache_key): arch_all_cache[cache_key] = 1; cache_key = "%s~%s~%s~%d" % (package, version, repr(source_id), architecture_id); if not binary_cache.has_key(cache_key): if not source_id: source_id = "\N"; count_bad = count_bad + 1; else: source_id = repr(source_id); binaries_id_serial = binaries_id_serial + 1; binaries_query_cache.write("%d\t%s\t%s\t%d\t%s\t%d\t%d\t%s\n" % (binaries_id_serial, package, version, maintainer_id, source_id, architecture_id, files_id, type)); binary_cache[cache_key] = binaries_id_serial; tmp_binaries_id = binaries_id_serial; else: tmp_binaries_id = binary_cache[cache_key]; bin_associations_id_serial = bin_associations_id_serial + 1; bin_associations_query_cache.write("%d\t%d\t%d\n" % (bin_associations_id_serial, suite_id, tmp_binaries_id)); if suite == "stable": bin_associations_id_serial = bin_associations_id_serial + 1; bin_associations_query_cache.write("%d\t%d\t%d\n" % (bin_associations_id_serial, testing_id, tmp_binaries_id)); count_total = count_total +1; file.close(); if count_bad != 0: print "%d binary packages processed; %d with no source match which is %.2f%%" % (count_total, count_bad, (float(count_bad)/count_total)*100); else: print "%d binary packages processed; 0 with no source match which is 0%%" % (count_total); ############################################################################################################## def do_sources(location, prefix, suite, component, server): temp_filename = tempfile.mktemp(); fd = os.open(temp_filename, os.O_RDWR|os.O_CREAT|os.O_EXCL, 0700); os.close(fd); sources = location + prefix + 'Sources.gz'; (result, output) = commands.getstatusoutput("gunzip -c %s > %s" % (sources, temp_filename)); if (result != 0): sys.stderr.write("Gunzip invocation failed!\n%s\n" % (output)); sys.exit(result); print 'Processing '+sources+'...'; process_sources (location, temp_filename, suite, component, server); os.unlink(temp_filename); ############################################################################################################## def main (): global Cnf, projectB, query_cache, files_query_cache, source_query_cache, src_associations_query_cache, dsc_files_query_cache, bin_associations_query_cache, binaries_query_cache; apt_pkg.init(); Cnf = apt_pkg.newConfiguration(); apt_pkg.ReadConfigFileISC(Cnf,utils.which_conf_file()); print "Re-Creating DB..." (result, output) = commands.getstatusoutput("psql -f init_pool.sql") if (result != 0): sys.exit(2) print output projectB = pg.connect('projectb', 'localhost', -1, None, None, 'postgres') db_access.init (Cnf, projectB); print "Adding static tables from conf file..." projectB.query("BEGIN WORK"); update_architectures(); update_components(); update_archives(); update_locations(); update_suites(); projectB.query("COMMIT WORK"); files_query_cache = utils.open_file(Cnf["Neve::ExportDir"]+"files","w"); source_query_cache = utils.open_file(Cnf["Neve::ExportDir"]+"source","w"); src_associations_query_cache = utils.open_file(Cnf["Neve::ExportDir"]+"src_associations","w"); dsc_files_query_cache = utils.open_file(Cnf["Neve::ExportDir"]+"dsc_files","w"); binaries_query_cache = utils.open_file(Cnf["Neve::ExportDir"]+"binaries","w"); bin_associations_query_cache = utils.open_file(Cnf["Neve::ExportDir"]+"bin_associations","w"); projectB.query("BEGIN WORK"); # Process Sources files to popoulate `source' and friends for location in Cnf.SubTree("Location").List(): SubSec = Cnf.SubTree("Location::%s" % (location)); server = SubSec["Archive"]; type = Cnf.Find("Location::%s::Type" % (location)); if type == "legacy-mixed": prefix = '' suite = Cnf.Find("Location::%s::Suite" % (location)); do_sources(location, prefix, suite, "", server); elif type == "legacy": for suite in Cnf.SubTree("Location::%s::Suites" % (location)).List(): for component in Cnf.SubTree("Component").List(): prefix = Cnf.Find("Suite::%s::CodeName" % (suite)) + '/' + component + '/source/' do_sources(location, prefix, suite, component, server); elif type == "pool": continue; # for component in Cnf.SubTree("Component").List(): # prefix = component + '/' # do_sources(location, prefix); else: sys.stderr.write("Unknown location type ('%s').\n" % (type)); sys.exit(2); # Process Packages files to populate `binaries' and friends for location in Cnf.SubTree("Location").List(): SubSec = Cnf.SubTree("Location::%s" % (location)); server = SubSec["Archive"]; type = Cnf.Find("Location::%s::Type" % (location)); if type == "legacy-mixed": packages = location + 'Packages'; suite = Cnf.Find("Location::%s::Suite" % (location)); print 'Processing '+location+'...'; process_packages (location, packages, suite, "", server); elif type == "legacy": for suite in Cnf.SubTree("Location::%s::Suites" % (location)).List(): for component in Cnf.SubTree("Component").List(): for architecture in Cnf.SubTree("Suite::%s::Architectures" % (suite)).List(): if architecture == "source" or architecture == "all": continue; packages = location + Cnf.Find("Suite::%s::CodeName" % (suite)) + '/' + component + '/binary-' + architecture + '/Packages' print 'Processing '+packages+'...'; process_packages (location, packages, suite, component, server); elif type == "pool": continue; files_query_cache.close(); source_query_cache.close(); src_associations_query_cache.close(); dsc_files_query_cache.close(); binaries_query_cache.close(); bin_associations_query_cache.close(); print "Writing data to `files' table..."; projectB.query("COPY files FROM '%s'" % (Cnf["Neve::ExportDir"]+"files")); print "Writing data to `source' table..."; projectB.query("COPY source FROM '%s'" % (Cnf["Neve::ExportDir"]+"source")); print "Writing data to `src_associations' table..."; projectB.query("COPY src_associations FROM '%s'" % (Cnf["Neve::ExportDir"]+"src_associations")); print "Writing data to `dsc_files' table..."; projectB.query("COPY dsc_files FROM '%s'" % (Cnf["Neve::ExportDir"]+"dsc_files")); print "Writing data to `binaries' table..."; projectB.query("COPY binaries FROM '%s'" % (Cnf["Neve::ExportDir"]+"binaries")); print "Writing data to `bin_associations' table..."; projectB.query("COPY bin_associations FROM '%s'" % (Cnf["Neve::ExportDir"]+"bin_associations")); print "Committing..."; projectB.query("COMMIT WORK"); # Add the constraints and otherwise generally clean up the database. # See add_constraints.sql for more details... print "Running add_constraints.sql..."; (result, output) = commands.getstatusoutput("psql projectb < add_constraints.sql"); print output if (result != 0): sys.stderr.write("psql invocation failed!\n"); sys.exit(result); return; if __name__ == '__main__': main()