X-Git-Url: https://git.decadent.org.uk/gitweb/?a=blobdiff_plain;f=dak%2Fimport_contents.py;h=b3fd2019c219d33572dab40b491fd016c0fd1ce8;hb=f71ac27c75a8ab5185508491e97bc6f237772aa6;hp=06691957489d6d591544626783410971bbb06652;hpb=9c7be82789039fb58aaf8667652ca4b74bfd6d46;p=dak.git diff --git a/dak/import_contents.py b/dak/import_contents.py index 06691957..b3fd2019 100755 --- a/dak/import_contents.py +++ b/dak/import_contents.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python2.4 # Import contents files # Copyright (C) 2008, 2009 Michael Casadevall @@ -33,9 +33,12 @@ Cnf = None projectB = None out = None AptCnf = None -content_path_id_cache = {} -content_file_id_cache = {} -insert_contents_file_cache = {} +has_opened_temp_file_lists = False +content_path_file = "" +content_name_file = "" +content_file_cache = set([]) +content_name_cache = set([]) +content_path_cache = set([]) ################################################################################ @@ -50,46 +53,27 @@ Import Contents files ################################################################################ - -def set_contents_file_id(file): - global content_file_id_cache - - if not content_file_id_cache.has_key(file): - # since this can be called within a transaction, we can't use currval - q = projectB.query("INSERT INTO content_file_names VALUES (DEFAULT, '%s') RETURNING id" % (file)) - content_file_id_cache[file] = int(q.getresult()[0][0]) - return content_file_id_cache[file] - -################################################################################ - -def set_contents_path_id(path): - global content_path_id_cache - - if not content_path_id_cache.has_key(path): - q = projectB.query("INSERT INTO content_file_paths VALUES (DEFAULT, '%s') RETURNING id" % (path)) - content_path_id_cache[path] = int(q.getresult()[0][0]) - return content_path_id_cache[path] - -################################################################################ - -def insert_content_path(bin_id, fullpath): - global insert_contents_file_cache - cache_key = "%s_%s" % (bin_id, fullpath) +def cache_content_path(fullpath): + global content_file_cache, contents_name_cache, content_path_cache # have we seen this contents before? - # probably only revelant during package import - if insert_contents_file_cache.has_key(cache_key): + if fullpath in content_file_cache: return + # Add the new key to the cache + content_file_cache.add(fullpath) + # split the path into basename, and pathname (path, file) = os.path.split(fullpath) - # Get the necessary IDs ... - file_id = set_contents_file_id(file) - path_id = set_contents_path_id(path) + # Due to performance reasons, we need to get the entire filelists table + # sorted first before we can do assiocation tables. + if path not in content_path_cache: + content_path_cache.add(path) + + if file not in content_name_cache: + content_name_cache.add(file) - # Put them into content_assiocations - projectB.query("INSERT INTO content_associations VALUES (DEFAULT, '%d', '%d', '%d')" % (bin_id, path_id, file_id)) return ################################################################################ @@ -103,6 +87,10 @@ def import_contents(suites): # Needed to make sure postgreSQL doesn't freak out on some of the data projectB.query("SET CLIENT_ENCODING TO 'LATIN1'") + # Precache everything + #print "Precaching binary information, this will take a few moments ..." + #database.preload_binary_id_cache() + # Prep regexs line_regex = re.compile(r'^(.+?)\s+(\S+)$') pkg_regex = re.compile(r'(\S+)/(\S+)$') @@ -112,19 +100,24 @@ def import_contents(suites): for s in suites: suite_id = database.get_suite_id(s) - q = projectB.query("SELECT s.architecture, a.arch_string FROM suite_architectures s JOIN architecture a ON (s.architecture=a.id) WHERE suite = '%d'" % suite_id) - arch_list = [ ] - for r in q.getresult(): - if r[1] != "source" and r[1] != "all": - arch_list.append((r[0], r[1])) + for r in Cnf.ValueList("Suite::%s::Architectures" % (s)): + if r != "source" and r != "all": + arch_list.append(r) arch_all_id = database.get_architecture_id("all") for arch in arch_list: - print "Processing %s/%s" % (s, arch[1]) - arch_id = database.get_architecture_id(arch[1]) - f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch[1]), "r") + print "Processing %s/%s" % (s, arch) + arch_id = database.get_architecture_id(arch) + + try: + f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch), "r") + + except: + print "Unable to open dists/%s/Contents-%s.gz" % (s, arch) + print "Skipping ..." + continue # Get line count lines = f.readlines() @@ -137,7 +130,7 @@ def import_contents(suites): for line in lines: if found_header == False: if not line: - print "Unable to find end of Contents-%s.gz header!" % ( arch[1]) + print "Unable to find end of Contents-%s.gz header!" % (arch) sys.exit(255) lines_processed += 1 @@ -157,29 +150,44 @@ def import_contents(suites): filename = matchs[0][0] packages = matchs[0][1].split(',') + + cache_content_path(filename) + # Iterate through each file's packages - for package in packages: - matchs = pkg_regex.findall(package) + #for package in packages: + # matchs = pkg_regex.findall(package) # Needed since the DB is unicode, and these files # are ASCII - section_name = matchs[0][0] - package_name = matchs[0][1] + # section_name = matchs[0][0] + # package_name = matchs[0][1] - section_id = database.get_section_id(section_name) - package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id) + #section_id = database.get_section_id(section_name) + #package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id) - if package_id == None: - # Likely got an arch all package - package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_all_id) + # if package_id == None: + # This can happen if the Contents file refers to a non-existant package + # it seems Contents sometimes can be stale due to use of caches (i.e., hurd-i386) + # hurd-i386 was removed from the archive, but its Contents file still exists + # and is seemingly still updated. The sane thing to do is skip it and continue + # continue - insert_content_path(package_id, filename) lines_processed += 1 + + print "" # newline since the Progress bar doesn't print one f.close() # Commit work + print "Committing to database ..." + projectB.query("COPY content_file_names (file) FROM STDIN") + + for line in content_name_cache: + projectB.putline("%s\n" % (line)) + + projectB.endcopy() + projectB.query("COMMIT") ################################################################################