-#!/usr/bin/env python
+#!/usr/bin/env python2.4
# Import contents files
# Copyright (C) 2008, 2009 Michael Casadevall <mcasadevall@debian.org>
projectB = None
out = None
AptCnf = None
-content_path_id_cache = {}
-content_file_id_cache = {}
-insert_contents_file_cache = {}
+has_opened_temp_file_lists = False
+content_path_file = ""
+content_name_file = ""
+content_file_cache = set([])
+content_name_cache = set([])
+content_path_cache = set([])
################################################################################
################################################################################
-
-def set_contents_file_id(file):
- global content_file_id_cache
-
- if not content_file_id_cache.has_key(file):
- # since this can be called within a transaction, we can't use currval
- q = projectB.query("INSERT INTO content_file_names VALUES (DEFAULT, '%s') RETURNING id" % (file))
- content_file_id_cache[file] = int(q.getresult()[0][0])
- return content_file_id_cache[file]
-
-################################################################################
-
-def set_contents_path_id(path):
- global content_path_id_cache
-
- if not content_path_id_cache.has_key(path):
- q = projectB.query("INSERT INTO content_file_paths VALUES (DEFAULT, '%s') RETURNING id" % (path))
- content_path_id_cache[path] = int(q.getresult()[0][0])
- return content_path_id_cache[path]
-
-################################################################################
-
-def insert_content_path(bin_id, fullpath):
- global insert_contents_file_cache
- cache_key = "%s_%s" % (bin_id, fullpath)
+def cache_content_path(fullpath):
+ global content_file_cache, contents_name_cache, content_path_cache
# have we seen this contents before?
- # probably only revelant during package import
- if insert_contents_file_cache.has_key(cache_key):
+ if fullpath in content_file_cache:
return
+ # Add the new key to the cache
+ content_file_cache.add(fullpath)
+
# split the path into basename, and pathname
(path, file) = os.path.split(fullpath)
- # Get the necessary IDs ...
- file_id = set_contents_file_id(file)
- path_id = set_contents_path_id(path)
+ # Due to performance reasons, we need to get the entire filelists table
+ # sorted first before we can do assiocation tables.
+ if path not in content_path_cache:
+ content_path_cache.add(path)
+
+ if file not in content_name_cache:
+ content_name_cache.add(file)
- # Put them into content_assiocations
- projectB.query("INSERT INTO content_associations VALUES (DEFAULT, '%d', '%d', '%d')" % (bin_id, path_id, file_id))
return
################################################################################
# Needed to make sure postgreSQL doesn't freak out on some of the data
projectB.query("SET CLIENT_ENCODING TO 'LATIN1'")
+ # Precache everything
+ #print "Precaching binary information, this will take a few moments ..."
+ #database.preload_binary_id_cache()
+
# Prep regexs
line_regex = re.compile(r'^(.+?)\s+(\S+)$')
pkg_regex = re.compile(r'(\S+)/(\S+)$')
for s in suites:
suite_id = database.get_suite_id(s)
- q = projectB.query("SELECT s.architecture, a.arch_string FROM suite_architectures s JOIN architecture a ON (s.architecture=a.id) WHERE suite = '%d'" % suite_id)
-
arch_list = [ ]
- for r in q.getresult():
- if r[1] != "source" and r[1] != "all":
- arch_list.append((r[0], r[1]))
+ for r in Cnf.ValueList("Suite::%s::Architectures" % (s)):
+ if r != "source" and r != "all":
+ arch_list.append(r)
arch_all_id = database.get_architecture_id("all")
for arch in arch_list:
- print "Processing %s/%s" % (s, arch[1])
- arch_id = database.get_architecture_id(arch[1])
- f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch[1]), "r")
+ print "Processing %s/%s" % (s, arch)
+ arch_id = database.get_architecture_id(arch)
+
+ try:
+ f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch), "r")
+
+ except:
+ print "Unable to open dists/%s/Contents-%s.gz" % (s, arch)
+ print "Skipping ..."
+ continue
# Get line count
lines = f.readlines()
for line in lines:
if found_header == False:
if not line:
- print "Unable to find end of Contents-%s.gz header!" % ( arch[1])
+ print "Unable to find end of Contents-%s.gz header!" % (arch)
sys.exit(255)
lines_processed += 1
filename = matchs[0][0]
packages = matchs[0][1].split(',')
+
+ cache_content_path(filename)
+
# Iterate through each file's packages
- for package in packages:
- matchs = pkg_regex.findall(package)
+ #for package in packages:
+ # matchs = pkg_regex.findall(package)
# Needed since the DB is unicode, and these files
# are ASCII
- section_name = matchs[0][0]
- package_name = matchs[0][1]
+ # section_name = matchs[0][0]
+ # package_name = matchs[0][1]
- section_id = database.get_section_id(section_name)
- package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id)
+ #section_id = database.get_section_id(section_name)
+ #package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id)
- if package_id == None:
- # Likely got an arch all package
- package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_all_id)
+ # if package_id == None:
+ # This can happen if the Contents file refers to a non-existant package
+ # it seems Contents sometimes can be stale due to use of caches (i.e., hurd-i386)
+ # hurd-i386 was removed from the archive, but its Contents file still exists
+ # and is seemingly still updated. The sane thing to do is skip it and continue
+ # continue
- insert_content_path(package_id, filename)
lines_processed += 1
+
+ print "" # newline since the Progress bar doesn't print one
f.close()
# Commit work
+
print "Committing to database ..."
+ projectB.query("COPY content_file_names (file) FROM STDIN")
+
+ for line in content_name_cache:
+ projectB.putline("%s\n" % (line))
+
+ projectB.endcopy()
+
projectB.query("COMMIT")
################################################################################