-#!/usr/bin/env python
+#!/usr/bin/env python2.4
# Import contents files
# Copyright (C) 2008, 2009 Michael Casadevall <mcasadevall@debian.org>
projectB = None
out = None
AptCnf = None
+has_opened_temp_file_lists = False
+content_path_file = ""
+content_name_file = ""
+content_file_cache = set([])
+content_name_cache = set([])
+content_path_cache = set([])
################################################################################
################################################################################
+def cache_content_path(fullpath):
+ global content_file_cache, contents_name_cache, content_path_cache
+
+ # have we seen this contents before?
+ if fullpath in content_file_cache:
+ return
+
+ # Add the new key to the cache
+ content_file_cache.add(fullpath)
+
+ # split the path into basename, and pathname
+ (path, file) = os.path.split(fullpath)
+
+ # Due to performance reasons, we need to get the entire filelists table
+ # sorted first before we can do assiocation tables.
+ if path not in content_path_cache:
+ content_path_cache.add(path)
+
+ if file not in content_name_cache:
+ content_name_cache.add(file)
+
+ return
+
+################################################################################
+
def import_contents(suites):
global projectB, Cnf
# Needed to make sure postgreSQL doesn't freak out on some of the data
projectB.query("SET CLIENT_ENCODING TO 'LATIN1'")
+ # Precache everything
+ #print "Precaching binary information, this will take a few moments ..."
+ #database.preload_binary_id_cache()
+
+ # Prep regexs
+ line_regex = re.compile(r'^(.+?)\s+(\S+)$')
+ pkg_regex = re.compile(r'(\S+)/(\S+)$')
+ file_regex = re.compile('^FILE')
+
# Get our suites, and the architectures
for s in suites:
suite_id = database.get_suite_id(s)
- q = projectB.query("SELECT s.architecture, a.arch_string FROM suite_architectures s JOIN architecture a ON (s.architecture=a.id) WHERE suite = '%d'" % suite_id)
-
arch_list = [ ]
- for r in q.getresult():
- if r[1] != "source" and r[1] != "all":
- arch_list.append((r[0], r[1]))
+ for r in Cnf.ValueList("Suite::%s::Architectures" % (s)):
+ if r != "source" and r != "all":
+ arch_list.append(r)
arch_all_id = database.get_architecture_id("all")
for arch in arch_list:
- print "Processing %s/%s" % (s, arch[1])
- arch_id = database.get_architecture_id(arch[1])
- f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch[1]), "r")
+ print "Processing %s/%s" % (s, arch)
+ arch_id = database.get_architecture_id(arch)
+
+ try:
+ f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch), "r")
+
+ except:
+ print "Unable to open dists/%s/Contents-%s.gz" % (s, arch)
+ print "Skipping ..."
+ continue
# Get line count
lines = f.readlines()
for line in lines:
if found_header == False:
if not line:
- print "Unable to find end of Contents-%s.gz header!" % ( arch[1])
+ print "Unable to find end of Contents-%s.gz header!" % (arch)
sys.exit(255)
lines_processed += 1
- p = re.compile('^FILE')
- if p.match(line):
+ if file_regex.match(line):
found_header = True
continue
# to split the two bits
# Print out progress bar
- print "\rProcessed %d lines of %d (%%%.2f)" % (lines_processed, num_of_lines, (float(lines_processed)/num_of_lines)),
+ print "\rProcessed %d lines of %d (%%%.2f)" % (lines_processed, num_of_lines, ((float(lines_processed)/num_of_lines)*100)),
# regex lifted from packages.d.o code
- p = re.compile('^(.+?)\s+(\S+)$')
- matchs = p.findall(line)
+ matchs = line_regex.findall(line)
filename = matchs[0][0]
packages = matchs[0][1].split(',')
+
+ cache_content_path(filename)
+
# Iterate through each file's packages
- for package in packages:
- p = re.compile('(\S+)/(\S+)$')
- matchs = p.findall(package)
+ #for package in packages:
+ # matchs = pkg_regex.findall(package)
# Needed since the DB is unicode, and these files
# are ASCII
- section_name = matchs[0][0]
- package_name = matchs[0][1]
+ # section_name = matchs[0][0]
+ # package_name = matchs[0][1]
- section_id = database.get_section_id(section_name)
- package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id)
+ #section_id = database.get_section_id(section_name)
+ #package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id)
- if package_id == None:
- # Likely got an arch all package
- package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_all_id)
+ # if package_id == None:
+ # This can happen if the Contents file refers to a non-existant package
+ # it seems Contents sometimes can be stale due to use of caches (i.e., hurd-i386)
+ # hurd-i386 was removed from the archive, but its Contents file still exists
+ # and is seemingly still updated. The sane thing to do is skip it and continue
+ # continue
- database.insert_content_path(package_id, filename)
lines_processed += 1
+
+ print "" # newline since the Progress bar doesn't print one
f.close()
# Commit work
+
print "Committing to database ..."
+ projectB.query("COPY content_file_names (file) FROM STDIN")
+
+ for line in content_name_cache:
+ projectB.putline("%s\n" % (line))
+
+ projectB.endcopy()
+
projectB.query("COMMIT")
################################################################################