]> git.decadent.org.uk Git - dak.git/blobdiff - dak/import_contents.py
Added initial COPY statement
[dak.git] / dak / import_contents.py
index 945b9ea6124578535a3f03e566754a41ceeeed89..b3fd2019c219d33572dab40b491fd016c0fd1ce8 100755 (executable)
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2.4
 # Import contents files
 
 # Copyright (C) 2008, 2009 Michael Casadevall <mcasadevall@debian.org>
@@ -33,6 +33,12 @@ Cnf = None
 projectB = None
 out = None
 AptCnf = None
+has_opened_temp_file_lists = False
+content_path_file = ""
+content_name_file = ""
+content_file_cache = set([])
+content_name_cache = set([])
+content_path_cache = set([])
 
 ################################################################################
 
@@ -47,6 +53,31 @@ Import Contents files
 
 ################################################################################
 
+def cache_content_path(fullpath):
+    global content_file_cache, contents_name_cache, content_path_cache
+
+    # have we seen this contents before?
+    if fullpath in content_file_cache:
+        return
+
+    # Add the new key to the cache
+    content_file_cache.add(fullpath)
+
+    # split the path into basename, and pathname
+    (path, file)  = os.path.split(fullpath)
+
+    # Due to performance reasons, we need to get the entire filelists table
+    # sorted first before we can do assiocation tables.
+    if path not in content_path_cache:
+        content_path_cache.add(path)
+
+    if file not in content_name_cache:
+        content_name_cache.add(file)
+
+    return
+
+################################################################################
+
 def import_contents(suites):
     global projectB, Cnf
 
@@ -56,23 +87,37 @@ def import_contents(suites):
     # Needed to make sure postgreSQL doesn't freak out on some of the data
     projectB.query("SET CLIENT_ENCODING TO 'LATIN1'")
 
+    # Precache everything
+    #print "Precaching binary information, this will take a few moments ..."
+    #database.preload_binary_id_cache()
+
+    # Prep regexs
+    line_regex = re.compile(r'^(.+?)\s+(\S+)$')
+    pkg_regex = re.compile(r'(\S+)/(\S+)$')
+    file_regex = re.compile('^FILE')
+
     # Get our suites, and the architectures
     for s in suites:
         suite_id = database.get_suite_id(s)
 
-        q = projectB.query("SELECT s.architecture, a.arch_string FROM suite_architectures s JOIN architecture a ON (s.architecture=a.id) WHERE suite = '%d'" % suite_id)
-
         arch_list = [ ]
-        for r in q.getresult():
-            if r[1] != "source" and r[1] != "all":
-                arch_list.append((r[0], r[1]))
+        for r in Cnf.ValueList("Suite::%s::Architectures" % (s)):
+            if r != "source" and r != "all":
+                arch_list.append(r)
 
         arch_all_id = database.get_architecture_id("all")
 
         for arch in arch_list:
-            print "Processing %s/%s" % (s, arch[1])
-            arch_id = database.get_architecture_id(arch[1])
-            f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch[1]), "r")
+            print "Processing %s/%s" % (s, arch)
+            arch_id = database.get_architecture_id(arch)
+
+            try:
+                f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch), "r")
+
+            except:
+                print "Unable to open dists/%s/Contents-%s.gz" % (s, arch)
+                print "Skipping ..."
+                continue
 
             # Get line count
             lines = f.readlines()
@@ -85,12 +130,11 @@ def import_contents(suites):
             for line in lines:
                 if found_header == False:
                     if not line:
-                        print "Unable to find end of Contents-%s.gz header!" % ( arch[1])
+                        print "Unable to find end of Contents-%s.gz header!" % (arch)
                         sys.exit(255)
 
                     lines_processed += 1
-                    p = re.compile('^FILE')
-                    if p.match(line):
+                    if file_regex.match(line):
                         found_header = True
                     continue
 
@@ -99,38 +143,51 @@ def import_contents(suites):
                 # to split the two bits
 
                 # Print out progress bar
-                print "\rProcessed %d lines of %d (%%%.2f)" % (lines_processed, num_of_lines, (float(lines_processed)/num_of_lines)),
+                print "\rProcessed %d lines of %d (%%%.2f)" % (lines_processed, num_of_lines, ((float(lines_processed)/num_of_lines)*100)),
 
                 # regex lifted from packages.d.o code
-                p = re.compile('^(.+?)\s+(\S+)$')
-                matchs = p.findall(line)
+                matchs = line_regex.findall(line)
                 filename = matchs[0][0]
                 packages = matchs[0][1].split(',')
 
+
+                cache_content_path(filename)
+
                 # Iterate through each file's packages
-                for package in packages:
-                    p = re.compile('(\S+)/(\S+)$')
-                    matchs = p.findall(package)
+                #for package in packages:
+                #    matchs = pkg_regex.findall(package)
 
                     # Needed since the DB is unicode, and these files
                     # are ASCII
-                    section_name = matchs[0][0]
-                    package_name = matchs[0][1]
+                #    section_name = matchs[0][0]
+                #    package_name = matchs[0][1]
 
-                    section_id = database.get_section_id(section_name)
-                    package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id)
+                    #section_id = database.get_section_id(section_name)
+                    #package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id)
 
-                    if package_id == None:
-                        # Likely got an arch all package
-                        package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_all_id)
+               #     if package_id == None:
+                        # This can happen if the Contents file refers to a non-existant package
+                        # it seems Contents sometimes can be stale due to use of caches (i.e., hurd-i386)
+                        # hurd-i386 was removed from the archive, but its Contents file still exists
+                        # and is seemingly still updated. The sane thing to do is skip it and continue
+               #         continue
 
-                    database.insert_content_path(package_id, filename)
 
                 lines_processed += 1
+
+            print "" # newline since the Progress bar doesn't print one
             f.close()
 
     # Commit work
+
     print "Committing to database ..."
+    projectB.query("COPY content_file_names (file) FROM STDIN")
+
+    for line in content_name_cache:
+        projectB.putline("%s\n" % (line))
+
+    projectB.endcopy()
+
     projectB.query("COMMIT")
 
 ################################################################################