Added initial COPY statement

[dak.git] / dak / import_contents.py
diff --git a/dak/import_contents.py b/dak/import_contents.py

index 945b9ea6124578535a3f03e566754a41ceeeed89..b3fd2019c219d33572dab40b491fd016c0fd1ce8 100755 (executable)
--- a/dak/import_contents.py
+++ b/dak/import_contents.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2.4
  # Import contents files
  
  # Copyright (C) 2008, 2009 Michael Casadevall <mcasadevall@debian.org>
@@ -33,6 +33,12 @@ Cnf = None
  projectB = None
  out = None
  AptCnf = None
+has_opened_temp_file_lists = False
+content_path_file = ""
+content_name_file = ""
+content_file_cache = set([])
+content_name_cache = set([])
+content_path_cache = set([])
  
  ################################################################################
  
@@ -47,6 +53,31 @@ Import Contents files
  
  ################################################################################
  
+def cache_content_path(fullpath):
+    global content_file_cache, contents_name_cache, content_path_cache
+
+    # have we seen this contents before?
+    if fullpath in content_file_cache:
+        return
+
+    # Add the new key to the cache
+    content_file_cache.add(fullpath)
+
+    # split the path into basename, and pathname
+    (path, file)  = os.path.split(fullpath)
+
+    # Due to performance reasons, we need to get the entire filelists table
+    # sorted first before we can do assiocation tables.
+    if path not in content_path_cache:
+        content_path_cache.add(path)
+
+    if file not in content_name_cache:
+        content_name_cache.add(file)
+
+    return
+
+################################################################################
+
  def import_contents(suites):
      global projectB, Cnf
  
@@ -56,23 +87,37 @@ def import_contents(suites):
      # Needed to make sure postgreSQL doesn't freak out on some of the data
      projectB.query("SET CLIENT_ENCODING TO 'LATIN1'")
  
+    # Precache everything
+    #print "Precaching binary information, this will take a few moments ..."
+    #database.preload_binary_id_cache()
+
+    # Prep regexs
+    line_regex = re.compile(r'^(.+?)\s+(\S+)$')
+    pkg_regex = re.compile(r'(\S+)/(\S+)$')
+    file_regex = re.compile('^FILE')
+
      # Get our suites, and the architectures
      for s in suites:
          suite_id = database.get_suite_id(s)
  
-        q = projectB.query("SELECT s.architecture, a.arch_string FROM suite_architectures s JOIN architecture a ON (s.architecture=a.id) WHERE suite = '%d'" % suite_id)
-
          arch_list = [ ]
-        for r in q.getresult():
-            if r[1] != "source" and r[1] != "all":
-                arch_list.append((r[0], r[1]))
+        for r in Cnf.ValueList("Suite::%s::Architectures" % (s)):
+            if r != "source" and r != "all":
+                arch_list.append(r)
  
          arch_all_id = database.get_architecture_id("all")
  
          for arch in arch_list:
-            print "Processing %s/%s" % (s, arch[1])
-            arch_id = database.get_architecture_id(arch[1])
-            f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch[1]), "r")
+            print "Processing %s/%s" % (s, arch)
+            arch_id = database.get_architecture_id(arch)
+
+            try:
+                f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch), "r")
+
+            except:
+                print "Unable to open dists/%s/Contents-%s.gz" % (s, arch)
+                print "Skipping ..."
+                continue
  
              # Get line count
              lines = f.readlines()
@@ -85,12 +130,11 @@ def import_contents(suites):
              for line in lines:
                  if found_header == False:
                      if not line:
-                        print "Unable to find end of Contents-%s.gz header!" % ( arch[1])
+                        print "Unable to find end of Contents-%s.gz header!" % (arch)
                          sys.exit(255)
  
                      lines_processed += 1
-                    p = re.compile('^FILE')
-                    if p.match(line):
+                    if file_regex.match(line):
                          found_header = True
                      continue
  
@@ -99,38 +143,51 @@ def import_contents(suites):
                  # to split the two bits
  
                  # Print out progress bar
-                print "\rProcessed %d lines of %d (%%%.2f)" % (lines_processed, num_of_lines, (float(lines_processed)/num_of_lines)),
+                print "\rProcessed %d lines of %d (%%%.2f)" % (lines_processed, num_of_lines, ((float(lines_processed)/num_of_lines)*100)),
  
                  # regex lifted from packages.d.o code
-                p = re.compile('^(.+?)\s+(\S+)$')
-                matchs = p.findall(line)
+                matchs = line_regex.findall(line)
                  filename = matchs[0][0]
                  packages = matchs[0][1].split(',')
  
+
+                cache_content_path(filename)
+
                  # Iterate through each file's packages
-                for package in packages:
-                    p = re.compile('(\S+)/(\S+)$')
-                    matchs = p.findall(package)
+                #for package in packages:
+                #    matchs = pkg_regex.findall(package)
  
                      # Needed since the DB is unicode, and these files
                      # are ASCII
-                    section_name = matchs[0][0]
-                    package_name = matchs[0][1]
+                #    section_name = matchs[0][0]
+                #    package_name = matchs[0][1]
  
-                    section_id = database.get_section_id(section_name)
-                    package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id)
+                    #section_id = database.get_section_id(section_name)
+                    #package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id)
  
-                    if package_id == None:
-                        # Likely got an arch all package
-                        package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_all_id)
+               #     if package_id == None:
+                        # This can happen if the Contents file refers to a non-existant package
+                        # it seems Contents sometimes can be stale due to use of caches (i.e., hurd-i386)
+                        # hurd-i386 was removed from the archive, but its Contents file still exists
+                        # and is seemingly still updated. The sane thing to do is skip it and continue
+               #         continue
  
-                    database.insert_content_path(package_id, filename)
  
                  lines_processed += 1
+
+            print "" # newline since the Progress bar doesn't print one
              f.close()
  
      # Commit work
+
      print "Committing to database ..."
+    projectB.query("COPY content_file_names (file) FROM STDIN")
+
+    for line in content_name_cache:
+        projectB.putline("%s\n" % (line))
+
+    projectB.endcopy()
+
      projectB.query("COMMIT")
  
  ################################################################################