dak/import_contents.py

   1 #!/usr/bin/env python2.4
   2 # Import contents files
   3
   4 # Copyright (C) 2008, 2009 Michael Casadevall <mcasadevall@debian.org>
   5
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program; if not, write to the Free Software
  18 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19
  20 ################################################################################
  21 ################################################################################
  22
  23 ################################################################################
  24
  25 import sys, os, popen2, tempfile, stat, time, pg
  26 import re, gzip, apt_pkg
  27 from daklib import database, utils
  28 from daklib.dak_exceptions import *
  29
  30 ################################################################################
  31
  32 Cnf = None
  33 projectB = None
  34 out = None
  35 AptCnf = None
  36 has_opened_temp_file_lists = False
  37 content_path_file = ""
  38 content_name_file = ""
  39 content_file_cache = set([])
  40 content_name_cache = set([])
  41 content_path_cache = set([])
  42
  43 ################################################################################
  44
  45 def usage (exit_code=0):
  46     print """Usage: dak import-contents
  47 Import Contents files
  48
  49  -h, --help                 show this help and exit
  50  -s, --suite=SUITE         only write file lists for this suite
  51 """
  52     sys.exit(exit_code)
  53
  54 ################################################################################
  55
  56 def cache_content_path(fullpath):
  57     global content_file_cache, contents_name_cache, content_path_cache
  58
  59     # have we seen this contents before?
  60     if fullpath in content_file_cache:
  61         return
  62
  63     # Add the new key to the cache
  64     content_file_cache.add(fullpath)
  65
  66     # split the path into basename, and pathname
  67     (path, file)  = os.path.split(fullpath)
  68
  69     # Due to performance reasons, we need to get the entire filelists table
  70     # sorted first before we can do assiocation tables.
  71     if path not in content_path_cache:
  72         content_path_cache.add(path)
  73
  74     if file not in content_name_cache:
  75         content_name_cache.add(file)
  76
  77     return
  78
  79 ################################################################################
  80
  81 def import_contents(suites):
  82     global projectB, Cnf
  83
  84     # Start transaction
  85     projectB.query("BEGIN WORK")
  86
  87     # Needed to make sure postgreSQL doesn't freak out on some of the data
  88     projectB.query("SET CLIENT_ENCODING TO 'LATIN1'")
  89
  90     # Precache everything
  91     #print "Precaching binary information, this will take a few moments ..."
  92     #database.preload_binary_id_cache()
  93
  94     # Prep regexs
  95     line_regex = re.compile(r'^(.+?)\s+(\S+)$')
  96     pkg_regex = re.compile(r'(\S+)/(\S+)$')
  97     file_regex = re.compile('^FILE')
  98
  99     # Get our suites, and the architectures
 100     for s in suites:
 101         suite_id = database.get_suite_id(s)
 102
 103         arch_list = [ ]
 104         for r in Cnf.ValueList("Suite::%s::Architectures" % (s)):
 105             if r != "source" and r != "all":
 106                 arch_list.append(r)
 107
 108         arch_all_id = database.get_architecture_id("all")
 109
 110         for arch in arch_list:
 111             print "Processing %s/%s" % (s, arch)
 112             arch_id = database.get_architecture_id(arch)
 113
 114             try:
 115                 f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch), "r")
 116
 117             except:
 118                 print "Unable to open dists/%s/Contents-%s.gz" % (s, arch)
 119                 print "Skipping ..."
 120                 continue
 121
 122             # Get line count
 123             lines = f.readlines()
 124             num_of_lines = len(lines)
 125
 126             # Ok, the file cursor is at the first entry, now comes the fun 'lets parse' bit
 127             lines_processed = 0
 128             found_header = False
 129
 130             for line in lines:
 131                 if found_header == False:
 132                     if not line:
 133                         print "Unable to find end of Contents-%s.gz header!" % (arch)
 134                         sys.exit(255)
 135
 136                     lines_processed += 1
 137                     if file_regex.match(line):
 138                         found_header = True
 139                     continue
 140
 141                 # The format is simple enough, *filename*, *section/package1,section/package2,etc*
 142                 # Each file appears once per Contents file, so first, use some regex match
 143                 # to split the two bits
 144
 145                 # Print out progress bar
 146                 print "\rProcessed %d lines of %d (%%%.2f)" % (lines_processed, num_of_lines, ((float(lines_processed)/num_of_lines)*100)),
 147
 148                 # regex lifted from packages.d.o code
 149                 matchs = line_regex.findall(line)
 150                 filename = matchs[0][0]
 151                 packages = matchs[0][1].split(',')
 152
 153
 154                 cache_content_path(filename)
 155
 156                 # Iterate through each file's packages
 157                 #for package in packages:
 158                 #    matchs = pkg_regex.findall(package)
 159
 160                     # Needed since the DB is unicode, and these files
 161                     # are ASCII
 162                 #    section_name = matchs[0][0]
 163                 #    package_name = matchs[0][1]
 164
 165                     #section_id = database.get_section_id(section_name)
 166                     #package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id)
 167
 168                #     if package_id == None:
 169                         # This can happen if the Contents file refers to a non-existant package
 170                         # it seems Contents sometimes can be stale due to use of caches (i.e., hurd-i386)
 171                         # hurd-i386 was removed from the archive, but its Contents file still exists
 172                         # and is seemingly still updated. The sane thing to do is skip it and continue
 173                #         continue
 174
 175
 176                 lines_processed += 1
 177
 178             print "" # newline since the Progress bar doesn't print one
 179             f.close()
 180
 181     # Commit work
 182
 183     print "Committing to database ..."
 184     projectB.query("COPY content_file_names (file) FROM STDIN")
 185
 186     for line in content_name_cache:
 187         projectB.putline("%s\n" % (line))
 188
 189     projectB.endcopy()
 190
 191     projectB.query("COMMIT")
 192
 193 ################################################################################
 194
 195 def main ():
 196     global Cnf, projectB, out
 197     out = sys.stdout
 198
 199     Cnf = utils.get_conf()
 200
 201     Arguments = [('h',"help","Import-Contents::Options::Help"),
 202                  ('s',"suite","Import-Contents::Options::Suite","HasArg"),
 203                 ]
 204
 205     for i in [ "help", "suite" ]:
 206         if not Cnf.has_key("Import-Contents::Options::%s" % (i)):
 207             Cnf["Import-Contents::Options::%s" % (i)] = ""
 208
 209     suites = apt_pkg.ParseCommandLine(Cnf,Arguments,sys.argv)
 210     Options = Cnf.SubTree("Import-Contents::Options")
 211
 212     if Options["Help"]:
 213         usage()
 214
 215     if Options["Suite"]:
 216         suites = utils.split_args(Options["Suite"])
 217     else:
 218         suites = Cnf.SubTree("Suite").List()
 219
 220     projectB = pg.connect(Cnf["DB::Name"], Cnf["DB::Host"], int(Cnf["DB::Port"]))
 221     database.init(Cnf, projectB)
 222
 223     import_contents(suites)
 224
 225 #######################################################################################
 226
 227 if __name__ == '__main__':
 228     main()