dak/import_contents.py

   1 #!/usr/bin/env python
   2 # Import contents files
   3
   4 # Copyright (C) 2008, 2009 Michael Casadevall <mcasadevall@debian.org>
   5
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program; if not, write to the Free Software
  18 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19
  20 ################################################################################
  21 ################################################################################
  22
  23 ################################################################################
  24
  25 import sys, os, popen2, tempfile, stat, time, pg
  26 import re, gzip, apt_pkg
  27 from daklib import database, utils
  28 from daklib.dak_exceptions import *
  29
  30 ################################################################################
  31
  32 Cnf = None
  33 projectB = None
  34 out = None
  35 AptCnf = None
  36 has_opened_temp_file_lists = False
  37 content_path_file = ""
  38 content_name_file = ""
  39 content_file_cache = {}
  40 content_name_cache = {}
  41 content_path_cache = {}
  42
  43 ################################################################################
  44
  45 def usage (exit_code=0):
  46     print """Usage: dak import-contents
  47 Import Contents files
  48
  49  -h, --help                 show this help and exit
  50  -s, --suite=SUITE         only write file lists for this suite
  51 """
  52     sys.exit(exit_code)
  53
  54 ################################################################################
  55
  56 def cache_content_path(fullpath):
  57     global content_file_cache, contents_name_cache, content_path_cache
  58     global content_path_file, content_name_file, has_opened_temp_file_lists
  59
  60     # have we seen this contents before?
  61     if content_file_cache.has_key(fullpath):
  62         return
  63
  64     # split the path into basename, and pathname
  65     (path, file)  = os.path.split(fullpath)
  66
  67     # Due to performance reasons, we need to get the entire filelists table
  68     # sorted first before we can do assiocation tables.
  69     if has_opened_temp_file_lists == False:
  70         content_path_file = open("/tmp/content_file_path.tmp", "w")
  71         content_name_file = open("/tmp/content_name_path.tmp", "w")
  72         has_opened_temp_file_lists = True
  73
  74     if not content_path_cache.has_key(path):
  75         content_path_file.write("DEFAULT %s\n" % (path))
  76         content_path_cache[path] = 1
  77
  78     if not content_name_cache.has_key(file):
  79         content_name_file.write("DEFAULT %s\n" % (file))
  80         content_name_cache[file] = 1
  81     return
  82
  83 ################################################################################
  84
  85 def import_contents(suites):
  86     global projectB, Cnf
  87
  88     # Start transaction
  89     projectB.query("BEGIN WORK")
  90
  91     # Needed to make sure postgreSQL doesn't freak out on some of the data
  92     projectB.query("SET CLIENT_ENCODING TO 'LATIN1'")
  93
  94     # Precache everything
  95     #print "Precaching binary information, this will take a few moments ..."
  96     #database.preload_binary_id_cache()
  97
  98     # Prep regexs
  99     line_regex = re.compile(r'^(.+?)\s+(\S+)$')
 100     pkg_regex = re.compile(r'(\S+)/(\S+)$')
 101     file_regex = re.compile('^FILE')
 102
 103     # Get our suites, and the architectures
 104     for s in suites:
 105         suite_id = database.get_suite_id(s)
 106
 107         q = projectB.query("SELECT s.architecture, a.arch_string FROM suite_architectures s JOIN architecture a ON (s.architecture=a.id) WHERE suite = '%d'" % suite_id)
 108
 109         arch_list = [ ]
 110         for r in q.getresult():
 111             if r[1] != "source" and r[1] != "all":
 112                 arch_list.append((r[0], r[1]))
 113
 114         arch_all_id = database.get_architecture_id("all")
 115
 116         for arch in arch_list:
 117             print "Processing %s/%s" % (s, arch[1])
 118             arch_id = database.get_architecture_id(arch[1])
 119
 120             try:
 121                 f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch[1]), "r")
 122
 123             except:
 124                 print "Unable to open dists/%s/Contents-%s.gz" % (s, arch[1])
 125                 print "Skipping ..."
 126                 continue
 127
 128             # Get line count
 129             lines = f.readlines()
 130             num_of_lines = len(lines)
 131
 132             # Ok, the file cursor is at the first entry, now comes the fun 'lets parse' bit
 133             lines_processed = 0
 134             found_header = False
 135
 136             for line in lines:
 137                 if found_header == False:
 138                     if not line:
 139                         print "Unable to find end of Contents-%s.gz header!" % ( arch[1])
 140                         sys.exit(255)
 141
 142                     lines_processed += 1
 143                     if file_regex.match(line):
 144                         found_header = True
 145                     continue
 146
 147                 # The format is simple enough, *filename*, *section/package1,section/package2,etc*
 148                 # Each file appears once per Contents file, so first, use some regex match
 149                 # to split the two bits
 150
 151                 # Print out progress bar
 152                 print "\rProcessed %d lines of %d (%%%.2f)" % (lines_processed, num_of_lines, ((float(lines_processed)/num_of_lines)*100)),
 153
 154                 # regex lifted from packages.d.o code
 155                 matchs = line_regex.findall(line)
 156                 filename = matchs[0][0]
 157                 packages = matchs[0][1].split(',')
 158
 159
 160                 cache_content_path(filename)
 161
 162                 # Iterate through each file's packages
 163                 #for package in packages:
 164                 #    matchs = pkg_regex.findall(package)
 165
 166                     # Needed since the DB is unicode, and these files
 167                     # are ASCII
 168                 #    section_name = matchs[0][0]
 169                 #    package_name = matchs[0][1]
 170
 171                     #section_id = database.get_section_id(section_name)
 172                     #package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id)
 173
 174                #     if package_id == None:
 175                         # This can happen if the Contents file refers to a non-existant package
 176                         # it seems Contents sometimes can be stale due to use of caches (i.e., hurd-i386)
 177                         # hurd-i386 was removed from the archive, but its Contents file still exists
 178                         # and is seemingly still updated. The sane thing to do is skip it and continue
 179                #         continue
 180
 181
 182                 lines_processed += 1
 183
 184             print "" # newline since the Progress bar doesn't print one
 185             f.close()
 186
 187     # Commit work
 188
 189     content_name_file.close()
 190     content_path_file.close()
 191
 192     print "Committing to database ..."
 193     projectB.query("COMMIT")
 194
 195 ################################################################################
 196
 197 def main ():
 198     global Cnf, projectB, out
 199     out = sys.stdout
 200
 201     Cnf = utils.get_conf()
 202
 203     Arguments = [('h',"help","Import-Contents::Options::Help"),
 204                  ('s',"suite","Import-Contents::Options::Suite","HasArg"),
 205                 ]
 206
 207     for i in [ "help", "suite" ]:
 208         if not Cnf.has_key("Import-Contents::Options::%s" % (i)):
 209             Cnf["Import-Contents::Options::%s" % (i)] = ""
 210
 211     suites = apt_pkg.ParseCommandLine(Cnf,Arguments,sys.argv)
 212     Options = Cnf.SubTree("Import-Contents::Options")
 213
 214     if Options["Help"]:
 215         usage()
 216
 217     if Options["Suite"]:
 218         suites = utils.split_args(Options["Suite"])
 219     else:
 220         suites = Cnf.SubTree("Suite").List()
 221
 222     projectB = pg.connect(Cnf["DB::Name"], Cnf["DB::Host"], int(Cnf["DB::Port"]))
 223     database.init(Cnf, projectB)
 224
 225     import_contents(suites)
 226
 227 #######################################################################################
 228
 229 if __name__ == '__main__':
 230     main()