daklib/contents.py

   1 #!/usr/bin/env python
   2 """
   3 Helper code for contents generation.
   4
   5 @contact: Debian FTPMaster <ftpmaster@debian.org>
   6 @copyright: 2011 Torsten Werner <twerner@debian.org>
   7 @license: GNU General Public License version 2 or later
   8 """
   9
  10 ################################################################################
  11
  12 # This program is free software; you can redistribute it and/or modify
  13 # it under the terms of the GNU General Public License as published by
  14 # the Free Software Foundation; either version 2 of the License, or
  15 # (at your option) any later version.
  16
  17 # This program is distributed in the hope that it will be useful,
  18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 # GNU General Public License for more details.
  21
  22 # You should have received a copy of the GNU General Public License
  23 # along with this program; if not, write to the Free Software
  24 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  25
  26 ################################################################################
  27
  28 from daklib.dbconn import *
  29 from daklib.config import Config
  30 from daklib.filewriter import BinaryContentsFileWriter, SourceContentsFileWriter
  31
  32 from multiprocessing import Pool
  33 from shutil import rmtree
  34 from subprocess import Popen, PIPE, check_call
  35 from tempfile import mkdtemp
  36
  37 import os.path
  38 import signal
  39
  40 class BinaryContentsWriter(object):
  41     '''
  42     BinaryContentsWriter writes the Contents-$arch.gz files.
  43     '''
  44     def __init__(self, suite, architecture, overridetype, component):
  45         self.suite = suite
  46         self.architecture = architecture
  47         self.overridetype = overridetype
  48         self.component = component
  49         self.session = suite.session()
  50
  51     def query(self):
  52         '''
  53         Returns a query object that is doing most of the work.
  54         '''
  55         overridesuite = self.suite
  56         if self.suite.overridesuite is not None:
  57             overridesuite = get_suite(self.suite.overridesuite, self.session)
  58         params = {
  59             'suite':         self.suite.suite_id,
  60             'overridesuite': overridesuite.suite_id,
  61             'component':     self.component.component_id,
  62             'arch_all':      get_architecture('all', self.session).arch_id,
  63             'arch':          self.architecture.arch_id,
  64             'type_id':       self.overridetype.overridetype_id,
  65             'type':          self.overridetype.overridetype,
  66         }
  67
  68         sql = '''
  69 create temp table newest_binaries (
  70     id integer primary key,
  71     package text);
  72
  73 create index newest_binaries_by_package on newest_binaries (package);
  74
  75 insert into newest_binaries (id, package)
  76     select distinct on (package) id, package from binaries
  77         where type = :type and
  78             (architecture = :arch_all or architecture = :arch) and
  79             id in (select bin from bin_associations where suite = :suite)
  80         order by package, version desc;
  81
  82 with
  83
  84 unique_override as
  85     (select o.package, s.section
  86         from override o, section s
  87         where o.suite = :overridesuite and o.type = :type_id and o.section = s.id and
  88         o.component = :component)
  89
  90 select bc.file, string_agg(o.section || '/' || b.package, ',' order by b.package) as pkglist
  91     from newest_binaries b, bin_contents bc, unique_override o
  92     where b.id = bc.binary_id and o.package = b.package
  93     group by bc.file'''
  94
  95         return self.session.query("file", "pkglist").from_statement(sql). \
  96             params(params)
  97
  98     def formatline(self, filename, package_list):
  99         '''
 100         Returns a formatted string for the filename argument.
 101         '''
 102         return "%-55s %s\n" % (filename, package_list)
 103
 104     def fetch(self):
 105         '''
 106         Yields a new line of the Contents-$arch.gz file in filename order.
 107         '''
 108         for filename, package_list in self.query().yield_per(100):
 109             yield self.formatline(filename, package_list)
 110         # end transaction to return connection to pool
 111         self.session.rollback()
 112
 113     def get_list(self):
 114         '''
 115         Returns a list of lines for the Contents-$arch.gz file.
 116         '''
 117         return [item for item in self.fetch()]
 118
 119     def writer(self):
 120         '''
 121         Returns a writer object.
 122         '''
 123         values = {
 124             'suite':        self.suite.suite_name,
 125             'component':    self.component.component_name,
 126             'debtype':      self.overridetype.overridetype,
 127             'architecture': self.architecture.arch_string,
 128         }
 129         return BinaryContentsFileWriter(**values)
 130
 131     def get_header(self):
 132         '''
 133         Returns the header for the Contents files as a string.
 134         '''
 135         header_file = None
 136         try:
 137             filename = os.path.join(Config()['Dir::Templates'], 'contents')
 138             header_file = open(filename)
 139             return header_file.read()
 140         finally:
 141             if header_file:
 142                 header_file.close()
 143
 144     def write_file(self):
 145         '''
 146         Write the output file.
 147         '''
 148         writer = self.writer()
 149         file = writer.open()
 150         file.write(self.get_header())
 151         for item in self.fetch():
 152             file.write(item)
 153         writer.close()
 154
 155
 156 class SourceContentsWriter(object):
 157     '''
 158     SourceContentsWriter writes the Contents-source.gz files.
 159     '''
 160     def __init__(self, suite, component):
 161         self.suite = suite
 162         self.component = component
 163         self.session = suite.session()
 164
 165     def query(self):
 166         '''
 167         Returns a query object that is doing most of the work.
 168         '''
 169         params = {
 170             'suite_id':     self.suite.suite_id,
 171             'component_id': self.component.component_id,
 172         }
 173
 174         sql = '''
 175 create temp table newest_sources (
 176     id integer primary key,
 177     source text);
 178
 179 create index sources_binaries_by_source on newest_sources (source);
 180
 181 insert into newest_sources (id, source)
 182     select distinct on (source) s.id, s.source from source s
 183         join files f on f.id = s.file
 184         join location l on l.id = f.location
 185         where s.id in (select source from src_associations where suite = :suite_id)
 186             and l.component = :component_id
 187         order by source, version desc;
 188
 189 select sc.file, string_agg(s.source, ',' order by s.source) as pkglist
 190     from newest_sources s, src_contents sc
 191     where s.id = sc.source_id group by sc.file'''
 192
 193         return self.session.query("file", "pkglist").from_statement(sql). \
 194             params(params)
 195
 196     def formatline(self, filename, package_list):
 197         '''
 198         Returns a formatted string for the filename argument.
 199         '''
 200         return "%s\t%s\n" % (filename, package_list)
 201
 202     def fetch(self):
 203         '''
 204         Yields a new line of the Contents-source.gz file in filename order.
 205         '''
 206         for filename, package_list in self.query().yield_per(100):
 207             yield self.formatline(filename, package_list)
 208         # end transaction to return connection to pool
 209         self.session.rollback()
 210
 211     def get_list(self):
 212         '''
 213         Returns a list of lines for the Contents-source.gz file.
 214         '''
 215         return [item for item in self.fetch()]
 216
 217     def writer(self):
 218         '''
 219         Returns a writer object.
 220         '''
 221         values = {
 222             'suite':     self.suite.suite_name,
 223             'component': self.component.component_name
 224         }
 225         return SourceContentsFileWriter(**values)
 226
 227     def write_file(self):
 228         '''
 229         Write the output file.
 230         '''
 231         writer = self.writer()
 232         file = writer.open()
 233         for item in self.fetch():
 234             file.write(item)
 235         writer.close()
 236
 237
 238 def binary_helper(suite_id, arch_id, overridetype_id, component_id):
 239     '''
 240     This function is called in a new subprocess and multiprocessing wants a top
 241     level function.
 242     '''
 243     session = DBConn().session(work_mem = 1000)
 244     suite = Suite.get(suite_id, session)
 245     architecture = Architecture.get(arch_id, session)
 246     overridetype = OverrideType.get(overridetype_id, session)
 247     component = Component.get(component_id, session)
 248     log_message = [suite.suite_name, architecture.arch_string, \
 249         overridetype.overridetype, component.component_name]
 250     contents_writer = BinaryContentsWriter(suite, architecture, overridetype, component)
 251     contents_writer.write_file()
 252     return log_message
 253
 254 def source_helper(suite_id, component_id):
 255     '''
 256     This function is called in a new subprocess and multiprocessing wants a top
 257     level function.
 258     '''
 259     session = DBConn().session(work_mem = 1000)
 260     suite = Suite.get(suite_id, session)
 261     component = Component.get(component_id, session)
 262     log_message = [suite.suite_name, 'source', component.component_name]
 263     contents_writer = SourceContentsWriter(suite, component)
 264     contents_writer.write_file()
 265     return log_message
 266
 267 class ContentsWriter(object):
 268     '''
 269     Loop over all suites, architectures, overridetypes, and components to write
 270     all contents files.
 271     '''
 272     @classmethod
 273     def log_result(class_, result):
 274         '''
 275         Writes a result message to the logfile.
 276         '''
 277         class_.logger.log(result)
 278
 279     @classmethod
 280     def write_all(class_, logger, suite_names = [], component_names = [], force = False):
 281         '''
 282         Writes all Contents files for suites in list suite_names which defaults
 283         to all 'touchable' suites if not specified explicitely. Untouchable
 284         suites will be included if the force argument is set to True.
 285         '''
 286         class_.logger = logger
 287         session = DBConn().session()
 288         suite_query = session.query(Suite)
 289         if len(suite_names) > 0:
 290             suite_query = suite_query.filter(Suite.suite_name.in_(suite_names))
 291         component_query = session.query(Component)
 292         if len(component_names) > 0:
 293             component_query = component_query.filter(Component.component_name.in_(component_names))
 294         if not force:
 295             suite_query = suite_query.filter_by(untouchable = False)
 296         deb_id = get_override_type('deb', session).overridetype_id
 297         udeb_id = get_override_type('udeb', session).overridetype_id
 298         pool = Pool()
 299         for suite in suite_query:
 300             suite_id = suite.suite_id
 301             for component in component_query:
 302                 component_id = component.component_id
 303                 # handle source packages
 304                 pool.apply_async(source_helper, (suite_id, component_id),
 305                     callback = class_.log_result)
 306                 for architecture in suite.get_architectures(skipsrc = True, skipall = True):
 307                     arch_id = architecture.arch_id
 308                     # handle 'deb' packages
 309                     pool.apply_async(binary_helper, (suite_id, arch_id, deb_id, component_id), \
 310                         callback = class_.log_result)
 311                     # handle 'udeb' packages
 312                     pool.apply_async(binary_helper, (suite_id, arch_id, udeb_id, component_id), \
 313                         callback = class_.log_result)
 314         pool.close()
 315         pool.join()
 316         session.close()
 317
 318
 319 class BinaryContentsScanner(object):
 320     '''
 321     BinaryContentsScanner provides a threadsafe method scan() to scan the
 322     contents of a DBBinary object.
 323     '''
 324     def __init__(self, binary_id):
 325         '''
 326         The argument binary_id is the id of the DBBinary object that
 327         should be scanned.
 328         '''
 329         self.binary_id = binary_id
 330
 331     def scan(self, dummy_arg = None):
 332         '''
 333         This method does the actual scan and fills in the associated BinContents
 334         property. It commits any changes to the database. The argument dummy_arg
 335         is ignored but needed by our threadpool implementation.
 336         '''
 337         session = DBConn().session()
 338         binary = session.query(DBBinary).get(self.binary_id)
 339         fileset = set(binary.scan_contents())
 340         if len(fileset) == 0:
 341             fileset.add('EMPTY_PACKAGE')
 342         for filename in fileset:
 343             binary.contents.append(BinContents(file = filename))
 344         session.commit()
 345         session.close()
 346
 347     @classmethod
 348     def scan_all(class_, limit = None):
 349         '''
 350         The class method scan_all() scans all binaries using multiple threads.
 351         The number of binaries to be scanned can be limited with the limit
 352         argument. Returns the number of processed and remaining packages as a
 353         dict.
 354         '''
 355         session = DBConn().session()
 356         query = session.query(DBBinary).filter(DBBinary.contents == None)
 357         remaining = query.count
 358         if limit is not None:
 359             query = query.limit(limit)
 360         processed = query.count()
 361         pool = Pool()
 362         for binary in query.yield_per(100):
 363             pool.apply_async(binary_scan_helper, (binary.binary_id, ))
 364         pool.close()
 365         pool.join()
 366         remaining = remaining()
 367         session.close()
 368         return { 'processed': processed, 'remaining': remaining }
 369
 370 def binary_scan_helper(binary_id):
 371     '''
 372     This function runs in a subprocess.
 373     '''
 374     scanner = BinaryContentsScanner(binary_id)
 375     scanner.scan()
 376
 377
 378 def subprocess_setup():
 379     # Python installs a SIGPIPE handler by default. This is usually not what
 380     # non-Python subprocesses expect.
 381     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
 382
 383 class UnpackedSource(object):
 384     '''
 385     UnpackedSource extracts a source package into a temporary location and
 386     gives you some convinient function for accessing it.
 387     '''
 388     def __init__(self, dscfilename):
 389         '''
 390         The dscfilename is a name of a DSC file that will be extracted.
 391         '''
 392         temp_directory = mkdtemp(dir = Config()['Dir::TempPath'])
 393         self.root_directory = os.path.join(temp_directory, 'root')
 394         command = ('dpkg-source', '--no-copy', '--no-check', '-q', '-x',
 395             dscfilename, self.root_directory)
 396         check_call(command, preexec_fn = subprocess_setup)
 397
 398     def get_root_directory(self):
 399         '''
 400         Returns the name of the package's root directory which is the directory
 401         where the debian subdirectory is located.
 402         '''
 403         return self.root_directory
 404
 405     def get_changelog_file(self):
 406         '''
 407         Returns a file object for debian/changelog or None if no such file exists.
 408         '''
 409         changelog_name = os.path.join(self.root_directory, 'debian', 'changelog')
 410         try:
 411             return open(changelog_name)
 412         except IOError:
 413             return None
 414
 415     def get_all_filenames(self):
 416         '''
 417         Returns an iterator over all filenames. The filenames will be relative
 418         to the root directory.
 419         '''
 420         skip = len(self.root_directory) + 1
 421         for root, _, files in os.walk(self.root_directory):
 422             for name in files:
 423                 yield os.path.join(root[skip:], name)
 424
 425     def cleanup(self):
 426         '''
 427         Removes all temporary files.
 428         '''
 429         if self.root_directory is None:
 430             return
 431         parent_directory = os.path.dirname(self.root_directory)
 432         rmtree(parent_directory)
 433         self.root_directory = None
 434
 435     def __del__(self):
 436         '''
 437         Enforce cleanup.
 438         '''
 439         self.cleanup()
 440
 441
 442 class SourceContentsScanner(object):
 443     '''
 444     SourceContentsScanner provides a method scan() to scan the contents of a
 445     DBSource object.
 446     '''
 447     def __init__(self, source_id):
 448         '''
 449         The argument source_id is the id of the DBSource object that
 450         should be scanned.
 451         '''
 452         self.source_id = source_id
 453
 454     def scan(self):
 455         '''
 456         This method does the actual scan and fills in the associated SrcContents
 457         property. It commits any changes to the database.
 458         '''
 459         session = DBConn().session()
 460         source = session.query(DBSource).get(self.source_id)
 461         fileset = set(source.scan_contents())
 462         for filename in fileset:
 463             source.contents.append(SrcContents(file = filename))
 464         session.commit()
 465         session.close()
 466
 467     @classmethod
 468     def scan_all(class_, limit = None):
 469         '''
 470         The class method scan_all() scans all source using multiple processes.
 471         The number of sources to be scanned can be limited with the limit
 472         argument. Returns the number of processed and remaining packages as a
 473         dict.
 474         '''
 475         session = DBConn().session()
 476         query = session.query(DBSource).filter(DBSource.contents == None)
 477         remaining = query.count
 478         if limit is not None:
 479             query = query.limit(limit)
 480         processed = query.count()
 481         pool = Pool()
 482         for source in query.yield_per(100):
 483             pool.apply_async(source_scan_helper, (source.source_id, ))
 484         pool.close()
 485         pool.join()
 486         remaining = remaining()
 487         session.close()
 488         return { 'processed': processed, 'remaining': remaining }
 489
 490 def source_scan_helper(source_id):
 491     '''
 492     This function runs in a subprocess.
 493     '''
 494     try:
 495         scanner = SourceContentsScanner(source_id)
 496         scanner.scan()
 497     except Exception, e:
 498         print e
 499