daklib/contents.py

   1 #!/usr/bin/env python
   2 """
   3 Helper code for contents generation.
   4
   5 @contact: Debian FTPMaster <ftpmaster@debian.org>
   6 @copyright: 2011 Torsten Werner <twerner@debian.org>
   7 @license: GNU General Public License version 2 or later
   8 """
   9
  10 ################################################################################
  11
  12 # This program is free software; you can redistribute it and/or modify
  13 # it under the terms of the GNU General Public License as published by
  14 # the Free Software Foundation; either version 2 of the License, or
  15 # (at your option) any later version.
  16
  17 # This program is distributed in the hope that it will be useful,
  18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 # GNU General Public License for more details.
  21
  22 # You should have received a copy of the GNU General Public License
  23 # along with this program; if not, write to the Free Software
  24 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  25
  26 ################################################################################
  27
  28 from daklib.dbconn import *
  29 from daklib.config import Config
  30 from daklib.filewriter import BinaryContentsFileWriter, SourceContentsFileWriter
  31
  32 from multiprocessing import Pool
  33 from shutil import rmtree
  34 from tempfile import mkdtemp
  35
  36 import daklib.daksubprocess
  37 import os.path
  38
  39 class BinaryContentsWriter(object):
  40     '''
  41     BinaryContentsWriter writes the Contents-$arch.gz files.
  42     '''
  43     def __init__(self, suite, architecture, overridetype, component):
  44         self.suite = suite
  45         self.architecture = architecture
  46         self.overridetype = overridetype
  47         self.component = component
  48         self.session = suite.session()
  49
  50     def query(self):
  51         '''
  52         Returns a query object that is doing most of the work.
  53         '''
  54         overridesuite = self.suite
  55         if self.suite.overridesuite is not None:
  56             overridesuite = get_suite(self.suite.overridesuite, self.session)
  57         params = {
  58             'suite':         self.suite.suite_id,
  59             'overridesuite': overridesuite.suite_id,
  60             'component':     self.component.component_id,
  61             'arch_all':      get_architecture('all', self.session).arch_id,
  62             'arch':          self.architecture.arch_id,
  63             'type_id':       self.overridetype.overridetype_id,
  64             'type':          self.overridetype.overridetype,
  65         }
  66
  67         sql_create_temp = '''
  68 create temp table newest_binaries (
  69     id integer primary key,
  70     package text);
  71
  72 create index newest_binaries_by_package on newest_binaries (package);
  73
  74 insert into newest_binaries (id, package)
  75     select distinct on (package) id, package from binaries
  76         where type = :type and
  77             (architecture = :arch_all or architecture = :arch) and
  78             id in (select bin from bin_associations where suite = :suite)
  79         order by package, version desc;'''
  80         self.session.execute(sql_create_temp, params=params)
  81
  82         sql = '''
  83 with
  84
  85 unique_override as
  86     (select o.package, s.section
  87         from override o, section s
  88         where o.suite = :overridesuite and o.type = :type_id and o.section = s.id and
  89         o.component = :component)
  90
  91 select bc.file, string_agg(o.section || '/' || b.package, ',' order by b.package) as pkglist
  92     from newest_binaries b, bin_contents bc, unique_override o
  93     where b.id = bc.binary_id and o.package = b.package
  94     group by bc.file'''
  95
  96         return self.session.query("file", "pkglist").from_statement(sql). \
  97             params(params)
  98
  99     def formatline(self, filename, package_list):
 100         '''
 101         Returns a formatted string for the filename argument.
 102         '''
 103         return "%-55s %s\n" % (filename, package_list)
 104
 105     def fetch(self):
 106         '''
 107         Yields a new line of the Contents-$arch.gz file in filename order.
 108         '''
 109         for filename, package_list in self.query().yield_per(100):
 110             yield self.formatline(filename, package_list)
 111         # end transaction to return connection to pool
 112         self.session.rollback()
 113
 114     def get_list(self):
 115         '''
 116         Returns a list of lines for the Contents-$arch.gz file.
 117         '''
 118         return [item for item in self.fetch()]
 119
 120     def writer(self):
 121         '''
 122         Returns a writer object.
 123         '''
 124         values = {
 125             'archive':      self.suite.archive.path,
 126             'suite':        self.suite.suite_name,
 127             'component':    self.component.component_name,
 128             'debtype':      self.overridetype.overridetype,
 129             'architecture': self.architecture.arch_string,
 130         }
 131         return BinaryContentsFileWriter(**values)
 132
 133     def get_header(self):
 134         '''
 135         Returns the header for the Contents files as a string.
 136         '''
 137         filename = os.path.join(Config()['Dir::Templates'], 'contents')
 138         with open(filename) as header_file:
 139             return header_file.read()
 140
 141     def write_file(self):
 142         '''
 143         Write the output file.
 144         '''
 145         writer = self.writer()
 146         file = writer.open()
 147         file.write(self.get_header())
 148         for item in self.fetch():
 149             file.write(item)
 150         writer.close()
 151
 152
 153 class SourceContentsWriter(object):
 154     '''
 155     SourceContentsWriter writes the Contents-source.gz files.
 156     '''
 157     def __init__(self, suite, component):
 158         self.suite = suite
 159         self.component = component
 160         self.session = suite.session()
 161
 162     def query(self):
 163         '''
 164         Returns a query object that is doing most of the work.
 165         '''
 166         params = {
 167             'suite_id':     self.suite.suite_id,
 168             'component_id': self.component.component_id,
 169         }
 170
 171         sql_create_temp = '''
 172 create temp table newest_sources (
 173     id integer primary key,
 174     source text);
 175
 176 create index sources_binaries_by_source on newest_sources (source);
 177
 178 insert into newest_sources (id, source)
 179     select distinct on (source) s.id, s.source from source s
 180         join files_archive_map af on s.file = af.file_id
 181         where s.id in (select source from src_associations where suite = :suite_id)
 182             and af.component_id = :component_id
 183         order by source, version desc;'''
 184         self.session.execute(sql_create_temp, params=params)
 185
 186         sql = '''
 187 select sc.file, string_agg(s.source, ',' order by s.source) as pkglist
 188     from newest_sources s, src_contents sc
 189     where s.id = sc.source_id group by sc.file'''
 190
 191         return self.session.query("file", "pkglist").from_statement(sql). \
 192             params(params)
 193
 194     def formatline(self, filename, package_list):
 195         '''
 196         Returns a formatted string for the filename argument.
 197         '''
 198         return "%s\t%s\n" % (filename, package_list)
 199
 200     def fetch(self):
 201         '''
 202         Yields a new line of the Contents-source.gz file in filename order.
 203         '''
 204         for filename, package_list in self.query().yield_per(100):
 205             yield self.formatline(filename, package_list)
 206         # end transaction to return connection to pool
 207         self.session.rollback()
 208
 209     def get_list(self):
 210         '''
 211         Returns a list of lines for the Contents-source.gz file.
 212         '''
 213         return [item for item in self.fetch()]
 214
 215     def writer(self):
 216         '''
 217         Returns a writer object.
 218         '''
 219         values = {
 220             'archive':   self.suite.archive.path,
 221             'suite':     self.suite.suite_name,
 222             'component': self.component.component_name
 223         }
 224         return SourceContentsFileWriter(**values)
 225
 226     def write_file(self):
 227         '''
 228         Write the output file.
 229         '''
 230         writer = self.writer()
 231         file = writer.open()
 232         for item in self.fetch():
 233             file.write(item)
 234         writer.close()
 235
 236
 237 def binary_helper(suite_id, arch_id, overridetype_id, component_id):
 238     '''
 239     This function is called in a new subprocess and multiprocessing wants a top
 240     level function.
 241     '''
 242     session = DBConn().session(work_mem = 1000)
 243     suite = Suite.get(suite_id, session)
 244     architecture = Architecture.get(arch_id, session)
 245     overridetype = OverrideType.get(overridetype_id, session)
 246     component = Component.get(component_id, session)
 247     log_message = [suite.suite_name, architecture.arch_string, \
 248         overridetype.overridetype, component.component_name]
 249     contents_writer = BinaryContentsWriter(suite, architecture, overridetype, component)
 250     contents_writer.write_file()
 251     session.close()
 252     return log_message
 253
 254 def source_helper(suite_id, component_id):
 255     '''
 256     This function is called in a new subprocess and multiprocessing wants a top
 257     level function.
 258     '''
 259     session = DBConn().session(work_mem = 1000)
 260     suite = Suite.get(suite_id, session)
 261     component = Component.get(component_id, session)
 262     log_message = [suite.suite_name, 'source', component.component_name]
 263     contents_writer = SourceContentsWriter(suite, component)
 264     contents_writer.write_file()
 265     session.close()
 266     return log_message
 267
 268 class ContentsWriter(object):
 269     '''
 270     Loop over all suites, architectures, overridetypes, and components to write
 271     all contents files.
 272     '''
 273     @classmethod
 274     def log_result(class_, result):
 275         '''
 276         Writes a result message to the logfile.
 277         '''
 278         class_.logger.log(result)
 279
 280     @classmethod
 281     def write_all(class_, logger, archive_names = [], suite_names = [], component_names = [], force = False):
 282         '''
 283         Writes all Contents files for suites in list suite_names which defaults
 284         to all 'touchable' suites if not specified explicitely. Untouchable
 285         suites will be included if the force argument is set to True.
 286         '''
 287         class_.logger = logger
 288         session = DBConn().session()
 289         suite_query = session.query(Suite)
 290         if len(archive_names) > 0:
 291             suite_query = suite_query.join(Suite.archive).filter(Archive.archive_name.in_(archive_names))
 292         if len(suite_names) > 0:
 293             suite_query = suite_query.filter(Suite.suite_name.in_(suite_names))
 294         component_query = session.query(Component)
 295         if len(component_names) > 0:
 296             component_query = component_query.filter(Component.component_name.in_(component_names))
 297         if not force:
 298             suite_query = suite_query.filter(Suite.untouchable == False)
 299         deb_id = get_override_type('deb', session).overridetype_id
 300         udeb_id = get_override_type('udeb', session).overridetype_id
 301         pool = Pool()
 302         for suite in suite_query:
 303             suite_id = suite.suite_id
 304             for component in component_query:
 305                 component_id = component.component_id
 306                 # handle source packages
 307                 pool.apply_async(source_helper, (suite_id, component_id),
 308                     callback = class_.log_result)
 309                 for architecture in suite.get_architectures(skipsrc = True, skipall = True):
 310                     arch_id = architecture.arch_id
 311                     # handle 'deb' packages
 312                     pool.apply_async(binary_helper, (suite_id, arch_id, deb_id, component_id), \
 313                         callback = class_.log_result)
 314                     # handle 'udeb' packages
 315                     pool.apply_async(binary_helper, (suite_id, arch_id, udeb_id, component_id), \
 316                         callback = class_.log_result)
 317         pool.close()
 318         pool.join()
 319         session.close()
 320
 321
 322 class BinaryContentsScanner(object):
 323     '''
 324     BinaryContentsScanner provides a threadsafe method scan() to scan the
 325     contents of a DBBinary object.
 326     '''
 327     def __init__(self, binary_id):
 328         '''
 329         The argument binary_id is the id of the DBBinary object that
 330         should be scanned.
 331         '''
 332         self.binary_id = binary_id
 333
 334     def scan(self, dummy_arg = None):
 335         '''
 336         This method does the actual scan and fills in the associated BinContents
 337         property. It commits any changes to the database. The argument dummy_arg
 338         is ignored but needed by our threadpool implementation.
 339         '''
 340         session = DBConn().session()
 341         binary = session.query(DBBinary).get(self.binary_id)
 342         fileset = set(binary.scan_contents())
 343         if len(fileset) == 0:
 344             fileset.add('EMPTY_PACKAGE')
 345         for filename in fileset:
 346             binary.contents.append(BinContents(file = filename))
 347         session.commit()
 348         session.close()
 349
 350     @classmethod
 351     def scan_all(class_, limit = None):
 352         '''
 353         The class method scan_all() scans all binaries using multiple threads.
 354         The number of binaries to be scanned can be limited with the limit
 355         argument. Returns the number of processed and remaining packages as a
 356         dict.
 357         '''
 358         session = DBConn().session()
 359         query = session.query(DBBinary).filter(DBBinary.contents == None)
 360         remaining = query.count
 361         if limit is not None:
 362             query = query.limit(limit)
 363         processed = query.count()
 364         pool = Pool()
 365         for binary in query.yield_per(100):
 366             pool.apply_async(binary_scan_helper, (binary.binary_id, ))
 367         pool.close()
 368         pool.join()
 369         remaining = remaining()
 370         session.close()
 371         return { 'processed': processed, 'remaining': remaining }
 372
 373 def binary_scan_helper(binary_id):
 374     '''
 375     This function runs in a subprocess.
 376     '''
 377     scanner = BinaryContentsScanner(binary_id)
 378     scanner.scan()
 379
 380 class UnpackedSource(object):
 381     '''
 382     UnpackedSource extracts a source package into a temporary location and
 383     gives you some convinient function for accessing it.
 384     '''
 385     def __init__(self, dscfilename, tmpbasedir=None):
 386         '''
 387         The dscfilename is a name of a DSC file that will be extracted.
 388         '''
 389         basedir = tmpbasedir if tmpbasedir else Config()['Dir::TempPath']
 390         temp_directory = mkdtemp(dir = basedir)
 391         self.root_directory = os.path.join(temp_directory, 'root')
 392         command = ('dpkg-source', '--no-copy', '--no-check', '-q', '-x',
 393             dscfilename, self.root_directory)
 394         daklib.daksubprocess.check_call(command)
 395
 396     def get_root_directory(self):
 397         '''
 398         Returns the name of the package's root directory which is the directory
 399         where the debian subdirectory is located.
 400         '''
 401         return self.root_directory
 402
 403     def get_changelog_file(self):
 404         '''
 405         Returns a file object for debian/changelog or None if no such file exists.
 406         '''
 407         changelog_name = os.path.join(self.root_directory, 'debian', 'changelog')
 408         try:
 409             return open(changelog_name)
 410         except IOError:
 411             return None
 412
 413     def get_all_filenames(self):
 414         '''
 415         Returns an iterator over all filenames. The filenames will be relative
 416         to the root directory.
 417         '''
 418         skip = len(self.root_directory) + 1
 419         for root, _, files in os.walk(self.root_directory):
 420             for name in files:
 421                 yield os.path.join(root[skip:], name)
 422
 423     def cleanup(self):
 424         '''
 425         Removes all temporary files.
 426         '''
 427         if self.root_directory is None:
 428             return
 429         parent_directory = os.path.dirname(self.root_directory)
 430         rmtree(parent_directory)
 431         self.root_directory = None
 432
 433     def __del__(self):
 434         '''
 435         Enforce cleanup.
 436         '''
 437         self.cleanup()
 438
 439
 440 class SourceContentsScanner(object):
 441     '''
 442     SourceContentsScanner provides a method scan() to scan the contents of a
 443     DBSource object.
 444     '''
 445     def __init__(self, source_id):
 446         '''
 447         The argument source_id is the id of the DBSource object that
 448         should be scanned.
 449         '''
 450         self.source_id = source_id
 451
 452     def scan(self):
 453         '''
 454         This method does the actual scan and fills in the associated SrcContents
 455         property. It commits any changes to the database.
 456         '''
 457         session = DBConn().session()
 458         source = session.query(DBSource).get(self.source_id)
 459         fileset = set(source.scan_contents())
 460         for filename in fileset:
 461             source.contents.append(SrcContents(file = filename))
 462         session.commit()
 463         session.close()
 464
 465     @classmethod
 466     def scan_all(class_, limit = None):
 467         '''
 468         The class method scan_all() scans all source using multiple processes.
 469         The number of sources to be scanned can be limited with the limit
 470         argument. Returns the number of processed and remaining packages as a
 471         dict.
 472         '''
 473         session = DBConn().session()
 474         query = session.query(DBSource).filter(DBSource.contents == None)
 475         remaining = query.count
 476         if limit is not None:
 477             query = query.limit(limit)
 478         processed = query.count()
 479         pool = Pool()
 480         for source in query.yield_per(100):
 481             pool.apply_async(source_scan_helper, (source.source_id, ))
 482         pool.close()
 483         pool.join()
 484         remaining = remaining()
 485         session.close()
 486         return { 'processed': processed, 'remaining': remaining }
 487
 488 def source_scan_helper(source_id):
 489     '''
 490     This function runs in a subprocess.
 491     '''
 492     try:
 493         scanner = SourceContentsScanner(source_id)
 494         scanner.scan()
 495     except Exception as e:
 496         print e