daklib/contents.py

   1 #!/usr/bin/env python
   2 """
   3 Helper code for contents generation.
   4
   5 @contact: Debian FTPMaster <ftpmaster@debian.org>
   6 @copyright: 2011 Torsten Werner <twerner@debian.org>
   7 @license: GNU General Public License version 2 or later
   8 """
   9
  10 ################################################################################
  11
  12 # This program is free software; you can redistribute it and/or modify
  13 # it under the terms of the GNU General Public License as published by
  14 # the Free Software Foundation; either version 2 of the License, or
  15 # (at your option) any later version.
  16
  17 # This program is distributed in the hope that it will be useful,
  18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 # GNU General Public License for more details.
  21
  22 # You should have received a copy of the GNU General Public License
  23 # along with this program; if not, write to the Free Software
  24 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  25
  26 ################################################################################
  27
  28 from daklib.dbconn import *
  29 from daklib.config import Config
  30 from daklib.filewriter import BinaryContentsFileWriter, SourceContentsFileWriter
  31
  32 from multiprocessing import Pool
  33 from shutil import rmtree
  34 from subprocess import Popen, PIPE, check_call
  35 from tempfile import mkdtemp
  36
  37 import os.path
  38 import signal
  39
  40 class BinaryContentsWriter(object):
  41     '''
  42     BinaryContentsWriter writes the Contents-$arch.gz files.
  43     '''
  44     def __init__(self, suite, architecture, overridetype, component):
  45         self.suite = suite
  46         self.architecture = architecture
  47         self.overridetype = overridetype
  48         self.component = component
  49         self.session = suite.session()
  50
  51     def query(self):
  52         '''
  53         Returns a query object that is doing most of the work.
  54         '''
  55         overridesuite = self.suite
  56         if self.suite.overridesuite is not None:
  57             overridesuite = get_suite(self.suite.overridesuite, self.session)
  58         params = {
  59             'suite':         self.suite.suite_id,
  60             'overridesuite': overridesuite.suite_id,
  61             'component':     self.component.component_id,
  62             'arch_all':      get_architecture('all', self.session).arch_id,
  63             'arch':          self.architecture.arch_id,
  64             'type_id':       self.overridetype.overridetype_id,
  65             'type':          self.overridetype.overridetype,
  66         }
  67
  68         sql_create_temp = '''
  69 create temp table newest_binaries (
  70     id integer primary key,
  71     package text);
  72
  73 create index newest_binaries_by_package on newest_binaries (package);
  74
  75 insert into newest_binaries (id, package)
  76     select distinct on (package) id, package from binaries
  77         where type = :type and
  78             (architecture = :arch_all or architecture = :arch) and
  79             id in (select bin from bin_associations where suite = :suite)
  80         order by package, version desc;'''
  81         self.session.execute(sql_create_temp, params=params)
  82
  83         sql = '''
  84 with
  85
  86 unique_override as
  87     (select o.package, s.section
  88         from override o, section s
  89         where o.suite = :overridesuite and o.type = :type_id and o.section = s.id and
  90         o.component = :component)
  91
  92 select bc.file, string_agg(o.section || '/' || b.package, ',' order by b.package) as pkglist
  93     from newest_binaries b, bin_contents bc, unique_override o
  94     where b.id = bc.binary_id and o.package = b.package
  95     group by bc.file'''
  96
  97         return self.session.query("file", "pkglist").from_statement(sql). \
  98             params(params)
  99
 100     def formatline(self, filename, package_list):
 101         '''
 102         Returns a formatted string for the filename argument.
 103         '''
 104         return "%-55s %s\n" % (filename, package_list)
 105
 106     def fetch(self):
 107         '''
 108         Yields a new line of the Contents-$arch.gz file in filename order.
 109         '''
 110         for filename, package_list in self.query().yield_per(100):
 111             yield self.formatline(filename, package_list)
 112         # end transaction to return connection to pool
 113         self.session.rollback()
 114
 115     def get_list(self):
 116         '''
 117         Returns a list of lines for the Contents-$arch.gz file.
 118         '''
 119         return [item for item in self.fetch()]
 120
 121     def writer(self):
 122         '''
 123         Returns a writer object.
 124         '''
 125         values = {
 126             'archive':      self.suite.archive.path,
 127             'suite':        self.suite.suite_name,
 128             'component':    self.component.component_name,
 129             'debtype':      self.overridetype.overridetype,
 130             'architecture': self.architecture.arch_string,
 131         }
 132         return BinaryContentsFileWriter(**values)
 133
 134     def get_header(self):
 135         '''
 136         Returns the header for the Contents files as a string.
 137         '''
 138         header_file = None
 139         try:
 140             filename = os.path.join(Config()['Dir::Templates'], 'contents')
 141             header_file = open(filename)
 142             return header_file.read()
 143         finally:
 144             if header_file:
 145                 header_file.close()
 146
 147     def write_file(self):
 148         '''
 149         Write the output file.
 150         '''
 151         writer = self.writer()
 152         file = writer.open()
 153         file.write(self.get_header())
 154         for item in self.fetch():
 155             file.write(item)
 156         writer.close()
 157
 158
 159 class SourceContentsWriter(object):
 160     '''
 161     SourceContentsWriter writes the Contents-source.gz files.
 162     '''
 163     def __init__(self, suite, component):
 164         self.suite = suite
 165         self.component = component
 166         self.session = suite.session()
 167
 168     def query(self):
 169         '''
 170         Returns a query object that is doing most of the work.
 171         '''
 172         params = {
 173             'suite_id':     self.suite.suite_id,
 174             'component_id': self.component.component_id,
 175         }
 176
 177         sql_create_temp = '''
 178 create temp table newest_sources (
 179     id integer primary key,
 180     source text);
 181
 182 create index sources_binaries_by_source on newest_sources (source);
 183
 184 insert into newest_sources (id, source)
 185     select distinct on (source) s.id, s.source from source s
 186         join files f on f.id = s.file
 187         join location l on l.id = f.location
 188         where s.id in (select source from src_associations where suite = :suite_id)
 189             and l.component = :component_id
 190         order by source, version desc;'''
 191         self.session.execute(sql_create_temp, params=params)
 192
 193         sql = '''
 194 select sc.file, string_agg(s.source, ',' order by s.source) as pkglist
 195     from newest_sources s, src_contents sc
 196     where s.id = sc.source_id group by sc.file'''
 197
 198         return self.session.query("file", "pkglist").from_statement(sql). \
 199             params(params)
 200
 201     def formatline(self, filename, package_list):
 202         '''
 203         Returns a formatted string for the filename argument.
 204         '''
 205         return "%s\t%s\n" % (filename, package_list)
 206
 207     def fetch(self):
 208         '''
 209         Yields a new line of the Contents-source.gz file in filename order.
 210         '''
 211         for filename, package_list in self.query().yield_per(100):
 212             yield self.formatline(filename, package_list)
 213         # end transaction to return connection to pool
 214         self.session.rollback()
 215
 216     def get_list(self):
 217         '''
 218         Returns a list of lines for the Contents-source.gz file.
 219         '''
 220         return [item for item in self.fetch()]
 221
 222     def writer(self):
 223         '''
 224         Returns a writer object.
 225         '''
 226         values = {
 227             'archive':   self.suite.archive.path,
 228             'suite':     self.suite.suite_name,
 229             'component': self.component.component_name
 230         }
 231         return SourceContentsFileWriter(**values)
 232
 233     def write_file(self):
 234         '''
 235         Write the output file.
 236         '''
 237         writer = self.writer()
 238         file = writer.open()
 239         for item in self.fetch():
 240             file.write(item)
 241         writer.close()
 242
 243
 244 def binary_helper(suite_id, arch_id, overridetype_id, component_id):
 245     '''
 246     This function is called in a new subprocess and multiprocessing wants a top
 247     level function.
 248     '''
 249     session = DBConn().session(work_mem = 1000)
 250     suite = Suite.get(suite_id, session)
 251     architecture = Architecture.get(arch_id, session)
 252     overridetype = OverrideType.get(overridetype_id, session)
 253     component = Component.get(component_id, session)
 254     log_message = [suite.suite_name, architecture.arch_string, \
 255         overridetype.overridetype, component.component_name]
 256     contents_writer = BinaryContentsWriter(suite, architecture, overridetype, component)
 257     contents_writer.write_file()
 258     session.close()
 259     return log_message
 260
 261 def source_helper(suite_id, component_id):
 262     '''
 263     This function is called in a new subprocess and multiprocessing wants a top
 264     level function.
 265     '''
 266     session = DBConn().session(work_mem = 1000)
 267     suite = Suite.get(suite_id, session)
 268     component = Component.get(component_id, session)
 269     log_message = [suite.suite_name, 'source', component.component_name]
 270     contents_writer = SourceContentsWriter(suite, component)
 271     contents_writer.write_file()
 272     session.close()
 273     return log_message
 274
 275 class ContentsWriter(object):
 276     '''
 277     Loop over all suites, architectures, overridetypes, and components to write
 278     all contents files.
 279     '''
 280     @classmethod
 281     def log_result(class_, result):
 282         '''
 283         Writes a result message to the logfile.
 284         '''
 285         class_.logger.log(result)
 286
 287     @classmethod
 288     def write_all(class_, logger, suite_names = [], component_names = [], force = False):
 289         '''
 290         Writes all Contents files for suites in list suite_names which defaults
 291         to all 'touchable' suites if not specified explicitely. Untouchable
 292         suites will be included if the force argument is set to True.
 293         '''
 294         class_.logger = logger
 295         session = DBConn().session()
 296         suite_query = session.query(Suite)
 297         if len(suite_names) > 0:
 298             suite_query = suite_query.filter(Suite.suite_name.in_(suite_names))
 299         component_query = session.query(Component)
 300         if len(component_names) > 0:
 301             component_query = component_query.filter(Component.component_name.in_(component_names))
 302         if not force:
 303             suite_query = suite_query.filter_by(untouchable = False)
 304         deb_id = get_override_type('deb', session).overridetype_id
 305         udeb_id = get_override_type('udeb', session).overridetype_id
 306         pool = Pool()
 307         for suite in suite_query:
 308             suite_id = suite.suite_id
 309             for component in component_query:
 310                 component_id = component.component_id
 311                 # handle source packages
 312                 pool.apply_async(source_helper, (suite_id, component_id),
 313                     callback = class_.log_result)
 314                 for architecture in suite.get_architectures(skipsrc = True, skipall = True):
 315                     arch_id = architecture.arch_id
 316                     # handle 'deb' packages
 317                     pool.apply_async(binary_helper, (suite_id, arch_id, deb_id, component_id), \
 318                         callback = class_.log_result)
 319                     # handle 'udeb' packages
 320                     pool.apply_async(binary_helper, (suite_id, arch_id, udeb_id, component_id), \
 321                         callback = class_.log_result)
 322         pool.close()
 323         pool.join()
 324         session.close()
 325
 326
 327 class BinaryContentsScanner(object):
 328     '''
 329     BinaryContentsScanner provides a threadsafe method scan() to scan the
 330     contents of a DBBinary object.
 331     '''
 332     def __init__(self, binary_id):
 333         '''
 334         The argument binary_id is the id of the DBBinary object that
 335         should be scanned.
 336         '''
 337         self.binary_id = binary_id
 338
 339     def scan(self, dummy_arg = None):
 340         '''
 341         This method does the actual scan and fills in the associated BinContents
 342         property. It commits any changes to the database. The argument dummy_arg
 343         is ignored but needed by our threadpool implementation.
 344         '''
 345         session = DBConn().session()
 346         binary = session.query(DBBinary).get(self.binary_id)
 347         fileset = set(binary.scan_contents())
 348         if len(fileset) == 0:
 349             fileset.add('EMPTY_PACKAGE')
 350         for filename in fileset:
 351             binary.contents.append(BinContents(file = filename))
 352         session.commit()
 353         session.close()
 354
 355     @classmethod
 356     def scan_all(class_, limit = None):
 357         '''
 358         The class method scan_all() scans all binaries using multiple threads.
 359         The number of binaries to be scanned can be limited with the limit
 360         argument. Returns the number of processed and remaining packages as a
 361         dict.
 362         '''
 363         session = DBConn().session()
 364         query = session.query(DBBinary).filter(DBBinary.contents == None)
 365         remaining = query.count
 366         if limit is not None:
 367             query = query.limit(limit)
 368         processed = query.count()
 369         pool = Pool()
 370         for binary in query.yield_per(100):
 371             pool.apply_async(binary_scan_helper, (binary.binary_id, ))
 372         pool.close()
 373         pool.join()
 374         remaining = remaining()
 375         session.close()
 376         return { 'processed': processed, 'remaining': remaining }
 377
 378 def binary_scan_helper(binary_id):
 379     '''
 380     This function runs in a subprocess.
 381     '''
 382     scanner = BinaryContentsScanner(binary_id)
 383     scanner.scan()
 384
 385
 386 def subprocess_setup():
 387     # Python installs a SIGPIPE handler by default. This is usually not what
 388     # non-Python subprocesses expect.
 389     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
 390
 391 class UnpackedSource(object):
 392     '''
 393     UnpackedSource extracts a source package into a temporary location and
 394     gives you some convinient function for accessing it.
 395     '''
 396     def __init__(self, dscfilename):
 397         '''
 398         The dscfilename is a name of a DSC file that will be extracted.
 399         '''
 400         temp_directory = mkdtemp(dir = Config()['Dir::TempPath'])
 401         self.root_directory = os.path.join(temp_directory, 'root')
 402         command = ('dpkg-source', '--no-copy', '--no-check', '-q', '-x',
 403             dscfilename, self.root_directory)
 404         check_call(command, preexec_fn = subprocess_setup)
 405
 406     def get_root_directory(self):
 407         '''
 408         Returns the name of the package's root directory which is the directory
 409         where the debian subdirectory is located.
 410         '''
 411         return self.root_directory
 412
 413     def get_changelog_file(self):
 414         '''
 415         Returns a file object for debian/changelog or None if no such file exists.
 416         '''
 417         changelog_name = os.path.join(self.root_directory, 'debian', 'changelog')
 418         try:
 419             return open(changelog_name)
 420         except IOError:
 421             return None
 422
 423     def get_all_filenames(self):
 424         '''
 425         Returns an iterator over all filenames. The filenames will be relative
 426         to the root directory.
 427         '''
 428         skip = len(self.root_directory) + 1
 429         for root, _, files in os.walk(self.root_directory):
 430             for name in files:
 431                 yield os.path.join(root[skip:], name)
 432
 433     def cleanup(self):
 434         '''
 435         Removes all temporary files.
 436         '''
 437         if self.root_directory is None:
 438             return
 439         parent_directory = os.path.dirname(self.root_directory)
 440         rmtree(parent_directory)
 441         self.root_directory = None
 442
 443     def __del__(self):
 444         '''
 445         Enforce cleanup.
 446         '''
 447         self.cleanup()
 448
 449
 450 class SourceContentsScanner(object):
 451     '''
 452     SourceContentsScanner provides a method scan() to scan the contents of a
 453     DBSource object.
 454     '''
 455     def __init__(self, source_id):
 456         '''
 457         The argument source_id is the id of the DBSource object that
 458         should be scanned.
 459         '''
 460         self.source_id = source_id
 461
 462     def scan(self):
 463         '''
 464         This method does the actual scan and fills in the associated SrcContents
 465         property. It commits any changes to the database.
 466         '''
 467         session = DBConn().session()
 468         source = session.query(DBSource).get(self.source_id)
 469         fileset = set(source.scan_contents())
 470         for filename in fileset:
 471             source.contents.append(SrcContents(file = filename))
 472         session.commit()
 473         session.close()
 474
 475     @classmethod
 476     def scan_all(class_, limit = None):
 477         '''
 478         The class method scan_all() scans all source using multiple processes.
 479         The number of sources to be scanned can be limited with the limit
 480         argument. Returns the number of processed and remaining packages as a
 481         dict.
 482         '''
 483         session = DBConn().session()
 484         query = session.query(DBSource).filter(DBSource.contents == None)
 485         remaining = query.count
 486         if limit is not None:
 487             query = query.limit(limit)
 488         processed = query.count()
 489         pool = Pool()
 490         for source in query.yield_per(100):
 491             pool.apply_async(source_scan_helper, (source.source_id, ))
 492         pool.close()
 493         pool.join()
 494         remaining = remaining()
 495         session.close()
 496         return { 'processed': processed, 'remaining': remaining }
 497
 498 def source_scan_helper(source_id):
 499     '''
 500     This function runs in a subprocess.
 501     '''
 502     try:
 503         scanner = SourceContentsScanner(source_id)
 504         scanner.scan()
 505     except Exception as e:
 506         print e
 507