Add class SourceContentsScanner.

[dak.git] / daklib / contents.py
diff --git a/daklib/contents.py b/daklib/contents.py

index bb81589394f20fb77068eb8a04fd071ed25b6aa4..dffb425e3cc2e5f8c5245de9e209fdaaa7a1f454 100755 (executable)
--- a/daklib/contents.py
+++ b/daklib/contents.py
@@ -27,12 +27,11 @@ Helper code for contents generation.
  
  from daklib.dbconn import *
  from daklib.config import Config
-from daklib.threadpool import ThreadPool
-from multiprocessing import Pool
  
-from sqlalchemy import desc, or_
-from sqlalchemy.exc import IntegrityError
-from subprocess import Popen, PIPE
+from multiprocessing import Pool
+from shutil import rmtree
+from subprocess import Popen, PIPE, check_call
+from tempfile import mkdtemp
  
  import os.path
  
@@ -244,7 +243,6 @@ def generate_helper(suite_id, arch_id, overridetype_id, component_id = None):
      '''
      This function is called in a new subprocess.
      '''
-    DBConn().reset()
      session = DBConn().session()
      suite = Suite.get(suite_id, session)
      architecture = Architecture.get(arch_id, session)
@@ -260,17 +258,17 @@ def generate_helper(suite_id, arch_id, overridetype_id, component_id = None):
      return log_message
  
  
-class ContentsScanner(object):
+class BinaryContentsScanner(object):
      '''
-    ContentsScanner provides a threadsafe method scan() to scan the contents of
-    a DBBinary object.
+    BinaryContentsScanner provides a threadsafe method scan() to scan the
+    contents of a DBBinary object.
      '''
-    def __init__(self, binary):
+    def __init__(self, binary_id):
          '''
-        The argument binary is the actual DBBinary object that should be
-        scanned.
+        The argument binary_id is the id of the DBBinary object that
+        should be scanned.
          '''
-        self.binary_id = binary.binary_id
+        self.binary_id = binary_id
  
      def scan(self, dummy_arg = None):
          '''
@@ -302,10 +300,136 @@ class ContentsScanner(object):
          if limit is not None:
              query = query.limit(limit)
          processed = query.count()
-        threadpool = ThreadPool()
+        pool = Pool()
          for binary in query.yield_per(100):
-            threadpool.queueTask(ContentsScanner(binary).scan)
-        threadpool.joinAll()
+            pool.apply_async(binary_scan_helper, (binary.binary_id, ))
+        pool.close()
+        pool.join()
+        remaining = remaining()
+        session.close()
+        return { 'processed': processed, 'remaining': remaining }
+
+def binary_scan_helper(binary_id):
+    '''
+    This function runs in a subprocess.
+    '''
+    scanner = BinaryContentsScanner(binary_id)
+    scanner.scan()
+
+
+class UnpackedSource(object):
+    '''
+    UnpackedSource extracts a source package into a temporary location and
+    gives you some convinient function for accessing it.
+    '''
+    def __init__(self, dscfilename):
+        '''
+        The dscfilename is a name of a DSC file that will be extracted.
+        '''
+        self.root_directory = os.path.join(mkdtemp(), 'root')
+        command = ('dpkg-source', '--no-copy', '--no-check', '-x', dscfilename,
+            self.root_directory)
+        # dpkg-source does not have a --quiet option
+        devnull = open(os.devnull, 'w')
+        check_call(command, stdout = devnull, stderr = devnull)
+        devnull.close()
+
+    def get_root_directory(self):
+        '''
+        Returns the name of the package's root directory which is the directory
+        where the debian subdirectory is located.
+        '''
+        return self.root_directory
+
+    def get_changelog_file(self):
+        '''
+        Returns a file object for debian/changelog or None if no such file exists.
+        '''
+        changelog_name = os.path.join(self.root_directory, 'debian', 'changelog')
+        try:
+            return open(changelog_name)
+        except IOError:
+            return None
+
+    def get_all_filenames(self):
+        '''
+        Returns an iterator over all filenames. The filenames will be relative
+        to the root directory.
+        '''
+        skip = len(self.root_directory) + 1
+        for root, _, files in os.walk(self.root_directory):
+            for name in files:
+                yield os.path.join(root[skip:], name)
+
+    def cleanup(self):
+        '''
+        Removes all temporary files.
+        '''
+        if self.root_directory is None:
+            return
+        parent_directory = os.path.dirname(self.root_directory)
+        rmtree(parent_directory)
+        self.root_directory = None
+
+    def __del__(self):
+        '''
+        Enforce cleanup.
+        '''
+        self.cleanup()
+
+
+class SourceContentsScanner(object):
+    '''
+    SourceContentsScanner provides a method scan() to scan the contents of a
+    DBSource object.
+    '''
+    def __init__(self, source_id):
+        '''
+        The argument source_id is the id of the DBSource object that
+        should be scanned.
+        '''
+        self.source_id = source_id
+
+    def scan(self):
+        '''
+        This method does the actual scan and fills in the associated SrcContents
+        property. It commits any changes to the database.
+        '''
+        session = DBConn().session()
+        source = session.query(DBSource).get(self.source_id)
+        fileset = set(source.scan_contents())
+        for filename in fileset:
+            source.contents.append(SrcContents(file = filename))
+        session.commit()
+        session.close()
+
+    @classmethod
+    def scan_all(class_, limit = None):
+        '''
+        The class method scan_all() scans all source using multiple processes.
+        The number of sources to be scanned can be limited with the limit
+        argument. Returns the number of processed and remaining packages as a
+        dict.
+        '''
+        session = DBConn().session()
+        query = session.query(DBSource).filter(DBSource.contents == None)
+        remaining = query.count
+        if limit is not None:
+            query = query.limit(limit)
+        processed = query.count()
+        pool = Pool()
+        for source in query.yield_per(100):
+            pool.apply_async(source_scan_helper, (source.source_id, ))
+        pool.close()
+        pool.join()
          remaining = remaining()
          session.close()
          return { 'processed': processed, 'remaining': remaining }
+
+def source_scan_helper(binary_id):
+    '''
+    This function runs in a subprocess.
+    '''
+    scanner = SourceContentsScanner(source_id)
+    scanner.scan()
+