From 14ee11299852e43c6d061fc2ed1cc3bb0fc1f9f3 Mon Sep 17 00:00:00 2001 From: Torsten Werner Date: Wed, 23 Mar 2011 20:03:33 +0100 Subject: [PATCH] Add class SourceContentsScanner. Signed-off-by: Torsten Werner --- daklib/contents.py | 61 ++++++++++++++++++++++++++++++++++++++-- daklib/dbconn.py | 38 +++++++++++++++++++++++++ tests/dbtest_contents.py | 23 +++++++++++---- 3 files changed, 115 insertions(+), 7 deletions(-) diff --git a/daklib/contents.py b/daklib/contents.py index df571e10..dffb425e 100755 --- a/daklib/contents.py +++ b/daklib/contents.py @@ -302,14 +302,14 @@ class BinaryContentsScanner(object): processed = query.count() pool = Pool() for binary in query.yield_per(100): - pool.apply_async(scan_helper, (binary.binary_id, )) + pool.apply_async(binary_scan_helper, (binary.binary_id, )) pool.close() pool.join() remaining = remaining() session.close() return { 'processed': processed, 'remaining': remaining } -def scan_helper(binary_id): +def binary_scan_helper(binary_id): ''' This function runs in a subprocess. ''' @@ -376,3 +376,60 @@ class UnpackedSource(object): Enforce cleanup. ''' self.cleanup() + + +class SourceContentsScanner(object): + ''' + SourceContentsScanner provides a method scan() to scan the contents of a + DBSource object. + ''' + def __init__(self, source_id): + ''' + The argument source_id is the id of the DBSource object that + should be scanned. + ''' + self.source_id = source_id + + def scan(self): + ''' + This method does the actual scan and fills in the associated SrcContents + property. It commits any changes to the database. + ''' + session = DBConn().session() + source = session.query(DBSource).get(self.source_id) + fileset = set(source.scan_contents()) + for filename in fileset: + source.contents.append(SrcContents(file = filename)) + session.commit() + session.close() + + @classmethod + def scan_all(class_, limit = None): + ''' + The class method scan_all() scans all source using multiple processes. + The number of sources to be scanned can be limited with the limit + argument. Returns the number of processed and remaining packages as a + dict. + ''' + session = DBConn().session() + query = session.query(DBSource).filter(DBSource.contents == None) + remaining = query.count + if limit is not None: + query = query.limit(limit) + processed = query.count() + pool = Pool() + for source in query.yield_per(100): + pool.apply_async(source_scan_helper, (source.source_id, )) + pool.close() + pool.join() + remaining = remaining() + session.close() + return { 'processed': processed, 'remaining': remaining } + +def source_scan_helper(binary_id): + ''' + This function runs in a subprocess. + ''' + scanner = SourceContentsScanner(source_id) + scanner.scan() + diff --git a/daklib/dbconn.py b/daklib/dbconn.py index 6782c081..014f1082 100755 --- a/daklib/dbconn.py +++ b/daklib/dbconn.py @@ -2157,6 +2157,18 @@ __all__.append('get_sections') ################################################################################ +class SrcContents(ORMObject): + def __init__(self, file = None, source = None): + self.file = file + self.source = source + + def properties(self): + return ['file', 'source'] + +__all__.append('SrcContents') + +################################################################################ + class DBSource(ORMObject): def __init__(self, source = None, version = None, maintainer = None, \ changedby = None, poolfile = None, install_date = None): @@ -2178,6 +2190,25 @@ class DBSource(ORMObject): metadata = association_proxy('key', 'value') + def scan_contents(self): + ''' + Returns a set of names for non directories. The path names are + normalized after converting them from either utf-8 or iso8859-1 + encoding. + ''' + fullpath = self.poolfile.fullpath + from daklib.contents import UnpackedSource + unpacked = UnpackedSource(fullpath) + fileset = set() + for name in unpacked.get_all_filenames(): + # enforce proper utf-8 encoding + try: + name.decode('utf-8') + except UnicodeDecodeError: + name = name.decode('iso8859-1').encode('utf-8') + fileset.add(name) + return fileset + __all__.append('DBSource') @session_wrapper @@ -2910,6 +2941,7 @@ class DBConn(object): 'source_acl', 'source_metadata', 'src_associations', + 'src_contents', 'src_format', 'src_uploaders', 'suite', @@ -3213,6 +3245,12 @@ class DBConn(object): backref=backref('contents', lazy='dynamic', cascade='all')), file = self.tbl_bin_contents.c.file)) + mapper(SrcContents, self.tbl_src_contents, + properties = dict( + source = relation(DBSource, + backref=backref('contents', lazy='dynamic', cascade='all')), + file = self.tbl_src_contents.c.file)) + mapper(MetadataKey, self.tbl_metadata_keys, properties = dict( key_id = self.tbl_metadata_keys.c.key_id, diff --git a/tests/dbtest_contents.py b/tests/dbtest_contents.py index 957307ba..e3128161 100755 --- a/tests/dbtest_contents.py +++ b/tests/dbtest_contents.py @@ -3,7 +3,8 @@ from db_test import DBDakTestCase, fixture from daklib.dbconn import * -from daklib.contents import ContentsWriter, BinaryContentsScanner, UnpackedSource +from daklib.contents import ContentsWriter, BinaryContentsScanner, \ + UnpackedSource, SourceContentsScanner from os.path import normpath from sqlalchemy.exc import FlushError, IntegrityError @@ -161,7 +162,10 @@ class ContentsTestCase(DBDakTestCase): self.session.delete(self.binary['hello_2.2-1_i386']) self.session.commit() - def test_scan_contents(self): + def test_binary_scan_contents(self): + ''' + Tests the BinaryContentsScanner. + ''' self.setup_binaries() filelist = [f for f in self.binary['hello_2.2-1_i386'].scan_contents()] self.assertEqual(['usr/bin/hello', 'usr/share/doc/hello/copyright'], @@ -175,10 +179,11 @@ class ContentsTestCase(DBDakTestCase): def test_unpack(self): ''' - Tests the UnpackedSource class. + Tests the UnpackedSource class and the SourceContentsScanner. ''' - self.setup_poolfiles() - dscfilename = fixture('ftp/pool/' + self.file['hello_2.2-1.dsc'].filename) + self.setup_sources() + source = self.source['hello_2.2-1'] + dscfilename = fixture('ftp/pool/' + source.poolfile.filename) unpacked = UnpackedSource(dscfilename) self.assertTrue(len(unpacked.get_root_directory()) > 0) self.assertEqual('hello (2.2-1) unstable; urgency=low\n', @@ -186,7 +191,15 @@ class ContentsTestCase(DBDakTestCase): all_filenames = set(unpacked.get_all_filenames()) self.assertEqual(8, len(all_filenames)) self.assertTrue('debian/rules' in all_filenames) + # method scan_contents() + self.assertEqual(all_filenames, source.scan_contents()) + # exception with invalid files self.assertRaises(CalledProcessError, lambda: UnpackedSource('invalidname')) + # SourceContentsScanner + self.session.commit() + self.assertTrue(source.contents.count() == 0) + SourceContentsScanner(source.source_id).scan() + self.assertTrue(source.contents.count() > 0) def classes_to_clean(self): return [Override, Suite, BinContents, DBBinary, DBSource, Architecture, Section, \ -- 2.39.2