From: Mark Hymers Date: Wed, 23 Mar 2011 18:34:24 +0000 (+0000) Subject: metadata generation work X-Git-Url: https://git.decadent.org.uk/gitweb/?a=commitdiff_plain;h=f6b62be0ac52b3bc05ec48ef2c458d2fd83625b6;p=dak.git metadata generation work Signed-off-by: Mark Hymers --- diff --git a/dak/dak.py b/dak/dak.py index 5a659d8c..ad99a5a0 100755 --- a/dak/dak.py +++ b/dak/dak.py @@ -86,6 +86,8 @@ def init(): "Generate Packages/Sources files"), ("contents", "Generate content files"), + ("metadata", + "Load data for packages/sources files"), ("generate-index-diffs", "Generate .diff/Index files"), ("clean-suites", diff --git a/dak/metadata.py b/dak/metadata.py new file mode 100755 index 00000000..f40c9431 --- /dev/null +++ b/dak/metadata.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +""" +Import data for Package/Sources files from .deb and .dsc files +@copyright: 2011 Torsten Werner +@copyright: 2011 Mark Hymers +@license: GNU General Public License version 2 or later +""" + +################################################################################ + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +################################################################################ + +# < mvo> that screams for consolidation in libapt at least (that then in turn can +# use libdpkg ... ) - I guess the "d" means delayed ;) + +# (whilst discussing adding xz support to dak, and therefore python-apt, and +# therefore libapt-pkg) + +################################################################################ + +import sys +import apt_pkg + +from daklib.config import Config +from daklib.dbconn import * +from daklib.metadata import MetadataScanner +from daklib import daklog +from daklib import utils + +################################################################################ + +def usage (exit_code=0): + print """Usage: dak metadata [options] subcommand + +SUBCOMMANDS + scan-source + scan the dsc files in the existing pool and load metadata into the database + + scan-binary + scan the deb files in the existing pool and load metadata into the database + +OPTIONS + -h, --help + show this help and exit + +OPTIONS for scan + -l, --limit=NUMBER + maximum number of items to scan +""" + sys.exit(exit_code) + +################################################################################ + +def scan_all(cnf, mode, limit): + Logger = daklog.Logger(cnf.Cnf, 'metadata scan (%s)' % mode) + result = MetadataScanner.scan_all(mode, limit) + processed = '%(processed)d %(type)s processed' % result + remaining = '%(remaining)d %(type)s remaining' % result + Logger.log([processed, remaining]) + Logger.close() + +################################################################################ + +def main(): + cnf = Config() + cnf['Metadata::Options::Help'] = '' + cnf['Metadata::Options::Suite'] = '' + cnf['Metadata::Options::Limit'] = '' + cnf['Metadata::Options::Force'] = '' + arguments = [('h', "help", 'Metadata::Options::Help'), + ('s', "suite", 'Metadata::Options::Suite', "HasArg"), + ('l', "limit", 'Metadata::Options::Limit', "HasArg"), + ('f', "force", 'Metadata::Options::Force'), + ] + args = apt_pkg.ParseCommandLine(cnf.Cnf, arguments, sys.argv) + options = cnf.SubTree('Metadata::Options') + + if (len(args) != 1) or options['Help']: + usage() + + limit = None + if len(options['Limit']) > 0: + limit = int(options['Limit']) + + if args[0] == 'scan-source': + scan_all(cnf, 'source', limit) + return + elif args[0] == 'scan-binary': + scan_all(cnf, 'binary', limit) + return + + suite_names = utils.split_args(options['Suite']) + + force = bool(options['Force']) + + if args[0] == 'generate': + raise NotImplementError + + usage() + + +if __name__ == '__main__': + main() diff --git a/dak/packagescan.py b/dak/packagescan.py deleted file mode 100755 index 2d2bab00..00000000 --- a/dak/packagescan.py +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/env python -""" -Import data for Packages files from .deb files - -@contact: Debian FTPMaster -@copyright: 2008, 2009 Michael Casadevall -@copyright: 2009 Mike O'Connor -@copyright: 2011 Torsten Werner -@copyright: 2011 Mark Hymers -@license: GNU General Public License version 2 or later -""" - -################################################################################ - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -################################################################################ - -# < mvo> that screams for consolidation in libapt at least (that then in turn can -# use libdpkg ... ) - I guess the "d" means delayed ;) - -# (whilst discussing adding xz support to dak, and therefore python-apt, and -# therefore libapt-pkg) - -################################################################################ - -import sys -import apt_pkg - -from daklib.config import Config -from daklib.dbconn import * -from daklib.packages import PackagesScanner -from daklib import daklog -from daklib import utils - -################################################################################ - -def usage (exit_code=0): - print """Usage: dak packagescan [options] subcommand - -SUBCOMMANDS - scan - scan the debs in the existing pool and load metadata into the database - -OPTIONS - -h, --help - show this help and exit - -OPTIONS for scan - -l, --limit=NUMBER - maximum number of packages to scan -""" - sys.exit(exit_code) - -################################################################################ - -def scan_all(cnf, limit): - Logger = daklog.Logger(cnf.Cnf, 'packages scan') - result = PackagesScanner.scan_all(limit) - processed = '%(processed)d packages processed' % result - remaining = '%(remaining)d packages remaining' % result - Logger.log([processed, remaining]) - Logger.close() - -################################################################################ - -def main(): - cnf = Config() - cnf['Packages::Options::Help'] = '' - cnf['Packages::Options::Suite'] = '' - cnf['Packages::Options::Limit'] = '' - cnf['Packages::Options::Force'] = '' - arguments = [('h', "help", 'Packages::Options::Help'), - ('s', "suite", 'Packages::Options::Suite', "HasArg"), - ('l', "limit", 'Packages::Options::Limit', "HasArg"), - ('f', "force", 'Packages::Options::Force'), - ] - args = apt_pkg.ParseCommandLine(cnf.Cnf, arguments, sys.argv) - options = cnf.SubTree('Packages::Options') - - if (len(args) != 1) or options['Help']: - usage() - - limit = None - if len(options['Limit']) > 0: - limit = int(options['Limit']) - - if args[0] == 'scan': - scan_all(cnf, limit) - return - - suite_names = utils.split_args(options['Suite']) - - force = bool(options['Force']) - - if args[0] == 'generate': - raise NotImplementError - - usage() - - -if __name__ == '__main__': - main() diff --git a/daklib/dbconn.py b/daklib/dbconn.py index 4d30e663..98b6c7d5 100755 --- a/daklib/dbconn.py +++ b/daklib/dbconn.py @@ -492,6 +492,10 @@ class DBBinary(ORMObject): self.poolfile = poolfile self.binarytype = binarytype + @property + def pkid(self): + return self.binary_id + def properties(self): return ['package', 'version', 'maintainer', 'source', 'architecture', \ 'poolfile', 'binarytype', 'fingerprint', 'install_date', \ @@ -533,20 +537,28 @@ class DBBinary(ORMObject): ''' Reads the control information from a binary. - @rtype: tuple - @return: (stanza, controldict) stanza is the text of the control - section. controldict is the information in a dictionary - form + @rtype: text + @return: stanza text of the control section. ''' - import apt_inst, apt_pk + import apt_inst fullpath = self.poolfile.fullpath deb_file = open(fullpath, 'r') - stanza = apt_inst.debExtractControl(deb_file).rstrip() - control = dict(apt_pkg.TagSection(stanza)) + stanza = apt_inst.debExtractControl(deb_file) deb_file.close() - return stanza, control + return stanza + + def read_control_fields(self): + ''' + Reads the control information from a binary and return + as a dictionary. + @rtype: dict + @return: fields of the control section as a dictionary. + ''' + import apt_pkg + stanza = self.read_control() + return apt_pkg.TagSection(stanza) __all__.append('DBBinary') @@ -2176,6 +2188,60 @@ __all__.append('get_sections') ################################################################################ +from debian.debfile import Deb822 + +# Temporary Deb822 subclass to fix bugs with : handling; see #597249 +class Dak822(Deb822): + def _internal_parser(self, sequence, fields=None): + # The key is non-whitespace, non-colon characters before any colon. + key_part = r"^(?P[^: \t\n\r\f\v]+)\s*:\s*" + single = re.compile(key_part + r"(?P\S.*?)\s*$") + multi = re.compile(key_part + r"$") + multidata = re.compile(r"^\s(?P.+?)\s*$") + + wanted_field = lambda f: fields is None or f in fields + + if isinstance(sequence, basestring): + sequence = sequence.splitlines() + + curkey = None + content = "" + for line in self.gpg_stripped_paragraph(sequence): + m = single.match(line) + if m: + if curkey: + self[curkey] = content + + if not wanted_field(m.group('key')): + curkey = None + continue + + curkey = m.group('key') + content = m.group('data') + continue + + m = multi.match(line) + if m: + if curkey: + self[curkey] = content + + if not wanted_field(m.group('key')): + curkey = None + continue + + curkey = m.group('key') + content = "" + continue + + m = multidata.match(line) + if m: + content += '\n' + line # XXX not m.group('data')? + continue + + if curkey: + self[curkey] = content + + class DBSource(ORMObject): def __init__(self, source = None, version = None, maintainer = None, \ changedby = None, poolfile = None, install_date = None): @@ -2186,6 +2252,10 @@ class DBSource(ORMObject): self.poolfile = poolfile self.install_date = install_date + @property + def pkid(self): + return self.source_id + def properties(self): return ['source', 'source_id', 'maintainer', 'changedby', \ 'fingerprint', 'poolfile', 'version', 'suites_count', \ @@ -2195,18 +2265,15 @@ class DBSource(ORMObject): return ['source', 'version', 'install_date', 'maintainer', \ 'changedby', 'poolfile', 'install_date'] - def read_control(self): + def read_control_fields(self): ''' Reads the control information from a dsc @rtype: tuple - @return: (stanza, controldict) stanza is the text of the control - section. controldict is the information in a dictionary - form + @return: fields is the dsc information in a dictionary form ''' - from debian.debfile import Deb822 fullpath = self.poolfile.fullpath - fields = Deb822(open(self.poolfile.fullpath, 'r')) + fields = Dak822(open(self.poolfile.fullpath, 'r')) return fields metadata = association_proxy('key', 'value') @@ -2354,6 +2421,34 @@ def get_source_in_suite(source, suite, session=None): __all__.append('get_source_in_suite') +@session_wrapper +def import_metadata_into_db(obj, session=None): + """ + This routine works on either DBBinary or DBSource objects and imports + their metadata into the database + """ + fields = obj.read_control_fields() + for k in fields.keys(): + try: + # Try raw ASCII + val = str(fields[k]) + except UnicodeEncodeError: + # Fall back to UTF-8 + try: + val = fields[k].encode('utf-8') + except UnicodeEncodeError: + # Finally try iso8859-1 + val = fields[k].encode('iso8859-1') + # Otherwise we allow the exception to percolate up and we cause + # a reject as someone is playing silly buggers + + obj.metadata[get_or_set_metadatakey(k, session)] = val + + session.commit_or_flush() + +__all__.append('import_metadata_into_db') + + ################################################################################ @session_wrapper @@ -2530,7 +2625,7 @@ def add_deb_to_db(u, filename, session=None): # session.rollback() # raise MissingContents, "No contents stored for package %s, and couldn't determine contents of %s" % (bin.package, filename) - return poolfile + return bin, poolfile __all__.append('add_deb_to_db') @@ -2853,6 +2948,38 @@ class MetadataKey(ORMObject): __all__.append('MetadataKey') +@session_wrapper +def get_or_set_metadatakey(keyname, session=None): + """ + Returns MetadataKey object for given uidname. + + If no matching keyname is found, a row is inserted. + + @type uidname: string + @param uidname: The keyname to add + + @type session: SQLAlchemy + @param session: Optional SQL session object (a temporary one will be + generated if not supplied). If not passed, a commit will be performed at + the end of the function, otherwise the caller is responsible for commiting. + + @rtype: MetadataKey + @return: the metadatakey object for the given keyname + """ + + q = session.query(MetadataKey).filter_by(key=keyname) + + try: + ret = q.one() + except NoResultFound: + ret = MetadataKey(keyname) + session.add(ret) + session.commit_or_flush() + + return ret + +__all__.append('get_or_set_metadatakey') + ################################################################################ class BinaryMetadata(ORMObject): diff --git a/daklib/metadata.py b/daklib/metadata.py new file mode 100755 index 00000000..d88cf4fa --- /dev/null +++ b/daklib/metadata.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python +""" +Helper code for packages and sources generation. + +@contact: Debian FTPMaster +@copyright: 2011 Torsten Werner +@copyright: 2011 Mark Hymers +@license: GNU General Public License version 2 or later +""" + +################################################################################ + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +################################################################################ + +from daklib.dbconn import * +from daklib.config import Config + +from multiprocessing import Pool +from subprocess import Popen, PIPE + +import os.path + +class MetadataScanner(object): + ''' + MetadataScanner provides a threadsafe method scan() to scan the metadata of + a DBSource or DBBinary object depending on what is passed as dbclass''' + + def __init__(self, dbclass, pkid, verbose=True): + ''' + The argument binary_id is the id of the DBBinary object that + + should be scanned. + ''' + self.verbose = True + self.dbclass = dbclass + self.pkid = pkid + + def scan(self, dummy_arg = None): + ''' + This method does the actual scan and fills in the associated metadata + property. It commits any changes to the database. The argument dummy_arg + is ignored but needed by our threadpool implementation. + ''' + obj = None + fullpath = 'UNKNOWN PATH' + + session = DBConn().session() + try: + obj = session.query(self.dbclass).get(self.pkid) + fullpath = obj.poolfile.fullpath + import_metadata_into_db(obj, session=session) + if self.verbose: + print "Imported %s (%s)" % (self.pkid, fullpath) + session.commit() + except Exception, e: + print "Failed to import %s [id=%s; fullpath=%s]" % (self.dbclass.__name__, self.pkid, fullpath) + print "Exception: ", e + session.rollback() + + session.close() + + @classmethod + def scan_all(class_, scantype='source', limit = None): + ''' + The class method scan_all() scans all sources using multiple threads. + The number of sources to be scanned can be limited with the limit + argument. Returns the number of processed and remaining files as a + dict. + ''' + session = DBConn().session() + if scantype == 'source': + dbclass = DBSource + query = session.query(DBSource).filter(~DBSource.source_id.in_(session.query(SourceMetadata.source_id.distinct()))) + t = 'sources' + else: + # Otherwise binary + dbclass = DBBinary + query = session.query(DBBinary).filter(~DBBinary.binary_id.in_(session.query(BinaryMetadata.binary_id.distinct()))) + t = 'binaries' + + remaining = query.count + if limit is not None: + query = query.limit(limit) + processed = query.count() + pool = Pool(processes=10) + for obj in query.yield_per(100): + pool.apply_async(scan_helper, (dbclass, obj.pkid, )) + pool.close() + pool.join() + remaining = remaining() + session.close() + return { 'processed': processed, 'remaining': remaining , 'type': t} + +def scan_helper(dbclass, source_id): + ''' + This function runs in a subprocess. + ''' + scanner = MetadataScanner(dbclass, source_id) + scanner.scan() diff --git a/daklib/packages.py b/daklib/packages.py deleted file mode 100755 index 27b6d287..00000000 --- a/daklib/packages.py +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env python -""" -Helper code for packages generation. - -@contact: Debian FTPMaster -@copyright: 2011 Torsten Werner -@copyright: 2011 Mark Hymers -@license: GNU General Public License version 2 or later -""" - -################################################################################ - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -################################################################################ - -from daklib.dbconn import * -from daklib.config import Config - -from multiprocessing import Pool -from subprocess import Popen, PIPE - -import os.path - -class PackagesScanner(object): - ''' - PackagesScanner provides a threadsafe method scan() to scan the metadata of - a DBBinary object. - ''' - def __init__(self, binary_id): - ''' - The argument binary_id is the id of the DBBinary object that - should be scanned. - ''' - self.binary_id = binary_id - - def scan(self, dummy_arg = None): - ''' - This method does the actual scan and fills in the associated metadata - property. It commits any changes to the database. The argument dummy_arg - is ignored but needed by our threadpool implementation. - ''' - session = DBConn().session() - binary = session.query(DBBinary).get(self.binary_id) - fileset = set(binary.read_control()) - print fileset - #if len(fileset) == 0: - # fileset.add('EMPTY_PACKAGE') - #for filename in fileset: - # binary.contents.append(BinContents(file = filename)) - #session.commit() - session.close() - - @classmethod - def scan_all(class_, limit = None): - ''' - The class method scan_all() scans all binaries using multiple threads. - The number of binaries to be scanned can be limited with the limit - argument. Returns the number of processed and remaining packages as a - dict. - ''' - session = DBConn().session() - query = session.query(DBBinary).filter(DBBinary.contents == None) - remaining = query.count - if limit is not None: - query = query.limit(limit) - processed = query.count() - pool = Pool() - for binary in query.yield_per(100): - pool.apply_async(scan_helper, (binary.binary_id, )) - pool.close() - pool.join() - remaining = remaining() - session.close() - return { 'processed': processed, 'remaining': remaining } - -def scan_helper(binary_id): - ''' - This function runs in a subprocess. - ''' - scanner = PackagesScanner(binary_id) - scanner.scan() diff --git a/daklib/queue.py b/daklib/queue.py index b4c62d38..52483cca 100755 --- a/daklib/queue.py +++ b/daklib/queue.py @@ -2025,6 +2025,7 @@ distribution.""" print "Installing." self.logger.log(["installing changes", self.pkg.changes_file]) + binaries = [] poolfiles = [] # Add the .dsc file to the DB first @@ -2037,7 +2038,9 @@ distribution.""" # Add .deb / .udeb files to the DB (type is always deb, dbtype is udeb/deb) for newfile, entry in self.pkg.files.items(): if entry["type"] == "deb": - poolfiles.append(add_deb_to_db(self, newfile, session)) + b, pf = add_deb_to_db(self, newfile, session) + binaries.append(b) + poolfiles.append(pf) # If this is a sourceful diff only upload that is moving # cross-component we need to copy the .orig files into the new @@ -2122,6 +2125,18 @@ distribution.""" # Our SQL session will automatically start a new transaction after # the last commit + # Now ensure that the metadata has been added + # This has to be done after we copy the files into the pool + # For source if we have it: + if self.pkg.changes["architecture"].has_key("source"): + import_metadata_into_db(source, session) + + # Now for any of our binaries + for b in binaries: + import_metadata_into_db(b, session) + + session.commit() + # Move the .changes into the 'done' directory utils.move(self.pkg.changes_file, os.path.join(cnf["Dir::Queue::Done"], os.path.basename(self.pkg.changes_file))) diff --git a/tests/dbtest_packages.py b/tests/dbtest_packages.py index 2b179053..f2587709 100755 --- a/tests/dbtest_packages.py +++ b/tests/dbtest_packages.py @@ -328,7 +328,7 @@ class PackageTestCase(DBDakTestCase): 'sha1sum': 'deadbeef', 'sha256sum': 'deadbeef'} upload = Upload(pkg) - poolfile = add_deb_to_db(upload, 'hello_2.2-2_i386.deb', self.session) + bin, poolfile = add_deb_to_db(upload, 'hello_2.2-2_i386.deb', self.session) self.session.refresh(poolfile) self.session.refresh(poolfile.binary) self.assertEqual('main/h/hello/hello_2.2-2_i386.deb', poolfile.filename)