From: Michael Casadevall Date: Sat, 3 Jan 2009 21:52:53 +0000 (-0500) Subject: Added content import, merged master, added update2 script, X-Git-Url: https://git.decadent.org.uk/gitweb/?a=commitdiff_plain;h=5e83da98cf5bd0100a850e32ede0e0b702044d7c;p=dak.git Added content import, merged master, added update2 script, added new quotes file, and added commands to dak.py Signed-off-by: Michael Casadevall --- diff --git a/dak/.generate_contents.py.swp b/dak/.generate_contents.py.swp deleted file mode 100644 index d4e83290..00000000 Binary files a/dak/.generate_contents.py.swp and /dev/null differ diff --git a/dak/dak.py b/dak/dak.py index 92753ecc..d04eebc2 100755 --- a/dak/dak.py +++ b/dak/dak.py @@ -138,6 +138,8 @@ def init(): "Check for users with no packages in the archive"), ("import-archive", "Populate SQL database based from an archive tree"), + ("import-contents", + "Populate SQL database with Contents files"), ("import-keyring", "Populate fingerprint/uid table based on a new/updated keyring"), ("import-ldap-fingerprints", diff --git a/dak/dakdb/update2.py b/dak/dakdb/update2.py new file mode 100644 index 00000000..ec9650b1 --- /dev/null +++ b/dak/dakdb/update2.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python + +# Debian Archive Kit Database Update Script 2 +# Copyright (C) 2009 Michael Casadevall + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +################################################################################ + +# really, if we want to screw ourselves, let's find a better way. +# rm -rf /srv/ftp.debian.org + +################################################################################ + +import psycopg2, time + +################################################################################ + +def do_update(self): + print "Adding content fields to database" + + try: + c = self.db.cursor() + c.execute("""CREATE TABLE content_file_paths ( + id serial primary key not null, + path text unique not null + )""") + + c.execute("""CREATE TABLE content_file_names ( + id serial primary key not null, + file text unique not null + )""") + + c.execute("""CREATE TABLE content_associations ( + id serial not null, + binary_pkg int4 not null references binaries(id) on delete cascade, + filepath int4 not null references content_file_paths(id) on delete cascade, + filename int4 not null references content_file_names(id) on delete cascade + );""") + + c.execute("""CREATE FUNCTION comma_concat(text, text) RETURNS text + AS $_$select case + WHEN $2 is null or $2 = '' THEN $1 + WHEN $1 is null or $1 = '' THEN $2 + ELSE $1 || ',' || $2 + END$_$ + LANGUAGE sql""") + + c.execute("""CREATE AGGREGATE comma_separated_list ( + BASETYPE = text, + SFUNC = comma_concat, + STYPE = text, + INITCOND = '' + );""") + + c.execute("UPDATE config SET value = '2' WHERE name = 'db_revision'") + self.db.commit() + + print "REMINDER: Remember to fully regenerate the Contents files before running import-contents" + print "" + print "Pausing for five seconds ..." + time.sleep (5) + + except psycopg2.ProgrammingError, msg: + self.db.rollback() + print "FATAL: Unable to apply content table update 2!" + print "Error Message: " + str(msg) + print "Database changes have been rolled back." diff --git a/dak/generate_contents.py b/dak/generate_contents.py index 54b70bde..6d84d16b 100755 --- a/dak/generate_contents.py +++ b/dak/generate_contents.py @@ -66,7 +66,7 @@ def generate_contents(suites): h.close() # Get our suites, and the architectures - for s in suites: + for s in [i.lower() for i in suites]: suite_id = database.get_suite_id(s) q = projectB.query("SELECT s.architecture, a.arch_string FROM suite_architectures s JOIN architecture a ON (s.architecture=a.id) WHERE suite = '%d'" % suite_id) diff --git a/dak/import_contents.py b/dak/import_contents.py new file mode 100755 index 00000000..945b9ea6 --- /dev/null +++ b/dak/import_contents.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python +# Import contents files + +# Copyright (C) 2008, 2009 Michael Casadevall + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +################################################################################ +################################################################################ + +################################################################################ + +import sys, os, popen2, tempfile, stat, time, pg +import re, gzip, apt_pkg +from daklib import database, utils +from daklib.dak_exceptions import * + +################################################################################ + +Cnf = None +projectB = None +out = None +AptCnf = None + +################################################################################ + +def usage (exit_code=0): + print """Usage: dak import-contents +Import Contents files + + -h, --help show this help and exit + -s, --suite=SUITE only write file lists for this suite +""" + sys.exit(exit_code) + +################################################################################ + +def import_contents(suites): + global projectB, Cnf + + # Start transaction + projectB.query("BEGIN WORK") + + # Needed to make sure postgreSQL doesn't freak out on some of the data + projectB.query("SET CLIENT_ENCODING TO 'LATIN1'") + + # Get our suites, and the architectures + for s in suites: + suite_id = database.get_suite_id(s) + + q = projectB.query("SELECT s.architecture, a.arch_string FROM suite_architectures s JOIN architecture a ON (s.architecture=a.id) WHERE suite = '%d'" % suite_id) + + arch_list = [ ] + for r in q.getresult(): + if r[1] != "source" and r[1] != "all": + arch_list.append((r[0], r[1])) + + arch_all_id = database.get_architecture_id("all") + + for arch in arch_list: + print "Processing %s/%s" % (s, arch[1]) + arch_id = database.get_architecture_id(arch[1]) + f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch[1]), "r") + + # Get line count + lines = f.readlines() + num_of_lines = len(lines) + + # Ok, the file cursor is at the first entry, now comes the fun 'lets parse' bit + lines_processed = 0 + found_header = False + + for line in lines: + if found_header == False: + if not line: + print "Unable to find end of Contents-%s.gz header!" % ( arch[1]) + sys.exit(255) + + lines_processed += 1 + p = re.compile('^FILE') + if p.match(line): + found_header = True + continue + + # The format is simple enough, *filename*, *section/package1,section/package2,etc* + # Each file appears once per Contents file, so first, use some regex match + # to split the two bits + + # Print out progress bar + print "\rProcessed %d lines of %d (%%%.2f)" % (lines_processed, num_of_lines, (float(lines_processed)/num_of_lines)), + + # regex lifted from packages.d.o code + p = re.compile('^(.+?)\s+(\S+)$') + matchs = p.findall(line) + filename = matchs[0][0] + packages = matchs[0][1].split(',') + + # Iterate through each file's packages + for package in packages: + p = re.compile('(\S+)/(\S+)$') + matchs = p.findall(package) + + # Needed since the DB is unicode, and these files + # are ASCII + section_name = matchs[0][0] + package_name = matchs[0][1] + + section_id = database.get_section_id(section_name) + package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id) + + if package_id == None: + # Likely got an arch all package + package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_all_id) + + database.insert_content_path(package_id, filename) + + lines_processed += 1 + f.close() + + # Commit work + print "Committing to database ..." + projectB.query("COMMIT") + +################################################################################ + +def main (): + global Cnf, projectB, out + out = sys.stdout + + Cnf = utils.get_conf() + + Arguments = [('h',"help","Import-Contents::Options::Help"), + ('s',"suite","Import-Contents::Options::Suite","HasArg"), + ] + + for i in [ "help", "suite" ]: + if not Cnf.has_key("Import-Contents::Options::%s" % (i)): + Cnf["Import-Contents::Options::%s" % (i)] = "" + + suites = apt_pkg.ParseCommandLine(Cnf,Arguments,sys.argv) + Options = Cnf.SubTree("Import-Contents::Options") + + if Options["Help"]: + usage() + + if Options["Suite"]: + suites = utils.split_args(Options["Suite"]) + else: + suites = Cnf.SubTree("Suite").List() + + projectB = pg.connect(Cnf["DB::Name"], Cnf["DB::Host"], int(Cnf["DB::Port"])) + database.init(Cnf, projectB) + + import_contents(suites) + +####################################################################################### + +if __name__ == '__main__': + main() diff --git a/dak/update_db.py b/dak/update_db.py index e59a558c..7d89e6bf 100755 --- a/dak/update_db.py +++ b/dak/update_db.py @@ -36,7 +36,7 @@ from daklib import utils Cnf = None projectB = None -required_database_schema = 1 +required_database_schema = 2 ################################################################################ diff --git a/daklib/database.py b/daklib/database.py index 1f659606..c39c83b1 100755 --- a/daklib/database.py +++ b/daklib/database.py @@ -45,6 +45,7 @@ suite_version_cache = {} suite_bin_version_cache = {} content_path_id_cache = {} content_file_id_cache = {} +insert_contents_file_cache = {} ################################################################################ @@ -250,14 +251,14 @@ def get_suite_version(source, suite, arch): return version -def get_latest_binary_version_id(binary, suite, arch): +def get_latest_binary_version_id(binary, section, suite, arch): global suite_bin_version_cache - cache_key = "%s_%s" % (binary, suite) + cache_key = "%s_%s_%s_%s" % (binary, section, suite, arch) if suite_bin_version_cache.has_key(cache_key): return suite_bin_version_cache[cache_key] - q = projectB.query("SELECT b.id, b.version FROM binaries b JOIN bin_associations ba ON (b.id = ba.bin) WHERE b.package = '%s' AND b.architecture = '%d' AND ba.suite = '%d'" % (binary, int(arch), int(suite))) + q = projectB.query("SELECT b.id, b.version FROM binaries b JOIN bin_associations ba ON (b.id = ba.bin) JOIN override o ON (o.package=b.package) WHERE b.package = '%s' AND b.architecture = '%d' AND ba.suite = '%d' AND o.section = '%d'" % (binary, int(arch), int(suite), int(section))) highest_bid, highest_version = None, None @@ -266,6 +267,7 @@ def get_latest_binary_version_id(binary, suite, arch): highest_bid = bi[0] highest_version = bi[1] + suite_bin_version_cache[cache_key] = highest_bid return highest_bid ################################################################################ @@ -459,6 +461,14 @@ def get_or_set_contents_path_id(path): ################################################################################ def insert_content_path(bin_id, fullpath): + global insert_contents_file_cache + cache_key = "%s_%s" % (bin_id, fullpath) + + # have we seen this contents before? + # probably only revelant during package import + if insert_contents_file_cache.has_key(cache_key): + return + # split the path into basename, and pathname (path, file) = os.path.split(fullpath) @@ -466,6 +476,13 @@ def insert_content_path(bin_id, fullpath): file_id = get_or_set_contents_file_id(file) path_id = get_or_set_contents_path_id(path) + # Determine if we're inserting a duplicate row + q = projectB.query("SELECT 1 FROM content_associations WHERE binary_pkg = '%d' AND filepath = '%d' AND filename = '%d'" % (int(bin_id), path_id, file_id)) + if q.getresult(): + # Yes we are, return without doing the insert + print "Inserting dup row" + return + # Put them into content_assiocations projectB.query("INSERT INTO content_associations VALUES (DEFAULT, '%d', '%d', '%d')" % (bin_id, path_id, file_id)) return diff --git a/docs/README.quotes b/docs/README.quotes index 3568ae7a..c696fbeb 100644 --- a/docs/README.quotes +++ b/docs/README.quotes @@ -344,3 +344,9 @@ Canadians: This is a lighthouse. Your call. elmo: I can't believe people pay you to fix computers %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +* Ganneff ponders how to best write the text to -devel. (need to tell em in + case they find more bugs). "We fixed the fucking idiotic broken implementation + to be less so" is probably not the nicest, even if perfect valid, way to say so + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%