From: Michael Casadevall Date: Sun, 4 Jan 2009 00:27:54 +0000 (-0500) Subject: Modified import_contents to use less selects, increasing performance X-Git-Url: https://git.decadent.org.uk/gitweb/?a=commitdiff_plain;h=9c7be82789039fb58aaf8667652ca4b74bfd6d46;p=dak.git Modified import_contents to use less selects, increasing performance to acceptable limited. Signed-off-by: Michael Casadevall --- diff --git a/dak/import_contents.py b/dak/import_contents.py index 945b9ea6..06691957 100755 --- a/dak/import_contents.py +++ b/dak/import_contents.py @@ -33,6 +33,9 @@ Cnf = None projectB = None out = None AptCnf = None +content_path_id_cache = {} +content_file_id_cache = {} +insert_contents_file_cache = {} ################################################################################ @@ -47,6 +50,50 @@ Import Contents files ################################################################################ + +def set_contents_file_id(file): + global content_file_id_cache + + if not content_file_id_cache.has_key(file): + # since this can be called within a transaction, we can't use currval + q = projectB.query("INSERT INTO content_file_names VALUES (DEFAULT, '%s') RETURNING id" % (file)) + content_file_id_cache[file] = int(q.getresult()[0][0]) + return content_file_id_cache[file] + +################################################################################ + +def set_contents_path_id(path): + global content_path_id_cache + + if not content_path_id_cache.has_key(path): + q = projectB.query("INSERT INTO content_file_paths VALUES (DEFAULT, '%s') RETURNING id" % (path)) + content_path_id_cache[path] = int(q.getresult()[0][0]) + return content_path_id_cache[path] + +################################################################################ + +def insert_content_path(bin_id, fullpath): + global insert_contents_file_cache + cache_key = "%s_%s" % (bin_id, fullpath) + + # have we seen this contents before? + # probably only revelant during package import + if insert_contents_file_cache.has_key(cache_key): + return + + # split the path into basename, and pathname + (path, file) = os.path.split(fullpath) + + # Get the necessary IDs ... + file_id = set_contents_file_id(file) + path_id = set_contents_path_id(path) + + # Put them into content_assiocations + projectB.query("INSERT INTO content_associations VALUES (DEFAULT, '%d', '%d', '%d')" % (bin_id, path_id, file_id)) + return + +################################################################################ + def import_contents(suites): global projectB, Cnf @@ -56,6 +103,11 @@ def import_contents(suites): # Needed to make sure postgreSQL doesn't freak out on some of the data projectB.query("SET CLIENT_ENCODING TO 'LATIN1'") + # Prep regexs + line_regex = re.compile(r'^(.+?)\s+(\S+)$') + pkg_regex = re.compile(r'(\S+)/(\S+)$') + file_regex = re.compile('^FILE') + # Get our suites, and the architectures for s in suites: suite_id = database.get_suite_id(s) @@ -89,8 +141,7 @@ def import_contents(suites): sys.exit(255) lines_processed += 1 - p = re.compile('^FILE') - if p.match(line): + if file_regex.match(line): found_header = True continue @@ -99,18 +150,16 @@ def import_contents(suites): # to split the two bits # Print out progress bar - print "\rProcessed %d lines of %d (%%%.2f)" % (lines_processed, num_of_lines, (float(lines_processed)/num_of_lines)), + print "\rProcessed %d lines of %d (%%%.2f)" % (lines_processed, num_of_lines, ((float(lines_processed)/num_of_lines)*100)), # regex lifted from packages.d.o code - p = re.compile('^(.+?)\s+(\S+)$') - matchs = p.findall(line) + matchs = line_regex.findall(line) filename = matchs[0][0] packages = matchs[0][1].split(',') # Iterate through each file's packages for package in packages: - p = re.compile('(\S+)/(\S+)$') - matchs = p.findall(package) + matchs = pkg_regex.findall(package) # Needed since the DB is unicode, and these files # are ASCII @@ -124,7 +173,7 @@ def import_contents(suites): # Likely got an arch all package package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_all_id) - database.insert_content_path(package_id, filename) + insert_content_path(package_id, filename) lines_processed += 1 f.close() diff --git a/daklib/database.py b/daklib/database.py index c39c83b1..f6bedf3d 100755 --- a/daklib/database.py +++ b/daklib/database.py @@ -431,12 +431,8 @@ def get_or_set_contents_file_id(file): q = projectB.query(sql_select) if not q.getresult(): # since this can be called within a transaction, we can't use currval - q = projectB.query("SELECT nextval('content_file_names_id_seq')") - file_id = int(q.getresult()[0][0]) - projectB.query("INSERT INTO content_file_names VALUES ('%d', '%s')" % (file_id, file)) - content_file_id_cache[file] = file_id - else: - content_file_id_cache[file] = int(q.getresult()[0][0]) + q = projectB.query("INSERT INTO content_file_names VALUES (DEFAULT, '%s') RETURNING id" % (file)) + content_file_id_cache[file] = int(q.getresult()[0][0]) return content_file_id_cache[file] ################################################################################ @@ -449,13 +445,8 @@ def get_or_set_contents_path_id(path): q = projectB.query(sql_select) if not q.getresult(): # since this can be called within a transaction, we can't use currval - q = projectB.query("SELECT nextval('content_file_names_id_seq')") - path_id = int(q.getresult()[0][0]) - projectB.query("INSERT INTO content_file_paths VALUES ('%d', '%s')" % ( path_id, path)) - content_path_id_cache[path] = path_id - else: - content_path_id_cache[path] = int(q.getresult()[0][0]) - + q = projectB.query("INSERT INTO content_file_paths VALUES (DEFAULT, '%s') RETURNING id" % (path)) + content_path_id_cache[path] = int(q.getresult()[0][0]) return content_path_id_cache[path] ################################################################################ @@ -480,7 +471,6 @@ def insert_content_path(bin_id, fullpath): q = projectB.query("SELECT 1 FROM content_associations WHERE binary_pkg = '%d' AND filepath = '%d' AND filename = '%d'" % (int(bin_id), path_id, file_id)) if q.getresult(): # Yes we are, return without doing the insert - print "Inserting dup row" return # Put them into content_assiocations