3 """ Populate the DB """
4 # Copyright (C) 2000, 2001, 2002, 2003, 2004, 2006 James Troup <james@nocrew.org>
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 ###############################################################################
22 # 04:36|<aj> elmo: you're making me waste 5 seconds per architecture!!!!!! YOU BASTARD!!!!!
24 ###############################################################################
26 # This code is a horrible mess for two reasons:
28 # (o) For Debian's usage, it's doing something like 160k INSERTs,
29 # even on auric, that makes the program unusable unless we get
30 # involed in sorts of silly optimization games (local dicts to avoid
31 # redundant SELECTS, using COPY FROM rather than INSERTS etc.)
33 # (o) It's very site specific, because I don't expect to use this
34 # script again in a hurry, and I don't want to spend any more time
35 # on it than absolutely necessary.
37 ###############################################################################
39 import commands, os, pg, sys, time
41 from daklib import database
42 from daklib import utils
43 from daklib.dak_exceptions import *
44 from daklib.regexes import re_arch_from_filename, re_taint_free, re_no_epoch, \
45 re_extract_src_version
47 ###############################################################################
55 location_path_cache = {}
59 src_associations_id_serial = 0
60 dsc_files_id_serial = 0
61 files_query_cache = None
62 source_query_cache = None
63 src_associations_query_cache = None
64 dsc_files_query_cache = None
65 orig_tar_gz_cache = {}
67 binaries_id_serial = 0
68 binaries_query_cache = None
69 bin_associations_id_serial = 0
70 bin_associations_query_cache = None
72 source_cache_for_binaries = {}
75 ################################################################################
77 def usage(exit_code=0):
78 print """Usage: dak import-archive
79 Initializes a projectB database from an existing archive
81 -a, --action actually perform the initalization
82 -h, --help show this help and exit."""
85 ###############################################################################
87 def reject (str, prefix="Rejected: "):
90 reject_message += prefix + str + "\n"
92 ###############################################################################
94 def check_signature (filename):
95 if not re_taint_free.match(os.path.basename(filename)):
96 reject("!!WARNING!! tainted filename: '%s'." % (filename))
99 status_read, status_write = os.pipe()
100 cmd = "gpgv --status-fd %s %s %s" \
101 % (status_write, utils.gpg_keyring_args(), filename)
102 (output, status, exit_status) = utils.gpgv_get_status_output(cmd, status_read, status_write)
104 # Process the status-fd output
106 bad = internal_error = ""
107 for line in status.split('\n'):
113 internal_error += "gpgv status line is malformed (< 2 atoms) ['%s'].\n" % (line)
115 (gnupg, keyword) = split[:2]
116 if gnupg != "[GNUPG:]":
117 internal_error += "gpgv status line is malformed (incorrect prefix '%s').\n" % (gnupg)
120 if keywords.has_key(keyword) and keyword != "NODATA" and keyword != "SIGEXPIRED":
121 internal_error += "found duplicate status token ('%s').\n" % (keyword)
124 keywords[keyword] = args
126 # If we failed to parse the status-fd output, let's just whine and bail now
128 reject("internal error while performing signature check on %s." % (filename))
129 reject(internal_error, "")
130 reject("Please report the above errors to the Archive maintainers by replying to this mail.", "")
133 # Now check for obviously bad things in the processed output
134 if keywords.has_key("SIGEXPIRED"):
135 utils.warn("%s: signing key has expired." % (filename))
136 if keywords.has_key("KEYREVOKED"):
137 reject("key used to sign %s has been revoked." % (filename))
139 if keywords.has_key("BADSIG"):
140 reject("bad signature on %s." % (filename))
142 if keywords.has_key("ERRSIG") and not keywords.has_key("NO_PUBKEY"):
143 reject("failed to check signature on %s." % (filename))
145 if keywords.has_key("NO_PUBKEY"):
146 args = keywords["NO_PUBKEY"]
148 reject("internal error while checking signature on %s." % (filename))
151 fingerprint = args[0]
152 if keywords.has_key("BADARMOR"):
153 reject("ascii armour of signature was corrupt in %s." % (filename))
155 if keywords.has_key("NODATA"):
156 utils.warn("no signature found for %s." % (filename))
158 #reject("no signature found in %s." % (filename))
164 # Next check gpgv exited with a zero return code
165 if exit_status and not keywords.has_key("NO_PUBKEY"):
166 reject("gpgv failed while checking %s." % (filename))
168 reject(utils.prefix_multi_line_string(status, " [GPG status-fd output:] "), "")
170 reject(utils.prefix_multi_line_string(output, " [GPG output:] "), "")
173 # Sanity check the good stuff we expect
174 if not keywords.has_key("VALIDSIG"):
175 if not keywords.has_key("NO_PUBKEY"):
176 reject("signature on %s does not appear to be valid [No VALIDSIG]." % (filename))
179 args = keywords["VALIDSIG"]
181 reject("internal error while checking signature on %s." % (filename))
184 fingerprint = args[0]
185 if not keywords.has_key("GOODSIG") and not keywords.has_key("NO_PUBKEY"):
186 reject("signature on %s does not appear to be valid [No GOODSIG]." % (filename))
188 if not keywords.has_key("SIG_ID") and not keywords.has_key("NO_PUBKEY"):
189 reject("signature on %s does not appear to be valid [No SIG_ID]." % (filename))
192 # Finally ensure there's not something we don't recognise
193 known_keywords = utils.Dict(VALIDSIG="",SIG_ID="",GOODSIG="",BADSIG="",ERRSIG="",
194 SIGEXPIRED="",KEYREVOKED="",NO_PUBKEY="",BADARMOR="",
197 for keyword in keywords.keys():
198 if not known_keywords.has_key(keyword):
199 reject("found unknown status token '%s' from gpgv with args '%r' in %s." % (keyword, keywords[keyword], filename))
207 ################################################################################
209 # Prepares a filename or directory (s) to be file.filename by stripping any part of the location (sub) from it.
210 def poolify (s, sub):
211 for i in xrange(len(sub)):
212 if sub[i:] == s[0:len(sub)-i]:
213 return s[len(sub)-i:]
216 def update_archives ():
217 projectB.query("DELETE FROM archive")
218 for archive in Cnf.SubTree("Archive").List():
219 SubSec = Cnf.SubTree("Archive::%s" % (archive))
220 projectB.query("INSERT INTO archive (name, origin_server, description) VALUES ('%s', '%s', '%s')"
221 % (archive, SubSec["OriginServer"], SubSec["Description"]))
223 def update_components ():
224 projectB.query("DELETE FROM component")
225 for component in Cnf.SubTree("Component").List():
226 SubSec = Cnf.SubTree("Component::%s" % (component))
227 projectB.query("INSERT INTO component (name, description, meets_dfsg) VALUES ('%s', '%s', '%s')" %
228 (component, SubSec["Description"], SubSec["MeetsDFSG"]))
230 def update_locations ():
231 projectB.query("DELETE FROM location")
232 for location in Cnf.SubTree("Location").List():
233 SubSec = Cnf.SubTree("Location::%s" % (location))
234 archive_id = database.get_archive_id(SubSec["archive"])
235 type = SubSec.Find("type")
236 if type == "legacy-mixed":
237 projectB.query("INSERT INTO location (path, archive, type) VALUES ('%s', %d, '%s')" % (location, archive_id, SubSec["type"]))
239 for component in Cnf.SubTree("Component").List():
240 component_id = database.get_component_id(component)
241 projectB.query("INSERT INTO location (path, component, archive, type) VALUES ('%s', %d, %d, '%s')" %
242 (location, component_id, archive_id, SubSec["type"]))
244 def update_architectures ():
245 projectB.query("DELETE FROM architecture")
246 for arch in Cnf.SubTree("Architectures").List():
247 projectB.query("INSERT INTO architecture (arch_string, description) VALUES ('%s', '%s')" % (arch, Cnf["Architectures::%s" % (arch)]))
249 def update_suites ():
250 projectB.query("DELETE FROM suite")
251 for suite in Cnf.SubTree("Suite").List():
252 SubSec = Cnf.SubTree("Suite::%s" %(suite))
253 projectB.query("INSERT INTO suite (suite_name) VALUES ('%s')" % suite.lower())
254 for i in ("Version", "Origin", "Description"):
255 if SubSec.has_key(i):
256 projectB.query("UPDATE suite SET %s = '%s' WHERE suite_name = '%s'" % (i.lower(), SubSec[i], suite.lower()))
257 for architecture in get_suite_architectures(suite):
258 architecture_id = database.get_architecture_id (architecture)
259 projectB.query("INSERT INTO suite_architectures (suite, architecture) VALUES (currval('suite_id_seq'), %d)" % (architecture_id))
261 def update_override_type():
262 projectB.query("DELETE FROM override_type")
263 for type in Cnf.ValueList("OverrideType"):
264 projectB.query("INSERT INTO override_type (type) VALUES ('%s')" % (type))
266 def update_priority():
267 projectB.query("DELETE FROM priority")
268 for priority in Cnf.SubTree("Priority").List():
269 projectB.query("INSERT INTO priority (priority, level) VALUES ('%s', %s)" % (priority, Cnf["Priority::%s" % (priority)]))
271 def update_section():
272 projectB.query("DELETE FROM section")
273 for component in Cnf.SubTree("Component").List():
274 if Cnf["Control-Overrides::ComponentPosition"] == "prefix":
276 if component != 'main':
277 prefix = component + '/'
282 if component != 'main':
283 suffix = '/' + component
286 for section in Cnf.ValueList("Section"):
287 projectB.query("INSERT INTO section (section) VALUES ('%s%s%s')" % (prefix, section, suffix))
289 def get_location_path(directory):
290 global location_path_cache
292 if location_path_cache.has_key(directory):
293 return location_path_cache[directory]
295 q = projectB.query("SELECT DISTINCT path FROM location WHERE path ~ '%s'" % (directory))
297 path = q.getresult()[0][0]
299 utils.fubar("[import-archive] get_location_path(): Couldn't get path for %s" % (directory))
300 location_path_cache[directory] = path
303 ################################################################################
305 def get_or_set_files_id (filename, size, md5sum, location_id):
306 global files_id_cache, files_id_serial, files_query_cache
308 cache_key = "_".join((filename, size, md5sum, repr(location_id)))
309 if not files_id_cache.has_key(cache_key):
311 files_query_cache.write("%d\t%s\t%s\t%s\t%d\t\\N\n" % (files_id_serial, filename, size, md5sum, location_id))
312 files_id_cache[cache_key] = files_id_serial
314 return files_id_cache[cache_key]
316 ###############################################################################
318 def process_sources (filename, suite, component, archive):
319 global source_cache, source_query_cache, src_associations_query_cache, dsc_files_query_cache, source_id_serial, src_associations_id_serial, dsc_files_id_serial, source_cache_for_binaries, orig_tar_gz_cache, reject_message
321 suite = suite.lower()
322 suite_id = database.get_suite_id(suite)
324 file = utils.open_file (filename)
325 except CantOpenError:
326 utils.warn("can't open '%s'" % (filename))
328 Scanner = apt_pkg.ParseTagFile(file)
329 while Scanner.Step() != 0:
330 package = Scanner.Section["package"]
331 version = Scanner.Section["version"]
332 directory = Scanner.Section["directory"]
333 dsc_file = os.path.join(Cnf["Dir::Root"], directory, "%s_%s.dsc" % (package, re_no_epoch.sub('', version)))
334 # Sometimes the Directory path is a lie; check in the pool
335 if not os.path.exists(dsc_file):
336 if directory.split('/')[0] == "dists":
337 directory = Cnf["Dir::PoolRoot"] + utils.poolify(package, component)
338 dsc_file = os.path.join(Cnf["Dir::Root"], directory, "%s_%s.dsc" % (package, re_no_epoch.sub('', version)))
339 if not os.path.exists(dsc_file):
340 utils.fubar("%s not found." % (dsc_file))
341 install_date = time.strftime("%Y-%m-%d", time.localtime(os.path.getmtime(dsc_file)))
342 fingerprint = check_signature(dsc_file)
343 fingerprint_id = database.get_or_set_fingerprint_id(fingerprint)
345 utils.fubar("%s: %s" % (dsc_file, reject_message))
346 maintainer = Scanner.Section["maintainer"]
347 maintainer = maintainer.replace("'", "\\'")
348 maintainer_id = database.get_or_set_maintainer_id(maintainer)
349 location = get_location_path(directory.split('/')[0])
350 location_id = database.get_location_id (location, component, archive)
351 if not directory.endswith("/"):
353 directory = poolify (directory, location)
354 if directory != "" and not directory.endswith("/"):
356 no_epoch_version = re_no_epoch.sub('', version)
357 # Add all files referenced by the .dsc to the files table
359 for line in Scanner.Section["files"].split('\n'):
361 (md5sum, size, filename) = line.strip().split()
362 # Don't duplicate .orig.tar.gz's
363 if filename.endswith(".orig.tar.gz"):
364 cache_key = "%s_%s_%s" % (filename, size, md5sum)
365 if orig_tar_gz_cache.has_key(cache_key):
366 id = orig_tar_gz_cache[cache_key]
368 id = get_or_set_files_id (directory + filename, size, md5sum, location_id)
369 orig_tar_gz_cache[cache_key] = id
371 id = get_or_set_files_id (directory + filename, size, md5sum, location_id)
373 # If this is the .dsc itself; save the ID for later.
374 if filename.endswith(".dsc"):
376 filename = directory + package + '_' + no_epoch_version + '.dsc'
377 cache_key = "%s_%s" % (package, version)
378 if not source_cache.has_key(cache_key):
379 nasty_key = "%s_%s" % (package, version)
380 source_id_serial += 1
381 if not source_cache_for_binaries.has_key(nasty_key):
382 source_cache_for_binaries[nasty_key] = source_id_serial
383 tmp_source_id = source_id_serial
384 source_cache[cache_key] = source_id_serial
385 source_query_cache.write("%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (source_id_serial, package, version, maintainer_id, files_id, install_date, fingerprint_id))
387 dsc_files_id_serial += 1
388 dsc_files_query_cache.write("%d\t%d\t%d\n" % (dsc_files_id_serial, tmp_source_id,id))
390 tmp_source_id = source_cache[cache_key]
392 src_associations_id_serial += 1
393 src_associations_query_cache.write("%d\t%d\t%d\n" % (src_associations_id_serial, suite_id, tmp_source_id))
397 ###############################################################################
399 def process_packages (filename, suite, component, archive):
400 global arch_all_cache, binary_cache, binaries_id_serial, binaries_query_cache, bin_associations_id_serial, bin_associations_query_cache, reject_message
404 suite = suite.lower()
405 suite_id = database.get_suite_id(suite)
407 file = utils.open_file (filename)
408 except CantOpenError:
409 utils.warn("can't open '%s'" % (filename))
411 Scanner = apt_pkg.ParseTagFile(file)
412 while Scanner.Step() != 0:
413 package = Scanner.Section["package"]
414 version = Scanner.Section["version"]
415 maintainer = Scanner.Section["maintainer"]
416 maintainer = maintainer.replace("'", "\\'")
417 maintainer_id = database.get_or_set_maintainer_id(maintainer)
418 architecture = Scanner.Section["architecture"]
419 architecture_id = database.get_architecture_id (architecture)
420 fingerprint = "NOSIG"
421 fingerprint_id = database.get_or_set_fingerprint_id(fingerprint)
422 if not Scanner.Section.has_key("source"):
425 source = Scanner.Section["source"]
427 if source.find("(") != -1:
428 m = re_extract_src_version.match(source)
430 source_version = m.group(2)
431 if not source_version:
432 source_version = version
433 filename = Scanner.Section["filename"]
434 if filename.endswith(".deb"):
438 location = get_location_path(filename.split('/')[0])
439 location_id = database.get_location_id (location, component.replace("/debian-installer", ""), archive)
440 filename = poolify (filename, location)
441 if architecture == "all":
442 filename = re_arch_from_filename.sub("binary-all", filename)
443 cache_key = "%s_%s" % (source, source_version)
444 source_id = source_cache_for_binaries.get(cache_key, None)
445 size = Scanner.Section["size"]
446 md5sum = Scanner.Section["md5sum"]
447 files_id = get_or_set_files_id (filename, size, md5sum, location_id)
448 cache_key = "%s_%s_%s_%d_%d_%d_%d" % (package, version, repr(source_id), architecture_id, location_id, files_id, suite_id)
449 if not arch_all_cache.has_key(cache_key):
450 arch_all_cache[cache_key] = 1
451 cache_key = "%s_%s_%s_%d" % (package, version, repr(source_id), architecture_id)
452 if not binary_cache.has_key(cache_key):
457 source_id = repr(source_id)
458 binaries_id_serial += 1
459 binaries_query_cache.write("%d\t%s\t%s\t%d\t%s\t%d\t%d\t%s\t%s\n" % (binaries_id_serial, package, version, maintainer_id, source_id, architecture_id, files_id, type, fingerprint_id))
460 binary_cache[cache_key] = binaries_id_serial
461 tmp_binaries_id = binaries_id_serial
463 tmp_binaries_id = binary_cache[cache_key]
465 bin_associations_id_serial += 1
466 bin_associations_query_cache.write("%d\t%d\t%d\n" % (bin_associations_id_serial, suite_id, tmp_binaries_id))
471 print "%d binary packages processed; %d with no source match which is %.2f%%" % (count_total, count_bad, (float(count_bad)/count_total)*100)
473 print "%d binary packages processed; 0 with no source match which is 0%%" % (count_total)
475 ###############################################################################
477 def do_sources(sources, suite, component, server):
478 (fd, temp_filename) = utils.temp_filename()
479 (result, output) = commands.getstatusoutput("gunzip -c %s > %s" % (sources, temp_filename))
481 utils.fubar("Gunzip invocation failed!\n%s" % (output), result)
482 print 'Processing '+sources+'...'
483 process_sources (temp_filename, suite, component, server)
484 os.unlink(temp_filename)
486 ###############################################################################
489 global Cnf, projectB, query_cache, files_query_cache, source_query_cache, src_associations_query_cache, dsc_files_query_cache, bin_associations_query_cache, binaries_query_cache
491 Cnf = utils.get_conf()
492 Arguments = [('a', "action", "Import-Archive::Options::Action"),
493 ('h', "help", "Import-Archive::Options::Help")]
494 for i in [ "action", "help" ]:
495 if not Cnf.has_key("Import-Archive::Options::%s" % (i)):
496 Cnf["Import-Archive::Options::%s" % (i)] = ""
498 apt_pkg.ParseCommandLine(Cnf, Arguments, sys.argv)
500 Options = Cnf.SubTree("Import-Archive::Options")
504 if not Options["Action"]:
505 utils.warn("""no -a/--action given; not doing anything.
506 Please read the documentation before running this script.
510 print "Re-Creating DB..."
511 (result, output) = commands.getstatusoutput("psql -f init_pool.sql template1")
513 utils.fubar("psql invocation failed!\n", result)
516 projectB = pg.connect(Cnf["DB::Name"], Cnf["DB::Host"], int(Cnf["DB::Port"]))
518 database.init (Cnf, projectB)
520 print "Adding static tables from conf file..."
521 projectB.query("BEGIN WORK")
522 update_architectures()
527 update_override_type()
530 projectB.query("COMMIT WORK")
532 files_query_cache = utils.open_file(Cnf["Import-Archive::ExportDir"]+"files","w")
533 source_query_cache = utils.open_file(Cnf["Import-Archive::ExportDir"]+"source","w")
534 src_associations_query_cache = utils.open_file(Cnf["Import-Archive::ExportDir"]+"src_associations","w")
535 dsc_files_query_cache = utils.open_file(Cnf["Import-Archive::ExportDir"]+"dsc_files","w")
536 binaries_query_cache = utils.open_file(Cnf["Import-Archive::ExportDir"]+"binaries","w")
537 bin_associations_query_cache = utils.open_file(Cnf["Import-Archive::ExportDir"]+"bin_associations","w")
539 projectB.query("BEGIN WORK")
540 # Process Sources files to popoulate `source' and friends
541 for location in Cnf.SubTree("Location").List():
542 SubSec = Cnf.SubTree("Location::%s" % (location))
543 server = SubSec["Archive"]
544 type = Cnf.Find("Location::%s::Type" % (location))
545 if type == "legacy-mixed":
546 sources = location + 'Sources.gz'
547 suite = Cnf.Find("Location::%s::Suite" % (location))
548 do_sources(sources, suite, "", server)
549 elif type == "legacy" or type == "pool":
550 for suite in Cnf.ValueList("Location::%s::Suites" % (location)):
551 for component in Cnf.SubTree("Component").List():
552 sources = Cnf["Dir::Root"] + "dists/" + Cnf["Suite::%s::CodeName" % (suite)] + '/' + component + '/source/' + 'Sources.gz'
553 do_sources(sources, suite, component, server)
555 utils.fubar("Unknown location type ('%s')." % (type))
557 # Process Packages files to populate `binaries' and friends
559 for location in Cnf.SubTree("Location").List():
560 SubSec = Cnf.SubTree("Location::%s" % (location))
561 server = SubSec["Archive"]
562 type = Cnf.Find("Location::%s::Type" % (location))
563 if type == "legacy-mixed":
564 packages = location + 'Packages'
565 suite = Cnf.Find("Location::%s::Suite" % (location))
566 print 'Processing '+location+'...'
567 process_packages (packages, suite, "", server)
568 elif type == "legacy" or type == "pool":
569 for suite in Cnf.ValueList("Location::%s::Suites" % (location)):
570 udeb_components = map(lambda x: x+"/debian-installer",
571 Cnf.ValueList("Suite::%s::UdebComponents" % suite))
572 for component in Cnf.SubTree("Component").List() + udeb_components:
573 architectures = filter(utils.real_arch, get_suite_architectures(suite))
574 for architecture in architectures:
575 packages = Cnf["Dir::Root"] + "dists/" + Cnf["Suite::%s::CodeName" % (suite)] + '/' + component + '/binary-' + architecture + '/Packages'
576 print 'Processing '+packages+'...'
577 process_packages (packages, suite, component, server)
579 files_query_cache.close()
580 source_query_cache.close()
581 src_associations_query_cache.close()
582 dsc_files_query_cache.close()
583 binaries_query_cache.close()
584 bin_associations_query_cache.close()
585 print "Writing data to `files' table..."
586 projectB.query("COPY files FROM '%s'" % (Cnf["Import-Archive::ExportDir"]+"files"))
587 print "Writing data to `source' table..."
588 projectB.query("COPY source FROM '%s'" % (Cnf["Import-Archive::ExportDir"]+"source"))
589 print "Writing data to `src_associations' table..."
590 projectB.query("COPY src_associations FROM '%s'" % (Cnf["Import-Archive::ExportDir"]+"src_associations"))
591 print "Writing data to `dsc_files' table..."
592 projectB.query("COPY dsc_files FROM '%s'" % (Cnf["Import-Archive::ExportDir"]+"dsc_files"))
593 print "Writing data to `binaries' table..."
594 projectB.query("COPY binaries FROM '%s'" % (Cnf["Import-Archive::ExportDir"]+"binaries"))
595 print "Writing data to `bin_associations' table..."
596 projectB.query("COPY bin_associations FROM '%s'" % (Cnf["Import-Archive::ExportDir"]+"bin_associations"))
597 print "Committing..."
598 projectB.query("COMMIT WORK")
600 # Add the constraints and otherwise generally clean up the database.
601 # See add_constraints.sql for more details...
603 print "Running add_constraints.sql..."
604 (result, output) = commands.getstatusoutput("psql %s < add_constraints.sql" % (Cnf["DB::Name"]))
607 utils.fubar("psql invocation failed!\n%s" % (output), result)
611 ################################################################################
614 utils.try_with_debug(do_da_do_da)
616 ################################################################################
618 if __name__ == '__main__':