]> git.decadent.org.uk Git - dak.git/blob - dak/contents.py
some docstrings
[dak.git] / dak / contents.py
1 #!/usr/bin/env python
2 """
3 Create all the contents files
4
5 @contact: Debian FTPMaster <ftpmaster@debian.org>
6 @copyright: 2008, 2009 Michael Casadevall <mcasadevall@debian.org>
7 @copyright: 2009 Mike O'Connor <stew@debian.org>
8 @license: GNU General Public License version 2 or later
9 """
10
11 ################################################################################
12
13 # This program is free software; you can redistribute it and/or modify
14 # it under the terms of the GNU General Public License as published by
15 # the Free Software Foundation; either version 2 of the License, or
16 # (at your option) any later version.
17
18 # This program is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21 # GNU General Public License for more details.
22
23 # You should have received a copy of the GNU General Public License
24 # along with this program; if not, write to the Free Software
25 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
26
27 ################################################################################
28
29 # <Ganneff> there is the idea to slowly replace contents files
30 # <Ganneff> with a new generation of such files.
31 # <Ganneff> having more info.
32
33 # <Ganneff> of course that wont help for now where we need to generate them :)
34
35 ################################################################################
36
37 import sys
38 import os
39 import logging
40 import gzip
41 import threading
42 import traceback
43 import Queue
44 import apt_pkg
45 import datetime
46 import traceback
47 from daklib import utils
48 from daklib.binary import Binary
49 from daklib.config import Config
50 from daklib.dbconn import *
51
52 ################################################################################
53
54 def usage (exit_code=0):
55     print """Usage: dak contents [options] command [arguments]
56
57 COMMANDS
58     generate
59         generate Contents-$arch.gz files
60
61     bootstrap_bin
62         scan the debs in the existing pool and load contents into the bin_contents table
63
64     bootstrap
65         copy data from the bin_contents table into the deb_contents / udeb_contents tables
66
67     cruft
68         remove files/paths which are no longer referenced by a binary
69
70 OPTIONS
71      -h, --help
72         show this help and exit
73
74      -v, --verbose
75         show verbose information messages
76
77      -q, --quiet
78         supress all output but errors
79
80      -s, --suite={stable,testing,unstable,...}
81         only operate on a single suite
82 """
83     sys.exit(exit_code)
84
85 ################################################################################
86
87 # where in dak.conf all of our configuration will be stowed
88
89 options_prefix = "Contents"
90 options_prefix = "%s::Options" % options_prefix
91
92 log = logging.getLogger()
93
94 ################################################################################
95
96 class EndOfContents(object):
97     """
98     A sentry object for the end of the filename stream
99     """
100     pass
101
102 class OneAtATime(object):
103     """
104     a one space queue which sits between multiple possible producers
105     and multiple possible consumers
106     """
107     def __init__(self):
108         self.next_in_line = None
109         self.read_lock = threading.Condition()
110         self.write_lock = threading.Condition()
111         self.die = False
112
113     def enqueue(self, next):
114         self.write_lock.acquire()
115         while self.next_in_line:
116             if self.die:
117                 return
118             self.write_lock.wait()
119
120         assert( not self.next_in_line )
121         self.next_in_line = next
122         self.write_lock.release()
123         self.read_lock.acquire()
124         self.read_lock.notify()
125         self.read_lock.release()
126
127     def dequeue(self):
128         self.read_lock.acquire()
129         while not self.next_in_line:
130             if self.die:
131                 return
132             self.read_lock.wait()
133
134         result = self.next_in_line
135
136         self.next_in_line = None
137         self.read_lock.release()
138         self.write_lock.acquire()
139         self.write_lock.notify()
140         self.write_lock.release()
141
142         return result
143
144
145 class ContentsWorkThread(threading.Thread):
146     """
147     """
148     def __init__(self, upstream, downstream):
149         threading.Thread.__init__(self)
150         self.upstream = upstream
151         self.downstream = downstream
152
153     def run(self):
154         while True:
155             try:
156                 contents_file = self.upstream.dequeue()
157                 if isinstance(contents_file,EndOfContents):
158                     if self.downstream:
159                         self.downstream.enqueue(contents_file)
160                     break
161
162                 s = datetime.datetime.now()
163                 print("%s start: %s" % (self,contents_file) )
164                 self._run(contents_file)
165                 print("%s finished: %s in %d seconds" % (self, contents_file, (datetime.datetime.now()-s).seconds ))
166                 if self.downstream:
167                     self.downstream.enqueue(contents_file)
168             except:
169                 traceback.print_exc()
170
171 class QueryThread(ContentsWorkThread):
172     def __init__(self, upstream, downstream):
173         ContentsWorkThread.__init__(self, upstream, downstream)
174
175     def __str__(self):
176         return "QueryThread"
177     __repr__ = __str__
178
179     def _run(self, contents_file):
180         contents_file.query()
181
182 class IngestThread(ContentsWorkThread):
183     def __init__(self, upstream, downstream):
184         ContentsWorkThread.__init__(self, upstream, downstream)
185
186     def __str__(self):
187         return "IngestThread"
188     __repr__ = __str__
189
190     def _run(self, contents_file):
191         contents_file.ingest()
192
193 class SortThread(ContentsWorkThread):
194     def __init__(self, upstream, downstream):
195         ContentsWorkThread.__init__(self, upstream, downstream)
196
197     def __str__(self):
198         return "SortThread"
199     __repr__ = __str__
200
201     def _run(self, contents_file):
202         contents_file.sorted_keys = sorted(contents_file.filenames.keys())
203
204 class OutputThread(ContentsWorkThread):
205     def __init__(self, upstream, downstream):
206         ContentsWorkThread.__init__(self, upstream, downstream)
207
208     def __str__(self):
209         return "OutputThread"
210     __repr__ = __str__
211
212     def _run(self, contents_file):
213         contents_file.open_file()
214         for fname in contents_file.sorted_keys:
215             contents_file.filehandle.write("%s\t%s\n" % (fname,contents_file.filenames[fname]))
216         contents_file.sorted_keys = None
217         contents_file.filenames.clear()
218
219 class GzipThread(ContentsWorkThread):
220     def __init__(self, upstream, downstream):
221         ContentsWorkThread.__init__(self, upstream, downstream)
222
223     def __str__(self):
224         return "GzipThread"
225     __repr__ = __str__
226
227     def _run(self, contents_file):
228         os.system("gzip -f %s" % contents_file.filename)
229
230 class ContentFile(object):
231     def __init__(self,
232                  filename,
233                  suite_str,
234                  suite_id):
235
236         self.filename = filename
237         self.filenames = {}
238         self.sorted_keys = None
239         self.suite_str = suite_str
240         self.suite_id = suite_id
241         self.session = None
242         self.filehandle = None
243         self.results = None
244
245     def __str__(self):
246         return self.filename
247     __repr__ = __str__
248
249
250     def cleanup(self):
251         self.filenames = None
252         self.sortedkeys = None
253         self.filehandle.close()
254         self.session.close()
255
256     def ingest(self):
257         while True:
258             r = self.results.fetchone()
259             if not r:
260                 break
261             filename, package = r
262             self.filenames[filename]=package
263
264         self.session.close()
265
266     def open_file(self):
267         """
268         opens a gzip stream to the contents file
269         """
270         filepath = Config()["Contents::Root"] + self.filename
271         filedir = os.path.dirname(self.filename)
272         if not os.path.isdir(filedir):
273             os.makedirs(filedir)
274         self.filehandle = open(self.filename, "w")
275         self._write_header()
276
277     def _write_header(self):
278         self._get_header();
279         self.filehandle.write(ContentFile.header)
280
281     header=None
282
283     @classmethod
284     def _get_header(self):
285         """
286         Internal method to return the header for Contents.gz files
287
288         This is boilerplate which explains the contents of the file and how
289         it can be used.
290         """
291         if not ContentFile.header:
292             if Config().has_key("Contents::Header"):
293                 try:
294                     h = open(os.path.join( Config()["Dir::Templates"],
295                                            Config()["Contents::Header"] ), "r")
296                     ContentFile.header = h.read()
297                     h.close()
298                 except:
299                     log.error( "error opening header file: %d\n%s" % (Config()["Contents::Header"],
300                                                                       traceback.format_exc() ))
301                     ContentFile.header = None
302             else:
303                 ContentFile.header = None
304
305         return ContentFile.header
306
307
308 class DebContentFile(ContentFile):
309     def __init__(self,
310                  filename,
311                  suite_str,
312                  suite_id,
313                  arch_str,
314                  arch_id):
315         ContentFile.__init__(self,
316                              filename,
317                              suite_str,
318                              suite_id )
319         self.arch_str = arch_str
320         self.arch_id = arch_id
321
322     def query(self):
323         self.session = DBConn().session();
324
325         self.results = self.session.execute("""SELECT filename, comma_separated_list(section || '/' || package)
326         FROM deb_contents
327         WHERE ( arch=2 or arch = :arch) AND suite = :suite
328         """, { 'arch':self.arch_id, 'suite':self.suite_id } )
329
330 class UdebContentFile(ContentFile):
331     def __init__(self,
332                  filename,
333                  suite_str,
334                  suite_id,
335                  section_name,
336                  section_id):
337         ContentFile.__init__(self,
338                              filename,
339                              suite_str,
340                              suite_id )
341
342     def query(self):
343         self.session = DBConn().session();
344
345         self.results = self.session.execute("""SELECT filename, comma_separated_list(section || '/' || package)
346         FROM udeb_contents
347         WHERE suite = :suite
348         group by filename
349         """ , { 'suite': self.suite_id } )
350
351 class Contents(object):
352     """
353     Class capable of generating Contents-$arch.gz files
354     """
355     def __init__(self):
356         self.header = None
357
358     def reject(self, message):
359         log.error("E: %s" % message)
360
361     def cruft(self):
362         """
363         remove files/paths from the DB which are no longer referenced
364         by binaries and clean the temporary table
365         """
366         s = DBConn().session()
367
368         # clear out all of the temporarily stored content associations
369         # this should be run only after p-a has run.  after a p-a
370         # run we should have either accepted or rejected every package
371         # so there should no longer be anything in the queue
372         s.query(PendingContentAssociation).delete()
373
374         # delete any filenames we are storing which have no binary associated
375         # with them
376         cafq = s.query(ContentAssociation.filename_id).distinct()
377         cfq = s.query(ContentFilename)
378         cfq = cfq.filter(~ContentFilename.cafilename_id.in_(cafq))
379         cfq.delete()
380
381         # delete any paths we are storing which have no binary associated with
382         # them
383         capq = s.query(ContentAssociation.filepath_id).distinct()
384         cpq = s.query(ContentFilepath)
385         cpq = cpq.filter(~ContentFilepath.cafilepath_id.in_(capq))
386         cpq.delete()
387
388         s.commit()
389
390
391     def bootstrap_bin(self):
392         """
393         scan the existing debs in the pool to populate the bin_contents table
394         """
395         pooldir = Config()[ 'Dir::Pool' ]
396
397         s = DBConn().session()
398
399         for binary in s.query(DBBinary).yield_per(100):
400             print( "binary: %s" % binary.package )
401             filename = binary.poolfile.filename
402              # Check for existing contents
403             existingq = s.execute( "select 1 from bin_contents where binary_id=:id", {'id':binary.binary_id} );
404             if existingq.fetchone():
405                 log.debug( "already imported: %s" % (filename))
406             else:
407                 # We don't have existing contents so import them
408                 log.debug( "scanning: %s" % (filename) )
409
410                 debfile = os.path.join(pooldir, filename)
411                 if os.path.exists(debfile):
412                     Binary(debfile, self.reject).scan_package(binary.binary_id, True)
413                 else:
414                     log.error("missing .deb: %s" % filename)
415
416
417
418     def bootstrap(self):
419         """
420         scan the existing debs in the pool to populate the contents database tables
421         """
422         s = DBConn().session()
423
424
425         # get a mapping of all the override types we care about (right now .deb an .udeb)
426         override_type_map = {};
427         for override_type in s.query(OverrideType).all():
428             if override_type.overridetype.endswith('deb' ):
429                 override_type_map[override_type.overridetype_id] = override_type.overridetype;
430
431         for override in s.query(Override).yield_per(100):
432             if not override_type_map.has_key(override.overridetype_id):
433                 #this isn't an override we care about
434                 continue
435
436             binaries = s.execute("""SELECT b.id, b.architecture
437                                     FROM binaries b
438                                     JOIN bin_associations ba ON ba.bin=b.id
439                                     WHERE ba.suite=:suite
440                                     AND b.package=:package""", {'suite':override.suite_id, 'package':override.package})
441             while True:
442                 binary = binaries.fetchone()
443                 if not binary:
444                     break
445
446                 exists = s.execute("SELECT 1 FROM %s_contents WHERE binary_id=:id limit 1" % override_type_map[override.overridetype_id], {'id':binary.id})
447
448
449                 if exists.fetchone():
450                     print '.',
451                     continue
452                 else:
453                     print '+',
454
455                 s.execute( """INSERT INTO %s_contents (filename,section,package,binary_id,arch,suite)
456                               SELECT file, :section, :package, :binary_id, :arch, :suite
457                               FROM bin_contents
458                               WHERE binary_id=:binary_id;""" % override_type_map[override.overridetype_id],
459                            { 'section' : override.section_id,
460                              'package' : override.package,
461                              'binary_id' : binary.id,
462                              'arch' : binary.architecture,
463                              'suite' : override.suite_id } )
464                 s.commit()
465
466     def generate(self):
467         """
468         Generate contents files for both deb and udeb
469         """
470         self.deb_generate()
471         self.udeb_generate()
472
473     def deb_generate(self):
474         """
475         Generate Contents-$arch.gz files for every available arch in each given suite.
476         """
477         session = DBConn().session()
478         debtype_id = get_override_type("deb", session)
479         suites = self._suites()
480
481         inputtoquery = OneAtATime()
482         querytoingest = OneAtATime()
483         ingesttosort = OneAtATime()
484         sorttooutput = OneAtATime()
485         outputtogzip = OneAtATime()
486
487         qt = QueryThread(inputtoquery,querytoingest)
488         it = IngestThread(querytoingest,ingesttosort)
489         st = SortThread(ingesttosort,sorttooutput)
490         ot = OutputThread(sorttooutput,outputtogzip)
491         gt = GzipThread(outputtogzip, None)
492
493         qt.start()
494         it.start()
495         st.start()
496         ot.start()
497         gt.start()
498
499         # Get our suites, and the architectures
500         for suite in [i.lower() for i in suites]:
501             suite_id = get_suite(suite, session).suite_id
502             print( "got suite_id: %s for suite: %s" % (suite_id, suite ) )
503             arch_list = self._arches(suite_id, session)
504
505             for (arch_id,arch_str) in arch_list:
506                 print( "suite: %s, arch: %s time: %s" %(suite_id, arch_id, datetime.datetime.now().isoformat()) )
507
508                 filename = "dists/%s/Contents-%s" % (suite, arch_str)
509                 cf = DebContentFile(filename, suite, suite_id, arch_str, arch_id)
510                 inputtoquery.enqueue( cf )
511
512         inputtoquery.enqueue( EndOfContents() )
513         gt.join()
514
515     def udeb_generate(self):
516         """
517         Generate Contents-$arch.gz files for every available arch in each given suite.
518         """
519         session = DBConn().session()
520         udebtype_id=DBConn().get_override_type_id("udeb")
521         suites = self._suites()
522
523         inputtoquery = OneAtATime()
524         querytoingest = OneAtATime()
525         ingesttosort = OneAtATime()
526         sorttooutput = OneAtATime()
527         outputtogzip = OneAtATime()
528
529         qt = QueryThread(inputtoquery,querytoingest)
530         it = IngestThread(querytoingest,ingesttosort)
531         st = SortThread(ingesttosort,sorttooutput)
532         ot = OutputThread(sorttooutput,outputtogzip)
533         gt = GzipThread(outputtogzip, None)
534
535         qt.start()
536         it.start()
537         st.start()
538         ot.start()
539         gt.start()
540
541
542     def generate(self):
543         """
544         Generate Contents-$arch.gz files for every available arch in each given suite.
545         """
546         session = DBConn().session()
547
548         arch_all_id = get_architecture("all", session).arch_id
549
550         # The MORE fun part. Ok, udebs need their own contents files, udeb, and udeb-nf (not-free)
551         # This is HORRIBLY debian specific :-/
552         for dtype, section, fn_pattern in \
553               [('deb',  None,                        "dists/%s/Contents-%s.gz"),
554                ('udeb', "debian-installer",          "dists/%s/Contents-udeb-%s.gz"),
555                ('udeb', "non-free/debian-installer", "dists/%s/Contents-udeb-nf-%s.gz")]:
556
557             overridetype = get_override_type(dtype, session)
558
559             # For udebs, we only look in certain sections (see the for loop above)
560             if section is not None:
561                 section = get_section(section, session)
562
563             # Get our suites
564             for suite in which_suites(session):
565                 # Which architectures do we need to work on
566                 arch_list = get_suite_architectures(suite.suite_name, skipsrc=True, skipall=True, session=session)
567
568                 # Set up our file writer dictionary
569                 file_writers = {}
570                 try:
571                     # One file writer per arch
572                     for arch in arch_list:
573                         file_writers[arch.arch_id] = GzippedContentWriter(fn_pattern % (suite, arch.arch_string))
574
575                     for r in get_suite_contents(suite, overridetype, section, session=session).fetchall():
576                         filename, section, package, arch_id = r
577
578                         if arch_id == arch_all_id:
579                             # It's arch all, so all contents files get it
580                             for writer in file_writers.values():
581                                 writer.write(filename, section, package)
582                         else:
583                             if file_writers.has_key(arch_id):
584                                 file_writers[arch_id].write(filename, section, package)
585
586                 finally:
587                     # close all the files
588                     for writer in file_writers.values():
589                         writer.finish()
590     def _suites(self):
591         """
592         return a list of suites to operate on
593         """
594         if Config().has_key( "%s::%s" %(options_prefix,"Suite")):
595             suites = utils.split_args(Config()[ "%s::%s" %(options_prefix,"Suite")])
596         else:
597             suites = Config().SubTree("Suite").List()
598
599         return suites
600
601     def _arches(self, suite, session):
602         """
603         return a list of archs to operate on
604         """
605         arch_list = []
606         arches = session.execute(
607             """SELECT s.architecture, a.arch_string
608             FROM suite_architectures s
609             JOIN architecture a ON (s.architecture=a.id)
610             WHERE suite = :suite_id""",
611             {'suite_id':suite } )
612
613         while True:
614             r = arches.fetchone()
615             if not r:
616                 break
617
618             if r[1] != "source" and r[1] != "all":
619                 arch_list.append((r[0], r[1]))
620
621         return arch_list
622
623
624 ################################################################################
625
626 def main():
627     cnf = Config()
628
629     arguments = [('h',"help", "%s::%s" % (options_prefix,"Help")),
630                  ('s',"suite", "%s::%s" % (options_prefix,"Suite"),"HasArg"),
631                  ('q',"quiet", "%s::%s" % (options_prefix,"Quiet")),
632                  ('v',"verbose", "%s::%s" % (options_prefix,"Verbose")),
633                 ]
634
635     commands = {'generate' : Contents.generate,
636                 'bootstrap_bin' : Contents.bootstrap_bin,
637                 'bootstrap' : Contents.bootstrap,
638                 'cruft' : Contents.cruft,
639                 }
640
641     args = apt_pkg.ParseCommandLine(cnf.Cnf, arguments,sys.argv)
642
643     if (len(args) < 1) or not commands.has_key(args[0]):
644         usage()
645
646     if cnf.has_key("%s::%s" % (options_prefix,"Help")):
647         usage()
648
649     level=logging.INFO
650     if cnf.has_key("%s::%s" % (options_prefix,"Quiet")):
651         level=logging.ERROR
652
653     elif cnf.has_key("%s::%s" % (options_prefix,"Verbose")):
654         level=logging.DEBUG
655
656
657     logging.basicConfig( level=level,
658                          format='%(asctime)s %(levelname)s %(message)s',
659                          stream = sys.stderr )
660
661     commands[args[0]](Contents())
662
663 def which_suites(session):
664     """
665     return a list of suites to operate on
666     """
667     if Config().has_key( "%s::%s" %(options_prefix,"Suite")):
668         suites = utils.split_args(Config()[ "%s::%s" %(options_prefix,"Suite")])
669     else:
670         suites = Config().SubTree("Suite").List()
671
672     return [get_suite(s.lower(), session) for s in suites]
673
674
675 if __name__ == '__main__':
676     main()