3 """ Various statistical pr0nography fun and games """
4 # Copyright (C) 2000, 2001, 2002, 2003, 2006 James Troup <james@nocrew.org>
5 # Copyright (C) 2013 Luca Falavigna <dktrkranz@debian.org>
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 ################################################################################
23 # <aj> can we change the standards instead?
25 # <aj> whatever we're not conforming to
26 # <aj> if there's no written standard, why don't we declare linux as
27 # the defacto standard
30 # [aj's attempt to avoid ABI changes for released architecture(s)]
32 ################################################################################
37 from datetime import datetime
38 from email.utils import mktime_tz, parsedate_tz
39 from mailbox import mbox
40 from os import listdir, system, unlink
41 from os.path import isfile, join, splitext
42 from re import findall, DOTALL, MULTILINE
43 from sys import stderr
44 from yaml import safe_load, safe_dump
46 from daklib import utils
47 from daklib.dbconn import DBConn, get_suite_architectures, Suite, Architecture
49 ################################################################################
56 FORMAT_SWITCH = '2009-08'
57 blacklisted = ('dak', 'katie')
59 NEW = ('^(\d{14})\|(?:jennifer|process-unchecked|.*?\|dak)'
60 '\|(Moving to new|ACCEPT-TO-NEW)')
61 new_ACTIONS = '^(\d{14})\|[^\|]*\|(\S+)\|NEW (\S+)[:\|]'
62 old_ACTIONS = ('(?:lisa|process-new)\|program start\|(.*?)\|'
63 '(?:lisa|process-new)\|program end')
64 old_ACTION = '^(\d{14})\|(?:lisa|process-new)\|(Accepting changes|rejected)\|'
66 ################################################################################
68 def usage(exit_code=0):
69 print """Usage: dak stats MODE
72 -h, --help show this help and exit.
74 The following MODEs are available:
76 arch-space - displays space used by each architecture
77 pkg-nums - displays the number of packages by suite/architecture
78 daily-install - displays daily install stats suitable for graphing
79 new - stores stats about the NEW queue
83 ################################################################################
85 def per_arch_space_use():
86 session = DBConn().session()
87 q = session.execute("""
88 SELECT a.arch_string as Architecture, sum(f.size) AS sum
89 FROM files f, binaries b, architecture a
90 WHERE a.id=b.architecture AND f.id=b.file
91 GROUP BY a.arch_string ORDER BY sum""").fetchall()
93 print "%-15.15s %s" % (j[0], j[1])
95 q = session.execute("SELECT sum(size) FROM files WHERE filename ~ '.(diff.gz|tar.gz|dsc)$'").fetchall()
96 print "%-15.15s %s" % ("Source", q[0][0])
98 ################################################################################
100 def daily_install_stats():
102 f = utils.open_file("2001-11")
103 for line in f.readlines():
104 split = line.strip().split('|')
106 if program != "katie" and program != "process-accepted":
109 if action != "installing changes" and action != "installed":
112 if not stats.has_key(date):
114 stats[date]["packages"] = 0
115 stats[date]["size"] = 0.0
116 if action == "installing changes":
117 stats[date]["packages"] += 1
118 elif action == "installed":
119 stats[date]["size"] += float(split[5])
124 packages = stats[date]["packages"]
125 size = int(stats[date]["size"] / 1024.0 / 1024.0)
126 print "%s %s %s" % (date, packages, size)
128 ################################################################################
138 def output_format(suite):
140 for word in suite.split("-"):
141 output_suite.append(word[0])
142 return "-".join(output_suite)
144 def number_of_packages():
150 session = DBConn().session()
151 # Build up suite mapping
152 for i in session.query(Suite).all():
153 suites[i.suite_id] = i.suite_name
154 suite_ids[i.suite_name] = i.suite_id
155 # Build up architecture mapping
156 for i in session.query(Architecture).all():
157 arches[i.arch_id] = i.arch_string
158 arch_ids[i.arch_string] = i.arch_id
159 # Pre-create the dictionary
160 for suite_id in suites.keys():
162 for arch_id in arches.keys():
163 d[suite_id][arch_id] = 0
164 # Get the raw data for binaries
165 # Simultate 'GROUP by suite, architecture' with a dictionary
166 # XXX: Why don't we just get the DB to do this?
167 for i in session.execute("""SELECT suite, architecture, COUNT(suite)
168 FROM bin_associations
169 LEFT JOIN binaries ON bin = binaries.id
170 GROUP BY suite, architecture""").fetchall():
171 d[ i[0] ][ i[1] ] = i[2]
172 # Get the raw data for source
173 arch_id = arch_ids["source"]
174 for i in session.execute('SELECT suite, COUNT(suite) FROM src_associations GROUP BY suite').fetchall():
175 (suite_id, count) = i
176 d[suite_id][arch_id] = d[suite_id][arch_id] + count
179 suite_list = suites.values()
182 for suite in suite_list:
183 suite_id = suite_ids[suite]
184 suite_arches[suite_id] = {}
185 for arch in get_suite_architectures(suite):
186 suite_arches[suite_id][arch.arch_string] = ""
187 suite_id_list.append(suite_id)
188 output_list = [ output_format(i) for i in suite_list ]
189 longest_suite = longest(output_list)
190 arch_list = arches.values()
192 longest_arch = longest(arch_list)
194 output = (" "*longest_arch) + " |"
195 for suite in output_list:
196 output = output + suite.center(longest_suite)+" |"
197 output = output + "\n"+(len(output)*"-")+"\n"
199 arch_list = arches.values()
201 longest_arch = longest(arch_list)
202 for arch in arch_list:
203 arch_id = arch_ids[arch]
204 output = output + arch.center(longest_arch)+" |"
205 for suite_id in suite_id_list:
206 if suite_arches[suite_id].has_key(arch):
207 count = "%d" % d[suite_id][arch_id]
210 output = output + count.rjust(longest_suite)+" |"
211 output = output + "\n"
214 ################################################################################
216 def parse_new_uploads(data):
218 latest_timestamp = stats['timestamp']
219 for entry in findall(NEW, data, MULTILINE):
221 if stats['timestamp'] >= timestamp:
223 date = parse_timestamp(timestamp)
224 if date not in stats:
225 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
226 'REJECT': 0, 'PROD': 0}, 'members': {}}
227 stats[date]['stats']['NEW'] += 1
228 stats['history']['stats']['NEW'] += 1
229 latest_timestamp = timestamp
230 return latest_timestamp
233 def parse_actions(data, logdate):
235 latest_timestamp = stats['timestamp']
236 if logdate <= FORMAT_SWITCH:
237 for batch in findall(old_ACTIONS, data, DOTALL):
238 who = batch.split()[0]
239 if who in blacklisted:
241 for entry in findall(old_ACTION, batch, MULTILINE):
243 if action.startswith('Accepting'):
245 elif action.startswith('rejected'):
248 if stats['timestamp'] >= timestamp:
250 date = parse_timestamp(entry[0])
251 if date not in stats:
252 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
253 'REJECT': 0, 'PROD': 0}, 'members': {}}
254 stats[date]['stats'][action] += 1
255 stats['history']['stats'][action] += 1
256 if who not in stats[date]['members']:
257 stats[date]['members'][who] = {'ACCEPT': 0, 'REJECT': 0,
259 stats[date]['members'][who][action] += 1
260 if who not in stats['history']['members']:
261 stats['history']['members'][who] = {'ACCEPT': 0, 'REJECT': 0,
263 stats['history']['members'][who][action] += 1
264 latest_timestamp = timestamp
266 if logdate >= FORMAT_SWITCH:
267 for entry in findall(new_ACTIONS, data, MULTILINE):
270 if stats['timestamp'] >= timestamp:
272 date = parse_timestamp(timestamp)
273 if date not in stats:
274 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
275 'REJECT': 0, 'PROD': 0}, 'members': {}}
277 if member in blacklisted:
279 if date not in stats:
280 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
281 'REJECT': 0, 'PROD': 0}, 'members': {}}
282 if member not in stats[date]['members']:
283 stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0,
285 if member not in stats['history']['members']:
286 stats['history']['members'][member] = {'ACCEPT': 0,
287 'REJECT': 0, 'PROD': 0}
288 stats[date]['stats'][action] += 1
289 stats[date]['members'][member][action] += 1
290 stats['history']['stats'][action] += 1
291 stats['history']['members'][member][action] += 1
292 latest_timestamp = timestamp
293 return latest_timestamp
296 def parse_prod(logdate):
299 maildate = ''.join([x[-2:] for x in logdate.split('-')])
300 mailarchive = join(utils.get_conf()['Dir::Base'], 'mail/archive',
301 'mail-%s.xz' % maildate)
302 if not isfile(mailarchive):
304 (fd, tmpfile) = utils.temp_filename(utils.get_conf()['Dir::TempPath'])
305 system('xzcat %s > %s' % (mailarchive, tmpfile))
306 for message in mbox(tmpfile):
307 if (message['subject'] and
308 message['subject'].startswith('Comments regarding')):
310 member = users[' '.join(message['From'].split()[:-1])]
313 ts = mktime_tz(parsedate_tz(message['date']))
314 timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S")
315 date = parse_timestamp(timestamp)
316 if date not in stats:
317 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
318 'REJECT': 0, 'PROD': 0}, 'members': {}}
319 if member not in stats[date]['members']:
320 stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0,
322 if member not in stats['history']['members']:
323 stats['history']['members'][member] = {'ACCEPT': 0,
324 'REJECT': 0, 'PROD': 0}
325 stats[date]['stats']['PROD'] += 1
326 stats[date]['members'][member]['PROD'] += 1
327 stats['history']['stats']['PROD'] += 1
328 stats['history']['members'][member]['PROD'] += 1
332 def parse_timestamp(timestamp):
333 y = int(timestamp[:4])
334 m = int(timestamp[4:6])
335 return '%d-%02d' % (y, m)
338 def new_stats(logdir, yaml):
342 with open(yaml, 'r') as fd:
343 stats = safe_load(fd)
347 stats = {'history': {'stats': {'NEW': 0, 'ACCEPT': 0,
348 'REJECT': 0, 'PROD': 0}, 'members': {}},
349 'timestamp': '19700101000000'}
350 latest_timestamp = stats['timestamp']
351 for fn in sorted(listdir(logdir)):
354 log = splitext(fn)[0]
355 if log < parse_timestamp(stats['timestamp']):
357 logfile = join(logdir, fn)
359 if fn.endswith('.bz2'):
360 # This hack is required becaue python2 does not support
361 # multi-stream files (http://bugs.python.org/issue1625)
362 (fd, tmpfile) = utils.temp_filename(Cnf['Dir::TempPath'])
363 system('bzcat %s > %s' % (logfile, tmpfile))
364 with open(tmpfile, 'r') as fd:
368 with open(logfile, 'r') as fd:
370 ts = parse_new_uploads(data)
371 if ts > latest_timestamp:
372 latest_timestamp = ts
373 ts = parse_actions(data, log)
374 if ts > latest_timestamp:
375 latest_timestamp = ts
380 stats['timestamp'] = latest_timestamp
381 with open(yaml, 'w') as fd:
384 ################################################################################
390 Cnf = utils.get_conf()
391 Arguments = [('h',"help","Stats::Options::Help")]
393 if not Cnf.has_key("Stats::Options::%s" % (i)):
394 Cnf["Stats::Options::%s" % (i)] = ""
396 args = apt_pkg.parse_commandline(Cnf, Arguments, sys.argv)
398 Options = Cnf.subtree("Stats::Options")
403 utils.warn("dak stats requires a MODE argument")
406 if args[0].lower() != "new":
407 utils.warn("dak stats accepts only one MODE argument")
409 elif args[0].lower() == "new":
410 utils.warn("new MODE requires an output file")
412 mode = args[0].lower()
414 if mode == "arch-space":
416 elif mode == "pkg-nums":
418 elif mode == "daily-install":
419 daily_install_stats()
421 users = utils.get_users_from_ldap()
422 new_stats(Cnf["Dir::Log"], args[1])
424 utils.warn("unknown mode '%s'" % (mode))
427 ################################################################################
429 if __name__ == '__main__':