Merge remote-tracking branch 'alteholz/alteholz/bugfix/tracker' into merge

[dak.git] / dak / stats.py
diff --git a/dak/stats.py b/dak/stats.py

index a11b902eaa59e287a3d39405d80fe7b7d5932aa1..6f69e1a7f4239429fac841b78cdb47c92605a278 100755 (executable)
--- a/dak/stats.py
+++ b/dak/stats.py
@@ -1,8 +1,8 @@
  #!/usr/bin/env python
  
-# Various statistical pr0nography fun and games
-# Copyright (C) 2000, 2001, 2002, 2003  James Troup <james@nocrew.org>
-# $Id: saffron,v 1.3 2005-11-15 09:50:32 ajt Exp $
+""" Various statistical pr0nography fun and games """
+# Copyright (C) 2000, 2001, 2002, 2003, 2006  James Troup <james@nocrew.org>
+# Copyright (C) 2013  Luca Falavigna <dktrkranz@debian.org>
  
  # This program is free software; you can redistribute it and/or modify
  # it under the terms of the GNU General Public License as published by
@@ -31,223 +31,400 @@
  
  ################################################################################
  
-import pg, sys;
-import utils;
-import apt_pkg;
+import sys
+import apt_pkg
+
+from datetime import datetime
+from email.utils import mktime_tz, parsedate_tz
+from mailbox import mbox
+from os import listdir, system, unlink
+from os.path import isfile, join, splitext
+from re import findall, DOTALL, MULTILINE
+from sys import stderr
+from yaml import safe_load, safe_dump
+
+from daklib import utils
+from daklib.dbconn import DBConn, get_suite_architectures, Suite, Architecture
  
  ################################################################################
  
-Cnf = None;
-projectB = None;
+Cnf = None
+
+stats = {}
+users = {}
+buffer = 0
+FORMAT_SWITCH = '2009-08'
+blacklisted = ('dak', 'katie')
+
+NEW = ('^(\d{14})\|(?:jennifer|process-unchecked|.*?\|dak)'
+       '\|(Moving to new|ACCEPT-TO-NEW)')
+new_ACTIONS = '^(\d{14})\|[^\|]*\|(\S+)\|NEW (\S+)[:\|]'
+old_ACTIONS = ('(?:lisa|process-new)\|program start\|(.*?)\|'
+               '(?:lisa|process-new)\|program end')
+old_ACTION = '^(\d{14})\|(?:lisa|process-new)\|(Accepting changes|rejected)\|'
  
  ################################################################################
  
  def usage(exit_code=0):
-    print """Usage: saffron STAT
+    print """Usage: dak stats MODE
  Print various stats.
  
    -h, --help                show this help and exit.
  
-The following STAT modes are available:
+The following MODEs are available:
  
    arch-space    - displays space used by each architecture
    pkg-nums      - displays the number of packages by suite/architecture
    daily-install - displays daily install stats suitable for graphing
+  new           - stores stats about the NEW queue
  """
      sys.exit(exit_code)
  
  ################################################################################
  
  def per_arch_space_use():
-    q = projectB.query("""
-SELECT a.arch_string as Architecture, sum(f.size)
+    session = DBConn().session()
+    q = session.execute("""
+SELECT a.arch_string as Architecture, sum(f.size) AS sum
    FROM files f, binaries b, architecture a
    WHERE a.id=b.architecture AND f.id=b.file
-  GROUP BY a.arch_string""");
-    print q;
-    q = projectB.query("SELECT sum(size) FROM files WHERE filename ~ '.(diff.gz|tar.gz|dsc)$'");
-    print q;
+  GROUP BY a.arch_string ORDER BY sum""").fetchall()
+    for j in q:
+        print "%-15.15s %s" % (j[0], j[1])
+    print
+    q = session.execute("SELECT sum(size) FROM files WHERE filename ~ '.(diff.gz|tar.gz|dsc)$'").fetchall()
+    print "%-15.15s %s" % ("Source", q[0][0])
  
  ################################################################################
  
  def daily_install_stats():
-    stats = {};
-    file = utils.open_file("2001-11");
-    for line in file.readlines():
-        split = line.strip().split('~');
-        program = split[1];
-        if program != "katie":
-            continue;
-        action = split[2];
+    stats = {}
+    f = utils.open_file("2001-11")
+    for line in f.readlines():
+        split = line.strip().split('|')
+        program = split[1]
+        if program != "katie" and program != "process-accepted":
+            continue
+        action = split[2]
          if action != "installing changes" and action != "installed":
-            continue;
-        date = split[0][:8];
+            continue
+        date = split[0][:8]
          if not stats.has_key(date):
-            stats[date] = {};
-            stats[date]["packages"] = 0;
-            stats[date]["size"] = 0.0;
+            stats[date] = {}
+            stats[date]["packages"] = 0
+            stats[date]["size"] = 0.0
          if action == "installing changes":
-            stats[date]["packages"] += 1;
+            stats[date]["packages"] += 1
          elif action == "installed":
-            stats[date]["size"] += float(split[5]);
+            stats[date]["size"] += float(split[5])
  
-    dates = stats.keys();
-    dates.sort();
+    dates = stats.keys()
+    dates.sort()
      for date in dates:
          packages = stats[date]["packages"]
          size = int(stats[date]["size"] / 1024.0 / 1024.0)
-        print "%s %s %s" % (date, packages, size);
+        print "%s %s %s" % (date, packages, size)
  
  ################################################################################
  
  def longest(list):
-    longest = 0;
+    longest = 0
      for i in list:
-        l = len(i);
+        l = len(i)
          if l > longest:
-            longest = l;
-    return longest;
-
-def suite_sort(a, b):
-    if Cnf.has_key("Suite::%s::Priority" % (a)):
-        a_priority = int(Cnf["Suite::%s::Priority" % (a)]);
-    else:
-        a_priority = 0;
-    if Cnf.has_key("Suite::%s::Priority" % (b)):
-        b_priority = int(Cnf["Suite::%s::Priority" % (b)]);
-    else:
-        b_priority = 0;
-    return cmp(a_priority, b_priority);
+            longest = l
+    return longest
  
  def output_format(suite):
-    output_suite = [];
+    output_suite = []
      for word in suite.split("-"):
-        output_suite.append(word[0]);
-    return "-".join(output_suite);
-
-# Obvious query with GROUP BY and mapped names                  -> 50 seconds
-# GROUP BY but ids instead of suite/architecture names          -> 28 seconds
-# Simple query                                                  -> 14 seconds
-# Simple query into large dictionary + processing               -> 21 seconds
-# Simple query into large pre-created dictionary + processing   -> 18 seconds
+        output_suite.append(word[0])
+    return "-".join(output_suite)
  
  def number_of_packages():
-    arches = {};
-    arch_ids = {};
-    suites = {};
-    suite_ids = {};
-    d = {};
+    arches = {}
+    arch_ids = {}
+    suites = {}
+    suite_ids = {}
+    d = {}
+    session = DBConn().session()
      # Build up suite mapping
-    q = projectB.query("SELECT id, suite_name FROM suite");
-    suite_ql = q.getresult();
-    for i in suite_ql:
-        (id, name) = i;
-        suites[id] = name;
-        suite_ids[name] = id;
+    for i in session.query(Suite).all():
+        suites[i.suite_id] = i.suite_name
+        suite_ids[i.suite_name] = i.suite_id
      # Build up architecture mapping
-    q = projectB.query("SELECT id, arch_string FROM architecture");
-    for i in q.getresult():
-        (id, name) = i;
-        arches[id] = name;
-        arch_ids[name] = id;
+    for i in session.query(Architecture).all():
+        arches[i.arch_id] = i.arch_string
+        arch_ids[i.arch_string] = i.arch_id
      # Pre-create the dictionary
      for suite_id in suites.keys():
-        d[suite_id] = {};
+        d[suite_id] = {}
          for arch_id in arches.keys():
-            d[suite_id][arch_id] = 0;
+            d[suite_id][arch_id] = 0
      # Get the raw data for binaries
-    q = projectB.query("""
-SELECT ba.suite, b.architecture
-  FROM binaries b, bin_associations ba
- WHERE b.id = ba.bin""");
      # Simultate 'GROUP by suite, architecture' with a dictionary
-    for i in q.getresult():
-        (suite_id, arch_id) = i;
-        d[suite_id][arch_id] = d[suite_id][arch_id] + 1;
+    # XXX: Why don't we just get the DB to do this?
+    for i in session.execute("""SELECT suite, architecture, COUNT(suite)
+                                FROM bin_associations
+                           LEFT JOIN binaries ON bin = binaries.id
+                            GROUP BY suite, architecture""").fetchall():
+        d[ i[0] ][ i[1] ] = i[2]
      # Get the raw data for source
-    arch_id = arch_ids["source"];
-    q = projectB.query("""
-SELECT suite, count(suite) FROM src_associations GROUP BY suite;""");
-    for i in q.getresult():
-        (suite_id, count) = i;
-        d[suite_id][arch_id] = d[suite_id][arch_id] + count;
+    arch_id = arch_ids["source"]
+    for i in session.execute('SELECT suite, COUNT(suite) FROM src_associations GROUP BY suite').fetchall():
+        (suite_id, count) = i
+        d[suite_id][arch_id] = d[suite_id][arch_id] + count
      ## Print the results
      # Setup
-    suite_list = suites.values();
-    suite_list.sort(suite_sort);
-    suite_id_list = [];
-    suite_arches = {};
+    suite_list = suites.values()
+    suite_id_list = []
+    suite_arches = {}
      for suite in suite_list:
-        suite_id = suite_ids[suite];
-        suite_arches[suite_id] = {};
-        for arch in Cnf.ValueList("Suite::%s::Architectures" % (suite)):
-            suite_arches[suite_id][arch] = "";
-        suite_id_list.append(suite_id);
-    output_list = map(lambda x: output_format(x), suite_list);
-    longest_suite = longest(output_list);
-    arch_list = arches.values();
-    arch_list.sort();
-    longest_arch = longest(arch_list);
+        suite_id = suite_ids[suite]
+        suite_arches[suite_id] = {}
+        for arch in get_suite_architectures(suite):
+            suite_arches[suite_id][arch.arch_string] = ""
+        suite_id_list.append(suite_id)
+    output_list = [ output_format(i) for i in suite_list ]
+    longest_suite = longest(output_list)
+    arch_list = arches.values()
+    arch_list.sort()
+    longest_arch = longest(arch_list)
      # Header
      output = (" "*longest_arch) + " |"
      for suite in output_list:
-        output = output + suite.center(longest_suite)+" |";
-    output = output + "\n"+(len(output)*"-")+"\n";
+        output = output + suite.center(longest_suite)+" |"
+    output = output + "\n"+(len(output)*"-")+"\n"
      # per-arch data
-    arch_list = arches.values();
-    arch_list.sort();
-    longest_arch = longest(arch_list);
+    arch_list = arches.values()
+    arch_list.sort()
+    longest_arch = longest(arch_list)
      for arch in arch_list:
-        arch_id = arch_ids[arch];
-        output = output + arch.center(longest_arch)+" |";
+        arch_id = arch_ids[arch]
+        output = output + arch.center(longest_arch)+" |"
          for suite_id in suite_id_list:
              if suite_arches[suite_id].has_key(arch):
-                count = repr(d[suite_id][arch_id]);
+                count = "%d" % d[suite_id][arch_id]
              else:
-                count = "-";
-            output = output + count.rjust(longest_suite)+" |";
-        output = output + "\n";
-    print output;
+                count = "-"
+            output = output + count.rjust(longest_suite)+" |"
+        output = output + "\n"
+    print output
+
+################################################################################
+
+def parse_new_uploads(data):
+    global stats
+    latest_timestamp = stats['timestamp']
+    for entry in findall(NEW, data, MULTILINE):
+        timestamp = entry[0]
+        if stats['timestamp'] >= timestamp:
+            continue
+        date = parse_timestamp(timestamp)
+        if date not in stats:
+            stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
+                           'REJECT': 0, 'PROD': 0}, 'members': {}}
+        stats[date]['stats']['NEW'] += 1
+        stats['history']['stats']['NEW'] += 1
+        latest_timestamp = timestamp
+    return latest_timestamp
+
+
+def parse_actions(data, logdate):
+    global stats
+    latest_timestamp = stats['timestamp']
+    if logdate <= FORMAT_SWITCH:
+        for batch in findall(old_ACTIONS, data, DOTALL):
+            who = batch.split()[0]
+            if who in blacklisted:
+                continue
+            for entry in findall(old_ACTION, batch, MULTILINE):
+                action = entry[1]
+                if action.startswith('Accepting'):
+                    action = 'ACCEPT'
+                elif action.startswith('rejected'):
+                    action = 'REJECT'
+                timestamp = entry[0]
+                if stats['timestamp'] >= timestamp:
+                    continue
+                date = parse_timestamp(entry[0])
+                if date not in stats:
+                    stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
+                                   'REJECT': 0, 'PROD': 0}, 'members': {}}
+                stats[date]['stats'][action] += 1
+                stats['history']['stats'][action] += 1
+                if who not in stats[date]['members']:
+                    stats[date]['members'][who] = {'ACCEPT': 0, 'REJECT': 0,
+                                                   'PROD': 0}
+                stats[date]['members'][who][action] += 1
+                if who not in stats['history']['members']:
+                    stats['history']['members'][who] = {'ACCEPT': 0, 'REJECT': 0,
+                                                    'PROD': 0}
+                stats['history']['members'][who][action] += 1
+                latest_timestamp = timestamp
+        parse_prod(logdate)
+    if logdate >= FORMAT_SWITCH:
+        for entry in findall(new_ACTIONS, data, MULTILINE):
+            action = entry[2]
+            timestamp = entry[0]
+            if stats['timestamp'] >= timestamp:
+                continue
+            date = parse_timestamp(timestamp)
+            if date not in stats:
+                stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
+                               'REJECT': 0, 'PROD': 0}, 'members': {}}
+            member = entry[1]
+            if member in blacklisted:
+                continue
+            if date not in stats:
+                stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
+                               'REJECT': 0, 'PROD': 0}, 'members': {}}
+            if member not in stats[date]['members']:
+                stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0,
+                                                  'PROD': 0}
+            if member not in stats['history']['members']:
+                stats['history']['members'][member] = {'ACCEPT': 0,
+                                                       'REJECT': 0, 'PROD': 0}
+            stats[date]['stats'][action] += 1
+            stats[date]['members'][member][action] += 1
+            stats['history']['stats'][action] += 1
+            stats['history']['members'][member][action] += 1
+            latest_timestamp = timestamp
+    return latest_timestamp
+
+
+def parse_prod(logdate):
+    global stats
+    global users
+    maildate = ''.join([x[-2:] for x in logdate.split('-')])
+    mailarchive = join(utils.get_conf()['Dir::Base'], 'mail/archive',
+                       'mail-%s.xz' % maildate)
+    if not isfile(mailarchive):
+        return
+    (fd, tmpfile) = utils.temp_filename(utils.get_conf()['Dir::TempPath'])
+    system('xzcat %s > %s' % (mailarchive, tmpfile))
+    for message in mbox(tmpfile):
+        if (message['subject'] and
+                message['subject'].startswith('Comments regarding')):
+            try:
+                member = users[' '.join(message['From'].split()[:-1])]
+            except KeyError:
+                continue
+            ts = mktime_tz(parsedate_tz(message['date']))
+            timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S")
+            date = parse_timestamp(timestamp)
+            if date not in stats:
+                stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
+                                 'REJECT': 0, 'PROD': 0}, 'members': {}}
+            if member not in stats[date]['members']:
+                stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0,
+                                                     'PROD': 0}
+            if member not in stats['history']['members']:
+                stats['history']['members'][member] = {'ACCEPT': 0,
+                                                       'REJECT': 0, 'PROD': 0}
+            stats[date]['stats']['PROD'] += 1
+            stats[date]['members'][member]['PROD'] += 1
+            stats['history']['stats']['PROD'] += 1
+            stats['history']['members'][member]['PROD'] += 1
+    unlink(tmpfile)
+
+
+def parse_timestamp(timestamp):
+    y = int(timestamp[:4])
+    m = int(timestamp[4:6])
+    return '%d-%02d' % (y, m)
+
+
+def new_stats(logdir, yaml):
+    global Cnf
+    global stats
+    try:
+        with open(yaml, 'r') as fd:
+            stats = safe_load(fd)
+    except IOError:
+        pass
+    if not stats:
+        stats = {'history': {'stats': {'NEW': 0, 'ACCEPT': 0,
+                 'REJECT': 0, 'PROD': 0}, 'members': {}},
+                 'timestamp': '19700101000000'}
+    latest_timestamp = stats['timestamp']
+    for fn in sorted(listdir(logdir)):
+        if fn == 'current':
+            continue
+        log = splitext(fn)[0]
+        if log < parse_timestamp(stats['timestamp']):
+            continue
+        logfile = join(logdir, fn)
+        if isfile(logfile):
+            if fn.endswith('.bz2'):
+                # This hack is required becaue python2 does not support
+                # multi-stream files (http://bugs.python.org/issue1625)
+                (fd, tmpfile) = utils.temp_filename(Cnf['Dir::TempPath'])
+                system('bzcat %s > %s' % (logfile, tmpfile))
+                with open(tmpfile, 'r') as fd:
+                    data = fd.read()
+                unlink(tmpfile)
+            else:
+                with open(logfile, 'r') as fd:
+                    data = fd.read()
+            ts = parse_new_uploads(data)
+            if ts > latest_timestamp:
+                latest_timestamp = ts
+            ts = parse_actions(data, log)
+            if ts > latest_timestamp:
+                latest_timestamp = ts
+            stderr.write('.')
+            stderr.flush()
+    stderr.write('\n')
+    stderr.flush()
+    stats['timestamp'] = latest_timestamp
+    with open(yaml, 'w') as fd:
+        safe_dump(stats, fd)
  
  ################################################################################
  
  def main ():
-    global Cnf, projectB;
+    global Cnf
+    global users
  
-    Cnf = utils.get_conf();
-    Arguments = [('h',"help","Saffron::Options::Help")];
+    Cnf = utils.get_conf()
+    Arguments = [('h',"help","Stats::Options::Help")]
      for i in [ "help" ]:
-       if not Cnf.has_key("Saffron::Options::%s" % (i)):
-           Cnf["Saffron::Options::%s" % (i)] = "";
+        if not Cnf.has_key("Stats::Options::%s" % (i)):
+            Cnf["Stats::Options::%s" % (i)] = ""
  
-    args = apt_pkg.ParseCommandLine(Cnf, Arguments, sys.argv);
+    args = apt_pkg.parse_commandline(Cnf, Arguments, sys.argv)
  
-    Options = Cnf.SubTree("Saffron::Options")
+    Options = Cnf.subtree("Stats::Options")
      if Options["Help"]:
-       usage();
+        usage()
  
      if len(args) < 1:
-        utils.warn("saffron requires at least one argument");
-        usage(1);
+        utils.warn("dak stats requires a MODE argument")
+        usage(1)
      elif len(args) > 1:
-        utils.warn("saffron accepts only one argument");
-        usage(1);
-    mode = args[0].lower();
-
-    projectB = pg.connect(Cnf["DB::Name"], Cnf["DB::Host"], int(Cnf["DB::Port"]));
+        if args[0].lower() != "new":
+            utils.warn("dak stats accepts only one MODE argument")
+            usage(1)
+    elif args[0].lower() == "new":
+            utils.warn("new MODE requires an output file")
+            usage(1)
+    mode = args[0].lower()
  
      if mode == "arch-space":
-        per_arch_space_use();
+        per_arch_space_use()
      elif mode == "pkg-nums":
-        number_of_packages();
+        number_of_packages()
      elif mode == "daily-install":
-        daily_install_stats();
+        daily_install_stats()
+    elif mode == "new":
+        users = utils.get_users_from_ldap()
+        new_stats(Cnf["Dir::Log"], args[1])
      else:
-        utils.warn("unknown mode '%s'" % (mode));
-        usage(1);
+        utils.warn("unknown mode '%s'" % (mode))
+        usage(1)
  
  ################################################################################
  
  if __name__ == '__main__':
      main()
-