#!/usr/bin/env python3

# data aggregation tool - hostlist collected histograms on numerical data

__version__ = "2.2.1"

# Copyright (C) 2010 Peter Kjellström <cap@nsc.liu.se>
#               
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.

import sys
import optparse
import math
import os
import stat
import time
from pprint import pprint
from hostlist import collect_hostlist
from hostlist import expand_hostlist

from subprocess import getoutput


def dprint(dstr):
    if opts.debug:
        print("Debug: %s" % dstr)

def eprint(estr, terminate=True):
    sys.stderr.write("Error: %s\n" % estr)
    if terminate:
        sys.exit(1)

def wprint(wstr):
    sys.stderr.write("Warning: %s\n" % wstr)

def iprint(istr):
    if opts.verbose:
        print("Info: %s" % istr)

def gettermwidth():
    try:
        cols = int(getoutput("/bin/stty -F /dev/tty size").split()[1])
    except Exception:
        cols = 80
    return cols

def redboldstr(instr):
    if (opts.color == 'never' or
        instr == ""):
        return instr
    return "\033[1m\033[31m%s\033[0m" % instr

# Opens cache file in read or write mode. Do lots of checks for write case.
def opencachefile(fmode):
    tmpdir = os.path.join(os.environ.get("TMPDIR", "/tmp"),
                           "dbuck-%i" % os.geteuid())
    dprint("trying to open cache file in dir %s in mode %s" % (tmpdir, fmode))
    if fmode == "w":
        oldmask = os.umask(0o77)
        if not os.path.isdir(tmpdir):
            iprint("creating cache directory: %s" % tmpdir)
            wprint("this version automatically saves a copy of the last data")
            wprint("for more information see man page (--previous, --no-cache)")
            os.mkdir(tmpdir)
        statstrct = os.stat(tmpdir)
        if ((stat.S_IWOTH & statstrct.st_mode) or
            (stat.S_IWGRP & statstrct.st_mode) or
            (statstrct.st_uid != os.geteuid())):
            eprint("incorrect permissions on tmpdir: %s" % tmpdir, terminate=False)
            raise BaseException
    f = open(tmpdir + "/cache-%i" % os.getsid(os.getpid()), fmode)
    if fmode == "w":
        os.umask(oldmask)
    return f

def readdata(cachefile):
    if opts.previous:
        if opts.verbose:
            cachestat = os.stat(cachefile.name)
            iprint("reading data from cache file created: %s" % time.ctime(cachestat.st_mtime))
        try:
            rawdata = cachefile.readlines()
        except:
            eprint("unable to read from cache file")
    else:
        try:
            rawdata = sys.stdin.readlines()
        except KeyboardInterrupt:
            iprint("caught keyboard interrupt, exiting...")
            sys.exit(1)
        if not opts.no_cache:
            try:
                cachefile.writelines(rawdata)
                cachefile.close()
            except:
                wprint("unable to write to cache file")

    return rawdata


## Statistical functions

def mean(list):
    return sum(list)/len(list)

def median(list):
    tmp = sorted(list)
    if (len(tmp) % 2) == 0:
        return (tmp[(len(tmp)//2)-1] + tmp[(len(tmp)//2)]) / 2
    else:
        return tmp[int(len(tmp)//2)]

def stdev(list):
    m = mean(list)
    return math.sqrt(mean( [ (m - x)**2 for x in list ] ))

# clean and refine indata: list of STRING -> list of [ "hostname", float vaule ]
def refine_data(rawlist):
    dprint("read in %i lines of data" % len(rawlist))

    # Transform additional separators -> " "
    for char in opts.field_separators:
        dprint("adding additional field separator: \"%s\"" % char)
        for i in range(len(rawlist)):
            tmp = rawlist[i].replace(char, " ")
            rawlist[i] = tmp

    if opts.key == None:
        if len(rawlist) < 3:
            eprint("not enough data for auto-detect, please use -k")

        # list to hold candidates for KEY
        key = []
        # Lets have a look at the last three lines
        for lnum in range(len(rawlist)-3,len(rawlist)):
            sline = rawlist[lnum].strip().split()
            if len(sline) < 2:
                key.append(0)
                continue
            # For anonymous mode start the search at column 0
            if opts.anonymous:
                coloffset = 0
            else:
                coloffset = 1
            # The first column that can be converted to a float will be our candidate
            for i in [ x + coloffset for x in range(len(sline) - coloffset) ]:
                tmp = None
                try:
                    tmp = float(sline[i])
                except ValueError:
                    pass
                if tmp != None:
                    dprint("auto-detect row=%i found data at column %i" % (lnum, i))
                    key.append(i)
                    break
        dprint("key list after auto-detect: %s" % str(key))

        # If more than half of the investigated lines have the same candidate...
        for candidate in key:
            if key.count(candidate) > (len(key) // 2):
                opts.key = candidate
                iprint("auto-detected data at column: %i" % candidate)
                break
        
        # No winner found (or winner was 'bad line')
        if opts.key == None or ((opts.key == 0) and
                                not opts.anonymous):
            eprint("Unable to auto-detect KEY from data")

    nreject = 0
    cleandata = []
    for line in rawlist:
        sline = line.strip().split()
        tmp = None
        try:
            tmp = float(sline[opts.key])
        except (ValueError, IndexError):
            pass
        if tmp != None:
            if opts.anonymous:
                sline[0] = "na"
            cleandata.append([ sline[0].strip(":"), tmp ])
        else:
            nreject += 1
            iprint("rejected line: \"%s\"" % line.strip())

    return (cleandata, nreject)

def addoverflowbuckets(blist, rmin, vmax):
    blist.insert(0, {'ub': rmin, 'special': 'underflow'})
    blist.append({'ub': vmax, 'special':'overflow'})
    return

# New bucket creation function
def create_buckets_new(valuelist, num, vmin, vmax):
    dprint("range is %f to %f split into %i buckets" % (vmin, vmax, num))
    blist = []
    ub = vmin
    for b in range(num):
        ub += (vmax-vmin) / num
        blist.append({'ub': ub})
    blist[-1]['ub'] = vmax
    return blist

def parse_range(rstr):
    rlist = rstr.split("-")
    dprint("list representation of raw range argument: %s" % str(rlist))
    try:
        while True:
            rlist[rlist.index('')+1] = '-' + rlist[rlist.index('')+1]
            rlist.remove('')
            dprint("processed one \"-\" character in range argument")
    except ValueError:
        pass
    except IndexError:
        eprint("invalid range specified")
    dprint("list representation of processed range argument: %s" % str(rlist))
    try:
        rlist = list(map(float, rlist))
    except ValueError:
        eprint("invalid range specified")
    if (len(rlist) != 2 or
        rlist[0] >= rlist[1]):
        eprint("inverted, incomplete or null range")
    return rlist

##
### Main program
##        

optp = optparse.OptionParser(usage=("usage: %prog [options] < DATA" +
                                    "\n       %prog [options] --previous" +
                                    "\n\nNote: long options can be abbreviated as long as unambiguous"))
optp.add_option("-a", "--anonymous",
                action="store_true", default=False,
                help="anonymous data, only handle data (implies --bars and allows -k0)")
optp.add_option("-b", "--bars",
                action="store_true", default=False,
                help="draw histogram style bars instead of list of keys")
optp.add_option("--color",
                action="store", type="string", metavar="WHEN", default="auto",
                help="allow colors in output; WHEN can be 'always', 'never', 'auto' (default: 'auto')")
optp.add_option("--no-cache",
                action="store_true", default=False,
                help="do not save a cached copy of the data for later use (see also --previous)")
optp.add_option("--highlight-hostlist",
                action="store", type="string", metavar="HOSTLIST",
                help="highlight the specified HOSTLIST in the output table")
optp.add_option("-r", "--range",
                action="store", type="string", metavar="LOW-HI",
                help="explicitly specify a value range")
optp.add_option("-z", "--zero",
                action="store_true", default=False,
                help="first bucket starts at zero not at lowest value")
optp.add_option("-o", "--show-overflow",
                action="store_true", default=False,
                help="include two extra buckets (over- and under-flow)")
optp.add_option("-k", "--key",
                action="store", type="int", default=None,
                help="use data at position KEY (default: auto)")
optp.add_option("-n", "--nbuckets",
                action="store", type="int", default=5,
                help="number of buckets to use (default: %default)")
optp.add_option("-p", "--previous",
                action="store_true", default=False,
                help="read cached data from previous run instead of normal stdin")
optp.add_option("-s", "--statistics",
                action="store_true", default=False,
                help="include a statistical summary")
optp.add_option("-S", "--chop-long-lines",
                action="store_true", default=False,
                help="chop too long lines / enforce one output line per bucket")
optp.add_option("-t", "--field-separators",
                action="store", type="string", default="",
                help="_additional_ field separators (default: \"\")")
optp.add_option("-v", "--verbose",
                action="store_true", default=False)
optp.add_option("--debug",
                action="store_true", default=False)
(opts, args) = optp.parse_args(sys.argv[1:])

if opts.debug:
    opts.verbose = True

if args != []:
    optp.print_help()
    sys.exit(1)

if opts.nbuckets < 1:
    eprint("number of buckets must be a positive integer")

if (opts.show_overflow and not (opts.range or opts.zero)):
    wprint("--show-overflow only has meaning with --range or --zero")

if opts.range:
    if opts.zero:
        wprint("ignoring --zero because of --range use")
        opts.zero = False
    (rmin, rmax) = parse_range(opts.range)

if opts.color == 'auto':
    if not sys.stdout.isatty():
        opts.color='never'

if opts.anonymous:
    opts.bars = True
elif opts.key == 0:
    eprint("-k0 only possible with --anonymous")

if opts.previous:
    try:
        cachefile = opencachefile("r")
    except:
        eprint("unable to open cachefile")
elif not opts.no_cache:
    try:
        cachefile = opencachefile("w")
    except:
        eprint("unable to open cachefile")
else:
    cachefile = None

rawdata = readdata(cachefile)

termwidth = gettermwidth()
dprint("termwidth: %i" % termwidth)

# do list of str -> list of [ hostname, value ] and discard bad lines
if (len(rawdata) == 0):
    eprint("no data found")
(data, nbadlines) = refine_data(rawdata)
if (len(data) == 0):
    eprint("no data found")

# sort it
data.sort(key=lambda x: x[1])
# put the values in a simple list
valuelist = [x[1] for x in data]

if opts.range:
    vmin = min(valuelist)
    vmax = max(valuelist)
elif opts.zero:
    rmin = 0.0
    vmin = min(valuelist)
    rmax = vmax = max(valuelist)
else:
    rmin = vmin = min(valuelist)
    rmax = vmax = max(valuelist)

if (opts.range or opts.zero):
    nunder = len([ 1 for x in valuelist if x < rmin ])
    nover = len([ 1 for x in valuelist if x > rmax ])
#dprint("cleaned up data: %s" % str(data))

# Create the bucket list
newbuckets = create_buckets_new(valuelist, opts.nbuckets, rmin, rmax)
if (opts.range or opts.zero):
    addoverflowbuckets(newbuckets, rmin, vmax)

for nbucket in range(len(newbuckets)):
    newbuckets[nbucket]['nodelist'] = []

if opts.debug:
    dprint("bucketlist created:")
    for nbucket in range(len(newbuckets)):
        dprint(" bucket[%i] %f" % (nbucket, newbuckets[nbucket]['ub']))

# Dump out some statistics
if opts.statistics:
    print("Statistical summary")
    print("-" * (termwidth - 1))
    print(" %-26s: %i" % ("Number of values", len(valuelist)))
    print(" %-26s: %i" % ("Number of rejected lines", nbadlines))
    if (opts.range or opts.zero):
        print(" %-26s: %i" % ("Number of overflow values", nover))
        print(" %-26s: %i" % ("Number of underflow values", nunder))
    print(" %-26s: %f" % ("Min value", valuelist[0]))
    print(" %-26s: %f" % ("Max value", valuelist[-1]))
    print(" %-26s: %f" % ("Mean", mean(valuelist)))
    print(" %-26s: %f" % ("Median", median(valuelist)))
    print(" %-26s: %f" % ("Standard deviation", stdev(valuelist)))
    print(" %-26s: %f" % ("Sum", sum(valuelist)))
    print()


if opts.debug:
    pprint(["final newbuckets", newbuckets])

# Populate the buckets with data
currentbucket = 0
for (node, value) in data:
    while ((newbuckets[currentbucket]['ub'] < value) or
           (value == rmin and currentbucket == 0 and 'special' in newbuckets[0])):
        currentbucket += 1
        if opts.debug:
            print()
        dprint("filling next bucket")
    if opts.debug:
        sys.stdout.write(".")
    newbuckets[currentbucket]['nodelist'].append(node)
if opts.debug:
    print()

# Compute number of characters needed for printing values and number of nodes
ncharvalue = len(str("%.2f" % max(valuelist[-1], rmax)))
ncharnodecnt = max(len(str(max( [ len(bucket['nodelist']) for bucket in newbuckets ] ))), 3)

# debug print of full data structure with data
#pprint(["final newbuckets", newbuckets])

dprint("value pad: %i" % ncharvalue)
dprint("node count pad: %i" % ncharnodecnt)

# Print out a header if --verbose
if opts.verbose:
    header = "%sLOW-%sHI: %sCNT" % (" " * (ncharvalue - 3),
                                    " " * (ncharvalue - 2),
                                    " " * (ncharnodecnt - 3))
    if not opts.bars:
        header += "  HOSTLIST"
    print(header)
    print("-" * termwidth)

if opts.highlight_hostlist:
    highlight_set = set(expand_hostlist(opts.highlight_hostlist))

# To later be able to scale the bars find the largest bucket
if opts.bars:
    maxbucketsize = 0
    for bucket in newbuckets:
        if not opts.show_overflow and 'special' in bucket:
            continue
        maxbucketsize = max(maxbucketsize, len(bucket['nodelist']))
    dprint("largest number of values in any bucket: %i" % maxbucketsize)
    barscale = max(1.0,
                   maxbucketsize / (termwidth - (ncharvalue * 2 + ncharnodecnt + 5.0)))
    dprint("available space for bars %i" % (termwidth - (ncharvalue * 2 + ncharnodecnt + 5)))
    dprint("barscale %f" % barscale)

# Main output print
lower = valuelist[0]
for bucket in newbuckets:
    # figure out the padding for each column
    pad1 = (ncharvalue - len(str("%.2f" % lower))) * " "
    pad2 = (ncharvalue - len(str("%.2f" % bucket['ub']))) * " "
    pad3 = (ncharnodecnt - len(str(len(bucket['nodelist'])))) * " "

    availablespace = termwidth - (ncharvalue * 2 + ncharnodecnt + 5)
    dprint("available space for nodelist: %i" % availablespace)
    # Three output modes for keys: bars, highlight and normal
    if opts.bars:
        if opts.highlight_hostlist:
            tnum = len(bucket['nodelist'])
            hnum = 0
            for i in highlight_set:
                hnum += bucket['nodelist'].count(i)
            nnum = tnum - hnum
            # Always show highlighted data even if it rounds down to zero characters
            if ((hnum > 0) and (int(hnum/barscale) == 0)):
                roundup = 1
            else:
                roundup = 0
            if opts.chop_long_lines:
                nodeliststr = ( redboldstr("#" * (int(hnum/barscale) + roundup)) +
                                "#" * int(nnum/barscale) )
            else:
                nodeliststr = redboldstr("#" * hnum) + "#" * nnum
        else:
            nodeliststr = "#" * len(bucket['nodelist'])
            if opts.chop_long_lines:
                nodeliststr = nodeliststr[:int(len(nodeliststr)/barscale)]
    elif opts.highlight_hostlist:
        hnodeliststr = collect_hostlist(highlight_set & set(bucket['nodelist']))
        nnodeliststr = collect_hostlist(set(bucket['nodelist']) - highlight_set)
        dprint("len(hnodeliststr) is: %i" % len(hnodeliststr))
        dprint("len(nnodeliststr) is: %i" % len(nnodeliststr))
        if hnodeliststr and nnodeliststr:
            sep = ","
            availablespace -= 1
        else:
            sep = ""
        dprint("sep is %s" % str(not (sep == "")))
        if (not opts.chop_long_lines or
            len(hnodeliststr + nnodeliststr) <= availablespace):
            nodeliststr = redboldstr(hnodeliststr) + sep + nnodeliststr
        elif ((len(hnodeliststr) + 3) <= availablespace):
            availablespace = availablespace + len(sep) - len(hnodeliststr)
            nodeliststr = redboldstr(hnodeliststr)
            nnodeliststr = sep + nnodeliststr
            nodeliststr += nnodeliststr[:(availablespace - 3)] + "..."
        else:
            availablespace += len(sep)
            nodeliststr = redboldstr(hnodeliststr[:(availablespace - 3)] + "...")
    else:
        nodeliststr = collect_hostlist(bucket['nodelist'])
        if (opts.chop_long_lines and
            len(nodeliststr) > availablespace):
            nodeliststr = nodeliststr[:(availablespace - 3)] + "..."

    pad4 = '' if len(bucket['nodelist']) == 0 else '  '
    if 'special' in bucket:
        if opts.show_overflow:
            specpad = " " * max(len("%s%.2f-%s%.2f" %
                                    (pad1, lower ,pad2, bucket['ub'])) - 9,
                                0)
            print("%-9s%s: %s%i%s%s" % (bucket['special'], specpad,
                                        pad3, len(bucket['nodelist']),
                                        pad4, nodeliststr))
    else:
        print("%s%.2f-%s%.2f: %s%i%s%s" % (pad1, lower,
                                           pad2, bucket['ub'],
                                           pad3, len(bucket['nodelist']),
                                           pad4, nodeliststr))
    lower = bucket['ub']
