#!/usr/bin/python


## from http://www.basic.nwu.edu/statguidefiles/conting_anal_ass_viol.html
##
## no joke.
##
## A standard (and conservative) rule of thumb (due to Cochran) is to avoid using the chi-square test for contingency tables with expected cell frequencies less than 1, or when more than 20% of the contingency table cells have expected cell frequencies less than 5.
##

from phil import *

import stats ## for the chisquare calculation -- taken from the web

def Help():
    print '\nUsage: %s <code-file> {-v}\n'%(argv[0])
    print '-v for maximum verbosity\n'
    exit()

if len(argv)<=1:
    Help()

args = argv[1:]
if args.count('-v'):
    del args[ args.index('-v') ]
    VERBOSE = 1
    VERBOSE_CHISQ_THRESHOLD = 100
else:
    VERBOSE = 0
    VERBOSE_CHISQ_THRESHOLD = 1000000


EXCLUDE_EXPOSED = 1

code_file = argv[1]

############################################ functions
class TABLE:
    def __init__(self, T):
        self.keys = {0:[], 1:[]}
        for k in T.keys():
            for i in range(2):
                if k[i] not in self.keys[i]:
                    self.keys[i].append(k[i])

        self.table = {}
        for k0 in self.keys[0]:
            for k1 in self.keys[1]:
                key = (k0,k1)
                if T.has_key( key ):
                    self.table[ key] = T[key]
                else:
                    self.table[ key] = 0

    def total(self):
        return reduce(add,self.table.values())

    def frequency(self,ii):
        total = 0
        frequency = {}
        for k in self.keys[ii]:
            frequency[k] = 0

        for k in self.table.keys():
            count = self.table[k]
            key = k[ii]
            frequency[ key ] = frequency[ key ] + count
            total = total + count

        for k in frequency.keys():
            frequency[k] = float( frequency[k] ) / total
        return frequency


    def cochran( self ): ## 0 signals failure of Cochran condition
        f0 = self.frequency(0)
        f1 = self.frequency(1)

        N = len(f0.keys())
        M = len(f1.keys())

        min5_count = 0
        fail = 0
        total = reduce(add,self.table.values())
        for k in self.table.keys():
            expected = f0[k[0]] * f1[k[1]] * total
            if expected<1:
                if VERBOSE:print 'cochran fail: expected<1:',expected
                fail = 1
                break
            if expected < 5:
                min5_count = min5_count + 1
                if min5_count >= 0.2 * N * M:
                    if VERBOSE:print 'cochran fail: min5_count >= 20%:',min5_count,N*M
                    fail = 1
                    break

        return not fail

    def merge(self,a,b,ii):
        assert ii in [0,1]
        assert a in self.keys[ii] and b in self.keys[ii]
        ab = (a,b) ## new key

        start_count = reduce(add,self.table.values()) ## for debugging

        delete = []
        for k in self.table.keys():
            if k[ii] in [a,b]:
                delete.append(k)
                count = self.table[k]

                if ii==0:
                    new_k = (ab, k[1])
                else:
                    new_k = (k[0], ab)

                Increment( self.table, new_k, count)

        for k in delete:
            del ( self.table[k] )

        assert start_count == reduce(add, self.table.values())
        self.keys[ii].append(ab)
        for xx in [a,b]:
            assert self.keys[ii].count( xx )
            pos = self.keys[ii].index( xx )
            del self.keys[ii][ pos ]

    def show(self):
        f0 = self.frequency(0)
        f1 = self.frequency(1)
        total = reduce(add,self.table.values())

        print 'columns:',
        for k1 in self.keys[1]:
            print k1,
        print

        for k0 in self.keys[0]:
            for k1 in self.keys[1]:
                actual = self.table[(k0,k1)]
                expected = f0[k0] * f1[k1] * total
                print '%6d (%9.3f)'%(actual,expected),
            print 'row_key = ',k0



### should this be a method function? who knows...
def Chisq( table ): ## table coming in is just a dictionary
    T = TABLE( table )
    N = len(T.keys[0])
    M = len(T.keys[1])
    if VERBOSE:print 'calc chisq: init_N: %d init_M: %d'%(N,M)

    fail = N<2 or M<2

    while not fail and not T.cochran():

        f0 = T.frequency(0)
        f1 = T.frequency(1)

        l0 = map(lambda x,f=f0: [f[x], x], f0.keys())
        l0.sort()
        l1 = map(lambda x,f=f1: [f[x], x], f1.keys())
        l1.sort()

        if len(l0)>2 and ( l0[0][0] < l1[0][0] or len(l1)<=2):
            ## merge two smallest keys in l0
            column = 0
            a = l0[0][1]
            b = l0[1][1]
            if VERBOSE:print 'merge-freq:',l0[0][0],l0[1][0]
        elif len(l1)>2:
            ## merge two smallest keys in l1
            column = 1
            a = l1[0][1]
            b = l1[1][1]
            if VERBOSE:print 'merge-freq:',l1[0][0],l1[1][0]

        else:
            if VERBOSE:log('chisq failed: 2x2 table fails Cochran\n')
            fail = 1
            break

        if VERBOSE:print 'merge:',a,b,column
        T.merge(a,b,column)

    if fail:
        return 0.0,0,1.0

    assert T.cochran() ## debugging
    chisq = 0.0
    freq0 = T.frequency(0)
    freq1 = T.frequency(1)
    N = len(freq0.keys())
    M = len(freq1.keys())
    assert N>=2 and M>=2

    total = T.total()

    for k0 in T.keys[0]:
        f0 = freq0[k0]
        for k1 in T.keys[1]:
            f1 = freq1[k1]
            expected = f0 * f1 * total
            assert expected >=1
            actual = T.table[(k0,k1)]
            #print k0,k1,actual,expected
            chisq = chisq + ( actual-expected)**2 / expected

    if chisq > VERBOSE_CHISQ_THRESHOLD:
        print 'verbose chisq:',chisq
        T.show()

    df = (N-1)*(M-1)
    pval = stats.lchisqprob( chisq, df)
    return chisq, df, pval

########################################################################
########################################################################
########################################################################
########################################################################
########################################################################




names = string.split (popen('grep "CODE_NAMES" '+code_file).readlines()[0])[1:]

## read labels for codes
feature = {}
lines = map(string.split,popen('grep "^MAP" '+code_file).readlines())
for line in lines:
    name = line[1]
    counter = int(line[3])
    rep = line[5]
    feature[ (name,counter)] = rep

def Exclude( index, counter):
    global names, feature, EXCLUDE_EXPOSED
    return not ( EXCLUDE_EXPOSED and feature[ (names[index],counter)] [:3] != '-1,' )



lines = map(string.split,popen('grep "^CODE:" '+code_file).readlines())

bar_code = {}
count = {}
N = 0

for line in lines:
    if not N:
        N = len(line) - 2
        log('num features: %d num_decoys: %d\n'%(N,len(lines)))
        for i in range(N):
            count[i] = {}

    elif len(line) -2 != N:
        log('bad line: %s\n'%(string.join(line)))
        continue

    code = map(int,line[2:])
    name = line[1]

    bar_code[name] = code


## now get pairwise correlations

for i1 in range(N):
    for i2 in range(i1+1,N):

        ## setup the table of counts:
        T = {}
        for name in bar_code.keys():
            c1 = bar_code[name][i1]
            c2 = bar_code[name][i2]

            if Exclude(i1,c1) or Exclude(i2,c2):
                continue
            Increment( T, ( (i1,c1), (i2,c2) ) )

        chisq,df,pval = Chisq(T)

        #if chisq>0: ## number of lines is used for calculating p-val later on
        print 'CHISQ %4d %4d %6s %6s %4d %4d %9.3f %2d %g'\
              %(i1,i2,names[i1],names[i2],0,0,chisq,df,pval)



