/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#include "rdc_typ.h"

long double rdc_typ::FisherExact(Int4 red_out, Int4 black_out,Int4 red_in,Int4 black_in, BooLean onetail, FILE *fp)
// USAGE: exact red_out black_out red_in black_in 
{
	Int4	s,time1,red,black,in,out;

	time1=time(NULL); 
/*************** Fisher Exact Test for a 2x2 contingency table *****
 *  observed:
 *
 *             red    black
 *      ----+-------+-------+-------
 *      out |   a   |   b   | a + b
 *      ----+-------+-------+-------
 *      in  |   c   |   d   | c + d
 *      ----+-------+-------+-------
 *          | a + c | b + d |a+b+c+d
 *
 *      see p. 81 of thesis.
 *
 **********************************************************************/
	long double	K,P,Q,end,p,OneTail,TwoTail,total;
	Int4	i,j,a,b,c,d;
	BooLean	failed=FALSE;
// 1. Find the most extreme configuration???
//  ... or leave this as it is because not interested in opposite arrangement?
	a = red_out; b = black_out;
	c = red_in; d = black_in;
	red = a+c; black = b+d; out = a+b; in = c+d;
        K = lgammal(out+1.0) + lgammal(red+1.0)
                + lgammal(black+1.0) + lgammal(in+1.0) - lgammal(a+b+c+d+1.0);
	if(isinf(K)==1){ fprintf(stderr,"input overflow (+infinity)\n"); failed=TRUE; }
	else if(isinf(K)==-1){ fprintf(stderr,"input underflow (-infinity)\n"); failed=TRUE; }
        P = lgammal(a+1.0) + lgammal(b+1.0) + lgammal(c+1.0) + lgammal(d+1.0);
	if(isinf(P)==1){ fprintf(stderr,"input overflow (+infinity)\n"); failed=TRUE; }
	else if(isinf(P)==-1){ fprintf(stderr,"input underflow (-infinity)\n"); failed=TRUE; }
	OneTail=TwoTail=total=0.0;
#if 0
	// if(red_out==3228) fp=stdout;
	if(onetail==FALSE){
	  end = MINIMUM(long double,a,d);
          for(i=MAXIMUM(Int4,-b,-c); i <= end; i++) {
            Q = lgammal(a-i+1.0) + lgammal(b+i+1.0) + lgammal(c+i+1.0) + lgammal(d-i+1.0);
	    p = expl(K-Q);
	    if(p==0.0) failed=TRUE;
            if(P <= Q){
		TwoTail += p;
		if((a-i) >= a){ OneTail += p;
            	  if(0) fprintf(stdout," [%d : %d | %d : %d] (Q = %Lg; = = %Lg)\n", 
			(a-i), (b+i),(c+i),(d-i),Q,p);
		}
	    } total+=p;
          }
	} else {
	  Int4 ro,bo,ri,bi;
          for(i=MINIMUM(Int4,red,out),j=0; i >= red_out; j++,i--){
	    ro=i; bo=out-ro; ri=red-ro; bi=black-bo;
            Q = lgammal(ro+1.0) + lgammal(bo+1.0) + lgammal(ri+1.0) + lgammal(bi+1.0);
	    p = expl(K-Q);
	    if(p==0.0) failed=TRUE;
	    OneTail += p;
            if(0 && fp) fprintf(stdout," [%d : %d | %d : %d] (Q = %Lg; = = %Lg)\n", 
			ro,ri,bo,bi,Q,p);
	  }
	}
#else
#if 0	// this does not appear to be formulated correctly...
        if(fp)fprintf(stdout,"========= for two tail test ============\n");
        end = MINIMUM(long double,a,d);
        for(i=MAXIMUM(Int4,-b,-c); i <= end; i++) {
            Q = lgammal(a-i+1.0) + lgammal(b+i+1.0) + lgammal(c+i+1.0) + lgammal(d-i+1.0);
            p = expl(K-Q);
            if(p==0.0) failed=TRUE;
            if(P <= Q){
                TwoTail += p;
                if((a-i) >= a){ 
                  if(fp) fprintf(stdout," [%d : %d | %d : %d] (Q = %Lg; = = %Lg)\n",
                        (a-i), (b+i),(c+i),(d-i),Q,p);
                }
            } total+=p;
        }
#else
assert(onetail);
#endif
 	if(fp)fprintf(stdout,"========= for one tail test ============\n");
	Int4 ro,bo,ri,bi;
        for(ro=red_out; ro <= red; ro++){
            bo=out-ro; ri=red-ro; bi=black-bo;
            if(bo < 0 || ri < 0 || bi < 0) break;
            Q = lgammal(ro+1.0) + lgammal(bo+1.0) + lgammal(ri+1.0) + lgammal(bi+1.0);
            p = expl(K-Q);
            if(p==0.0) failed=TRUE;
            OneTail += p;
            if(0 && fp) fprintf(stdout," [%d : %d | %d : %d] (Q = %Lg; = = %Lg) Sum=%Lg\n",
                        ro,ri,bo,bi,Q,p,OneTail);
        }
#endif
	if(fp){
	   fprintf(fp,"Drew %d balls out of an urn with %d red and %d black balls\n",
			out,red,black);
	   fprintf(fp,"Among these there were %d red balls.\n",red_out);
	   fprintf(fp,"One tail prob = %Lg\n",OneTail);
	   // fprintf(fp,"Two tail prob = %Lg\n",TwoTail);
	   // fprintf(fp,"total = %.9Lg\n",total);
	   if(failed) fprintf(stdout,"WARNING: Underflow occurred: test is unreliable\n");
	   // fprintf(stderr,"LDBL_MAX = %Lg\n",LDBL_MIN);
	}
	if(failed) return 0.0; 
	else if(onetail) return OneTail;
	else return TwoTail;
	// else return OneTail;
}

Int4 rdc_typ::Run(FILE *ofp,Int4 pairA,Int4 pairB,Int4 rank)
{
                swt_typ *swt=0; 
                BooLean PassedInHSW=FALSE;
                hsw_typ hsw=0;
		if(filename){
		  FILE *fp=0;
                  char str[200];
                  sprintf(str,"%s.hsw",filename);
                  if((fp=fopen(str,"r")) == NULL){        // then create file...
                    swt = new swt_typ(cma);
                    hsw=swt->RtnHSW( ); fp = open_file(filename,".hsw","w");
                    FWriteHSW(fp,hsw); fclose(fp);
                  } else {
                    hsw=FReadHSW(fp,AB,cma); fclose(fp); swt = new swt_typ(hsw);
                    PassedInHSW=TRUE;
                  }
		} 
                // long double **rtn=RunRtnLogP(ofp,pairA,pairB,rank);
                long double **rtn=RunRtnLogP(ofp,pairA,pairB,rank,swt);
                for(Int4 i=0; rtn[i]; i++) free(rtn[i]); free(rtn);
		if(swt){
                  delete swt;
                  if(PassedInHSW) NilHSW(hsw);
		} return 0;
}

long double	**rdc_typ::RunRtnLogP(FILE *ofp,Int4 pairA,Int4 pairB, Int4 DC_rank,swt_typ *swt)
{
	if(pairA > NumColumnsCMSA(cma) || pairB > NumColumnsCMSA(cma)){
			print_error("FATAL: -pairs input out of range");
	} 
	if(pairA == pairB) print_error("FATAL: -pairs input error");
	Int4 i,j,NN,resIJ[25][25],resI[25],resJ[25];
	long double	**rtn=0;
	NEWP(rtn,nAlpha(AB) +5, long double);
	for(i=0; i <= nAlpha(AB); i++){ 
	   resI[i]=resJ[i]=0;
	   NEW(rtn[i],nAlpha(AB) +5, long double);
	   for(j=0; j <= nAlpha(AB); j++){ resIJ[i][j]=0; }
	}
	Int4 n,rA,rB,sq,N=NumSeqsCMSA(cma);
    if(swt != 0){ // use sequence weights (swt_typ)
// fprintf(stderr,"======= downweighting for sequence redundancy ======\n");
	double ResIJ[25][25],ResI[25],ResJ[25];
	for(i=0; i <= nAlpha(AB); i++){ 
	   ResI[i]=ResJ[i]=0.0;
	   for(j=0; j <= nAlpha(AB); j++){ ResIJ[i][j]=0; }
	}
	double  *SqWt=swt->RtnAveSqWt();
        for(Int4 sq=1;sq <= N; sq++) {
		rA=ResidueCMSA(1,sq,pairA,cma);
		rB=ResidueCMSA(1,sq,pairB,cma);
		ResIJ[rA][rB] += SqWt[sq];
		ResI[rA] += SqWt[sq]; ResJ[rB] += SqWt[sq];
// fprintf(stderr,"\tsq=%d; ResIJ[%c][%c]=%g\n",sq,AlphaChar(rA,AB),AlphaChar(rB,AB),ResIJ[rA][rB]);
	}
	Int4 Ni,Nj;
	for(Ni=Nj=0,i=0; i <= nAlpha(AB); i++){ 
	   resI[i]= (Int4) round(ResI[i]); resJ[i]= (Int4) round(ResJ[i]);
	   for(j=0; j <= nAlpha(AB); j++){ resIJ[i][j]=(Int4) round(ResIJ[i][j]); }
	   Ni += resI[i]; Nj += resJ[i];
// fprintf(stderr,"resI[%d]=%d; ResI[%d]=%lf; Ni=%d\n",i,resI[i],i,ResI[i],Ni);
// fprintf(stderr,"resJ[%d]=%d; ResJ[%d]=%lf; Nj=%d\n",i,resJ[i],i,ResJ[i],Nj);
	}
	if(Ni < Nj) Ni=Nj; else if(Ni > Nj) Nj=Ni;
	NN=Ni;
// fprintf(stderr,"NN = %d; N = %d\n\n",NN,N);
    } else {	// use actual unweighted counts
// fprintf(stderr,"======= using actual sequence counts (no downweighting) ======\n");
        for(Int4 sq=1;sq <= N; sq++) {
		rA=ResidueCMSA(1,sq,pairA,cma);
		rB=ResidueCMSA(1,sq,pairB,cma);
		resIJ[rA][rB]++; resI[rA]++; resJ[rB]++;
	} NN=N;
    }
	dh_type dH=dheap(500,4);
	Int4 n_i[500],n_j[500];
	for(n=0,i=1; i <= nAlpha(AB); i++){ 
	   for(j=1; j <= nAlpha(AB); j++){
		n++;
		if(1 || resIJ[i][j] > 0){
		   insrtHeap(n,-(keytyp)resIJ[i][j],dH); n_i[n]=i; n_j[n]=j;
		}
	   }
	}
	long double *Exact[5];
	NEW(Exact[1],n+9,long double);
	NEW(Exact[2],n+9,long double);
	Int4	totHits=0,sumHits=0; 
	dh_type dHp[5]; dHp[1]=dheap(500,4); dHp[2]=dheap(500,4);
	// fprintf(stdout,"rank\tresA\tresB\tobs\texp\t X \texact\n");
	Int4 rank=0;
	for(rank=1; (n=delminHeap(dH)) != 0; rank++){
		i=n_i[n]; j=n_j[n];
		totHits += resIJ[i][j];
		if(resI[i] > 0 && resJ[j] > 0){
		  double pI=(double)resI[i]/(double)NN;
		  double pJ=(double)resJ[j]/(double)NN;
		  double exp=pI*pJ*(double)NN;
		  Int4 red=resI[i]; 
		  Int4 Out=resJ[j];
		  Int4 red_out=resIJ[i][j]; 
		  Int4 black_out=Out-red_out;
		  Int4 red_in=red-red_out;
		  Int4 In = NN-Out;
		  Int4 black=NN-red;
		  Int4 black_in=black-black_out;

		  FILE *efp=0; // efp=stderr;	
		  BooLean onetail=TRUE,twotail=FALSE;
		  double d;
assert(rtn[i][j] == 0.0);
char cA,cB; cA=AlphaChar(i,AB); cB=AlphaChar(j,AB);
		  if(exp < (double)resIJ[i][j]){
		    long double DD=this->FisherExact(red_out, black_out,red_in,black_in,onetail,efp);
		    if(DD == 0.0){ rtn[i][j]=-1.0; d=-(double) DBL_MAX; }
		    else { rtn[i][j]=DD; d=(double) logl(DD); }
		    Exact[1][n]=DD; insrtHeap(n,d,dHp[1]); 
if(0) fprintf(stderr,"%c%d..%c%d: log10l(%Lg)=%Lf; rtn=%Lg; DD=%Lg\n",
		cA,pairA,cB,pairB,Exact[1][n],-log10l(Exact[1][n]),rtn[i][j],DD);
		  } else if(exp > (double)resIJ[i][j]){
		    long double DD=this->FisherExact(red_in,black_in,red_out,black_out,onetail,efp);
		    if(DD == 0.0){ rtn[i][j]=2.0; d=-(double) DBL_MAX; }
		    else { rtn[i][j]=-DD; d=(double) logl(DD); }
		    Exact[2][n]=DD; insrtHeap(n,d,dHp[2]); 
if(0) fprintf(stderr,"%c%d..%c%d: log10l(%Lg)=%Lf; rtn=%Lg; -DD=%Lg\n",
		cA,pairA,cB,pairB,Exact[2][n],-log10l(Exact[2][n]),rtn[i][j],-DD);
		  } else rtn[i][j]=-2.0;
		} 
	} // fprintf(stdout,"\n\n\n");
/*************** Fisher Exact Test for a 2x2 contingency table *****
 *  observed:
 *
 *             red    black
 *      ----+-------+-------+-------
 *      out |  ro   |  bo   | oo		oo=resJ[j];
 *      ----+-------+-------+-------
 *      in  |  ri   |  bi   | ii
 *      ----+-------+-------+-------
 *          |  rr   |  bb   |  N	rr=resI[i]; bb=resJ[j];
 *
 **********************************************************************/
	fprintf(ofp,"\n======================== %d. Pairs for columns %d & %d ========================\n",
		DC_rank,pairA,pairB);
	fprintf(ofp,"\n........................ Elevated residue pairs ........................\n");
	fprintf(ofp,"rank\tresI\tresJ\tIJ\texpIJ\tI!J\t!IJ\t!I!J\tratio\t%cseqs\t1-tail prob\n",'%');
	for(rank=1,sumHits=0; (n=delminHeap(dHp[1])) != 0; rank++){
		i=n_i[n]; j=n_j[n];
		char cA,cB; cA=AlphaChar(i,AB); cB=AlphaChar(j,AB);
		double pI=(double)resI[i]/(double)NN;
		double pJ=(double)resJ[j]/(double)NN;
		double exp=pI*pJ*(double)NN;
		double d=(double) resIJ[i][j]/exp;
		sumHits+=resIJ[i][j];
		double rr=(double)resI[i];
		double oo=(double)resJ[j];
		double ro=(double)resIJ[i][j];
		double bo=oo-ro;
		double ri=rr-ro;
		double ii=(double)NN - oo;
		double bb=(double)NN - rr;
		double bi=ii-ri;


#if 0	// DEBUG...
	        double eR=pI*(double)NN;
	        double eO=pJ*(double)NN;
		double eRO=pI*pJ*(double)NN;
		double eBO=eO-eRO;
		double eRI=eR - eRO;
	        double eI=(double)NN-eO;
		double eB= (double)NN-eR;
		double eBI=eB - eBO;

		double chsq,dd=(ro-eRO)*(ro-eRO)/eRO; chsq=dd;
		fprintf(stderr,"dd=%lg\n",dd);
		dd = (bo-eBO)*(bo-eBO)/eBO;
		fprintf(stderr,"dd=%lg\n",dd);
		chsq += dd;
		dd = (ri-eRI)*(ri-eRI)/eRI;
		fprintf(stderr,"dd=%lg\n",dd);
		chsq += dd;
		dd = (bi-eBI)*(bi-eBI)/eBI;
		fprintf(stderr,"dd=%lg\n",dd);
		chsq += dd;

#if 0
		long double LD=FisherExact((Int4)ro,(Int4)bo,(Int4)ri,(Int4)bi,TRUE,stdout);
// FisherExact((Int4)bi,(Int4)ri,(Int4)bo,(Int4)ro,FALSE, stdout);
#elif 0
		fprintf(stdout,"\t%g\t%g\t%g\t%g\tchisq=%g\n",ro,bo,ri,bi,chsq);
#endif
#endif
		long double DD=Exact[1][n];
		if(DD > 0.01) continue;
		if(resIJ[i][j] > 0 || exp >= 0.9){
		   if(exp >= 10.0){
		    fprintf(ofp,"%d\t%c%d\t%c%d\t%d\t%.0f\t%d\t%d\t%d\t%.2f\t%.1f%c\t%.3Lg\n",
			rank,cA,pairA,cB,pairB,resIJ[i][j],exp,(Int4)bo,(Int4)ri,(Int4)bi,
			d,(100.0*(double)sumHits)/(double)totHits,'%',DD); 
		   } else {
		    fprintf(ofp,"%d\t%c%d\t%c%d\t%d\t%.2f\t%d\t%d\t%d\t%.2f\t%.1f%c\t%.3Lg\n",
			rank,cA,pairA,cB,pairB,resIJ[i][j],exp,(Int4)bo,(Int4)ri,(Int4)bi,
			d,(100.0*(double)sumHits)/(double)totHits,'%',DD); 
		   }
		}
	}
	fprintf(ofp,"\n........................ Reduced residue pairs ........................\n");
	fprintf(ofp,"rank\tresI\tresJ\tIJ\texpIJ\tI!J\t!IJ\t!I!J\tratio\t%cseqs\t1-tail prob\n",'%');
	for(rank=1, sumHits=0; (n=delminHeap(dHp[2])) != 0; rank++){
		i=n_i[n]; j=n_j[n];
		char cA,cB; cA=AlphaChar(i,AB); cB=AlphaChar(j,AB);
		double pI=(double)resI[i]/(double)NN;
		double pJ=(double)resJ[j]/(double)NN;
		double exp=pI*pJ*(double)NN;
		double d=(double) resIJ[i][j]/exp;
		sumHits+=resIJ[i][j];
		double rr=(double)resI[i];
		double oo=(double)resJ[j];
		double ro=(double)resIJ[i][j];
		double bo=oo-ro;
		double ri=rr-ro;
		double ii=(double)NN - oo;
		double bb=(double)NN-rr;
		double bi=ii-ri;
		long double DD=Exact[2][n];
		if(DD > 0.01) continue;
		if(exp >= 0.9){
		   if(exp >= 10.0){
		    fprintf(ofp,"%d\t%c%d\t%c%d\t%d\t%.0f\t%d\t%d\t%d\t%.2f\t%.1f%c\t%.3Lg\n",
			rank,cA,pairA,cB,pairB,resIJ[i][j],exp,(Int4)bo,(Int4)ri,(Int4)bi,
			d,(100.0*(double)sumHits)/(double)totHits,'%',DD); 
		   } else {
		    fprintf(ofp,"%d\t%c%d\t%c%d\t%d\t%.2f\t%d\t%d\t%d\t%.2f\t%.1f%c\t%.3Lg\n",
			rank,cA,pairA,cB,pairB,resIJ[i][j],exp,(Int4)bo,(Int4)ri,(Int4)bi,
			d,(100.0*(double)sumHits)/(double)totHits,'%',DD); 
		   }
		}
	} Nildheap(dH); Nildheap(dHp[1]); Nildheap(dHp[2]);
	free(Exact[1]); free(Exact[2]);
	return rtn;
}


