/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#include "edc_typ.h"

Int4    dcm_typ::pvcalcD(Int4 L,Int4 D,Int4 *pos,long double &pval,BooLean jeff,BooLean cflag)
/*	Subroutine to calculate P-values for			*/
/*	initial match clusters in sequences			*/
/*								*/
/*	Program by Stephen F. Altschul				*/
/*	Version 1.11.3;  January 26, 2017			*/
/* 	int	jeff;		=  Flat: 0;  Jeffreys': 1	*/
/* 	int	cflag;          =   p-value correction		*/
// This version uses long double...(afn modification) 
{
 	Int4	M,m,n,x,y,minm,maxm,bestm;
 	long double	sum,off,score,bestsc=-1,pv;
 	static	long double	*lni,*lrat,*offset;
 	static	Int4	oldL=0,oldD=0;

/*	Check that input makes sense				*/
 	if (L<2 || D<1 || D>=L) return(-1);
 	pos[0]=0;
 	for (m=1;m<=D;++m) if (pos[m]<=pos[m-1] || pos[m]>L) return(-1);

/*	Initialize arrays					*/

 	if (L>oldL) {
 		if (oldL) { free(lni); free(lrat); free(offset); }
 		lni= (long double *) calloc(L+3,sizeof(long double));
 		lrat= (long double *) calloc(L+3,sizeof(long double));
 		offset= (long double *) calloc(L+3,sizeof(long double));
 		lni[0]=lrat[0]=sum=0;
		// Purify is reporting an uninitialized memory read for library function logl() 
 		for (x=1;x<L;++x) { lni[x]=x*logl(x); sum+=logl(x); lrat[x]=lni[x]-sum; }
 		lni[L]=L*logl(L);
 	}
 	if (L!=oldL || D!= oldD) for (x=1;x<L;++x) offset[x]=0;
 	oldL=L; oldD=D;

/*	Find best front-weighted cut				*/

 	bestm=0;
 	for (M=1;M<=D;++M) if (M*L > D*(x=pos[M])) {
 		y=L-x;
 		if (offset[x]==0) {
 			sum=0; off=lrat[x]+lrat[y];
 			minm= (y>=D) ? 0 : D-y;
 			maxm= (x<=D) ? x : D;
 			for (m=minm;m<=maxm;++m) {
 				n=D-m; sum+=expl(lrat[m]+lrat[x-m]+lrat[n]+lrat[y-n]-off);
 			}
 			offset[x]=lni[x]+lni[y]+logl(sum);
 			if (jeff==0) offset[x]+=logl(1.0/(x*x)+1.0/(y*y))/2;
 			offset[y]=offset[x];
 		} n=D-M; score=lni[M]+lni[x-M]+lni[n]+lni[y-n]-offset[x];
 		if (bestm==0 || score > bestsc) { bestm=M; bestsc=score; }
 	}

/*	Return best number of matches and p-value		*/
 	pval=1.0;
 	if(bestm) {
 		bestsc-=lni[D]+lni[L-D]-lni[L];
 		// pv = expl(-bestsc)*sqrtl(D/3.14159)/2;
 		pv = expl(-bestsc)*sqrtl(D/3.14159)/4;
 		pv*= (jeff) ? logl(1.0242*L) : L-1;
 		if (cflag) pv*=(L-D+1-2.0/(D+1))/(L-1);
 		pval = pv/(1+pv); 
 	} return(bestm);
}

long double  dcm_typ::LnFact(Int4 n)
/* static variables are guaranteed to be initialized to zero */
{
        static long double lnft[501];

        if (n <= 1) return 0.0;
// fprintf(stderr,"n=%d\n",n);
        if (n <= 500) return lnft[n] ? lnft[n] : (lnft[n]=lgammal((long double)n+1.0));
        else return lgammal((long double) n+1.0);
}

long double  dcm_typ::OneTailCumHyperGeomProb(Int4 Nr,Int4 Nb, Int4 rd)
#if 0   //****************************************************
  N total balls with Nr red balls and Nb = N-Nr black balls.
  Choose Nr balls at random.  The probability that the
  group so chosen will contain rd or more red balls is returned.
#endif  //****************************************************
{
        long double  p,K;

        if(Nb == 0) return 1.0;
        K = (LnFact(Nr)+LnFact(Nb)-LnFact(Nb+Nr)+LnFact(Nr)+LnFact(Nb));
        for(p=0.0; rd <= Nr; rd++){
           p += expl(K-LnFact(rd)-LnFact(Nr-rd)-LnFact(Nr-rd)-LnFact(Nb-Nr+rd));
        } return p;
}

long double  dcm_typ::CumHyperGeomProb(Int4 N1,Int4 N2, Int4 n,Int4 x)
#if 0   //****************************************************
  N total balls with N1 red balls and N2 = N-N1 black balls.
  Choose n balls at random.  The probability that the
  group so chosen will contain x or more red balls is
  given by: p=CumHyperGeomProb(N1,N2,n,x).
#endif  //****************************************************
{
        Int4    end;
        long double  p,K;

        if(x == 0) return 1.0;
        end = MINIMUM(Int4,N1,n);
        K = (LnFact(N1)+LnFact(N2)-LnFact(N2+N1)+LnFact(n)+LnFact(N2+N1-n));
        for(p=0.0; x <= end; x++){
           p += expl(K-LnFact(x)-LnFact(N1-x)-LnFact(n-x)-LnFact(N2-n+x));
        } return p;
}

Int4	dcm_typ::pvcalcICA_BIU(Int4 L,Int4 D,Int4 *pos,long double *binp,Int4 jeff,Int4 cflag,
		long double &pval)
#if 0 /*********************************************************************
/*      Subroutine to calculate P-values for                    */
/*      initial match clusters in sequences                     */
/*      using ball & urn information                            */
/*                                                              */
/*      Program by Stephen F. Altschul                          */
/*      Version 1.11.3;  August 2, 2017                         */
        Int4     L;
        Int4     D;
        Int4     *pos;
        long double  *binp;          /*   Ball & urn p-values        */
        Int4     jeff;           /*   Flat: 0;  Jeffreys': 1     */
        Int4     cflag;          /*   p-value correction         */
        long double  *pval;
#endif
{
        Int4    M,m,n,x,y,minm,maxm,bestm;
        long double  sum,off,score,bestsc=-1,pv;
        long double  Offset,temp;
#if 0	// moved to dcm_typ
        static  long double  *lni,*lrat,*offset;
        static  Int4     oldL=0;
        static  Int4     oldD=0;
#endif

#if 1	// check for possible math overflow...
	if(L > (Int4) INT4_MAX){
		print_error("FATAL: Input array length > INT4_MAX");
	}
#endif

/*      Check that input makes sense                            */
        if(L<2 || D<1 || D>=L) return(-1);
        // for(pos[0]=0,m=1;m<=D;++m) fprintf(stderr,"%d: pos[m]=%d\n",m,pos[m]);
        for(pos[0]=0,m=1;m<=D;++m) if(pos[m]<=pos[m-1] || pos[m]>L) return(-1);

/*      Initialize arrays                                       */
        if (L>oldL) {
                if (oldL) { free(lni); free(lrat); free(offset); }
                lni= (long double *) calloc(L+1,sizeof(long double));
                lrat= (long double *) calloc(L,sizeof(long double));
                offset= (long double *) calloc(L,sizeof(long double));
                lni[0]=lrat[0]=sum=0;
                for (x=1;x<L;++x) {
                        lni[x]=x*logl(x);
                        sum+=logl(x);
                        lrat[x]=lni[x]-sum;
                } lni[L]=L*logl(L);
        }
        if (L!=oldL || D!= oldD) for (x=1;x<L;++x) offset[x]=0;
        oldL=L; oldD=D;

/*      Find best front-weighted cut                            */
        bestm=0;
        Offset=lni[D]+lni[L-D]-lni[L];
        for(M=1;M<=D;++M){
	    if (M*L > D*(x=pos[M])) {
                y=L-x;
                if(offset[x]==0) {
                        sum=0;
                        off=lrat[x]+lrat[y];
                        minm= (y>=D) ? 0 : D-y;
                        maxm= (x<=D) ? x : D;
                        for(m=minm;m<=maxm;++m) {
                                n=D-m;
                                sum+=expl(lrat[m]+lrat[x-m]+lrat[n]+lrat[y-n]-off);
                        } offset[y]=offset[x]=lni[x]+lni[y]+logl(sum);
                } n=D-M;
                score=lni[M]+lni[x-M]+lni[n]+lni[y-n]-offset[x]-Offset;
		// ^Offset needed to get p-value.
                temp=expl(-score);       // Log-odds -> odds ratio       // expm1l() ?
                temp/=1+temp;           // Odds ratio -> p-value        
                temp*=binp[M];          // Add ball & urn information   
                temp*=1-logl(temp);      // Two-factor correction: Unified P-value.
                score = -logl(temp);      // p-value -> -log p-value.
#if 0
                if(jeff==0) score-=logl(1.0/(x*x)+1.0/(y*y))/2;	
		// ^ Fisher information correction.
#else
		long double XX,YY;
                if(jeff==0){
		  XX=(long double)x*(long double)x;
		  YY=(long double)y*(long double)y;
                  score-=logl(1.0/XX+1.0/YY)/2;	// Fisher information correction.
		}
#endif
                if(bestm==0 || score > bestsc) { bestm=M; bestsc=score; }
#if 0
		if(XX < 0 || YY < 0) 
		{
		   fprintf(stderr,
			"M=%d; L=%d; M*L=%d > %d = D*pos[M]; D=%d; pos[M]=%d\n",
				M,L,M*L,D*pos[M],D,pos[M]);
		   fprintf(stderr,
			"M=%d; n=%d; lni[M]=%Lg; lni[x-M]=%Lg; lni[n]=%Lg; lni[y-n]=%Lg;\n",
			M,n,lni[M],lni[x-M],lni[n],lni[y-n]);
		   fprintf(stderr,
			" offset[x]=%Lg; score=%Lg.\n",offset[x],score);
		   fprintf(stderr,"score2=%Lg; x*x=%d; y*y=%d.\n",score,x*x,y*y);
		   fprintf(stderr,
			"bestm=%d; score=%Lg; bestsc=%Lg; x=%d; Offset=%Lg.\n",
			bestm,score,bestsc,x,Offset);
		}
#endif
	    }
        }

/*      Return best number of matches and p-value               */
        pval=1.0;
        if(bestm){
                pv = expl(-bestsc)*sqrtl(D/3.14159)/4; 
                pv *= (jeff) ? logl(1.024*L) : L-1;
                if (cflag) pv*=(L-D+1-2.0/(D+1))/(L-1);
		pval = -expm1l(-pv);	// convert back from E-value to P-value.
	/**********************************************************************************
		expm1l computes the e (Euler's number, 2.7182818) raised to the given
		power arg, minus 1.0. This function is more accurate than the expression
		exp(arg)-1.0 if arg is close to zero.
	**********************************************************************************/
        }
#if 0
	fprintf(stderr,"bestm=%d; pval=%Lg; pv = %Lg; bestsc=%Lg\n",
		bestm,pval,pv,bestsc);
#endif
	return(bestm);
}

Int4	dcm_typ::ScorePermutation(Int4 *P, Int4 N)
{
	Int4	score,i,j,k,n;
	for(score=0,i=1; i<=N; i++){ score += P[i]*(N-i+1); }
	// for(score=0,i=1; i<=N; i++){ score += P[i]*pow(2,N-i); }
	return score;
}

double	dcm_typ::PermutationPvalue(FILE *fp,Int4 N, Int4 *input)
{
	Int4	i,j,k,x,n,*permute;
	Int4	minscore,maxscore,diff;
	double	d;
	NEW(permute,N+9,Int4);
	for(j=1; j <=N; j++) permute[j]=j;
	minscore = ScorePermutation(permute,N);
	for(j=1; j <=N; j++) permute[j]=N-j+1;
	maxscore = ScorePermutation(permute,N);
	diff=maxscore-minscore; d=(double)diff/100.0;
	if(efptr) fprintf(fp,"range of possible scores = %d to %d\n",minscore,maxscore);
	for(j=1; j <=N; j++) permute[j]=0;
	dh_type	dH=dheap(N+2,3);
	for(j=1; j <= N; j++){ insrtHeap(j,(keytyp) input[j],dH); }
	for(i=1;!emptyHeap(dH); i++){
		j=delminHeap(dH); permute[j] = i; 
		// fprintf(stderr,"%d. j=%d \n",i,j);
	}
	Nildheap(dH);
	if(efptr) for(j=1; j <= N; j++) fprintf(fp,"%d,",permute[j]); fprintf(fp,"\n");
	h_type	HG=Histogram("permutation scores",minscore,maxscore,d);
	Int4	nLower=0,M=1000000,s,S,*Permute;
	if(PermuteMillions > 1) M=M*PermuteMillions;
	NEW(Permute,N+9,Int4);
	s=ScorePermutation(permute,N);
	dH=dheap(N+5,3);
	for(n=1; n <= M; n++){
	   for(j=1; j <= N; j++){ insrtHeap(j,(keytyp) Random(),dH); }
	   for(i=1;!emptyHeap(dH); i++){ Permute[i]=delminHeap(dH); }
	   S = ScorePermutation(Permute,N);
	   IncdHist(S,HG);
	   if(S <= s) nLower++;
	}
	double	Z,stdev,norm_p,mean=MeanHist(HG);
	stdev=sqrt(VarianceHist(HG));
	Z = (mean-s)/stdev; norm_p=erfc(Z/sqrt(2))/2;
	if(efptr) PutHist(fp,60,HG); 
	NilHist(HG);
	d=(double)nLower/(double)M;
	// fprintf(fp,"Permute: = %d (p=%.3g; norm_p(Z=%.3f)=%.3g)\n",s,d,Z,norm_p);
	// fprintf(fp,"Permute:\t%d\t%d\t%.3g;\t%.3f\t%.3f\n",N,s,Z,d,norm_p);
	fprintf(fp,"Permute: %d\t%.3f\t%.3g\t%.3g\n",N,Z,d,norm_p);
	Nildheap(dH); free(permute); free(Permute);
	return d;
}

