#include "stdinc.h"
#include "residues.h"
#include "cmsa.h"
#include "cma_gmb.h"

#define	MAXNUM	4000
// #define	MAXLEN	52000
#define	MAXLEN	500000

int	RunSP_score2(FILE *testfile,FILE *goldfile)
{

 	int	i,j,k,h,h1,h2,temp,score,num,Num,len,pos1,pos2,tests,Tests;
 	int	hmmlen,hmmlen2;
 	char	ch;
 	int	Len[MAXNUM+1];
#if 0	// this is core dumping for some reason prior to entering this routine; compiler bug?
 	int	hmmpos[MAXNUM+1][MAXLEN+1];
 	int	seqpos[MAXNUM+1][MAXLEN+1];
#else
 	int	*hmmpos[MAXNUM+1];
 	int	*seqpos[MAXNUM+1];
	for(i=1; i <= MAXNUM; i++){ NEW(seqpos[i],MAXLEN+1,int); NEW(hmmpos[i],MAXLEN+1,int); }
#endif

 	num=0;
 	while (fscanf(goldfile,"%c",&ch)!=EOF) {
 		if (ch=='s') {
 			++num;
 			if (num>MAXNUM) {
 				fprintf(stderr,"Too many gold standard sequences.\n"); return 1;
 			} len=h=0;
 		}
 		if (ch=='m' || ch=='i') {
 			++len;
 			if (len>MAXLEN) {
 				fprintf(stderr,"Gold standard sequence %d is too long.\n",num); return 1;
 			}
 		}
 		if (ch=='m' || ch=='d') {
 			++h;
 			if (h>MAXLEN) {
 				fprintf(stderr,"Gold standard sequence %d is too long.\n",num); return 1;
 			} seqpos[num][h] = (ch=='m') ? len : -len;
 		}
 		if (ch=='e') {
 			if (num==1) hmmlen=h;
 			else if (h!=hmmlen) {
 				fprintf(stderr,"Gold standard sequences have unequal numbers of match states.\n");
 				return 1;
 			} Len[num]=len;
 		}
 	}
 	Num=num;
 	fprintf(stderr,"Number of sequences = %4d\n",Num);
 	fprintf(stderr,"Gold HMM length =     %4d\n",hmmlen);

/*	Read in test alignment	*/

 	num=0;
 	while (fscanf(testfile,"%c",&ch)!=EOF) {
 		if (ch=='s') {
 			++num;
 			if (num>Num) {
 				fprintf(stderr,"More test than gold standard sequences.\n");
 				return 1;
 			} len=h=0;
 		}
 		if (ch=='m' || ch=='d') {
 			++h;
 			if (h>MAXLEN) {
 				fprintf(stderr,"Test sequence %d is too long.\n",num);
 				return 1;
 			}
 		}
 		if (ch=='m' || ch=='i') {
 			++len;
 			if (len>MAXLEN) {
 				fprintf(stderr,"Test sequence %d is too long.\n",num);
 				return 1;
 			} hmmpos[num][len] = (ch=='m') ? h : 0;
 		}
 		if (ch=='e') {
 			if (num==1) hmmlen2=h;
 			else if (h!=hmmlen2) {
 				fprintf(stderr,"Test sequences have unequal numbers of match states.\n");
 				return 1;
 			}
 			if (len!=Len[num]) {
 				fprintf(stderr,"Gold standard and test sequences %d have unequal lengths: %d and %d\n",num,Len[num],len);
 				return 1;
 			}
 		}
 	}
 	if (num!=Num) {
 		fprintf(stderr,"Gold standard and test sets have unequal numbers of sequences: %d and %d\n",Num,num);
 		return 1;
 	}
 	fprintf(stderr,"Test HMM length =     %4d\n",hmmlen2);

/*	Calculate SP-Score	*/

 	tests=Tests=hmmlen*Num*(Num-1)/2;
 	for (i=1;i<=hmmlen;++i) {
 		temp=0;
 		for (j=1;j<=Num;++j) if (seqpos[j][i]<=0) ++temp;
 		tests-= temp*(temp-1)/2;
 	}
#if 1
 	score=0;
 	for (i=1;i<=hmmlen;++i) for (j=1;j<Num;++j) {
 		h1 = ((pos1=seqpos[j][i])>0) ? hmmpos[j][pos1]:hmmpos[j][-pos1];
 		for (k=j+1;k<=Num;++k) {
 			h2= ((pos2=seqpos[k][i])>0) ? hmmpos[k][pos2]:hmmpos[k][-pos2];
 			if (pos1>0 && pos2>0) {
 				if (h1>0 && h1==h2) ++score;
 			}
 			else if (pos1>0) {
 				if (h1>0 && (pos2==0 || (h2>0 && h2<h1)) && (-pos2 == Len[k] 
							|| hmmpos[k][-pos2+1]>h1)) ++score;

 			}
 			else if (pos2>0) {
 				if (h2>0 && (pos1==0 || (h1>0 && h1<h2)) && (-pos1 == Len[j] 
							|| hmmpos[j][-pos1+1]>h2)) ++score;
 			}
 		}
 	}
#endif
 	fprintf(stderr,"Tests = %7d /%7d\n",tests,Tests);
 	fprintf(stderr,"SP-score (%d) = %7d /%7d (%.3f)\n",hmmlen2,score,tests,(float) score/tests);
	for(i=1; i <= MAXNUM; i++){ free(seqpos[i]); free(hmmpos[i]); }
	return 0;
}

int	RunSP_score(FILE *testfile,FILE *goldfile)
{
 	int	i,j,k,h,temp,score,num,Num,len,pos1,pos2,tests;
 	int	hmmlen,hmmlen2;
 	int	Len[MAXNUM+1];
#if 0
 	int	hmmpos[MAXNUM+1][MAXLEN+1];
 	int	seqpos[MAXNUM+1][MAXLEN+1];
#else
 	int	*hmmpos[MAXNUM+1];
 	int	*seqpos[MAXNUM+1];
	for(i=1; i <= MAXNUM; i++){ NEW(seqpos[i],MAXLEN+1,int); NEW(hmmpos[i],MAXLEN+1,int); }
#endif
 	char	ch;

/*	Read in gold standard alignment	*/

 	num=0;
 	while (fscanf(goldfile,"%c",&ch)!=EOF) {
 		if (ch=='s') {
 			++num;
 			if (num>MAXNUM) {
 				fprintf(stderr,"Too many gold standard sequences.\n");
 				return 1;
 			} len=h=0;
 		}
 		if (ch=='m' || ch=='i') {
 			++len;
 			if (len>MAXLEN) {
 				fprintf(stderr,"Gold standard sequence %d is too long.\n",num);
 				return 1;
 			}
 		}
 		if (ch=='m' || ch=='d') {
 			++h;
 			if (h>MAXLEN) {
 				fprintf(stderr,"Gold standard sequence %d is too long.\n",num);
 				return 1;
 			} seqpos[num][h] = (ch=='m') ? len : 0;
 		}
 		if (ch=='e') {
 			if (num==1) hmmlen=h;
 			else if (h!=hmmlen) {
 				fprintf(stderr,"Gold standard sequences have unequal numbers of match states.\n");
 				return 1;
 			} Len[num]=len;
 		}
 	} Num=num;
 	fprintf(stderr,"Number of sequences = %4d\n",Num);
 	fprintf(stderr,"Gold HMM length =     %4d\n",hmmlen);

/*	Read in test alignment	*/

 	num=0;
 	while (fscanf(testfile,"%c",&ch)!=EOF) {
 		if (ch=='s') {
 			++num;
 			if (num>Num) {
 				fprintf(stderr,"More test than gold standard sequences.\n");
 				return 1;
 			} len=h=0;
 		}
 		if (ch=='m' || ch=='d') {
 			++h;
 			if (h>MAXLEN) {
 				fprintf(stderr,"Test sequence %d is too long.\n",num);
 				return 1;
 			}
 		}
 		if (ch=='m' || ch=='i') {
 			++len;
 			if (len>MAXLEN) {
 				fprintf(stderr,"Test sequence %d is too long.\n",num);
 				return 1;
 			} hmmpos[num][len] = (ch=='m') ? h : 0;
 		}
 		if (ch=='e') {
 			if (num==1) hmmlen2=h;
 			else if (h!=hmmlen2) {
 				fprintf(stderr,"Test sequences have unequal numbers of match states.\n");
 				return 1;
 			} if (len!=Len[num]) {
 				fprintf(stderr,"Gold standard and test sequences %d have unequal lengths: %d and %d\n",
					num,Len[num],len);
 				return 1;
 			}
 		}
 	}
 	if (num!=Num) {
 		fprintf(stderr,"Gold standard and test sets have unequal numbers of sequences: %d and %d\n",Num,num);
 		return 1;
 	} fprintf(stderr,"Test HMM length =     %4d\n",hmmlen2);

/*	Calculate SP-Score	*/

 	score=tests=0;
 	for (i=1;i<=hmmlen;++i) {
 		temp=0;
 		for (j=1;j<=Num;++j) if (pos1=seqpos[j][i]) {
 			++temp;
 			if (h=hmmpos[j][pos1]) for (k=j+1;k<=Num;++k)
 				if ((pos2=seqpos[k][i]) && (h==hmmpos[k][pos2])) ++score;
 		}
 		tests+=(temp*temp-temp)/2;
 	}
#if 0
 	fprintf(stderr,"Tests = %7d /%7d\n",tests,Tests);
 	fprintf(stderr,"SP-score (%d) = %7d /%7d (%.3f)\n",hmmlen2,score,tests,(float) score/tests);
#else
 	fprintf(stderr,"Tests =   %7d /%7d\n",tests,hmmlen*Num*(Num-1)/2);
 	fprintf(stderr,"SP-score (%d) = %7d /%7d (%.5f)\n",hmmlen2,score,tests,(float) score/tests);
#endif
	for(i=1; i <= MAXNUM; i++){ free(seqpos[i]); free(hmmpos[i]); }
	return 0;
}

#define USAGE_START     "USAGE: SP_score test_paths gold_paths\n\
   options:\n\
     -x         - dummy\n\n"

int main(int argc, char *argv[])
{
     Int4	i,j;
     FILE 	*fp[3];
     // FILE 	**fp; NEWP(fp,5,FILE);

     if(argc != 3) print_error(USAGE_START);
#if 1
     a_type AB=MkAlpha(AMINO_ACIDS,PROT_BLOSUM62);
     for(i=1; i <=2; i++){
	cma_typ cma=ReadCMSA2(argv[i],AB);
#if 0
	if(LengthCMSA(1,cma) >= 998){
		double	fractDeleted=0.80;
		Int4 col_deleted=RmGappyColumnsCMSA(fractDeleted, cma);
		fprintf(stderr,"\n %d columns removed",col_deleted);
	}
#endif
#if 1
	fp[i]=tmpfile();
#else
	fp[i]=open_file(argv[i],".tmp","w");
#endif
        gss_typ *gss=gssCMSA(cma);
        Int4    NN=NumSeqsCMSA(cma),len=LengthCMSA(1,cma);
        Int4    Start=0,Len=0;
        for(Int4 sq=1; sq <= NN; sq++){
              gsq_typ *gsq=gsqCMSA(sq,cma);
              Int4 *Sites=GetPosSitesCMSA(sq,cma);
              char c,*operation=gsq->Operation(nBlksCMSA(cma),Sites,LengthsCMSA(cma));
	      free(Sites);
              for(Int4 s=0; (c=operation[s]) != 0; s++){
                if(s==0) operation[s]='s'; else operation[s] = tolower(c);
              }
#if 0
              Int4 Nt=gsq->OverHangN( );
              Int4 Ct=gsq->OverHangC( );
              // fprintf(stdout,"%d(%d): [%d]%s[%d]\n",sq,strlen(operation),Nt,operation,Ct);
              // fprintf(stderr,"%d: %s\n",sq,operation);
#endif
              fprintf(fp[i],"%d: %s\n",sq,operation);
	      free(operation);
        } TotalNilCMSA(cma); 
#if 1
	rewind(fp[i]);
#else
	fclose(fp[i]); fp[i]=open_file(argv[i],".tmp","r");
#endif
     } NilAlpha(AB);
#else
// char c=0; while((c=fgetc(fp[1])) != EOF)  fprintf(stderr,"%c",c); rewind(fp[1]);
fp[1]=open_file(argv[1],".tmp","r");
fp[2]=open_file(argv[2],".tmp","r");
#endif
     i=RunSP_score(fp[1],fp[2]); fclose(fp[1]); fclose(fp[2]);
     // i=RunSP_score2(fp[1],fp[2]); fclose(fp[1]); fclose(fp[2]);
     return i;
}

