/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#include "sma.h"
#include "cmsa.h"
#include "gibbs.h"
#include "residues.h"

#define	USAGE_MSADIFF "USAGE: msadiff msafile1 msafile2 [options]\n\
   options:\n\
     -o         - reorder sequences in msa using order in file1\n\
     -g         - put gap lengths\n\
     -p         - put difference \n\
     -m<file>   - merge alignments\n\
\n\n"

/**************************** Global Variables ******************************/
int	run_msadiff(Int4 argc,char *argv[])
{ 
	Int4	arg,i,j,s,cutoff=-999;
	Int4    time1,*len,t,n;
	char	str[300],*name;
	sma_typ MA[3];
	a_type	A;
	ss_type	data;
	BooLean	reorder=FALSE,gaps=FALSE,merge=FALSE,putdiff=FALSE;
	cma_typ	ma1,ma2,ma;
	char    *argument[200];
	Int4	*num,narg;
	st_type	S;

	time1=time(NULL); 
	if(argc < 3) print_error(USAGE_MSADIFF);
	for(arg = 3; arg < argc; arg++){
	   if(argv[arg][0] != '-') print_error(USAGE_MSADIFF);
	   switch(argv[arg][1]) {
	     case 'm': merge=TRUE; name = argv[arg]+2; break;
	     case 'g': gaps=TRUE; break;
	     case 'p': putdiff=TRUE; break;
	     case 'o': reorder=TRUE; break;
	     default: print_error(USAGE_MSADIFF);
	   }
	}
  if(gaps){
	MA[1]=ReadSMA(argv[1]);
	fprintf(stdout,"gaps for %s: \n",argv[1]);
	PutGapsSMA(stdout, MA[1]);
	NilSMA(MA[1]); 
#if 0
	MA[2]=ReadSMA(argv[2]);
	fprintf(stdout,"gaps for %s: \n",argv[2]);
	PutGapsSMA(stdout, MA[2]);
	NilSMA(MA[2]); 
#endif
  } else if(merge){
	MA[1]=ReadSMA(argv[1]);
	MA[2]=ReadSMA(argv[2]);
	ma1=SMA2CMSA(name,MA[1]);
	ma2=SMA2CMSA(name,MA[2]);
	ma = IntersectionCMSA(ma1, ma2);

	S = SitesCMSA(ma);
	NEW(num,nBlksCMSA(ma)+3,Int4);
	for(j=1; j<=nBlksCMSA(ma); j++) num[j] = SiteLen(j,S);
	narg=string2argv(argument,"-t1");
	SimulatedAnnealingGibbs(narg,argument,&ma,'D',150);

	PutAlnCMSA("test_msadiff", ma,NULL);
	NilSMA(MA[1]); NilSMA(MA[2]); 
	NilCMSA(ma1); NilCMSA(ma2);
	NilCMSA(ma);
  } else if(putdiff){
	MA[1]=ReadSMA(argv[1]);
	MA[2]=ReadSMA(argv[2]);
	PutDiffSMA(stdout, MA[1], MA[2]);
	NilSMA(MA[1]); NilSMA(MA[2]); 
  } else if(reorder){
	MA[1]=ReadSMA(argv[2]);
	if(MA[1]==NULL) print_error(USAGE_MSADIFF);
	ReOrderSMA(stdout, argv[1], MA[1]);
	NilSMA(MA[1]); 
  } else {
    	for(i=1; i<=2; i++){
		MA[i]=ReadSMA(argv[i]);
		if(MA[i]==NULL) print_error(USAGE_MSADIFF);
#if 0
		PutSMA(stdout, MA[i]);
#endif
    	}
    	DiffSMA(stderr,MA[1], MA[2]);
    
	NilSMA(MA[1]); NilSMA(MA[2]); 
  }
	fprintf(stderr,"\ttime: %d seconds (%0.2f minutes)\n",
                        time(NULL)-time1,(float)(time(NULL)-time1)/60.0);
	return 0;
}

#define	USAGE_MSA2RTF	"USAGE: msa2rtf msafafile [options]\n\
   msafafile = fafile with aligned sequences\n\
   options:\n\
     -p<real>        - cumulative binomial probability cutoff\n\
     -K<fafile>      - output only those sequences in <fafile>\n\
     -I<real>:<real> - low and high column information cutoff settings\n\
\n\n"

/**************************** Global Variables ******************************/
static int msa2rtf(Int4 argc,char *argv[])
{ 
	Int4	arg,i,j,s,cutoff=-999;
	Int4    time1,*len,t,n;
	sma_typ MA,MA2;
	BooLean	**null;
	float	Cut= -100;
	double	cbp_cut=0.00001,infoLO=1.8,infoHI=2.5;
	char	c,str[100];
	FILE	*fp;
	ss_type	key_seq=0;
	char	*key_name=0;
#if 0 // kyte & doolittle hydrophobicity
char code[] = "RKDBNSEHZQTGAPVYCMILWFX";
float factor[] = {0.0,0.6,1.0,1.0,1.0,3.6,1.0,1.3,1.0,1.0,3.8,4.1,
        6.3,2.9,8.7,3.2,7.0,6.4,9.0,8.2,3.6,7.2,4.5};

	a_type A=MkAlpha(AMINO_ACIDS,PROT_BLOSUM62);
	for(i=0; i<=nAlpha(A); i++){
	    c = AlphaChar(i,A);
	    for(j=0; code[j] != 0; j++){
		if(c == code[j]){ 
	    	   printf("%.1f, ",factor[j]); break;
		}
	    }
	}
#endif

	time1=time(NULL); 
	if(argc < 2) print_error(USAGE_MSA2RTF);
	for(arg = 2; arg < argc; arg++){
	   if(argv[arg][0] != '-') print_error(USAGE_MSA2RTF);
	   switch(argv[arg][1]) {
	     case 'p': cbp_cut=RealOption(argv[arg],'p',0.0,1.0,USAGE_MSA2RTF); break;
	     case 'K': if(argv[arg][2]!=0) key_name=argv[arg]+2; 
			else print_error(USAGE_MSA2RTF); break;
             case 'I': if(sscanf(argv[arg],"-I%lf:%lf",&infoLO,&infoHI) != 2)
                                        print_error(USAGE_MSA2RTF); 
                     if(infoLO <= 0.1 || infoHI < infoLO) print_error(USAGE_MSA2RTF);
                     break;
	     default: print_error(USAGE_MSA2RTF);
	   }
	}
	sprintf(str,"%s.msa",argv[1]);
	MA=ReadSMA(str);
	fp=open_file(argv[1],".rtf","w");
	fprintf(stderr,"Purging sequences to compute pattern significance...\n");
	if(key_name) key_seq = SeqSet(key_name,AlphaSMA(MA));
	SMA2RTF(fp, log10(cbp_cut), infoLO,infoHI,key_seq,MA);
	fclose(fp);
	if(key_name) NilSeqSet(key_seq); NilSMA(MA); 
	fprintf(stderr,"\ttime: %d seconds (%0.2f minutes)\n",
                        time(NULL)-time1,(float)(time(NULL)-time1)/60.0);
	return 0;
}

static long blsm62_ins_emit_bits2K[21] = { 0,   // X
// C    G    A    S    T    N    D    E    Q    K
-500, 399,-149, 359, 117, 275, 233,  43,  45, 210,
// R    H    W    Y    F    V    I    L    M    P
  96, 106,-294,-249,-381,-369,-626,-466,-720, 394};

#define	USAGE_START	"tweak block based (blk_gismo) MSA(s)\n\
   USAGE 1: tweakmsa R <prefix> [options]\n\
      Create a rtf file where <prefix>.msa is an input MSA file\n\
    options:\n\
     -p<real>        - cumulative binomial probability cutoff\n\
     -K<fafile>      - output only those sequences in <fafile>\n\
     -I<real>:<real> - low and high column information cutoff settings\n\
   USAGE 2: tweakmsa D <prefix1> <prefix2> [options]\n\
        finds the difference between two MSAs\n\
    options:\n\
     -o         - reorder sequences in msa using order in file1\n\
     -g         - put gap lengths\n\
     -p         - put difference \n\
     -m<file>   - merge alignments\n\
   USAGE 3: tweakmsa O <prefix> [options]\n\
    options:\n\
     -c            - cluster sequences into related sets (default)\n\
     -e<int>       - eliminate sequence <int> from the alignment\n\
     -p<int>       - purge MA sequences at cutoff <int>\n\
     -S<int>-<int> - Remove sequences from alignment with lengths outside range\n\
     -r<real>      - Remove poor sequences from alignment at E-value = <real>\n\
     -R<array>     - remove blocks from alignment\n\
     -I<int>:<int> - reput sequences with flanking regions\n\
     -T<blk>:<n>   - remove n residues from front of block (negative = end)\n\
     -N<n>         - remove n residues from N-terminal block and sequences\n\
     -C<n>         - remove n residues from C-terminal block and sequences\n\
     -x<int>       - fill in gaps between block in fafile with x's to a gap of <int>\n\
\n\n"

/**************************** Global Variables ******************************/
int	main(Int4 argc,char *argv[])
{ 
	Int4	arg,i,s,cutoff=-999,blk,lenrm,mingap;
	Int4    time1,*len,t,n,N,left=0,right=0,x;
	UInt4	min, max,seqid;
	char	str[300],mode='c',*rm=NULL;
	float	Cut;
	sma_typ MA,MA2;
	cma_typ	cmsa,cmsa2;
	BooLean	*remove;
	Int4	*value,test=0;
	ss_type	data;
	e_type	*ListE;
	a_type	A;
	BooLean *good;
	FILE	*fp;

	time1=time(NULL); 
#if 0	// temperary
	a_type AB=MkAlpha(AMINO_ACIDS,PROT_BLOSUM62);
	char alpha2[22]="XACDEFGHIKLMNPQRSTVWY";
	Int4	ins_emmission[]={0,-149,-500,233,43,-381,399,106,-626,
			210,-466,-720,275,394,45,96,359,117,-369,-294,-249};
	// fprintf(stdout,"Int4 blsm62_ins_emit_bits2K[ ] = {\n");
	fprintf(stdout,"float blsm62_ins_emit_nats[ ] = {\n");
	for(Int4 let=0; let<=nAlpha(AB); let++){
		for(Int4 let2=0; let2 <=20; let2++){
			if(let == AlphaCode(alpha2[let2],AB)){
				// fprintf(stdout,"%d,",ins_emmission[let2]);
				double tmp_d = (double) ins_emmission[let2];
				tmp_d = log(pow(2.0,tmp_d/1000.0));
				fprintf(stdout,"%.4f,",tmp_d);
				break;
			}
		}
	} fprintf(stdout,"};\n");
#endif
	int Argc;
	char *Argv[100];
	if(argc < 2) print_error(USAGE_START);
	if(argv[1][1]!=0) print_error(USAGE_START);
	if(argv[1][0]=='R' && argv[1][1]==0){
	    if(argc < 3) print_error(USAGE_START);
	    // if(argv[3][0] != '-') print_error(USAGE_START);
	    Argc = argc-1; Argv[0]=argv[0];
	    for(arg = 2; arg < argc; arg++) Argv[arg-1]=argv[arg]; 
	    return msa2rtf(Argc,Argv);
	} else if(argv[1][0]=='D' && argv[1][1]==0){
	    if(argc < 4) print_error(USAGE_START);
	    // if(argc > 4 && argv[5][0] != '-') print_error(USAGE_START);
	    Argc = argc-1; Argv[0]=argv[0];
	    for(arg = 2; arg < argc; arg++) Argv[arg-1]=argv[arg]; 
	    return run_msadiff(Argc,Argv);
	} else if(argv[1][0]=='O' && argv[1][1]==0){
	 if(argc < 3) print_error(USAGE_START);
	 for(arg = 3; arg < argc; arg++){
	   if(argv[arg][0] != '-') print_error(USAGE_START);
	   switch(argv[arg][1]) {
             case 'C': mode = 'C'; 
		lenrm=IntOption(argv[arg],'C',1,5000,USAGE_START); break;
             case 'N': mode = 'N'; 
		lenrm=IntOption(argv[arg],'N',1,5000,USAGE_START); break;
             case 'T': mode = 'T'; 
                     if(sscanf(argv[arg],"-T%d:%d",&blk,&lenrm) != 2)
                        print_error(USAGE_START); 
                     break;
             case 'S': mode = 'S'; 
                     if(sscanf(argv[arg],"-S%d-%d",&min,&max) != 2)
                        print_error(USAGE_START); 
                     break;
             case 'I': mode = 'I'; 
                     if(sscanf(argv[arg],"-I%d:%d",&left,&right) != 2)
                        print_error(USAGE_START); 
                     break;
	     case 'e': seqid=IntOption(argv[arg],'e',1,5000,USAGE_START); 
			mode = 'e'; break;
	     case 'p': cutoff=IntOption(argv[arg],'p',2,5000,USAGE_START); 
			mode = 'p'; break;
	     case 'r': Cut=RealOption(argv[arg],'r',-99999,500000,USAGE_START); 
			mode = 'r'; break;
	     case 'c': mode = 'c'; break;
	     case 'R': mode = 'R'; rm = argv[arg]+2; break;
	     case 'x': mingap=IntOption(argv[arg],'x',1,50,USAGE_START); 
			mode = 'x'; break;
	     case 't': mode = 't'; test=IntOption(argv[arg],'t',0,5000,USAGE_START); 
			break;
	     case 'z': break;
	     default: print_error(USAGE_START);
	   }
	 }
	 sprintf(str,"%s.msa",argv[2]);
	 MA=ReadSMA(str); A=AlphaSMA(MA);
	 data=NULL;
	 switch(mode) {
	  case 'N': 
		data = SeqSet(argv[2],AlphaSMA(MA));
		fp = open_file(argv[2],".new","w");
		PutTrimmedSeqSet(fp, lenrm, data);
		fclose(fp);
		fp = open_file(argv[2],".new.msa","w");
		PutTruncateBlkSMA(fp, 1, lenrm, MA);
		fclose(fp); 
		break;
	  case 'C': 
		data = SeqSet(argv[2],AlphaSMA(MA));
		fp = open_file(argv[2],".new","w");
		PutTrimmedSeqSet(fp, -lenrm, data);
		fclose(fp);
		fp = open_file(argv[2],".new.msa","w");
		PutTruncateBlkSMA(fp, ntypSMA(MA), -lenrm, MA);
		fclose(fp); 
		break;
	  case 'T': 
		PutTruncateBlkSMA(stdout, blk, lenrm, MA);
		break;
	  case 'e':
		data = SeqSet(argv[2],AlphaSMA(MA));
	    	N = NSeqsSeqSet(data);
		if(seqid > N) print_error(USAGE_START);
		NEW(good,N+3,BooLean);
		fp = open_file(argv[2],".new","w");
	        for(n=0,i=1; i<= N; i++) {
		   if(i != seqid){ 
			good[i] = TRUE; n++;
			PutSeq(fp,SeqSetE(i,data),A); 
		   }
		} fclose(fp);
		if(n < 1) print_error("fatal!: removes all sequences!");
		MA2 = RmSMA(good,n,MA);
		fp = open_file(argv[2],".new.msa","w");
		PutSMA(fp, MA2); fclose(fp);
		NilSMA(MA2); free(good);
		break;
	  case 'S':
		data = SeqSet(argv[2],AlphaSMA(MA));
	    	N = NSeqsSeqSet(data);
		NEW(good,N+3,BooLean);
		fp = open_file(argv[2],".new","w");
	        for(n=0,i=1; i<= N; i++) {
		   x = LenSeq(SeqSetE(i,data));
		   if(x >= min && x <= max){
			good[i] = TRUE; n++;
			PutSeq(fp,SeqSetE(i,data),A);
		   }
		} fclose(fp);
		if(n < 1) print_error("fatal!: removes all sequences!");
		MA2 = RmSMA(good,n,MA);
		fp = open_file(argv[2],".new.msa","w");
		PutSMA(fp, MA2); fclose(fp);
		NilSMA(MA2); free(good);
		break;
	  case 'r':
		data = SeqSet(argv[2],AlphaSMA(MA));
	    	N = NSeqsSeqSet(data);
		good = FixSMA(Cut, &MA2, MA);
		fp = open_file(argv[2],".new.msa","w");
		PutSMA(fp, MA2); fclose(fp);
		fp = open_file(argv[2],".new","w");
	        for(i=1; i<= N; i++) if(good[i]) PutSeq(fp,SeqSetE(i,data),A);
		fclose(fp);
		NilSMA(MA2); free(good);
		break;
	  case 'p':
		data = SeqSet(argv[2],AlphaSMA(MA));
		fp = open_file(argv[2],".new.msa","w");
		good = PutPurgeSMA(fp,cutoff,MA); fclose(fp);
		N = NSeqsSeqSet(data);
		fp = open_file(argv[2],".new","w");
		for(i=1; i<= N; i++) {
		   if(good[i]) PutSeq(fp,SeqSetE(i,data),AlphaSMA(MA));
		}
		fclose(fp); free(good);
		break;
	  case 'I':
		data = SeqSet(argv[2],AlphaSMA(MA));
	    N = NSeqsSeqSet(data);
	    NEW(ListE,N+2,e_type);
	    for(i=1; i<= N; i++) ListE[i] = SeqSetE(i,data);
	    RePutSeqsSMA(stdout, ListE, left, right, MA);
	    free(ListE);
		break;
	  case 'c':
	    ClusterSMA(MA);
	    PutSMA(stdout, MA);
		break;
	  case 'R':
		if(rm==NULL) print_error(USAGE_START);
fprintf(stderr,"DEBUG 1\n");
		NEW(remove,ntypSMA(MA) + 2, BooLean);
		NEW(value,ntypSMA(MA) + 10, Int4);
		i = 1;
		n=ParseIntegers(rm, value, USAGE_START);
		for(i=1; i <=n; i++){
		   s=value[i];
		   if(s < 0 || s > ntypSMA(MA)) print_error(USAGE_START);
		   else remove[s]=TRUE;
		}
fprintf(stderr,"DEBUG 2\n");
		sprintf(str,"%s.msa",argv[2]);
		cmsa = SubSMA2CMSA(str, remove,MA);
fprintf(stderr,"DEBUG 3\n");
		sprintf(str,"%s.new",argv[2]);
		PutAlnCMSA(str,cmsa,NULL);
		// WriteMtfCMSA(str, cmsa, NULL);
		free(remove); free(value);
	    	NilCMSA(cmsa);
		break;
#if 0
	  case 't':
	    e_type  E,E2;
	    st_type S;
	    ss_type P;
	    switch(test){
	      case 1:
		cmsa = SMA2CMSA(argv[2],MA);
		S = SitesCMSA(cmsa);
		P = SitesSeqSet(S);
		for(i=1; i<= 200; i++){
		   for(n=1; n <= NSeqsSeqSet(P); n++){
			E = SeqSetE(n,P);
			E2 = CopySeq(E);
			if(!ReplaceSeqSites(n, E2, S)) print_error("input error");
		   }
		}
	    	NilCMSA(cmsa);
	        break;
	      case 0:
		for(i=1; i<= 200; i++){
		  cmsa = SMA2CMSA(argv[2],MA);
	    	  NilCMSA(cmsa);
		}
	        break;
	      default : print_error(USAGE_START); break;
	      break;
	    }
	    break;
#endif
	  case 'x':
		cmsa = SMA2CMSA(argv[2],MA);
		fp = open_file(argv[2],".stuffed","w");
		PutStuffedSeqSetCMSA(fp,mingap,cmsa);
		fclose(fp);
	    	NilCMSA(cmsa);
		break;
	  default : print_error(USAGE_START); break;
	 }
	} else print_error(USAGE_START);
	if(data != NULL) NilSeqSet(data);
	NilSMA(MA); 
	if(mode != 'T') fprintf(stderr,
		"\ttime: %d seconds (%0.2f minutes)\n",
                        time(NULL)-time1,(float)(time(NULL)-time1)/60.0);
	return 0;
}

