/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#include "hat_typ.h"

hat_typ::hat_typ(FILE *fp)
{ 
	Init(fp);
}

void	hat_typ::Init(FILE *fp)
{

}

void	hat_typ::Free( )
{

}

void    hat_typ::hat_srch(FILE *fp)
// search a set of sequences 
{
}

void    hat_typ::Put(FILE *fp)
{
}

void	TrimToTemplateCMSA(cma_typ *IN_CMA, Int4 num_cma_files)
// Check for truncations at either end and modify cma files accordingly.
{
   cma_typ tpl_cma=IN_CMA[0];
   for(Int4 s=1; s <= num_cma_files; s++){ 
        fprintf(stderr,"================================ %d: %s.\n", s,NameCMSA(IN_CMA[s])); 
	e_type tplSq=TrueSeqCMSA(s+1,tpl_cma);
	if(LenSeq(tplSq) > LengthCMSA(1,IN_CMA[s])) print_error("TrimToTemplateCMSA() input error 1");
	if(LenSeq(tplSq) != LengthCMSA(1,IN_CMA[s])){	// then need to change input cma file.
		Int4 Start,i;
		cma_typ cmaX=IN_CMA[s];
		e_type csqSq=TrueSeqCMSA(1,cmaX);
		char rtn=IsSubSeq(tplSq,csqSq,&Start,FALSE);
		// rtn = 1 if tplSq is a subseq of csqSq.
		if(rtn != 1) print_error("Template and cma files are incompatible");
		if(Start > 0){					// remove N-terminal columns.
			for(i=1; i<=Start; i++){
                           if(LengthCMSA(1,cmaX) <= 3) print_error("TrimToTemplateCMSA() input error 2");
                           RmColumnMSA(1,1,cmaX); // block 1, first column removed.
                        }
		}
		if(LenSeq(tplSq) < LengthCMSA(1,cmaX)) {	// remove C-terminal columns.
			Int4 lenrm = LengthCMSA(1,cmaX) - LenSeq(tplSq);
                        for(i=1; i<=lenrm; i++){
                           Int4 lemon = LengthCMSA(1,cmaX);
                           if(lemon <= 3) print_error("TrimToTemplateCMSA() input error 3");
                           RmColumnMSA(1, lemon, cmaX);
                        }
		}
		IN_CMA[s] = MinimizeFirstSeqCMSA(cmaX); TotalNilCMSA(cmaX);
	}
   }
}

/********************************************************************** 

{()MVIRPATPELLRDGELDAVLMVKLLL()}*	template root consensus.
{()---NVTgnevlAPA--VVAP--NTLFYKV--()}*	template subgroup consensus.

   {()NVTGNEVLAPAVVAPNTLFYKV()}*	subgroup consensus.

  Need to add deletion ('-') columns at ends to  match template...
 **********************************************************************/
cma_typ	GrowEdgesCMSA(BooLean add2Nterm, Int4 NumGaps, cma_typ cma)
// Trim the cma input file at ends 
{
    Int4        i,j;

// fprintf(stderr,"st = %d\n",NumGaps); PutCMSA(stderr,cma);
//     ExtendFakeToRealCMSA(cma);	// else need to fix GetEdgeBlocksSite() routine!!!
// fprintf(stderr,"st = %d\n",NumGaps); PutCMSA(stderr,cma);
    for(i=1; i <= NumGaps; i++){
      if(add2Nterm){ if(!InsertColCMSA(1,FALSE,cma)) return 0; }
      else { if(!InsertColCMSA(1,TRUE,cma)) return 0; }
// PutCMSA(stderr,cma);
    } return cma;
}

BooLean	IsOkayTemplateCMA(cma_typ TemplateCMA)
{
	// fprintf(stderr,"len(cma %d)=%d\n",II,LengthCMSA(1,cma));
	for(Int4 sq=1; sq <= NumSeqsCMSA(TemplateCMA); sq++){
	   gsq_typ *gsq=gsqCMSA(sq,TemplateCMA);
	   // check for insertions at N-terminus.
	   Int4 i;
	   if((i=gsq->Insertion(0)) > 0){
		// gsq->Put(stderr,AlphabetCMSA(TemplateCMA));
		fprintf(stderr,
			"**** Fatal: %d residue insert at N-terminus of template sequence %d: ****\n",
			i,sq);	
		gsq->Put(stderr,60,AlphabetCMSA(TemplateCMA));
		print_error("********** Template file formating error *********");
		return FALSE;
	   }
	   // check for insertions at C-terminus.
	   if((i=gsq->CheckForInsertsAtEnds( )) > 0){
		// gsq->Put(stderr,AlphabetCMSA(TemplateCMA));
		fprintf(stderr,
			"**** Fatal: %d residue insert at C-terminus of template sequence %d: ****\n",
			i,sq);	
		gsq->Put(stderr,60,AlphabetCMSA(TemplateCMA));
		print_error("********** Template file formating error *********");
		return FALSE;
	   }
	}
	return TRUE;
}

#if 0
BooLean IsDeletedCMSA(UInt4 n, UInt4 r, cma_typ cma);
BooLean IsDeletedCMSA(UInt4 blk, UInt4 n, UInt4 r,
        cma_typ cma);
unsigned short InsertionCMSA(UInt4 blk,UInt4 n,UInt4 i,
        cma_typ cma);
unsigned short InsertionCMSA(UInt4 n,UInt4 i,cma_typ cma);
Int4    NumColumnsCMSA(cma_typ msa);
Int4    TotalLenCMSA(cma_typ msa);
Int4    FakeToRealCMA(Int4 sq,Int4 s, cma_typ cma);
Int4    ResidueCMSA(register Int4 t, register Int4 n, register Int4 s, cma_typ cma);
#endif

#if 1	// Add routine for outputting regions in vsi format.
void	PutVSIregionsCMSA(FILE *fp,Int4 sq,char *color,Int4 *start,Int4 *end,
		Int4 num_regions,cma_typ cma)
{
	Int4	r,i,c,n;
	ss_type	data=DataCMSA(cma);
	a_type	AB=SeqSetA(data);

	n=1;
	assert(nBlksCMSA(cma) ==1);
	c=TotalLenCMSA(cma);
	r = FakeToRealCMA(sq,c,cma);
	fprintf(fp,"1-%d.W20\n",r);
	for(c=1; c <= TotalLenCMSA(cma); c++){
	   if(n <= num_regions){
	     r = FakeToRealCMA(sq,c,cma);
	     if(c == start[n]){
		if(IsDeletedCMSA(sq,c,cma)){
			fprintf(fp,"%d-",r+1);
		} else fprintf(fp,"%d-",r);
	     } else if(c == end[n]){
		if(IsDeletedCMSA(sq,c,cma)){
			fprintf(fp,"%d.%c\n",r,color[n]);
		} else fprintf(fp,"%d.%c\n",r,color[n]);
		n++;
	     } else if(c > start[n] && c < end[n]){
		if((i=InsertionCMSA(sq,c,cma))){
		     printf("%d.%c,",r,color[n]);
	     	     r = FakeToRealCMA(sq,c+1,cma);
		     printf("%d-",r);
		}
	     } else if(c > end[n]) n++;
	   }
	} fprintf(fp,"\n\n");
#if 0	// DEBUG.
	for(c=1; c <= TotalLenCMSA(cma); c++){
		if(IsDeletedCMSA(sq,c,cma)){
			printf("%d(%d): -\n",c,r);
		} else {
			Int4 aa=ResidueCMSA(1, sq, c, cma);
			printf("%d(%d): %c\n",c,r,AlphaChar(aa,AB));
		} 
		if((i=InsertionCMSA(sq,c,cma))){
			printf("%d(%d): (%d)\n",c,r,i);
		}
	} fprintf(fp,"\n\n");
#endif
}
#endif

Int4	ParseInputRegionsVSI(char *input_string,Int4 **Start,Int4 **End,
		char **Colors,const char *usage)
	// -m=R:6..20;O:32..45;Y55..68.
// input_string = "R:6..20,O:32..45,Y:55..68."
{
	Int4	n,i,start,end,*S,*E,NumRegions=0;
	char	color,*C,*str;

	for(i=0; input_string[i]; i++){
		if(input_string[i] == ':') NumRegions++;
	}
	
	NEW(S,NumRegions+3,Int4); NEW(E,NumRegions+3,Int4);
	NEW(C,NumRegions+3,char);
	n=0;
	str=input_string;
	for(i=1; i < NumRegions; i++){
	  if(sscanf(str,"%c:%d..%d,",&color,&start,&end) != 3){
		print_error(usage);
	  } else { S[i]=start; E[i]=end; C[i]=color; }
	  while(str[0]!=','){ str++; } str++;
	}
	if(sscanf(str,"%c:%d..%d.",&color,&start,&end) != 3){
		print_error(usage);
	} else { S[i]=start; E[i]=end; C[i]=color; }
	// Check start and end...
	for(i=1; i < NumRegions; i++){
		if(E[i] < S[i]) print_error(usage);
		if(S[i+1] < E[i]) print_error(usage);
	} if(E[i] < S[i]) print_error(usage);
	*Start=S; *End=E; *Colors=C;
	return NumRegions;
}

#if 0	// not used...
char	*AddInsertToOperationArray3(Int4 start, Int4 end, char *operation)
// ========== Add an insertion to an operational array. ===========
{
	
	char state,*new_operation=0;
	Int4 j,o,no,column;
	Int4 trace_length=strlen(operation);
	NEW(new_operation,trace_length+5,char);
	new_operation[0]='E'; 

	for(no=1,o=j=1,column=1,state='E'; operation[o] != 'E'; o++){
          switch(operation[o]){
            case 'M': 
            case 'm': 
		if(column >= start && column <=end){
			   new_operation[no]='I'; 
		} else new_operation[no]=operation[o];
		no++; j++; column++; 
		break;
            case 'D': 
            case 'd': // deletion in sequence relative to profile.
		if(column >= start && column <=end){
			   // do nothing in new_operation; 
		} else { new_operation[no]=operation[o]; no++; }
                column++; break;
            case 'i': // insert is between profile blocks;
		new_operation[no]=operation[o]; no++; 
		j++; break;
            case 'I': // Insert ('-') within a profile block; delete from seq.
		new_operation[no]=operation[o]; no++; 
		j++; break;
            default:
            print_error("operation( ): input error"); break;
          }  state=operation[o];
       	}
	new_operation[no]='E'; no++; new_operation[no]=0;
	return new_operation;
}
#endif

cma_typ	*ConversionViaTemplateCMSA(cma_typ TemplateCMA, cma_typ *IN_CMA)
//************************************************************************************
//            NEW ROUTINE TO CONVERT INPUT CMAFILES GIVEN A TEMPLATE.
// any changes to this code should be copied to the corresponding files in hat_typ.cc
//************************************************************************************
{ 
	Int4	II,num_cma_files=NumSeqsCMSA(TemplateCMA)-1;
	e_type	trueE,fakeE;
	cma_typ	*OUT_CMA,cma,cma0;
	a_type	AB=AlphabetCMSA(TemplateCMA);

   NEW(OUT_CMA,num_cma_files+3,cma_typ);
   for(II=1; II <= num_cma_files; II++){	
	if(IN_CMA[II]==0) continue;
	// else fprintf(stderr,"\n%d.%s:\n",II,NameCMSA(IN_CMA[II]));
#if 0	// DEBUG
	{
	  char    str[100]; sprintf(str,"_aln%d.cma",II);
	  FILE *fptr=open_file("debug",str,"w"); PutCMSA(fptr,IN_CMA[II]); fclose(fptr);
	}
#endif
	cma=0;		// CMA[1] is template cmafile.

	//*********** 1. Confirm consistency between template and alignments. ***********
	Int4 len_template=LengthCMSA(1,TemplateCMA);
	Int4 len_cma=LengthCMSA(1,IN_CMA[II]);
	Int4 st,sc,ins,ndel=0;
	trueE = TrueSeqCMSA(II+1,TemplateCMA); fakeE = FakeSeqCMSA(1,IN_CMA[II]);
	if(!IdentSeqs(trueE,fakeE)){
	    fprintf(stderr,"\n%s:\n",NameCMSA(IN_CMA[II]));
	    PutSeq(stderr,trueE,AB); PutSeq(stderr,fakeE,AB);
	    fprintf(stderr,"Fatal: trueE and fakeE don't match in cma file %d\n",II+1);
	    exit(1);
	}
	assert(IdentSeqs(TrueSeqCMSA(1,IN_CMA[II]),fakeE));	
	assert(LenSeq(trueE) == len_cma);	// sequence 
	if(InsertionCMSA(1,II+1,len_template,TemplateCMA) > 0){
		fprintf(stderr,"Fatal: insertion within cma file %d\n",II+1); exit(1);
	}

	//********** 2. If deletions on either end of template,
	//                find C- and N-terminal adjustments. ***********
// PutCMSA(stderr,IN_CMA[II]);	// DEBUG...
	cma=CopyCMSA(IN_CMA[II]);
// PutCMSA(stderr,cma);	// DEBUG...
	Int4 Nt_adj=0,Ct_adj=0;
	// This solves issues with deletions on the ends. 
	// Need to convert GK{()---  ->  {(GK)---- .
	if(IsDeletedCMSA(1,II+1,1,TemplateCMA) 
		|| IsDeletedCMSA(1,II+1,len_template,TemplateCMA)){
        	ExtendFakeToRealCMSA(cma);
		// ^ else need to fix GetEdgeBlocksSite() routine!!!
	}
	if(IsDeletedCMSA(1,II+1,1,TemplateCMA)){	// no deletions allowed on ends.
	   fprintf(stderr,"Warning: first position deleted in template alignment");
           fprintf(stderr," %d (File %d)\n",II+1,II);
	   for(st=0; IsDeletedCMSA(1,II+1,st+1,TemplateCMA); st++) ; 
	   // ^ move in until reach a residue.
	   if(GrowEdgesCMSA(TRUE,st,cma) == 0){	// cma retained..
		WriteCMSA("debug1.cma", cma);
		print_error("hat_typ GrowEdgesCMSA( ) error");
	   } Nt_adj=st;
	}
	if(IsDeletedCMSA(1,II+1,len_template,TemplateCMA)){
		if(0) fprintf(stderr,
		   "Warning: last position deleted in template alignment %d (File %d)\n",
		    II+1,II);
		for(st=0; IsDeletedCMSA(1,II+1,len_template-st,TemplateCMA); st++) ;
		if(GrowEdgesCMSA(FALSE,st,cma) == 0){
			WriteCMSA("debug2.cma", cma);
			print_error("hat_typ GrowEdgesCMSA( ) error");
		} Ct_adj=st;
	}
	len_cma=LengthCMSA(1,cma);  // Ct_adj=Nt_adj=0;
// PutCMSA(stderr,cma);	// DEBUG...
	//*********** 3. Iterate through full lengths of both template and input alignments. ***********
	// sc = site in cma;  st = site in template
	// Start from the C-terminal end of both template and cma alignment.
	for(st=len_template-Ct_adj,sc=len_cma-Ct_adj; st > Nt_adj; )
	{
	   assert(sc > Nt_adj);
	   //******************** 3a. Disallow insertions directly following deletions. ***************
	   if(IsDeletedCMSA(1,II+1,st,TemplateCMA) && InsertionCMSA(1,II+1,st,TemplateCMA)){
		fprintf(stderr,"FATAL! -> %d = '%s':",II,NameCMSA(IN_CMA[II]));
		print_error("Input alignment contains a deletion next to an insertion");
	   }
	   ins=InsertionCMSA(1,II+1,st,TemplateCMA);  // Get # insertions at site in template.
	   //******************** 3b. Insertion in template found. ***************
	   if(ins > 0){			// Insertion in template alignment.
		// Disallow insertions next to deletions. (REDUNDANT CHECK)
		assert(!(IsDeletedCMSA(1,II+1,st,TemplateCMA) 
			|| IsDeletedCMSA(1,II+1,st+1,TemplateCMA)));
		sc-=ins; 	// decrement site in cma by # inserted residues...
		cma0=ConvertColsToInsertsCMSA(cma,1,sc+1,sc+ins); NilCMSA(cma); cma=cma0;
// fprintf(stderr,"ConvertColsToInsertsCMSA(cma,%d,%d)\n",sc+1,sc+ins);
// PutCMSA(stderr,cma);	// DEBUG...
		// Convert aligned columns at sc+1 to sc+ins into an insertion.
		st--; sc--; 
	   	assert(sc >= 0);
	   //******************** 3c. One or more deletions in template found. ***************
	   } else if(IsDeletedCMSA(1,II+1,st,TemplateCMA)){
	     // Deletion in template alignment.
	     ndel=0;
	     //******************** 3ci. Count the number of deletions to right of site in input alignment. ***************
	     do {
		if(st <= 1) break;
#if 1		// Disallow insertions next to deletions. (REDUNDANT CHECK)
		if(!(InsertionCMSA(1,II+1,st,TemplateCMA) == 0 && 
				InsertionCMSA(1,II+1,st-1,TemplateCMA) == 0)){
		    fprintf(stderr,"II = %d; st = %d\n",II,st);
		    fprintf(stderr,"InsertionCMSA(st) = %d\n",
				InsertionCMSA(1,II+1,st,TemplateCMA));
		    e_type tmpE=TrueSeqCMSA(II+1,TemplateCMA);
		    PutSeqID(stderr,tmpE);
		    fprintf(stderr,"\n");
		    PutSeq(stderr,tmpE,AlphabetCMSA(TemplateCMA));
		    gsq_typ *gsq=gsqCMSA(II+1,TemplateCMA);
		    gsq->Put(stderr,AlphabetCMSA(TemplateCMA));
		    print_error("Insertions next to deletions disallowed in template");
		    fprintf(stderr,"InsertionCMSA(st-1) = %d\n",
				InsertionCMSA(1,II+1,st-1,TemplateCMA));
			assert(InsertionCMSA(1,II+1,st,TemplateCMA) == 0 && 
				InsertionCMSA(1,II+1,st-1,TemplateCMA) == 0);
		}
#endif
		ndel++; st--;
	     } while(IsDeletedCMSA(1,II+1,st,TemplateCMA));
	     // fprintf(stderr,"end of loop ...6-%d.c\n",II);

	   //******************** 3cii. Add deletions to right of site in input alignment. ***************
	     if(ndel > 0){	// REDUNDANT IF STATEMENT...
	        cma0=InsertColumnsCMSA(cma,1,sc,ndel); // add '-'s to right of sc.
		if(cma0==0) print_error("InsertColumnsCMSA() error");
		NilCMSA(cma); cma=cma0;
// fprintf(stderr,"InsertColumnsCMSA(%d,%d,cma)\n",sc,ndel);
// PutCMSA(stderr,cma);
	     } 
	   //******************** 3d. Input and template match at this position. ***************
	   } else { st--; sc--; }	// Match in template alignment.
	}
	OUT_CMA[II]=cma;
   } return OUT_CMA;
}

