/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#include "PurgeMSA_usage.h"
#include "cmsa.h"
#include "swt_typ.h"
#include "cma_gmb.h"

#include "residues.h"
#include "blosum62.h"
#include "set_typ.h"


void	PutFastRepSetCMSA(FILE *fp_err, FILE *fp, Int4 percent_ident,Int4 *Nset,cma_typ cma)
{
#if 0
	BooLean	*list=RtnFastRepSetCMSA(fp_err,percent_ident,Nset,cma);
	PutSelectCMSA(fp,list,cma); free(list); 
#else
	BooLean	keep_first=FALSE;
	if(percent_ident < 0){ keep_first=TRUE; percent_ident=-percent_ident; }
	set_typ InSet=MakeSet(NumSeqsCMSA(cma)+4); FillSet(InSet);
	set_typ	Set=RtnFastRepSetCMSA(fp_err, percent_ident,InSet,cma);
	if(keep_first) AddSet(1,Set);
	PutInSetCMSA(fp,Set,cma); NilSet(Set); NilSet(InSet);
#endif
}

FILE *OpenFileToRead(char *argv1)
{
	FILE *fp=0;
	char	str[500]; sprintf(str,"%s.cma",argv1);
	if((fp=fopen(str,"r")) == NULL){ fp = open_file(argv1,".mma","r"); } 
	return fp;
}


Int4	PutUniqueMergedMinColCMSA(char *filename, char *matstr, double MinCol, a_type AB)
{
#if 0
	FILE *fp=OpenFileToRead(filename);
	Int4 rtn=PutUniqueMergedMinColCMSA(fp, char *matstr, double MinCol, a_type AB);
	fclose(fp); return rtn;
#else
	Int4 Number,file;
	FILE *fp=OpenFileToRead(filename);
	cma_typ cma,tcma,*IN_CMA=MultiReadCMSA(fp,&Number,AB); fclose(fp);

	// -m option
	assert(nBlksCMSA(IN_CMA[1]) == 1);
	if(Number > 1){
	   fp=tmpfile();
	   PutMergedCMSA(fp,Number,IN_CMA); 
	   for(file=1; file <= Number; file++) TotalNilCMSA(IN_CMA[file]); free(IN_CMA);
	   rewind(fp); cma=ReadCMSA(fp,AB); fclose(fp);
	} else { cma=IN_CMA[1]; free(IN_CMA); }

	// -mincol=0.75 option
	h_type HG=Histogram("fraction aligned",0,1,0.025);
	Int4	i,j,k,s,J,I,n,Len,na;
	Int4    N = NumSeqsCMSA(cma);
        BooLean *skip; NEW(skip,N+3,BooLean);
        for(J=1; J <= N; J++){ skip[J]=TRUE; }
	Len=LengthCMSA(1,cma);
        for(n=0,J=1; J <= N; J++){
		// e_type E=FakeSeqCMSA(J,cma);
		for(na=0,s=1 ; s <= Len;s++){
			Int4 r=ResidueCMSA(1,J,s,cma);
			if(r != UndefAlpha(A)) na++; 
		}
		double fr=(double)na/(double)Len;
		if(fr >= MinCol){ skip[J]=FALSE; n++; }
		IncdHist(fr, HG);
	} 
	if(n > 0){
		fp=tmpfile(); PutSelectCMSA(fp,skip,cma); 
		TotalNilCMSA(cma); rewind(fp); cma=ReadCMSA(fp,AB); fclose(fp);
		PutHist(stdout,60,HG); NilHist(HG); fflush(stdout); free(skip);
	} else {
		PutHist(stdout,60,HG); NilHist(HG); fflush(stdout);
		free(skip); TotalNilCMSA(cma);
		return 0;
	}

	// -U option...
	ss_type data = TrueDataCMSA(cma);
	N=NSeqsSeqSet(data); NEW(skip,N+3,BooLean); 
        for(i=1;i < N; i++) {
	   if(skip[i]) continue;
	   e_type  qE=SeqSetE(i,data);
	   if(i % 1000 == 0) fprintf(stderr,"\r%.1f",100.0*((double)i/(double)N));
       	   for(j=i+1;j <= N; j++) {
		if(skip[j]) continue;
		if(IdentSeqs(qE,SeqSetE(j,data))) skip[j]=TRUE;
	   }
	}
	fp = open_file(filename,matstr,"w");
	PutSelectCMSA(fp,skip,cma); free(skip); fclose(fp);
	TotalNilCMSA(cma);
	return n;
#endif
}


void	CreateWriteHSW(FILE *fp,cma_typ cma)
// 	create and write HSW.
{
        assert(nBlksCMSA(cma) ==1); 
	Int4 Length=LengthCMSA(1,cma);

	swt_typ swt = swt_typ(cma);
	UInt4   wtFactor=swt.WtFactor();
	double  **WtCnts1 = swt.ObsWtCnts();
	hsw_typ hsw=swt.RtnHSW( );
	fprintf(stderr,"Length = %d; NWtSq = %d\n",hsw->Length,hsw->NWtSq);
#if 1
	unsigned char **RtnSqWt;
	UInt4   i,*AveSqIWt=swt.GetIntegerWts(&RtnSqWt),Total;
	for(i=1,Total=0; i <= swt.NumWtSeqs(); i++)  Total += AveSqIWt[i];
	fprintf(stderr,"WtNumSeqs = %.3f\n",(double)Total/(double)swt.WtFactor());
#endif
	Int4 time1=time(NULL); 
	FWriteHSW(fp,hsw);
	fprintf(stderr, "\ttime hsw write: %d seconds (%0.2f minutes)\n",
                        time(NULL)-time1,(float)(time(NULL)-time1)/60.0);
}

Int4	InsDelTransCMSA(Int4 *InDelTrans, cma_typ cma)
{
    char    **Operation=0,*operation;
    Int4    *Start;
    Int4    sq,hits,sq_hits,s,pos[4],len=LengthCMSA(1,cma),N=NumSeqsCMSA(cma);
    gss_typ *gss=gssCMSA(cma);
    a_type  A=AlphabetCMSA(cma);
    e_type  *ListE;

    NEWP(Operation,N+3,char);
    NEW(Start,N+3,Int4);
    NEW(ListE,N+3,e_type);
    for(hits=sq_hits=0,sq=1; sq<=N; sq++){
	operation=gss->Operation(sq);
#if 0
 	if(sq==381){
		// char *new_operation = AllocString("EMIIIIIIIIIIIIImmmmmmmmmmmmmmmmmmmmmmIImmmmddddddddddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdmmmImmmmmmmmIImmmdmmmmdmmmmddddmmmmmmmmmmmmmdddddddddddddddddIIIIIIIIIIIIIIIIIIIIImmmmmImdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmIIImddddddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmddddmmdddddddmmmmmmmmmmmmmmmmmmmmmmmmmmdddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddmmmmmmmmdddddddmmmmmmmmmmmmmmmmmmmmmmmddddddddddddddddmmmmmmmmmmmmmmmmmIIIIImddmmmmmmmmmmmmmmmmmmmmmddddmmmmmmmmmmmmmmmmmmmmmmmmmmmImddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmE");
		char *new_operation = AllocString("EMIIIIIIIIIIIIImmmmmmmmmmmmmmmmmmmmmmIImmmmIIIIIIIIIIIIIIIIIIddddddddddddddddddddddddddddddmmmmmmmmmmmmmmmmmmmmmmmmmmmdmmmImmmmmmmmIImmmdmmmmdmmmmddddmmmmmmmmmmmmmIIIIIIIIIIIIIIIIIIIIIdddddddddddddddddmmmmmImdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmIIImddddddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmddddmmdddddddmmmmmmmmmmmmmmmmmmmmmmmmmmdddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddmmmmmmmmdddddddmmmmmmmmmmmmmmmmmmmmmmmddddddddddddddddmmmmmmmmmmmmmmmmmIIIIImddmmmmmmmmmmmmmmmmmmmmmddddmmmmmmmmmmmmmmmmmmmmmmmmmmmImddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmE");
		// char *new_operation = AllocString("EMIIIIIIIIIIIIImmmmmmmmmmmmmmmmmmmmmmIImmmmddddddddddddddddddddddddddddddIIIIIIIIIIIIIIIIIImmmmmmmmmmmmmmmmmmmmmmmmmmmdmmmImmmmmmmmIImmmdmmmmdmmmmddddmmmmmmmmmmmmmdddddddddddddddddIIIIIIIIIIIIIIIIIIIIImmmmmImdmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmIIImddddddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmddddmmdddddddmmmmmmmmmmmmmmmmmmmmmmmmmmdddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddmmmmmmmmdddddddmmmmmmmmmmmmmmmmmmmmmmmddddddddddddddddmmmmmmmmmmmmmmmmmIIIIImddmmmmmmmmmmmmmmmmmmmmmddddmmmmmmmmmmmmmmmmmmmmmmmmmmmImddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmE");
		fprintf(stderr,"\n %s\n",operation);
	      Operation[sq]=new_operation;
	} else
#endif
	{
	      // fprintf(stderr,"\n%d: '%s'\n",sq,operation);
	      Int4 num_fix=IronOutOperation(operation);
	      // if(num_fix > 0) fprintf(stderr,"\n --> '%s'\n num_fix=%d;\n",operation,num_fix);
	      if(num_fix > 0) fprintf(stderr,"\n --> '%s'\n num_fix=%d;\n",NameCMSA(cma),num_fix);
	      Operation[sq]=operation;
	}
        Start[sq]=TruePosCMSA(sq,1,cma);
	ListE[sq]=TrueSeqCMSA(sq,cma);
    }
    // Make a new cma file, using: 
    // cma_typ MakeCMSA(e_type *ListE,Int4 N,char **Operation,Int4 *Start,cma_typ cma);
    cma_typ cma2=MakeCMSA(ListE,N,Operation,Start,cma);
    FILE *fp = open_file("junk_test",".cma","w"); PutCMSA(fp,cma2); fclose(fp);
    return sq_hits;
}

Int4	InsDelTransCMSA2(Int4 *InDelTrans, cma_typ cma)
// return number of sequences with insertion-to-deletion or del-to-ins transitions
{
    if(nBlksCMSA(cma) != 1) print_error("InsDelTransCMSA() requires a single block");
    Int4	i,j,sq,hits,sq_hits,s,pos[4],len=LengthCMSA(1,cma),N=NumSeqsCMSA(cma);
    gss_typ *gss=gssCMSA(cma);
    a_type  A=AlphabetCMSA(cma);

    for(hits=sq_hits=0,sq=1; sq<=N; sq++){
	e_type sE=TrueSeqCMSA(sq,cma);
        BooLean	found=FALSE;
	// assert(PosSiteCMSA(1,1,pos,cma));  // position of blk in first seq...
	// Look for insertions and deletions in cma file for 
	// query sequence to determine the query stop position!
	assert(PosSiteCMSA(1, sq, pos, cma)); 

	Int4	del,ins;
// NEED TO TRIM DELETIONS AND INSERTIONS FROM ENDS...
// 1. Find start sites...
	fprintf(stderr,"WARNING: InsDelTransCMSA2() likely has a bug in IsDeletedCMSA(sq,s,cma)!!\n");
	for(s=1; IsDeletedCMSA(sq,s,cma); ) { s++; }
// 2. Find matching regions...
	// first check for deletions followed by insertions...
	for( ; s <= LengthCMSA(1,cma); s++){
		for(del=0; IsDeletedCMSA(sq,pos[1]+s-1,cma); ){
			del++; 
			if(s >= LengthCMSA(1,cma)){ s++; break; }
			if(InsertionCMSA(sq,pos[1]+s-1,cma)){
				// PutSeqInfo(stderr,sE);
				// PutShortSeqID(stderr,sE);
				if(!found){
				   fprintf(stderr,"%4d: ", sq);
				   PutSeqID(stderr,sE);
				   fprintf(stderr,"\n       (");
				} else fprintf(stderr,"       (");
				if(!found){ sq_hits++; found=TRUE; } 
				ins=InsertionCMSA(sq,pos[1]+s-1,cma);
				fprintf(stderr," site=%d; del = %d; ins = %d)\n",
					pos[1]+s-1,del,ins);
				
				hits++;
			} s++;
		}
		// at this point !IsDeletedCMSA(sq,s,cma).
		if(s >= LengthCMSA(1,cma)) break;
	}
	// Next check for insertions followed by deletions...
/***************************************************************************************
Example:
 (implied model:     Mmmmmmmmmmmmiidddmmmmmmmmmmmmmmmdiiidmmmmmmmmmmmmmd )
 Real:      plwlbvrpfIEVIGKENICGApg---IVASNHRSHLDPPVL-dee-GGILKHMRAIPLR-rainsg*
 Fake:               IEVIGKENICGA  xxxIVASNHRSHLDPPVLx   xGGILKHMRAIPLRx
 inserts:  9         000000000002  0000000000000000003   000000000000006
 del_bit:  0         000000000000  1110000000000000001   100000000000001
 f2r[f]:   0        10...
 f:        0         1...

 ***************************************************************************************/
#if 0
	if(sq==380){
		gsq_typ *gsq=gss->GetGSQ(sq);
		// char *operation = gsq->Operation(0, A);
		// char *operation=gss->Operation(sq);
		gsq->IronOut(A);
	}
#endif
	for(s=1; IsDeletedCMSA(sq,s,cma); ) { s++; }
	for( ; s < LengthCMSA(1,cma); s++){
	   if(!IsDeletedCMSA(sq,pos[1]+s-1,cma)){ 	// Apg---
		ins=InsertionCMSA(sq,pos[1]+s-1,cma);	// ins=2
#if 1
#elif 0
		// if(sq==380 && s > 552 && s < 555)
		if(ins > 0)
		{
			fprintf(stderr,"\n%d===> site=%d; ins = %d)\n",
				sq,pos[1]+s-1,ins);
		}
#else
#endif
		if(ins && IsDeletedCMSA(sq,pos[1]+s,cma)){ // next site is deleted.
		        Int4 s0=s; 
		        for(del=0; IsDeletedCMSA(sq,pos[1]+s,cma); ){ 
				del++; s++; if(s >= LengthCMSA(1,cma)) break;
			}
			// PutSeqInfo(stderr,sE);
			// PutShortSeqID(stderr,sE);
			if(!found){
				   fprintf(stderr,"%4d: ", sq);
				   PutSeqID(stderr,sE);
				   fprintf(stderr,"\n       (");
			} else fprintf(stderr,"       (");
			if(!found){ sq_hits++; found=TRUE; } 
			fprintf(stderr," site=%d; ins = %d; del = %d)\n",
				pos[1]+s0-1,ins,del);
			hits++;
	        }
	   } // s++;
	   if(s >= LengthCMSA(1,cma)) break;
	}
#if 0
	if(found && sq==380){
		char *operation=gss->Operation(sq);
		fprintf(stderr," %s\n",operation);
                PosSiteCMSA(1,sq,pos,cma);
                Int4 start=TruePosCMSA(sq,1,cma);
		// put in code to correct indel errors as follows...
#if 0
	// Mmmmm includes N-terminal extension!!!! Reason not working below...!!!
		char *new_operation = AllocString("EMmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdmmmmdddddddddddddddddddddddddddddddmmmmmmmmmmmmmmmmmmmmmmmmmdddmmImmmmmmmmdmmdmmmmdmmmmddddmmmmmmmmmmmmmmmdddddddddmmmImmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdmddddddmmmmmmmmmmmmmmmmmmmmmmmmmddddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmImmmmmIIIIImmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddmmmmmmmmddddddddddddddddddddddddddddddddddddddddddddddmmmmmmmmmmmmmmmmmImddmmmmmmmmmmmmmmmmmmmmmddddmmmmmmmmmmmmmmmmmmmmmmmmmdmImmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmE");
	char *new_operation =AllocString("EMmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdmmmmdddddddddddddddddddddddddddddddmmmmmmmmmmmmmmmmmmmmmmmmmdddmmImmmmmmmmdmmdmmmmdmmmmddddmmmmmmmmmmmmmmmdddddddddmmmdddIIIImmmmmdImmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdmddddddddIImmmmmmmmmmmmmmmmmmmmmmmmmddddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmImmmmmIIIIImmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddmmmmmmmmddddddddddddddddddddddddddddddddddddddddddddddmmmmmmmmmmmmmmmmmImddmmmmmmmmmmmmmmmmmmmmmddddmmmmmmmmmmmmmmmmmmmmmmmmmdmIIIIIddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmE");
#else
	char *new_operation =AllocString("EMmmmmmmmmmmmmmmmmmmmmmdmmmmdddddddddddddddddddddddddddddddmmmmmmmmmmmmmmmmmmmmmmmmmdddmmImmmmmmmmdmmdmmmmdmmmmddddmmmmmmmmmmmmmmmdddddddddmmmImmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdmddddddddIImmmmmmmmmmmmmmmmmmmmmmmmmddddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmImmmmmIIIIImmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddmmmmmmmmddddddddddddddddddddddddddddddddddddddddddddddmmmmmmmmmmmmmmmmmImddmmmmmmmmmmmmmmmmmmmmmddddmmmmmmmmmmmmmmmmmmmmmmmmmdmImmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmE");
	// char *new_operation =AllocString("EMmmmmmmmmmmmmmmmmmmmmmdmmmmdddddddddddddddddddddddddddddddmmmmmmmmmmmmmmmmmmmmmmmmmdddmmImmmmmmmmdmmdmmmmdmmmmddddmmmmmmmmmmmmmmmdddddddddmmmdddIIIImmmmmdImmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmdmddddddddIImmmmmmmmmmmmmmmmmmmmmmmmmddddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmImmmmmIIIIImmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddmmmmmmmmddddddddddddddddddddddddddddddddddddddddddddddmmmmmmmmmmmmmmmmmImddmmmmmmmmmmmmmmmmmmmmmddddmmmmmmmmmmmmmmmmmmmmmmmmmdmIIIIIddddmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmE");
#endif
		Int4 newpos[4];
                VacateSitesCMSA(sq,cma);
                Int4 trace_length=strlen(new_operation);
                e_type E = gss->TrueSeq(sq);
                gsq_typ *gsq; gsq = new gsq_typ[1];
                assert(gss->LeftFlank() == 0 && gss->RightFlank() == 0);
                if(start > 0 && new_operation[1]=='D') start++;
                // WARNING: need to fix this as flanking sequence on the left
                // "{(AFGV)...}" is causing fatal problems....
                gsq->initialize(gss->LeftFlank(),gss->RightFlank(),
                  new_operation,trace_length,start,E,newpos);
                // fprintf(stderr,"pos=%d; newpos=%d\n",TruePos[sq],newpos[1]);
                // gsq->Put(stdout,A);
                ReplaceCMSA(sq,gsq,cma); // replace sequence sq in CMSA & fmodel.
                AddSiteCMSA(1,sq,pos[1],cma);
		// free(operation);
	}
#else
#endif
    }
    *InDelTrans=hits;
    return sq_hits;
}

Int4	Check4NtermExtendCMSA(cma_typ cma)
// check to see if cma has N-terminal extensions that will core dump (use this to debug).
{
    Int4	hits,sq,N=NumSeqsCMSA(cma);
    for(hits=0,sq=1; sq<=N; sq++){
        Int4 start=TruePosCMSA(sq,1,cma);
	if(start > 1) hits++;
    }
    return hits;
}

Int4	RmQueryGapsCMSA(double cutoff, cma_typ &cma)
// remove columns that are deleted in the first sequence.
{
    Int4	sq,hits,sq_hits,s,pos[4],len=LengthCMSA(1,cma),N=NumSeqsCMSA(cma);
    Int4	i,j,total=0,cycle;
    // a_type	A=AlphabetCMSA(cma);
    BooLean	*Delete;

#if 0
    if((i=Check4NtermExtendCMSA(cma)) > 0){
	fprintf(stderr,"%d out of %d sequences have N-term extensions\n",i,N);
	print_error("Fatal: input file must not have N-term extensions");
    }
#endif
#if 1	// fixes problem with adding insertions!
    ExtendFakeToRealCMSA(cma);
#endif
    NEW(Delete, LengthCMSA(1,cma)+3, BooLean);
    // Don't remove columns on ends!!
    for(s=2 ; s < LengthCMSA(1,cma); s++){ Delete[s]=IsDeletedCMSA(1,1,s,cma); }
    cycle=0;
    Int4 Len=LengthCMSA(1,cma);
    for(s = Len; s > 0; s--){
	if(Delete[s]){
	   cycle++;
	   Int4 end=s; 
	   while(Delete[s]){ total++; s--; }
	   s++;
	   // routines from cma_gmb.cc
	   cma_typ rcma=0;
	   if(s==Len){ rcma=TrimBlkCMSA(cma,1,0,(end-s+1), 2); }
	   else if(end==1){ rcma=TrimBlkCMSA(cma,1,(end-s+1),0, 2); }
	   else { rcma=ConvertColsToInsertsCMSA(cma,1,s,end); }
	   if(rcma){ NilCMSA(cma); cma=rcma; }
	   fprintf(stderr,"\n********************** cycle %d **************************\n",cycle);
	   char str[200];
           sprintf(str,"%s.cycle%d",NameCMSA(cma),cycle);
           // FILE *fp = open_file(str,".cma","w"); PutCMSA(fp,cma); fclose(fp);
	}
    } free(Delete);
    return total;
}

static void     PrintError(const char *usage,const char *version)
{ fprintf(stderr,"%s",version); print_error(usage); }


int	main(Int4 argc,char *argv[])
{ 
	Int4    x,z,time1,arg,blk=0,purge=-1;
	char	mode,str[300],status=0;
	double	fractDeleted=0;
	BooLean	IronOut=FALSE;
	Int4	length_extend=0;
	double	MinCol=0;
	UInt4   seed=7061950;
	a_type	A;
	cma_typ	xcma=0,cma=0;
	FILE	*fp;

	time1=time(NULL); 
	TurnOffLicenseStatement();
	if(argc < 4){ PrintError(USAGE_PURGE_MSA,PURGE_MSA_VERSION); }
        if(sscanf(argv[2],"%d",&x) == 1){
	     if(x < 10 || x > 100) print_error(USAGE_PURGE_MSA);
	     else MinCol=(double)x/100.0; 
	} else print_error(USAGE_PURGE_MSA);
        if(sscanf(argv[3],"%d",&x) == 1){
	     if(x < 10 || x > 100) print_error(USAGE_PURGE_MSA); else purge=x; 
	} else print_error(USAGE_PURGE_MSA);
	
	for(arg = 4; arg < argc; arg++){
	   if(argv[arg][0] != '-') print_error(USAGE_PURGE_MSA);
	   switch(argv[arg][1]) {
             case 'i': mode = 'i';
	     {
		   if(sscanf(argv[arg],"-iron=%lf",&fractDeleted) == 1){
                        if(fractDeleted <= 0.0 || fractDeleted > 1.0){
                                print_error(USAGE_PURGE_MSA);
                        } else IronOut=TRUE;
                   } else if(strncmp(argv[arg],"-iron",6) ==0){
                        IronOut=TRUE;
                   } else { print_error(USAGE_PURGE_MSA); }
	     } break;
	     default: print_error(USAGE_PURGE_MSA);
	   }
	}
	if(seed == 7061950) seed = (UInt4) time(NULL);
	sRandom(seed);
	A = MkAlpha(AMINO_ACIDS,PROT_BLOSUM62);
	cma=0;
	char matstr[50];
	sprintf(matstr,"_Match%.0f.cma",100.0*MinCol);
	Int4 left=PutUniqueMergedMinColCMSA(argv[1],matstr,MinCol,A);
	if(left > 0){
	   fprintf(stderr,"opening %s%s\n",argv[1],matstr);
	   fp = open_file(argv[1],matstr,"r");
	   xcma=ReadCMSA(fp,A); fclose(fp); 
	   sprintf(matstr,"_Match%.0f_U%d.cma",100.0*MinCol,purge);
	   fp = open_file(argv[1],matstr,"w");
	   Int4 Nset,Nsq=NumSeqsCMSA(xcma);
	   set_typ InSet=MakeSet(Nsq+4); FillSet(InSet);
	   set_typ Set=RtnFastRepSetCMSA(stderr,purge,InSet,xcma);
	   PutInSetCMSA(fp,Set,xcma); Nset=CardSet(Set); NilSet(Set); NilSet(InSet);
	   fprintf(stdout,"File \"%s\": (%d/%d removed; %d remain).\n", NameCMSA(xcma),Nsq-Nset,Nsq,Nset);
	   fclose(fp); TotalNilCMSA(xcma); xcma=0;

	   if(IronOut){
	      fp = open_file(argv[1],matstr,"r"); xcma=ReadCMSA(fp,A); fclose(fp); 
              cma=RmWrinklesCMSA(xcma);
	      if(fractDeleted > 0){
                Int4 col_deleted=RmGappyColumnsCMSA(fractDeleted,cma);
                fprintf(stderr,"\n %d columns removed",col_deleted);
	      }
	      fp = open_file(argv[1],matstr,"w"); PutCMSA(fp,cma); fclose(fp);
	      if(xcma) NilCMSA(xcma); xcma=0;
	      if(cma) TotalNilCMSA(cma); cma=0;
           }
#if 0
		fprintf(stderr,"opening %s%s\n",argv[1],matstr);
	        fp = open_file(argv[1],matstr,"r");
		xcma=ReadCMSA(fp,A); fclose(fp); 
		sprintf(matstr,"_Match%.0f_U%d.hsw",100.0*MinCol,purge);
		fprintf(stderr,"creating %s%s\n",argv[1],matstr);
	        fp = open_file(argv[1],matstr,"w");
		CreateWriteHSW(fp,xcma); fclose(fp);
		TotalNilCMSA(xcma); xcma=0;
#endif
	}
	if(cma) TotalNilCMSA(cma);
	// if(data != NULL) NilSeqSet(data);
	NilAlpha(A);
	fprintf(stderr, "\ttime: %d seconds (%0.2f minutes)\n",
                        time(NULL)-time1,(float)(time(NULL)-time1)/60.0);
	fprintf(stderr, "\nPurgeMSA was successful.\n");
	return status;
}

