/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#if 0
#include "rpm_typ.h"
#else
#include "sma.h"
#include "cmsa.h"
#include "residues.h"
#include "gpsi_typ.h"
#include "editcma.h"
#include "table.h"
#include "sset.h"
#include "wdigraph.h"
#include "selexCMSA.h"
#include "dheap.h"
#include "swaln.h"
#include "tax_typ.h"
#include "set_typ.h"
#include "hat_typ.h"
#include "hpt_typ.h"
#endif

#define	USAGE_START	"\
    USAGE 1: matchcma multicma_file pattern [options]\n\
      example pattern: L31,D32,LIV74,G75,R81,W84,FY96,R105,E108,AS109\n\
    USAGE 2: matchcma multicma_file <int> [options]\n\
              (show residues occurring in column <int>)\n\
    Note: All MSAs must have the same number of columns.\n\
   options:\n\
     -C         Count simulated sequences in each of the input cma files (output from -S).\n\
     -H      	Output Files for viewing a heat map of matching sequences\n\
     -M=<int>	Output a File of sequences with <= <int> mismatches\n\
     -M      	Output a File of perfectly matching sequences\n\
     -hm=<str>  Suffix for pdb head map file (default: no suffix)\n\
     -Q=<str>   Input Pattern corresponds to cma file named <str>.cma\n\
     -Min=<int> minimum number of sequences in cma file to output results\n\
     -same      Don't sort input sets by score; output order same as input order\n\
     -S         Emit simulated sequences for each of the input cma files.\n\
     -print     print out sequences in input file as multiple NameCMA.seq files\n\
     -x		dummy option\n\
\n\n"

#define MAX_PATTERN_POSITIONS 100

/*********************************************************
Key superfamilies:

AAA+
P-loop GTPases
EEP
GNAT
RecA (??)


RhoD
vWFA (VWA)
Helicase_Ct (DEAD)

 *********************************************************/

int	RunMatchCMA(Int4 argc,char *argv[])
{ 
	Int4	arg,i,j,k,s,blk=0,lenrm,mingap;
	Int4    time1,*len,t,n,N;
	char	str[1000],mode=' ',tmp_str[103],*HM=0;
	cma_typ	cmsa,cmsa2,cma=0;
	a_type	AB;
	UInt4   seed=7061950;
	Int4	see_aa=0,see_blk=0;
	Int4	deleted_pos=0,residue_pos=0,Number;
	ss_type	data;
	h_type	HG=0;
	dh_type dH=0;
	FILE	*fp;
	BooLean	SortByScore=TRUE,PrintSets=FALSE,HeatMap=FALSE;
	Int4	MaxMisMatches=0,Simulated=0;

	Int4	MinNumSeq=0;		// afn: 11/13/09.

	BooLean InvertResSet[MAX_PATTERN_POSITIONS];
        char    Residue[MAX_PATTERN_POSITIONS];
        Int4    NumResidues;
        Int4    MaxConcensusLines;
	char    residue_str[MAX_PATTERN_POSITIONS][30];
	sst_typ Residues[MAX_PATTERN_POSITIONS];
	Int4    Position[MAX_PATTERN_POSITIONS],Position0[MAX_PATTERN_POSITIONS];
	char	query_cma[200],print_file_name[500];
	BooLean	PrintCMA=FALSE,PrintCMD=TRUE;
	query_cma[0]=0;

	time1=time(NULL); 
	if(argc < 3) print_error(USAGE_START);
	TurnOffLicenseStatement();
	for(arg = 3; arg < argc; arg++){
	   if(argv[arg][0] != '-') print_error(USAGE_START);
	   switch(argv[arg][1]) {
             case 'C': 
		if(argv[arg][2]==0){ mode = 'C'; PrintCMD=FALSE; }
		else print_error(USAGE_START);
		break;
             case 'H': 
		if(argv[arg][2]==0){ HeatMap=TRUE; }
		break;
             case 'h': 
		if(sscanf(argv[arg],"-hm=%s",str)==1){
		   HM=AllocString(str);
		} else print_error(USAGE_START);
		break;
             case 'M': 
		if(sscanf(argv[arg],"-Min=%d",&MinNumSeq)==1){
			if(MinNumSeq < 1) print_error(USAGE_START);
		} else if(argv[arg][2]==0){ mode = 'M';
		} else if(argv[arg][2]=='='){ mode = 'M'; 
		   if(sscanf(argv[arg],"-M=%d",&MaxMisMatches)!=1) print_error(USAGE_START);
		} else print_error(USAGE_START);
		break;
             case 'P': 
		if(sscanf(argv[arg],"-Print=%s",print_file_name)!=1) print_error(USAGE_START);
		PrintCMA=TRUE; PrintCMD=FALSE;
		break;
             case 'p': 
		if(strcmp("-print",argv[arg]) == 0){ PrintSets=TRUE; }
		else print_error(USAGE_START);
		break;
             case 'Q': 
		if(sscanf(argv[arg],"-Q=%s",query_cma)!=1) print_error(USAGE_START);
		break;
             case 'S': 
		if(argv[arg][2]==0){ mode = 'S'; PrintCMA=TRUE; PrintCMD=FALSE; }
		else print_error(USAGE_START);
		// if(sscanf(argv[arg],"-S=%d",&Simulated)!=1) print_error(USAGE_START);
		// if(Simulated < 1) print_error(USAGE_START);
		// mode='S'; PrintCMA=TRUE;
		break;
             case 's': 
		if(strcmp("-same",argv[arg]) == 0){ SortByScore=FALSE; }
		else print_error(USAGE_START);
		break;
             case 'x': mode = 'x'; break;
	     default: print_error(USAGE_START);
	   }
	}
	if(seed == 7061950) seed = (UInt4) time(NULL);
	sRandom(seed);
	AB = MkAlpha(AMINO_ACIDS,PROT_BLOSUM62);

	if(query_cma[0] != 0){
	  sprintf(str,"%s.cma",query_cma);
	  cma=ReadCMSA2(str,AB);
	  if(nBlksCMSA(cma) != 1) print_error("cma file error: not single block.");
	  if(!cma) print_error("cma file read error");
	}

	if(PrintCMD) printf("command-line: matchcma %s %s\n\n",argv[1],argv[2]);
	//************************** Read pattern **************************
	//************************** Open multiple cma file **************************
	// fp = open_file(argv[1],".cma","r");
	fp = open_file(argv[1],"","r");
	cma_typ *IN_CMA=MultiReadCMSA(fp,&Number,AB);
	fclose(fp);
	if(mode == 'S'){	// generate simulated sequences.
	  Int4 rpts=1, Gap_Len[4]; Gap_Len[0]=Gap_Len[1]=Gap_Len[2]=0;
	  for(Int4 f=1; f <= Number; f++){
		// e_type *Seq=SimulatedSeqsCMSA(IN_CMA[f],Simulated,rpts,Gap_Len);
		e_type *Seq=SimulatedSeqsCMSA(IN_CMA[f],NumSeqsCMSA(IN_CMA[f]),rpts,Gap_Len);
		char new_info[200]; 
		// Int4 n = 1; NumSeqsCMSA(IN_CMA[f]); n++
		// for(Int4 n=1; n <= Simulated; n++)
		data=TrueDataCMSA(IN_CMA[f]);
		for(Int4 n=1; n <= NumSeqsCMSA(IN_CMA[f]); n++)
		{
			e_type	E=SeqSetE(n,data);
			char    *phylum=PhylumSeq(E);
			char    K=KingdomSeq(E);
			sprintf(new_info,"%s {<%s(%c)>}seq%d",NameCMSA(IN_CMA[f]),phylum,K,n);
			ChangeInfoSeq(new_info,Seq[n]);
			PutSeq(stdout,Seq[n],AB);
		}
	  } exit(1);
	} else if(mode=='C'){	// count simulated output sequences...
	  Int4	x,MaxSize=100000; // 100,000 is max id for CDD.
	  Int4	NumSets=0,fID[500],fCnt[500];	// For reverse comparison...
	  Int4	NumIdent=0;
	  set_typ Set=MakeSet(MaxSize); 
	  ClearSet(Set);
	  for(Int4 f=1; f <= Number; f++){  // assume f == 1 is Root & f == Number is reject.
		Int4 v,n2,i,ID;
		cma=IN_CMA[f];
		e_type Ei,Ej;
		char str2[108];
		data = TrueDataCMSA(cma); N = NSeqsSeqSet(data);
		BooLean	*counted; NEW(counted,N+2,BooLean);
	        // fprintf(stdout,"====== File %s ========\n",NameCMSA(cma)); 
	        fprintf(stdout,"%s(%d) = ",NameCMSA(cma),N); 
		for(j=1; j<= N; j++) {
		 if(counted[j]) continue; 
		 Ej=SeqSetE(j,data); StrSeqID(str,100,Ej);
		 if(sscanf(str,"Set%d",&ID) != 1){
		   if(sscanf(str,"cd%d",&ID) != 1){
		     if(strcmp(str,"Random") != 0 && strcmp(str,"Reject") != 0){
			fprintf(stderr,"Name = %s",str);
			print_error("Input names invalid");
		     } else ID=0;
		   }
		 }
		 if(ID > 0){
		 	if(!MemberSet(ID,Set)){  // first 'f' found.
			   AddSet(ID,Set);
			   if(f > 1 && f < Number){
			      NumSets++; fCnt[NumSets]=1;
			      fID[NumSets]=ID; 
			   }
			} else if(f > 1 && f < Number){	// found in previous f set.
			   for(x = 1; x <= NumSets; x++){
			   	if(fID[x] == ID){ fCnt[x]++; break; }
			   }
			}
		 }
		 for(n2=0,i=1; i<= N; i++) {
		   if(counted[i]) continue;
		   Ei=SeqSetE(i,data); StrSeqID(str2,100,Ei);
		   if(strcmp(str2,str) == 0){
			counted[i]=TRUE; n2++;
			if(n2==1){
				// fprintf(stdout," %s ",str); 
				if(ID==0) fprintf(stdout," Random"); 
				else fprintf(stdout," Set%d",ID); 
				fflush(stdout);
			} // PutSeq(stdout,Ei,A);
		   }
		 } // fprintf(stdout," %d\n",n2);
		 if(n2 == N) NumIdent++;
		 fprintf(stdout,"(%d)",n2); fflush(stdout);
		} free(counted); fprintf(stdout,"\n"); fflush(stdout);
	  } Int4 Card=CardSet(Set);  double d=100.0*((double)NumIdent/(double)Number);
	  fprintf(stdout,"\n identical = %d out of %d sets = %.1f%c\n",NumIdent,Number,d,'%'); 
	  fprintf(stdout," PERCENT %.1f\n\n",d);
#if 0
	  PutSet(stdout,Set); fprintf(stdout,"\n"); 
	  for(x=1; x <= NumSets; x++){
	  	fprintf(stdout,"Set%d = %d nodes.\n",fID[x],fCnt[x]); 
	  } fprintf(stdout,"\n");
#endif
	  NilSet(Set);
	  return 0;
	}
#if 1	// print out subsets...
	if(PrintCMA){
	  for(Int4 f=1; f <= Number; f++){
		char *str1=NameCMSA(IN_CMA[f]);
		if(strcmp(str1,print_file_name)==0){
			PutCMSA(stdout,IN_CMA[f]);
		}
	  } exit(1);
	} else if(PrintSets){
	  for(Int4 f=1; f <= Number; f++){
		fp = open_file(NameCMSA(IN_CMA[f]),".seq","w");
		PutSeqSetEs(fp,TrueDataCMSA(IN_CMA[f]));
		fclose(fp);
		fp = open_file(NameCMSA(IN_CMA[f]),".cma","w");
		PutCMSA(fp,IN_CMA[f]); fclose(fp);
	  } exit(1);
	}
#endif
	if(Number < 1) print_error("cma file error: less than one cma file");

	if(cma && LengthCMSA(1,IN_CMA[1]) != LengthCMSA(1,cma)){
		print_error("-Q option input error.");
	}

        char *Arg=argv[2];
	char *Used; NEW(Used,LengthCMSA(1,IN_CMA[1]) +5, char);
	NumResidues=0;
        do {
             NumResidues++;  // increment...
             if(NumResidues >= MAX_PATTERN_POSITIONS)
                 print_error("Too many input patterns");
             if(sscanf(Arg,"%[a-zA-Z]%d",residue_str[NumResidues], &Position[NumResidues]) != 2){
                 print_error(USAGE_START);
             } Residue[NumResidues] = residue_str[NumResidues][0];
	     i=Position[NumResidues]; 
#if 0
	     if(Used[i]){
		fprintf(stderr,"Fatal: more than one pattern at position %d\n",i);
		print_error("Input error: only one pattern at each position allowed");
	     } else Used[i]=1;
#endif
	     if(cma) k = RealToFakeCMSA(1, Position[NumResidues], cma);
             else k = Position[NumResidues];
	     if(k < 1 || k > LengthCMSA(1,IN_CMA[1])){
		print_error("pattern input error 1");
	     } 
	     // fprintf(stdout,"%s%d(%d) ", residue_str[NumResidues],Position[NumResidues],k);
	     Position0[NumResidues] = k;
             while(Arg[0] != ',' && Arg[0] != 0) Arg++;
             if(Arg[0] == ',') Arg++;
        } while(Arg[0]);
	free(Used);
        // fprintf(stdout,"\n\n");

	//************************** interpret pattern **************************
	char    output_name[300],checkin[200],temp_name[300];
        for(i = 1; i <= NumResidues; i++){
#if 0
           sprintf(temp_name,"%s",output_name);
           if(i < 10){ // use only first patterns in file name.
                sprintf(output_name,"%s_%s%d",temp_name,residue_str[i],Position[i]);
           }
#endif
           if(islower(Residue[i])){
                InvertResSet[i]=TRUE;
                Residue[i]=toupper(Residue[i]);
           } else InvertResSet[i]=FALSE;
           Int4 m = strlen(residue_str[i]);
           if(InvertResSet[i]){
             sst_typ NotThese=0;
             for(j=0; j<m; j++){
                char aa=residue_str[i][j];
                sst_typ tmp_set=SsetLet(AlphaCode(aa,AB));
                NotThese= UnionSset(NotThese,tmp_set);
             }
             Residues[i] = 0;   // Empty set...
             for(j=0; j <= nAlpha(AB); j++){
                if(!MemSset(j,NotThese)){
                    sst_typ tmp_set=SsetLet(j);
                    Residues[i] = UnionSset(Residues[i],tmp_set);
                }
             }
           } else {
             Residues[i] = 0;   // Empty set...
             for(j=0; j<m; j++){
                char aa=residue_str[i][j];
                sst_typ tmp_set=SsetLet(AlphaCode(aa,AB));
                Residues[i] = UnionSset(Residues[i],tmp_set);
             }
           }
#if 0
	   Int4 r1 = ResidueCMSA(1,1,Position0[i],cma);
	   if(!MemSset(r1,Residues[i])){
		fprintf(stderr,"%c%d in seq 1 of query set fails to match input pattern %s%d\n",
                        AlphaChar(r1,AB),Position0[i],residue_str[i],Position[i]);
                print_error("input error 2");
	   }
#endif
        }


	//************************** allocate arrays,etc. **************************
	UInt4	**MisMatch,**Match,**NumMatch,**NumMisMatch,*TotalMatch,*TotalMisMatch;
	UInt4	**Deleted,**NumDeleted,*TotalDeleted;
	Int4	*SORTED;
	char	**NAME=0;
	NEWP(MisMatch,Number +2, UInt4); NEWP(Match,Number +2, UInt4);
	NEWP(Deleted,Number +2, UInt4); NEWP(NumDeleted,Number +2, UInt4);
	NEWP(NumMatch,Number +2, UInt4); NEW(TotalMatch,Number +2, UInt4);
	NEWP(NumMisMatch,Number +2, UInt4); NEW(TotalMisMatch,Number +2, UInt4);
	NEW(TotalDeleted,Number +2, UInt4);
	NEW(SORTED,Number +2, Int4);
	NEWP(NAME,Number +2, char);
        for(i=1; i <= Number; i++){
		if(nBlksCMSA(IN_CMA[i]) != 1) print_error("cma file error; not single block.");
		if(LengthCMSA(1,IN_CMA[i]) != LengthCMSA(1,IN_CMA[1]))
			print_error("FATAL: cma lengths not the same!");
		NEW(MisMatch[i],NumResidues+2, UInt4); NEW(Match[i],NumResidues+2, UInt4);
		NEW(NumMatch[i],NumResidues+2, UInt4); NEW(NumMisMatch[i],NumResidues+2, UInt4);
		NEW(Deleted[i],NumResidues+2, UInt4); NEW(NumDeleted[i],NumResidues+2, UInt4);
	}

	//************************** compute pattern matches **************************
	// if output file then create this here.
	FILE	*mfp=0;
	BooLean	*skip=0;
	Int4 hits=0,miss=0,del=0;
        for(i=1; i <= Number; i++){
	   if(mode == 'M'){
// fprintf(stderr,"creating skip array %d.\n",i);
		NEW(skip,NumSeqsCMSA(IN_CMA[i])+3,BooLean);	// set to FALSE.
	   }
	   Int4 sq;
	   for(sq=1; sq <= NumSeqsCMSA(IN_CMA[i]); sq++){
                for(hits=miss=del=0,j=1; j <= NumResidues; j++){
		    Int4 r1 = ResidueCMSA(1,sq,Position0[j],IN_CMA[i]);
		    if(MemSset(r1,Residues[j])){ Match[i][j]++; hits++; }
		    else if(r1 != 0){
			// if(skip){ skip[sq]=TRUE; }
			MisMatch[i][j]++; miss++;
		    } // else if(skip){ skip[sq]=TRUE; }
#if 0
		    else if(skip){ del++; Deleted[i][j]++; }
#else
		    else { Deleted[i][j]++; del++; }
#endif
		} NumMatch[i][hits]++;
		NumDeleted[i][del]++;
		TotalDeleted[i] += del;
		TotalMatch[i] += hits;	
		NumMisMatch[i][miss]++;
		TotalMisMatch[i] += miss;	
#if 1
		if(skip && (miss + del) > MaxMisMatches){ skip[sq]=TRUE; } 
#endif
           }
	   if(skip){
	     Int4 n=0;
	     for(sq=1; sq <= NumSeqsCMSA(IN_CMA[i]); sq++){ if(skip[sq] == FALSE) n++; }
// fprintf(stderr,"found %d perfect matches in file %d.\n",n,i);
	     if(n > 0){
		if(mfp==0){
		   char Str[20];
		   if(NumResidues == 2){
			char    *str1=GetPatternFromSST(Residues[1],AB);
			char    *str2=GetPatternFromSST(Residues[2],AB);
			sprintf(Str,"%s%d_%s%d",str1,Position0[1],
				str2,Position0[2]);
			free(str1); free(str2);
		        mfp = open_file(Str,argv[1],"w");
		   } else mfp = open_file("Match_",argv[1],"w");
		} PutSelectCMSA(mfp,skip,IN_CMA[i]); 
	     } free(skip);
	   }
	}
	if(mfp) fclose(mfp);

	//************************** sort profiles by scores **************************
	HG=Histogram("Average # Matches", 0,NumResidues+1,0.5);
	dH=dheap(Number+5,4);
        for(i=1; i <= Number; i++){	// over number of families...
	    // Int4 nsq=TotalMatch[i] + TotalMisMatch[i];
	    Int4 nsq=NumSeqsCMSA(IN_CMA[i]);
	    keytyp key=(keytyp) TotalMatch[i]/ (keytyp) nsq;	// average matches per sequence.
	    if(SortByScore) insrtHeap(i,-key,dH); else insrtHeap(i,i,dH);
	    IncdHist((double)key,HG);
	    // fprintf(stderr,"%3d %10s: %.3f\n",i,NameCMSA(IN_CMA[i]),key);
	}
	for(i=0;(j=delminHeap(dH)) != 0; ){ i++; SORTED[i]=j; }
	assert(i == Number);
	Nildheap(dH);

	double d,mean=MeanHist(HG);
	double	min=MinimumHist(HG);
	// Int4 Cutoff=(Int4) ceil(min + 1.0);
	// double cutoff=(double) Cutoff;
#if 0
	Int4 Cutoff=(Int4) ceil(mean);
	double cutoff=mean;
#else
	Int4 Cutoff=(Int4) ceil(mean);
	double cutoff=1.0;
#endif
	// fprintf(stderr,"cutoff=%f; Cutoff=%d\n",cutoff,Cutoff);
	Int4 HitsHist=0;
#if 0
        for(i=1; i <= Number; i++){
	    Int4 nsq=NumSeqsCMSA(IN_CMA[i]);
	    double key=(double) TotalMatch[i]/ (double) nsq;	// average matches per sequence.
	    if(key <= cutoff){
	    	// IncdHist((double) NumResidues - (double) key,HG2);
	    	IncdHist((double) key,HG2);
		HitsHist++;
	    }
	}
#endif

#if 0
	//************************** output matches **************************
	printf("       MATCHES: ");
	for(j = NumResidues; j >= 0; j--) printf("%4d ",j); 
	printf(" Total_sq AveMatch\n");
        for(k=1; k <= Number; k++){
	   i=SORTED[k];
	   Int4 nsq=NumSeqsCMSA(IN_CMA[i]);

	   if(nsq < MinNumSeq) continue;	// afn: 11/13/09.

	   printf("%3d %10s: ",i,NameCMSA(IN_CMA[i]));
           for(j = NumResidues; j >= 0; j--){
	     hits=NumMatch[i][j];	// from all matched to none matched
	     // nsq = hits + NumMisMatch[i][j];
	     double d=100.0*(double)hits/(double)nsq;
	     if(hits==0) printf("   . ");
	     else printf("%4d ",(Int4)floor(d+0.5));
	   }
	   printf(" %8d (%.2f)\n",nsq,(double)TotalMatch[i]/(double) nsq);
	}
	printf("       MATCHES: ");
	for(j = NumResidues; j >= 0; j--) printf("%4d ",j); 
	printf(" Total_sq AveMatch\n");
	printf("\n"); printf("\f"); printf("\n");
	PutHist(stdout,60,HG); 
#endif
	NilHist(HG);

	//************************** sort profiles by scores **************************
	dH=dheap(Number+5,4);
        for(i=1; i <= Number; i++){	// over number of families...
	    double ave_percent=0.0;
            for(j=1; j <= NumResidues; j++){
	     hits=Match[i][j];
	     // Int4 nsq=hits + MisMatch[i][j];
	     Int4 nsq=NumSeqsCMSA(IN_CMA[i]);
	     if(nsq > 0){
	       ave_percent += 100.0*(double)hits/(double)nsq;
	     }
	    }
	    keytyp key=(keytyp) ave_percent/(keytyp)NumResidues;
	    if(SortByScore) insrtHeap(i,-key,dH); else insrtHeap(i,i,dH);
	}
	for(i=0;(j=delminHeap(dH)) != 0; ){ i++; SORTED[i]=j; }
	assert(i == Number);
	Nildheap(dH);
	// printf("\n"); printf("\f"); printf("\n");

	//************************** output pattern **************************
	printf("       PATTERN: ");
        for(j=1; j <= NumResidues; j++)
        // for(j=NumResidues; j > 0; j--)
	{
             sprintf(str,"%s%d",residue_str[j],Position[j]);
	     printf("%6s ",str);
	} printf(" Total_sq AvePercent\n");
	//************************** output matches per position **************************
FILE *hmfp=0;
static Int4 calls=0;
Int4 AtomNum=1;
double	space=2.6; // space=4.0;
    if(HeatMap){
	if(HM){ calls++; sprintf(tmp_str,"%s_%s",argv[1],HM); }
	else { calls++; sprintf(tmp_str,"%s_%d",argv[1],calls); }
        hmfp=open_file(tmp_str,".pdb","w"); // Heat map using pymol.
        // hmfp=open_file(argv[1],".pdb","w"); // Heat map using rasmol.
	fprintf(hmfp,"HEADER    PercentScale\n");
        for(d=0.0,j=0; j <= 41; j++){
	     fprintf(hmfp,
		      "HETATM%5d  C   XXX A   1    %8.3f%8.3f%8.3f%6.2f%6.2f      1XXX\n",
                                AtomNum,(double)(j+2)*space,4.0*space,0.0,1.0,d); 
	     d = d + 2.5; AtomNum++;
	} fprintf(hmfp,"END\n");
     }
	HG=Histogram("Average Percent Matches",0,101,5.0);
        for(n=1; n <= Number; n++){
	   BooLean okay=TRUE,bad=TRUE;
	   i=SORTED[n];

	   Int4 nsq=NumSeqsCMSA(IN_CMA[i]);
	   if(nsq < MinNumSeq) continue;	// afn: 11/13/09.

	   NAME[n]=NameCMSA(IN_CMA[i]);
	   sprintf(tmp_str,"%s",NameCMSA(IN_CMA[i]));
	   tmp_str[10]=0;
	   // printf("%3d %10s: ",i,NameCMSA(IN_CMA[i]));
	   printf("%3d %10s: ",i,tmp_str);
	   double ave_percent=0.0;
if(hmfp) fprintf(hmfp,"HEADER    %s\n",NameCMSA(IN_CMA[i]));
           for(j=1; j <=NumResidues; j++) {
	     if(residue_str[j][0] == 'X') hits=Deleted[i][j];
	     else hits=Match[i][j];
#if 0
	     nsq=Match[i][j] + MisMatch[i][j];
#else
	     nsq=Match[i][j] + MisMatch[i][j] + Deleted[i][j];
	     // nsq=NumSeqsCMSA(IN_CMA[i]);
#endif
	     if(hits==0){
		okay=FALSE;
		printf("     . "); d=0.0;
		d=0.0;
		// continue;
	     } else {
	        d=100.0*(double)hits/(double)nsq;
		if(d < 50.0) okay=FALSE;
		if(d >= 25.0) bad=FALSE;
		ave_percent += d;
	        // k=(Int4) floor(d+0.5); printf("%6d ",k);
	        printf("%6.1lf ",d);
	     }
	     assert(d <= 100.0);
	     if(hmfp && d > 0.1) fprintf(hmfp,
		 "HETATM%5d  C   XXX B   1    %8.3f%8.3f%8.3f%6.2f%6.2f      1XXX\n",
                                AtomNum,(double)j*space,(double)-n*space,0.0,1.0,d);
	     AtomNum++;
	   }
if(hmfp) fprintf(hmfp,"END\n");
	   double Nsq = (double)(TotalMatch[i] + TotalMisMatch[i])/(double) NumResidues;
	   double d=ave_percent/(double)NumResidues;
	   IncdHist(d,HG);
	   if(okay) printf(" %8.1f (%.1f)+\n",Nsq,d);
	   else if(bad) printf(" %8.1f (%.1f)-\n",Nsq,d);
	   else printf(" %8.1f (%.1f)\n",Nsq,d);
	} 
	if(hmfp) fclose(hmfp); 
	//************************** output pattern ************************** printf("       PATTERN: ");
	printf("       PATTERN: ");
        for(j=1; j <= NumResidues; j++)
        // for(j=NumResidues; j > 0; j--)
	{
             sprintf(str,"%s%d",residue_str[j],Position[j]);
	     printf("%6s ",str);
	} printf(" Total_sq AvePercent\n\n");
	// printf("\f"); printf("\n");
#if 1
	//************************** output pattern **************************
	if(Number==1 && NumResidues == 2){ // = number of families...
	  printf(" PTTRN: Match Mismatch\n",i);
          for(j=1; j <= NumResidues; j++){
             fprintf(stdout,"   %s%d: %d  %d\n",
			residue_str[j],Position[j],Match[1][j],MisMatch[1][j]);
	  } printf("\n\n");
	  printf("exact %d %d %d %d\n\n",
		Match[1][1],MisMatch[1][1],Match[1][2],MisMatch[1][2]);
	}
#endif
	//************************** output Histograms **************************
	// PutHist(stdout,60,HG); 
	NilHist(HG);

	//************************** output sorted list of profile names **************************
        // for(n=1; n < Number; n++){ printf("%s,",NAME[n]); } printf("%s\n\n",NAME[n]);

        for(i=1; i <= Number; i++){
		free(MisMatch[i]); free(Match[i]); free(NumMatch[i]); free(NumMisMatch[i]);
		free(Deleted[i]); free(NumDeleted[i]);
	}
	free(MisMatch); free(Match); free(NumMatch); free(TotalMatch); free(NAME);
	free(TotalMisMatch);
	free(Deleted); free(NumDeleted); free(TotalDeleted);
	if(cma) TotalNilCMSA(cma);
	// if(data != NULL) NilSeqSet(data);
	NilAlpha(AB);
	fprintf(stderr,"\ttime: %d seconds (%0.2f minutes)\n",
                        time(NULL)-time1,(float)(time(NULL)-time1)/60.0);
	return 0;
}

#if 0	// moved to rpm_typ.cc in librpm.
Int4	PutHeatMaps(cma_typ *cma,Int4 Number,hpt_typ *Hpt,a_type AB,
		int argc, char *argv[])
{
	Int4	i,j,k,x,n,sq,N,M,nPttrns,Len;
	char	tmp_str[103];
	
	//============= Check input for consistency ================
	if(Number != Hpt->NumSets()) print_error("hpt and cma files inconsistent");
	Len=LengthCMSA(1,cma[1]);
	for(j=1; j <= Number; j++){
	   if(Len != LengthCMSA(1,cma[j])){
		print_error("cma file lengths are inconsistent");
	   } N=NumSeqsCMSA(cma[j]);
	   if(strcmp(NameCMSA(cma[j]),Hpt->SetName(j)) != 0) 
		print_error("hpt and cma files are inconsistent");
	   fprintf(stderr,"%d. '%s' == '%s'\n",
		j,NameCMSA(cma[j]),Hpt->SetName(j));
	}

	//============= Get pattern info from Hpt Args ================
	for(Int4 c=1; c <= Hpt->NumBPPS(); c++){
	    Int4 argC=Hpt->nArg(c);
	    char **Str=Hpt->Argv(c);
	    for(i=0; i < argC; i++){
	      if(strncmp(Str[i],"-P=",3) == 0){
		   Int4 Argc=0;
		   char	*Argv[10];
		   Argv[Argc]=argv[0]; Argc++;
		   Argv[Argc]=argv[1]; Argc++;
		   char *S=Str[i]+3;
		   Argv[Argc]=AllocString(S); Argc++;
fprintf(stderr,"Argv[%d]=%s\n",Argc,Argv[Argc-1]);
		   Argv[Argc]=AllocString("-H"); Argc++;
#if 0
	           sprintf(tmp_str,"-hm=%s",Hpt->GrpName(c));
		   Argv[Argc]=AllocString(tmp_str); Argc++; fprintf(stderr,"Argv3=%s\n",Argv[4]);
#endif
		   RunMatchCMA(Argc,Argv); 
		   for(j=2; j < Argc; j++) free(Argv[j]);
		break;
	      }
	    }
	} fprintf(stderr,"\n");
	return 0;
}


Int4     ParsePttrns(char *str,set_typ set)
/** WARNING: index starts at 0 not 1 as for ParseIntegers() **/
// set_typ set = set of positions used.
{
        Int4    n,r;
        double  k;
	char	*s,ptrn[26];

	ClearSet(set);
	if(strncmp(str,"-P=",3) == 0){
	   s=str;
	   while(s[0] != '=') s++;
	   s++; n=0;
	   do {
             if(sscanf(s,"%[A-Z]%d,",ptrn,&r) == 2){
	        fprintf(stderr,"%s%d ",ptrn,r); n++; AddSet(r,set);
	     } else if(sscanf(s,"%[A-Z]%d",ptrn,&r) == 2){
	        fprintf(stderr,"%s%d\n ",ptrn,r); n++; AddSet(r,set);
		break;;
	     } else print_error("ParsePttrns() input error"); 
  	     while(s[0] != ','){ s++; if(s[0]==0) return n; }
	     s++;
	   } while(1);
	   fprintf(stderr,"\n");
	} return n;
}


Int4	GetCrossConserved(cma_typ *cma,Int4 Number,hpt_typ *Hpt,a_type AB,
		int argc, char *argv[])
{
	Int4	i,j,k,x,n,sq,N,M,nPttrns,Len;
	char	tmp_str[103];
	
	// create N x M sets,each of size Number, where N = Len & M = nPttrns
	// create a single 
        // Medium (Based on chemical-geometrical mymicry)
        // e.g., Q cat look like part of H if in the right conformation.
	// see: rst_typ *rst=new rst_typ('M');
        const char *Pttrn[] = { 0,
	   "C","G","GA","A","AS","S","ST","SN","T","N","ND","NH","NQ","NQH","NDE",
           "D","DE","DEQ","E","EQ","EDQ","Q","QE","QK","QR","QH",
	   "QKR","K","KR","R","RQ","RK","H","HY",
           "W","WY","WYF","Y","YF","YHW","YHF","YHWF","F","FL",
           "V","VI","VL","VIL","VILM","I","IL",
           "L","LM","M","P",0,0,0};

	//============= Check input for consistency ================
	for(k=1,nPttrns=0; Pttrn[k] != 0; k++){
	    // fprintf(stderr,"%d = %s\n",k,Pttrn[k]);
	    nPttrns++;
	}
	if(Number != Hpt->NumSets()) print_error("hpt and cma files inconsistent");
	Len=LengthCMSA(1,cma[1]);
	Int4 maxN=0;
	for(j=1; j <= Number; j++){
	   if(Len != LengthCMSA(1,cma[j])){
		print_error("cma file lengths are inconsistent");
	   } N=NumSeqsCMSA(cma[j]);
	   if(strcmp(NameCMSA(cma[j]),Hpt->SetName(j)) != 0) 
		print_error("hpt and cma files are inconsistent");
	   fprintf(stderr,"%d. '%s' == '%s'\n",
		j,NameCMSA(cma[j]),Hpt->SetName(j));
	   if(N > maxN) maxN=N;
	}

	//============= Get FG & BG info from Hpt Args ================
	for(Int4 r=1; r <= Hpt->NumSets(); r++){
	   for(Int4 c=1; c <= Hpt->NumBPPS(); c++){
	     //Int4 argc=Hpt->RtnArgC(j);
	     char state=Hpt->RtnHyperPartition(r,c);
	     fprintf(stderr,"%c",state);
	   } fprintf(stderr,"\n");
	} fprintf(stderr,"\n\n");

	//============= Get pattern info from Hpt Args ================
	set_typ	*ptrn; NEW(ptrn,Hpt->NumBPPS()+3,set_typ);
	set_typ SetO= MakeSet(Len+3); ClearSet(SetO);
	for(i=1; i <= Len; i++) AddSet(i,SetO);
	for(Int4 c=1; c <= Hpt->NumBPPS(); c++){
	    ptrn[c]=MakeSet(Len+3); 
	    Int4 argc=Hpt->nArg(c);
	    char **Str=Hpt->Argv(c);
	    // char *sst=Hpt->sst_str(c);
	    // char    *GetPatternFromSST(sst, AB);
	    // fprintf(stderr,"%s\n",sst);
	    for(i=0; i < argc; i++){
	      // fprintf(stderr,"%s\n",Str[i]);
	      // sst_typ tmp_set=SsetLet(AlphaCode(aa,AB));
	      // usst = UnionSset(usst,SsetLet(x));
	      // DisjointSset(s,r);
	      n=ParsePttrns(Str[i],ptrn[c]);
	      if(n > 0){
		fprintf(stderr,"n=%d\n",n);
		fprintf(stderr,"Pttrn=%s\n",Str[i]);
		PutSet(stderr,ptrn[c]); // NilSet(pttn[c]);
		break;
	      }
	    }
	} fprintf(stderr,"\n");
	fprintf(stderr,"NumCols=%d; NumCMAs=%d\n",Hpt->NumBPPS(),Number);

	//============= Get Union & Intersection of pattern positions ================
	set_typ USet = CopySet(ptrn[1]); 
	set_typ XSet = CopySet(ptrn[1]); 
	for(Int4 c=2; c <= Hpt->NumBPPS(); c++){
	    UnionSet(USet,ptrn[c]);
	    IntersectSet3(XSet,ptrn[c]);
	}
	PutSet(stderr,USet);
	IntersectNotSet(SetO,USet);
	PutSet(stderr,SetO);
	PutSet(stderr,XSet);

	//============= Allocate sets for pattern matches ================
	set_typ **set; 
	NEWP(set,Len+3,set_typ);
	for(i=1; i <= Len; i++){
	    NEW(set[i],nPttrns+3,set_typ);
	    for(k=1; k <= nPttrns; k++){
		set[i][k]=MakeSet(Number + 3); ClearSet(set[i][k]);
	    }
	}
	//============= Create sets of nodes for pattern matches ================
	char	c;
	unsigned char r,R;
	double dd=0.0;
	Int4	m,blk=1;
	for(i=1; i <= Len; i++){
	   fprintf(stderr,"site %d out of %d \n",i,Len);
	   for(k=1; k <= nPttrns; k++){
	      for(j=1; j <= Number; j++){
	        N=NumSeqsCMSA(cma[j]);
		for(dd=0.0,sq=1; sq <=N; sq++){
		  for(x=0; (c=Pttrn[k][x]);x++){
		    R=AlphaCode(c,AB);
		    r=ResidueCMSA(blk,sq,i,cma[j]);
		    if(r ==R) dd+=1.0;
		  }
		} dd = dd/(double)N;
		if(dd >= 0.80) AddSet(j,set[i][k]);
	      }
	   }
	}
	//============= output results for pattern matches ================
	for(i=1; i <= Len; i++){
	    if(MemberSet(i,USet)) continue;
	    for(k=1; k <= nPttrns; k++){
		n=CardSet(set[i][k]);
		dd=(double)n/(double)Number;
		if(0 && dd > 0.5){
		   fprintf(stderr,"%d (%.2lf): %s\n",i,dd,Pttrn[k]);
		}
		if(n >= 2 && dd < 0.90){
		   fprintf(stderr,"%s%d %d\n",Pttrn[k],i,n);
		   Int4 Argc=0;
		   char	*Argv[10];
		   Argv[0]=argv[0]; Argc++;
		   Argv[1]=argv[1]; Argc++;
		   char Strn[100];
		   sprintf(Strn,"%s%d",Pttrn[k],i);
		   Argv[2]=AllocString(Strn); Argc++;
		   // Argv[3]=AllocString("-H"); Argc++;
		   RunMatchCMA(Argc,Argv); 
		   for(j=2; j < Argc; j++) free(Argv[j]);
		}
	    }
	}
	return 0;
}
#endif

int	main(Int4 argc,char *argv[])
{ 
#if 0	// can eventually delete this...
	if(argc < 3) print_error("matchcma <mma_file> <hpt_file> \n");
	Int4 Number;
	a_type AB = MkAlpha(AMINO_ACIDS,PROT_BLOSUM62);
	FILE *fp = open_file(argv[1],"","r");
	cma_typ *IN_CMA=MultiReadCMSA(fp,&Number,AB);
	fclose(fp);
	fp = open_file(argv[2],"","r");
	hpt_typ *hpt = new hpt_typ(fp); fclose(fp);
	hpt->Put(stderr);
	PutHeatMaps(IN_CMA,Number,hpt,AB,argc,argv);
	// GetCrossConserved(IN_CMA,Number,hpt,AB,argc,argv);
	delete hpt;
	return 1;
#elif 1
	// fprintf(stderr,"argv[2]='%s'\n",argv[2]);
	if(argc < 3) print_error(USAGE_START);
	else if(isdigit(argv[2][0])){
	   Int4		Number,site;
	   BooLean	ExcelFormat=FALSE;
	   a_type AB = MkAlpha(AMINO_ACIDS,PROT_BLOSUM62);
           FILE *fp = open_file(argv[1],"","r");
           // fp=OpenFileToRead(argv[1]);
           cma_typ *IN_CMA=MultiReadCMSA(fp,&Number,AB);
           fclose(fp);
	   site=(Int4)atoi(argv[2]);
	   if(site > 0 && site < LengthCMSA(1,IN_CMA[1])){
	      PutResFreqCMA(stdout,IN_CMA,Number,site,AB,ExcelFormat);
	   } else print_error("site is out of range");
	} else return RunMatchCMA(argc,argv);
#elif 0
	rpm_typ *rpm = new rpm_typ(argc,argv);
	// rpm->RunMatchCMA( );
	rpm->PutHeatMaps( );
	rpm->OutputCrossConserved( );
	delete rpm;
#endif
}

