/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#include "cmc_typ.h"
// #include "blosum62.h"

#if 0
#define WEIGHT_RANDOM_SEQUENCES_MANUALLY 0	// look at Junk/cmp_typ3_14_11.cc if want to add this...
#define DISTINCT_TRIPARTITION_NULL_MODEL 0	// look at Junk/cmp_typ3_14_11.cc if want to add this...

#endif

void	cmc_typ::PartitionByInputSetCMSA(Int4 NumSeedAln)
// Partition the alignment cma based on sequence scores against the seed cma files
{
	if(checkpoint){
	  assert((NumSeedAln + 1) == num_passed_in_sets); 
	  // '1' = passed in random set.
	} else {	// implies that passed_in_cma != 0 && passed_in_sets != 0
	  assert(passed_in_cma); assert(TrueMainCMA==passed_in_cma);
	  if(!passed_in_mcma) assert(NumSeedAln == num_passed_in_sets);
	}
  
	// 1. Make sure that input sequence sets are compatible with other input.
        Int4    i,j,n,s;
	assert(num_passed_in_sets > 0);
	assert(passed_in_sets);
	assert(passed_in_sets[1]); 
	Int4 size_passed_in_set=SetN(passed_in_sets[1]);
	assert((size_passed_in_set - 1) == (NumSeqsCMSA(TrueMainCMA) + NumRandom));
        for(i=2; i <= num_passed_in_sets; i++){
		assert(SetN(passed_in_sets[i]) == size_passed_in_set);
	}
	set_typ SetU=MakeSet(size_passed_in_set); ClearSet(SetU);
        for(i=1; i <= num_passed_in_sets; i++){
           for(j=i+1; j <= num_passed_in_sets; j++){
                Int4 X=CardInterSet(passed_in_sets[i],passed_in_sets[j]);
                if(X != 0) print_error("cmc_typ error: sequence sets not disjoint");
           } UnionSet(SetU,passed_in_sets[i]);
        } assert(!MemberSet(0,SetU)); 

	// 2. Create set representation for partitions...
	for(i=1; i <= NumSeedAln; i++){ InitSet[i] = passed_in_sets[i]; }
	Int4	N=NumSeqsCMSA(MainCMA),M=NumSeqsCMSA(TrueMainCMA),J,I=NumSeedAln+1;
	if(checkpoint==0){
	   if(NumSeedAln == num_passed_in_sets){
		assert(CardSet(SetU) == NumSeqsCMSA(TrueMainCMA));
		InitSet[I] = MakeSet(N+1); // last set I == Random sequence set.
	        ClearSet(InitSet[I]);  // One more For Random Sequence set
		for(J=M+1; J <= N; J++) AddSet(J,InitSet[I]); 	// fill up reject set.
		assert(CardInterSet(SetU,InitSet[I]) == 0); 
		// ^ No random seqs placed in other sets and vice versa.
           } else {
		InitSet[I]=passed_in_sets[I]; // reject set also passed in...
	   }
        } else {
		InitSet[I]=passed_in_sets[I]; // reject set also passed in...
	} NilSet(SetU); 
	ComputeMinSeed2CsqScores();
	return;
}

void	cmc_typ::PartitionBySeedAlnCMSA(Int4 NumSeedAln, char *TypeOfSet)
// Create an initial partitioning of the aligned input sequences (cma)
// based on seed alignment consensus scores.
{
        Int4    i,j,n,s,I,J,N=NumSeqsCMSA(MainCMA);
	FILE	*ofp=0; // ofp=stderr;

	ComputeMinSeed2CsqScores();	// New routine for labeling sequences...

	//==== 3a. Compute max score between seed alignment consensus seqs. ====
	if(ofp){
	  fprintf(ofp,"   ");
	   for(j=1; j <= NumSeedAln; j++) fprintf(ofp," %5d",j);
	   fprintf(ofp,"\n");
	}
	Int4 *MinScoreInSet=ComputeMinScore(ofp,NumSeedAln,stderr);
// ofp=stderr;
	Int4 *MaxScoreNonID=ComputeMaxScore(ofp,NumSeedAln,MinScoreInSet,stderr);
// ofp=0; exit(1);

	//==== 4. Find the Best Partition for each of the input sequences. ==== 
	Int2	*Partition; NEW(Partition, N+3, Int2);		
	h_type HG=Histogram("partitions", 0,NumSeedAln,1.0);
	h_type sHG=Histogram("best scores", -100,5000,25.0);
	// if(key_seq < 1 || key_seq > N) print_error("PartitionBySeedAlnCMSA( ) error");
	Int4	M=NumSeqsCMSA(TrueMainCMA);
	for(J=1; J <= M; J++){
	   Int4 Score,BestScore=INT4_MIN; 
	   Int2	BestPartition=0;
	   for(n=1; n <= NumSeedAln; n++){
		Score=PseudoAlnScoreTwoCMSA(1,SeedCMA[n],J,TrueMainCMA);
		if(Score > BestScore){
		  // if(n==1)
                  if((TypeOfSet[n] == '?')) // Compare score for all Misc sets...
		  {
                    if(Score > BestScore){ BestScore=Score; BestPartition=n; }
		  } else if(Score >= MinScoreInSet[n] && Score > BestScore){
                    BestScore=Score; BestPartition=n; // otherwise only consider if > Max
		  }
		}
	   }
	   if(BestScore <= 0) BestPartition=0;
	   Partition[J]=BestPartition;
	   IncdHist((double)BestPartition,HG);
	   IncdHist((double)BestScore,sHG);
#if 0
	   if(BestPartition == 0){
		print_error("Hyperpartition lacks a miscellaneous set (indicator: '?')");
	   	// assert(BestPartition != 0);
	   }
#endif
	}
        if(ofp){ PutHist(ofp,50,HG); PutHist(ofp,50,sHG); }
	NilHist(sHG); NilHist(HG); 

	//==== 5. Create initialization sets for each partition. ====
	for(i=1; i <= NumSeedAln; i++){
	  InitSet[i]=MakeSet(SetSize); ClearSet(InitSet[i]); 
	} InitSet[i] = MakeSet(SetSize); ClearSet(InitSet[i]);  // == reject set
	InitSet[0]=InitSet[i]; 	// Let 0 point to Reject set as well
	HG=Histogram("partitions", 0,NumSeedAln,1.0);
	for(n=0; n <= NumSeedAln; n++){
	   Int4 Cnt=0;
	   for(J=1; J <= M; J++){
		if(Partition[J] == n){
		   AddSet(J,InitSet[n]); 	// Set representation.
		   IncdHist((double)n,HG); Cnt++; 
		}
	   }
	   // assert(Cnt > 0);	// if none get put in will not work...
	   if(n > 0 && Cnt <= 0){
		fprintf(stderr,"Initialization failed for seed alignment: '%d.%s'\n",
			n,NameCMSA(SeedCMA[n]));
		fprintf(stderr,"input alignment lacks any top matching sequences\n");
		if((TypeOfSet[n] != '?')){
		   print_error(" Try modifying the hyperpartition and/or seed alignment.");
		}
	   }
	} free(Partition);
// ofp=stderr;
	for(J=M+1; J <= N; J++) AddSet(J,InitSet[0]); 	// == reject set.
        if(ofp) PutHist(ofp,50,HG); NilHist(HG); free(MaxScoreNonID); free(MinScoreInSet);
	// for(i=1; i <= NumDisplayCMA; i++){ TotalNilCMSA(SeedCMA[i]); }
	// exit(1);
	InitSet[0]=0;	// Don't need this pointer anymore.
	return;
}

char	**cmc_typ::GetSetRelations(const char *Title,Int4 *nGrpsX, Int4 **GrpsX,set_typ **RtnSetX)
//**************** Identify relationships between sets. *******************
{
	Int4	m,n,i,j,f,g,b,r;
	char **RelateSets;
// subset[n1][n2]='<',superset='>',Intersect='+',Disjoint='0',Identical='='.
	set_typ *SetX;
	NEWP(RelateSets,Hpt->NumBPPS() + 5,char);
	NEW(SetX,Hpt->NumBPPS() + 5,set_typ);
        for(n=1; n<= Hpt->NumBPPS(); n++){
	   NEW(RelateSets[n],Hpt->NumBPPS() + 5,char);
	   SetX[n]=MakeSet(Hpt->NumSets()+1); ClearSet(SetX[n]);
	   for(i=1; i <= nGrpsX[n]; i++){
		g = GrpsX[n][i]; AddSet(g,SetX[n]);
	   } RelateSets[n][n]='=';
	}
	Int4 ci,cn,cm;
	// set_typ Union=MakeSet(Hpt->NumSets()+1); 
        for(n=1; n < Hpt->NumBPPS(); n++){
	   cn=CardSet(SetX[n]); 
           for(m=n+1; m<= Hpt->NumBPPS(); m++){
		ci=CardInterSet(SetX[n],SetX[m]);
		if(ci == 0){ RelateSets[n][m]='0'; RelateSets[m][n]='0'; continue; } // disjoint.
		cm=CardSet(SetX[m]);
		// fprintf(stderr,"c%d=%d;c%d=%d;ci=%d\n",n,cn,m,cm,ci);
		if(cn == ci){	// n is subset; m is superset.
		    if(cn == cm){ RelateSets[n][m]='='; RelateSets[m][n]='='; continue; } // equal.
		    else { RelateSets[n][m]='<'; RelateSets[m][n]='>'; continue; } // subset/superset
		}
		if(cm == ci){	// m is proper subset; n is proper superset.
		    RelateSets[n][m]='>'; RelateSets[m][n]='<'; continue; // subset/superset
		}
		if(ci > 0){ RelateSets[n][m]='+'; RelateSets[m][n]='+'; continue; } // intersect
		else {
		   if(efp) fprintf(efp,"c%d=%d;c%d=%d;ci=%d\n",n,cn,m,cm,ci);
		   // fprintf(stderr,"cn=%d;cm=%d;ci=%d\n",cn,cm,ci);
		   print_error("cmc_typ Init( ): this should not happen");
		}
		// UnionSet3(SetX[n],SetX[m],Union); Int4 cu=CardSet(Union);
	   }
	}
   if(0){
        fprintf(stdout,"\n %s Set Relationships:\n     ",Title);
        for(n=1; n<= Hpt->NumBPPS(); n++){ fprintf(stdout,"%2d ",n); } fprintf(stdout,"\n");
        for(n=1; n <= Hpt->NumBPPS(); n++){
           fprintf(stdout,"%3d: ",n);
           for(m=1; m<= Hpt->NumBPPS(); m++){ fprintf(stdout," %c ",RelateSets[n][m]); }
	   fprintf(stdout,"\n");
	   // NilSet(SetX[n]);
	} fprintf(stdout,"\n");
   }
	// free(SetX);
	*RtnSetX = SetX;
	return RelateSets;
}

void    cmc_typ::GetChnFiles( )
//************** get the chn files for the BPPS analyses. *****************
{
  Int4	ArgC=3,i,j,n;
  FILE	*fp;

  //****** 1. Create display set alignment files (QryCMAs) for each analysis ***********************
  const Int4 max_num_TmpCMA=7000;
  cma_typ	TmpCMA[7004];
  Int4		NumTmpCMA;

  NEW(QryCMAs,Hpt->NumBPPS()+4,cma_typ);
  for(n=1; n<=Hpt->NumBPPS(); n++){
	if(Hpt->GrpsBG(n,1)==0){	// main set...all in foreground...
		print_error("BG=(0) not yet implemented");
	// } else if(passed_in_cma != 0){
	} else if(IsTreeHpt && (checkpoint || passed_in_cma != 0)){
	   // if(IsTreeHpt) assert(Hpt->NumBPPS() == NumDisplayCMA);
	   assert(Hpt->NumBPPS() == NumDisplayCMA);
	   fp = tmpfile(); PutCMSA(fp,DisplayCMA[n]);
	   rewind(fp); QryCMAs[n]=ReadCMSA(fp,AB); fclose(fp);
	} else {
	    if(Hpt->nGrpsFG(n) == 0){
		fprintf(stderr,"Tripartition %d category lacks a foreground set\n",n);
		print_error("Fatal hyperpartition syntax error.");
	    }
	    for(i=1,j=0; i <= Hpt->nGrpsFG(n); i++){
		if(Hpt->GrpsFG(n,i) > NumDisplayCMA){
			if(efp) fprintf(efp,"GrpsFG[%d][%d]=%d > %d.\n",
				n,i,Hpt->GrpsFG(n,i),NumDisplayCMA);
			print_error("Fatal: Number of display sets less than number of FG sets."); 
		} assert(Hpt->GrpsFG(n,i) > 0);
		j++; TmpCMA[j]=DisplayCMA[Hpt->GrpsFG(n,i)];
	    } fp = tmpfile(); 
	    PutMergedCMSA(fp,j,TmpCMA);
  	    rewind(fp); QryCMAs[n]=ReadCMSA(fp,AB); fclose(fp);
	    // -addcsq option adds a csq to each sma file 
	    if(AddCSQ==TRUE){	// then add a consensus sequence.
	       cma_typ ConsCMA = AddConsensusCMSA(QryCMAs[n]);
	       TotalNilCMSA(QryCMAs[n]); QryCMAs[n] = ConsCMA; ConsCMA=0;
	    } // else (e.g., to rerun BPPS).
	    RenameCMSA(Hpt->GrpName(n),QryCMAs[n]);
#if 0	// DEBUGGING...
	    char st[100]; sprintf(st,"_%d.cma",n);
	    fp=open_file("junk",st,"w");
	    PutCMSA(fp,QryCMAs[n]); fclose(fp);
#endif
	    for(i=1; i <= Hpt->nGrpsFG(n); i++) TmpCMA[i]=0;
	}
  }

  //****** 2. Create dummy cma files required by chn format (not used by anything right now). *****
  // if(passed_in_cma != 0) assert(NumSeqsCMSA(QryCMAs[1]) > 0);
  if(checkpoint || passed_in_cma != 0) assert(NumSeqsCMSA(QryCMAs[1]) > 0);
  else assert(NumSeqsCMSA(QryCMAs[1]) > 1);
  BooLean  *skip; NEW(skip,NumSeqsCMSA(QryCMAs[1])+3,BooLean);
  for(i=2; i<=NumSeqsCMSA(QryCMAs[1]); i++) skip[i]=TRUE;
  fp = tmpfile(); PutSelectCMSA(fp,skip,QryCMAs[1]); rewind(fp);
  dummyCMA=ReadCMSA(fp,AB); fclose(fp); free(skip);
  // WriteCMSA("dummy.cma",dummyCMA);	// DEBUG.
 
  hsw_typ HSW=0;  NEWP(chn,Hpt->NumBPPS() + 3, chn_typ);
  if(checkpoint) HSW=checkpoint->hsw;
  else if(passed_in_hsw) HSW=passed_in_hsw;
  //****** 3. Create a chn files for the analysis ***********************
  for(n=1; n<=Hpt->NumBPPS(); n++){
	if(NumSeqsCMSA(QryCMAs[n]) > max_num_TmpCMA){
		fprintf(stderr,"n=%d; Hpt->NumBPPS()=%d; NumSeqsCMSA(QryCMAs[n])=%d\n",
				n,Hpt->NumBPPS(),NumSeqsCMSA(QryCMAs[n]));
		print_error("FATAL: Input seed alignments contain too many sequences");
	} TmpCMA[1]=QryCMAs[n];
#if 0	// DEBUG...
	// if(n==22){ PutSeq(stderr,FakeSeqCMSA(1,QryCMAs[n]),AB); exit(1); }
  	fprintf(stderr,"n=%d; Hpt->NumBPPS()=%d; NumSeqsCMSA(QryCMAs[n])=%d =? %d\n",
			n,Hpt->NumBPPS(),NumSeqsCMSA(QryCMAs[n]),NumSeqsCMSA(TmpCMA[1]));
#endif
	TmpCMA[2]=MainCMA;
	for(i=1,j=2; i <= NumSeqsCMSA(QryCMAs[n]); i++){
		// fprintf(stderr,"n=%d; i=%d; j=%d\n",n,i,j);
		j++; TmpCMA[j]=dummyCMA; // shouldn't matter, as chn does not own these CMA files.
	} NumTmpCMA=j;
  	char	*ArgV[20]; ArgV[0]=AllocString("mcBPPS"); 
	ArgV[1]=AllocString(infile); ArgV[2]=AllocString("-Q"); ArgC=3;
	if(efp) fprintf(efp,"************ Analysis #%d... ************\n",n);
	// if((checkpoint==0 || passed_in_hsw==0) && n==1){
	if(HSW==0){
	    chn[n] = new chn_typ(ArgC,ArgV,NumTmpCMA,TmpCMA,200);
	    assert(!chn[n]->OwnsCMAs()); 
	    HSW=chn[n]->RtnHSW(1);	// Pass Henikoff weights on to other analyses
	} else {
  	    hsw_typ *hsw; NEW(hsw, NumTmpCMA+3, hsw_typ); // Make sure array is long enough..
#if 0	// DEBUG
  	fprintf(stderr,"======> %d: NumSeqsCMSA=%d\n",n,NumSeqsCMSA(TmpCMA[1]));
#endif
	    hsw[1]=HSW; chn[n] = new chn_typ(ArgC,ArgV,NumTmpCMA,TmpCMA,200,hsw);
	    assert(!chn[n]->OwnsCMAs()); free(hsw);
	}
#if 1   // Temporary fix for gaps ('-') in query sequence...need better fix later.
        {
	 IN_CMA=chn[n]->GetIN_CMSA();
         e_type fakeE1=FakeSeqCMSA(1,IN_CMA[1]);
         for(Int4 s=1; s<=LenSeq(fakeE1); s++){
           Int4 r=ResSeq(s,fakeE1);
           if(r==AlphaCode('X',AB)){ // set 'X' to 'A'in fake query seq.
             if(efp) fprintf(efp,"WARNING: setting 'X' at position %d in query to 'A'.\n",s);
             r=AlphaCode('A',AB); EqSeq(s,r,fakeE1);
           }
         }
        }
#endif
	for(i=0; i < ArgC; i++) free(ArgV[i]);
  }
}

void	cmc_typ::RmAbsentSeqs()
//**************** Remove sequences that don't belong. *******************
{
     Int4 i,g,n;
     BooLean	rtn,*InFgBg;
     NEW(InFgBg, Hpt->NumSets()+3,BooLean);
     for(n=1; n<= Hpt->NumBPPS(); n++){	
#if 0	// DEBUG...
cma_typ *in_CMA=chn[n]->GetIN_CMSA();
fprintf(stderr,"&&&&&&& %d. NumSeqsCMSA=%d\n",n,NumSeqsCMSA(in_CMA[1]));
#endif
	for(g=1; g <= Hpt->NumSets(); g++) InFgBg[g]=FALSE;
	for(i=1; i <= Hpt->nGrpsFG(n); i++){ g = Hpt->GrpsFG(n,i); InFgBg[g]=TRUE; }
	for(i=1; i <= Hpt->nGrpsBG(n); i++){ g = Hpt->GrpsBG(n,i); InFgBg[g]=TRUE; }
	for(g=1; g <= Hpt->NumSets(); g++){
	   if(!InFgBg[g]){	// Is set absent from the gth foreground & background?
		for(Int4 sq=1; sq <= NumSeqsCMSA(MainCMA); sq++){
		     if(MemberSet(sq,InitSet[g])){
			// then remove members from either partition.
			rtn=che[n]->RemoveSeq(sq);
		     }
		}
	   }
	}
     } free(InFgBg);
}

void	cmc_typ::FillUpHpt( )
{
	Int4	m,n,i,j,f,g,b,r,s,x;
	if(checkpoint || passed_in_sets) PartitionByInputSetCMSA(NumDisplayCMA);
	else PartitionBySeedAlnCMSA(NumDisplayCMA,Hpt->TypeOfSet());
	SortDisplaySets( );
	ReadSeedPttrns( );   // Get seed patterns for each run using pattern strings. 

	RandomSet=0;
        for(g=1; g<= Hpt->NumSets(); g++){	
		IsFailedSet[g] = FALSE;		// include all sets initially...
		if(strcmp("Random",Hpt->ElmntSetName(g)) == 0){
			if(RandomSet != 0) print_error("FATAL: multiple random sets defined.");
			else RandomSet=g;
		} 
	}
	GetChnFiles( );	 // 3. Create chn files (chn_typ).

	// 3. Create che objects, one for each search based on arguments + seed patterns.
	if(Hpt->NumBPPS() < 1) print_error("too few categories of constraints");
	NEWP(che,Hpt->NumBPPS()+4,che_typ);
	NEWP(sqd,Hpt->NumBPPS()+4,sqd_typ);
	NEWPP(SST,Hpt->NumBPPS()+4,sst_typ);
	NEW(SFBG,Hpt->NumBPPS()+4,char);
	NEW(MaxNumCol,Hpt->NumBPPS()+4,Int4);
	for(Int4 n=1; n <= Hpt->NumBPPS(); n++){ MaxNumCol[n]=DefaultMaxCol; }
	for(Int4 n=1; n <= Hpt->NumBPPS(); n++){ // Call che constructors for each run.
	   if(passed_in_hpt) SetUpNthSrch(n,InitHpt->nArg(n),InitHpt->Argv(n));
	   else SetUpNthSrch(n, InitHpt->nArg(n),InitHpt->Argv(n));
	   IsFailedBPPS[n]=FALSE; 
	}
	this->RmAbsentSeqs();
     //**************** Label sequences to be kept fixed. *******************
     // move this to cmc_sort.cc ???
     Int4 num_sq=NumSeqsCMSA(MainCMA), NumSq=NumSeqsCMSA(TrueMainCMA);
     if(checkpoint){ Labeled=checkpoint->labeled; }
     else {
efp=stderr;
	Int4 cnt=0,sq; 
	Labeled=MakeSet(num_sq+1); ClearSet(Labeled); 
        for(m=0,n=Hpt->NumBPPS(); n > 0; n--){
	   for(cnt=0,sq=1; sq <= num_sq; sq++){
		if(che[n]->MemberGold(sq) && !MemberSet(sq,Labeled))
		   { AddSet(sq,Labeled); cnt++; }
	   } if(efp){ fprintf(efp,"%d labeled sequences for node %d\n",cnt,n); }
	   m += cnt;
	} if(efp){ fprintf(efp,"%d labeled sequences found\n",m); }
#if 1	// Also add best matches within specific groups to Labeled set.
	Int4 Sq0;
  	for(m=0,g=1; g < Hpt->NumSets(); g++){
#if 1	// Label as fixed at most 20 of the highest scoring seqs.
	    Int4 Nj,end=20;
	    for(j=Index1stBest[g]; WorstToBest[g][j] != 0; j++) ; 
	    for(Nj=0,j--;  Nj <= end && j >= Index1stBest[g]; j--,Nj++){
	        Sq0=WorstToBest[g][j];
		if(MemberSet(Sq0,InitSet[g]) && !MemberSet(Sq0,Labeled)){
			AddSet(Sq0,Labeled); m++; 
		}
	    }
#else
	    for(j=Index1stBest[g]; (Sq0=WorstToBest[g][j]) != 0; j++){
		if(MemberSet(Sq0,InitSet[g]) && !MemberSet(Sq0,Labeled)){
			AddSet(Sq0,Labeled); m++; 
		}
	    }
#endif
	}   for(Sq0=NumSq+1; Sq0 <= num_sq; Sq0++) AddSet(Sq0,Labeled);
	//    ^ Random sequences must stay in the rejected set
	if(efp) fprintf(efp,"%d additional seqs labeled based on log-odds scores.\n",m);
#endif
    }
efp=0;
	if(1){ // DEBUG: print out an alignment of labeled sequeces ...
	   FILE *tfp=open_file(infile,"_fixed.cma","w");
	   this->PutFixedSeqs(tfp); fclose(tfp);
	}
     //*********** Create HyperPartition; identify seq. subgroups. *************
     for(g=1; g<= Hpt->NumSets(); g++){
	HyperPartition[g]=Hpt->RtnHyperPartition(g); 
        if(0) fprintf(stderr,"%d. InitSet=%d/%d seqs\n",g,CardSet(InitSet[g]),num_sq);
	GrpSet[g]=CopySet(InitSet[g]); ClearSet(GrpSet[g]); 
	CopySet(GrpSet[g],InitSet[g]); 
     }
     RelateFGs=SetRelationsFG( ); RelateBGs=SetRelationsBG( );
     //**************** Put sequences in their starting partitions. *******************
     // need to make sure that labeled sequences are in all the right partitions...
     // also label BG seqs in same hyperpartition:
     for(g=1; g<= Hpt->NumSets(); g++){	
	  for(Int4 sq=1; sq <= num_sq; sq++){
	   if(MemberSet(sq,GrpSet[g])){		// this sequence is in group g.
             for(n=1; n<= Hpt->NumBPPS(); n++){
		switch (HyperPartition[g][n]){
		  case '+':  assert(!che[n]->RemovedSeq(sq));
	      	    if(!che[n]->MemberFG(sq)){ assert(che[n]->ChngPartition(sq)=='+'); }
		  break;
		  case '-':  assert(!che[n]->RemovedSeq(sq));
	      	    if(!che[n]->MemberBG(sq)){ assert(che[n]->ChngPartition(sq)=='-'); }
		  break;
		  case 'o':  assert(che[n]->RemovedSeq(sq)); break;
		  default: print_error("cmc_typ Init( ) error"); break;
		}
	     }
           }
	  }
      }
	
	FILE *sfp=0;
	// if not defined then find a seed pattern based on above partition.
	for(n=1; n<= Hpt->NumBPPS(); n++){	
	  if(sst_str[n] == 0){
		if(sfp==0 && PutIntermediateFiles){ sfp=open_file(infile,".seeds","w"); }
		sst_str[n]=FindSeedPattern(n); // resets the pattern to the seed pattern.
		// checkpoint is checked in FindSeedPattern(n) routine.
		if(PutIntermediateFiles) fprintf(sfp,"%d: %s\n",n,sst_str[n]);
	  }
     	} 
	if(sfp) fclose(sfp);
	//********* For storing best ***********
	SaveBest=FALSE;		// 
	BestLPR=-99999999999999999999.9;
     	for(g=1; g<= Hpt->NumSets(); g++)
	{ BestSet[g]=MakeSet(num_sq+1); ClearSet(BestSet[g]); }
	Int4	Length=LenSeq(che[1]->KeyE());
	for(n=1; n<= Hpt->NumBPPS(); n++){ NEW(best_sst[n],Length+2,sst_typ); }
	//********* For storing best ***********
	outfp = open_file(infile,"_hpt.out","w");
	PutHyperPartition(outfp); fflush(outfp);
	if(efp){
	   PutHyperPartition(efp);
           // for(j=1; j<=Hpt->NumBPPS(); j++)
           for(j=1; j<=Hpt->NumSets(); j++)
	   {
              fprintf(efp,"%2d(\"%s\"): labeled = %d/%d\n", j,Hpt->ElmntSetName(j),
			CardInterSet(this->Labeled,this->GrpSet[j]),
			CardSet(this->GrpSet[j]));
           }
	}
}

/**************************** Global Variables ******************************/
Int4	cmc_typ::SetUpNthSrch(Int4 Level, Int4 argc,char *argv[])
{ 
	Int4	arg;
	BooLean	verbose=FALSE;
	char	compare='U';
	double	MinKeyFrq=0.5,MaxGapFrq=0.5;
	Int4	min_nats=5;
	double	fract_ignored[20];
	double	A0=GlobalA0,B0=GlobalB0;
	char	Mode='R';	// can only use random order so that all sequences are in FG.
	// Int4	Contrast=12;
	Int4	Contrast=DefaultMaxCol,contrast=-1;


	// double	LnRho= 0.6931471805599452862;	// -log(0.5);
	// double	rho=0.1;
	double	rho=Global_rho; // 0.003;
	double	PriorRi=GlobalRi; // 0.03;
	Int4	min_num_col=DefaultMinCol;
	Int4	max_num_col=DefaultMaxCol;
	char	*pttrn_str=0;

	Int4	n=Level;
	char    sets_mode=this->SetsMode[n];
// if(n==1) sets_mode='R'; // Treat Root node differently. afn5_27_2022.
// ^ when turning this on 'bpps H' no longer works!!
	SFBG[n]='B';

	fract_ignored[1]=0.0; fract_ignored[2]=0.0;
   //********************** for public consumption. *************************
   if(Hpt->RtnMode() == 'I'){	// for public consumption.
	print_error("this mcBPPS option has been turned off");
	Int4	weight=-1,quality=-1,noise=-1;
	for(arg = 0; arg < argc; arg++){
	   fprintf(stderr,"argv[%d] = %s\n",arg,argv[arg]);
	   if(argv[arg][0] != '-') Hpt->PrintError(); 
	   switch(argv[arg][1]) {
	    case 'c': {
                 if(sscanf(argv[arg],"-contrast=%d",&contrast)==1){
		    if(contrast < 2) Hpt->PrintError();
		 } else Hpt->PrintError(); } break;
	     case 'n': 
                 if(sscanf(argv[arg],"-noise=%d",&noise)==1){
		    if(noise < 0 || noise > 50) Hpt->PrintError();
		 } else Hpt->PrintError(); break;
	     case 'q': 
                 if(sscanf(argv[arg],"-quality=%d",&quality)==1){
		    if(quality < 0 || quality > 9) Hpt->PrintError();
		 } else Hpt->PrintError(); break;
	     case 'w': 
                 if(sscanf(argv[arg],"-weight=%d",&weight)==1){
		    if(weight < 2 || weight > 1000) Hpt->PrintError();
		 } else Hpt->PrintError(); break;
	     case ' ': break;	// ignore these...
	     default: Hpt->PrintError(); break;
	   }
	}  // set formal parameters using informal input.
	if(quality != -1){
	   switch (quality){
	     case 0: PriorRi=0.9; break;
	     case 1: PriorRi=0.75; break;
	     case 2: PriorRi=0.5; break;
	     case 3: PriorRi=0.25; break;
	     case 4: PriorRi=0.1; break;
	     case 5: PriorRi=0.05; break;
	     case 6: PriorRi=0.02; break;
	     case 7: PriorRi=0.01; break;
	     case 8: PriorRi=0.005; break;
	     case 9: PriorRi=0.001; break;
	     default: Hpt->PrintError(); break;
	   }
	}
	if(contrast != -1) Contrast=contrast;
	else Contrast=max_num_col;
	if(noise == -1) noise=10;	// 0.90 okay
	if(weight == -1) weight=10;	// -A10:2 by default...
	A0 = 1.0+(double)weight*((100.0-(double) noise)/100.0);
	B0 = 1.0+(double)weight*((double) noise/100.0);
   } else {   //********************** for internal use. *************************
	rho=Global_rho;
	A0=GlobalA0,B0=GlobalB0; PriorRi=GlobalRi; 
	for(Int4 row=1; row <= Hpt->NumSets(); row++){
		char cell=Hpt->Cell(row,Level);
		if(cell == '+' && Hpt->TypeOfSet(row) == '?'){
			A0=MiscGlobalA0,B0=MiscGlobalB0; PriorRi=MiscGlobalRi; break;
		}	// for FGs with a miscellaneous set use more liberal settings.
	}
	for(arg = 0; arg < argc; arg++){
	   if(efp) fprintf(stderr,"argv[%d] = %s\n",arg,argv[arg]);
	   if(argv[arg][0] != '-') print_error(CMC_USAGE_START);
	   switch(argv[arg][1]) {
             case 'A': if(sscanf(argv[arg],"-A%lf:%lf",&A0,&B0)==2){
			if(A0 <= 0.0 || B0 <= 0.0){
	   		   fprintf(stderr,"A0 = %lf; B0 = %lf\n",A0,B0);
			   print_error(CMC_USAGE_START); 
			}
			argv[arg][1] = ' '; 
		       } else print_error(CMC_USAGE_START); 
		break;
	     case 'B':
		if(sscanf(argv[arg],"-B=%c",&SFBG[n])==1){
		  if(!(SFBG[n]=='M' || SFBG[n]=='B' || SFBG[n]=='A')) print_error(CMC_USAGE_START);
		} else print_error(CMC_USAGE_START); 
		  argv[arg][1] = ' '; 
		break;
	     case 'c': 
	      if(sscanf(argv[arg],"-col=%d:%d",&min_num_col,&max_num_col)==2){
			if(min_num_col < 2 || min_num_col > max_num_col){
				fprintf(stderr,"Min(%d)/Max(%d) number columns out of range\n",
					min_num_col,max_num_col);
				print_error(CMC_USAGE_START);
			} argv[arg][1] = ' '; 
	      } else if(strcmp("-verbose",argv[arg]) == 0){ verbose=TRUE;
	      } else if(sscanf(argv[arg],"-compare=%c",&compare)==1){
                        if(!isupper(compare)) print_error(CMC_USAGE_START);
		  	argv[arg][1] = ' '; 
	      } else {
		// NEW method based on fraction of poorest seqs to ignore.
		// fract_ignored=RealOption(argv[arg],'c',0.0,0.5000001,CMC_USAGE_START);
		// Int4    ParseReals(char *str, double *values, const char *msg);
		// for getting separate values for each partitions
		if(argv[arg][2] != '=') print_error(CMC_USAGE_START);
		Int4 n = ParseReals(argv[arg] + 3,fract_ignored,CMC_USAGE_START);
		if(n != 2) print_error(CMC_USAGE_START);
		fract_ignored[2]=fract_ignored[1];
		fract_ignored[1]=fract_ignored[0];
		if(fract_ignored[2] < 0.0 || fract_ignored[2] > 0.9) print_error(CMC_USAGE_START);
		if(fract_ignored[1] < 0.0 || fract_ignored[1] > 0.9) print_error(CMC_USAGE_START);
                argv[arg][1] = ' '; 
	      } break;
	     case 'm': 
		min_nats=IntOption(argv[arg],'m',0,1000,CMC_USAGE_START); 
                argv[arg][1] = ' '; 
		break;
	     case 'M': MinKeyFrq=RealOption(argv[arg],'M',0.0,1.0,CMC_USAGE_START);
                argv[arg][1] = ' '; break;
	     case 'N': 
             	if(sscanf(argv[arg],"-N=%d",&contrast)==1){
			if(contrast < 1) print_error(CMC_USAGE_START);
			argv[arg][1] = ' ';
                } else print_error(CMC_USAGE_START); 
		break;
	     case 'P': 
		if(argv[arg][2] == '=' && argv[arg][3] == 0) continue; // skip "-P= " strings.
		if(argv[arg][2] == '=' && isalpha(argv[arg][3])){
// fprintf(stderr,"%d (arg): %s\n",Level,argv[arg]);
			argv[arg][1] = ' '; pttrn_str=AllocString(argv[arg]+3);
		} else print_error(CMC_USAGE_START);
		break;
             case 'R':
		if(sscanf(argv[arg],"-Ri=%lf",&PriorRi)==1){
		   if(PriorRi <= 0.0 || PriorRi >= 1.0){
		      fprintf(stderr,"Ri (%.2f) out of range\n",PriorRi);
		      print_error(CMC_USAGE_START);
		   }
		   argv[arg][1] = ' ';
                } else print_error(CMC_USAGE_START); 
		break;
	     case 'r': 
	        if(sscanf(argv[arg],"-rho=%lf",&rho)==1){
		   if(rho <= 0.0 || rho > 0.5){
		        fprintf(stderr,"rho (%.2f) out of range\n",rho);
			print_error(CMC_USAGE_START);
		   }
		   argv[arg][1] = ' '; 
                } else print_error(CMC_USAGE_START); 
		break;
	     case 's': 
#if 0
	        if(sscanf(argv[arg],"-sets=%c",&sets_mode)==1){
		   if(sets_mode != 'M' && sets_mode != 'R' 
			&& sets_mode != 'G' && sets_mode != 'O' && sets_mode != 'L'){
				print_error(CMC_USAGE_START);
		   }
		} else print_error(CMC_USAGE_START);
		argv[arg][1] = ' '; break;
#else
		print_error(CMC_USAGE_START); break;
#endif
	     case 0: print_error(CMC_USAGE_START); break;
	     case ' ': break;	// ignore these...
	     default: 
		fprintf(stderr,"illegal input option (%c)\n",argv[arg][1]);
		print_error(CMC_USAGE_START); break; // do nothing.
	   }   // end of switch scope
	}	// end of argument loop.
	if(contrast != -1) Contrast=contrast;
	else if(GlobalN > 0) Contrast=GlobalN; else Contrast=max_num_col;
    } // End of if(Hpt->RtnMode() == 'I' else 
	if(Level == 0) return 0;

	IN_CMA=chn[Level]->GetIN_CMSA();
	set_mode[n]=sets_mode;
	sqd[n]=new sqd_typ(IN_CMA[2],IN_CMA[1],MinKeyFrq,MaxGapFrq,sets_mode);
	// fprintf(stderr,"Returned significant seed patterns.\n");
	SST[n]=sqd[n]->LegalResSets( ); // Get legal sets for each run (will change during a search).
	// Need to initialize legal sets to be subsets of positions higher up the tree.
	e_type Query=FakeSeqCMSA(1,IN_CMA[1]);
	double **Rho=0;  // categorical distribution given # sets...geometric weighting based on rho 
	if(efp) Rho=GetRhoCategoricalPriors(efp, LenSeq(Query), rho, SST[n], AB);
	else Rho=GetRhoCategoricalPriors(0, LenSeq(Query), rho, SST[n], AB);
	// fprintf(stderr,"GetRhoCategoricalPriors() done\n");
	if(efp) fprintf(efp," seed pattern %d.\n",n);
	// if(n != 4) continue;	// Temporary for testing...
	if(sst_str[n] && verbose) fprintf(stderr,"pattern %d: '%s'\n",n,sst_str[n]);
	if(sst_str[n]==0 && pttrn_str != 0) sst_str[n]=pttrn_str;
	// provide seed pattern.
	// should order the FG and BG sequences to match the input ordering.
	// Then should look for seed patterns.
#if 0	// make sure Hpt patterns are legal...
	fprintf(stderr,"sst_str[%d]='%s'\n",n,sst_str[n]);
	for(Int4 j=1; sst[n][j]; j++){
	  fprintf(stderr,"%d",j);
	  for(Int4 k=1; sst[n][j][k]; k++){
		char pstr[20],*s; PutSST(pstr,sst[n][j][k],AB);
		if((s=strstr(sst_str[n],pstr)) != 0){
		   PutSST(stderr,sst[n][j][k],AB); fprintf(stderr," ");
	  	   fprintf(stderr,"found -> '%s' (%d) == '%s'\n",pstr,k,s);
		} 
	   // char    *GetPatternFromSST(sst_typ sst, a_type AB);
	  } fprintf(stderr,"\n");
	}
	fprintf(stderr,"sst_str[%d]='%s'\n",n,sst_str[n]);
	PutSeq(stderr,Query,AB);
	exit(1);
#endif
	che[n] = new che_typ(sst_str[n],chn[n],A0,B0,verbose,Mode,Rho,PriorRi);
	for(Int4 j=1; j <= LenSeq(Query); j++) free(Rho[j]); free(Rho);
	che[n]->BeQuiet( );
	che[n]->SetMinNumColumns(min_num_col);
	MaxNumCol[n]=max_num_col;
	che[n]->SetMaxNumColumns(LenSeq(Query)+2);
	if(Contrast > 0) che[n]->SetContrast(Contrast);
#if 0	// DEBUG...
cma_typ *in_CMA=chn[n]->GetIN_CMSA();
fprintf(stderr,"####### %d. NumSeqsCMSA=%d\n",n,NumSeqsCMSA(in_CMA[1]));
#endif
	return 1;
}

Int4	cmc_typ::ReadSeedPttrns( )
//************* read in seed patterns from <infile>.sp *************
{
	Int4 i,r,f,b,n,g;
      for(i=1; i <= Hpt->NumBPPS(); i++){
	   //****************** Read partition information ***************
	   assert(Hpt->GrpName(i)!=0);
	   // check to make sure FG and BG don't overlap; later need to do within hpt_typ only. 
	   for(f=1; f <= Hpt->nGrpsFG(i); f++){
	     for(b=1; b <= Hpt->nGrpsBG(i); b++){
		if(Hpt->GrpsFG(i,f)==Hpt->GrpsBG(i,b)){
		   fprintf(stderr,"Analysis #%d: group %d assigned to both FG & BG sets.\n",
				i,Hpt->GrpsFG(i,f));
		   print_error("Fatal: FG & BG set overlap disallowed.");
		}
	     }
	   }
	   if(sst_str[i]) free(sst_str);
	   if(NoSeeds || Hpt->sst_str(i)==0) sst_str[i]=0;
	   else sst_str[i]=AllocString(Hpt->sst_str(i));
      }
      return Hpt->NumBPPS();
}

