/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#include "cmc_typ.h"
#include "blosum62.h"

void    cmc_typ::ReSetRelations(FILE *fp)
// Recompute FG and BG sets and relationships.
{
	Int4	x;
	for(x=1; x <= Hpt->NumBPPS(); x++) NilSet(SetBG[x]);  free(SetBG);
	for(x=1; x <= Hpt->NumBPPS(); x++) NilSet(SetFG[x]);  free(SetFG);
	for(x=1; x <= Hpt->NumBPPS(); x++) free(RelateFGs[x]);  free(RelateFGs);
	for(x=1; x <= Hpt->NumBPPS(); x++) free(RelateBGs[x]);  free(RelateBGs);
	RelateFGs=SetRelationsFG( ); RelateBGs=SetRelationsBG( );
	if(fp){
    	  fprintf(fp,"FG set:\n");
	  for(x=1; x <= Hpt->NumBPPS(); x++){ fprintf(fp,"%d. ",x); PutSet(fp,SetFG[x]); }
	  PutSetRelations(fp);
    	  fprintf(fp,"BG set:\n");
	  for(x=1; x <= Hpt->NumBPPS(); x++){ fprintf(fp,"%d. ",x); PutSet(fp,SetBG[x]); }
	  fflush(fp);
	}
}

BooLean	cmc_typ::SampleHpt(FILE *fp,Int4 SampledCol, Int4 Root, wdg_typ &Tree)
// WARNING: this routine assumes direct correspondence between column and rows!!
// So that the diagonal equates rows (sets) with columns (FD-categories)!!
{
    // 3. Change that cell and recompute LPR.
    // 4. If this improves the LPR then accept the change, else revert back... 

    Int4	g,n,r,c,x;
    char	state;
    double	olpr,nlpr;
    Int4	row_neg,row_pos,row_omit,col_neg,col_pos,col_omit;
    Int4	sq,num_sq = NumSeqsCMSA(MainCMA);

    wdg_typ NewTree=0;	// NewTree to merge with old tree...
    if(Tree){ NewTree=MkWdgraph(WdgraphN(Tree),WdgraphM(Tree)); }

    assert(Hpt->NumSets() == Hpt->NumBPPS() +1); // assumes direct correspondence between column and rows!!
    // Start at n=2 because don't want to modify main set...
    for(n=2; n<= Hpt->NumBPPS(); n++){	// 2. Find a column to work on.

//************ Testing this for cdhBPPS routine: adding an internal node...
	if(SampledCol && SampledCol != n) continue;

	g=n; 	// assume direct correspondence between column and rows!!
	if(Hpt->TypeOfSet(g) != '?') continue; // find a misc. column 
    	// 1. Found miscellaneous column to change within the hpt.
	char    **HP=HyperPartition;
	// start with g == 2; don't allow main set to be in foreground!!!
        for(g=2; g < Hpt->NumSets(); g++){		// for each row... Don't look at Rejects...
	    if(Hpt->Cell(g,n) == '+') continue;	// don't look at positive cells right now...
	    if(Hpt->Cell(g,n) == 'o') continue;	// don't look at omitted cells right now...
	    assert(Hpt->Cell(g,n) == '-');
	    Hpt->RowsInColumn(n,row_neg,row_pos,row_omit);   // count # '-' cells in column n.
	    if(row_neg < 2) continue; 
	    olpr = CalcTotalLPR(0,FALSE);
	    for(sq=1; sq <= num_sq; sq++){
		    if(MemberSet(sq,GrpSet[g])){ che[n]->SetPartition('+',sq); }
	    } nlpr=CalcTotalLPR(0,FALSE);
	    double d=nlpr-olpr;
	    if(d < 10.0){	// then revert back to previous..
	      if(fp) fprintf(fp,"Failed to change HP[%d][%d] from '-' to '+' (delta=%.3f).\n",g,n,d);
		for(sq=1; sq <= num_sq; sq++){
		     if(MemberSet(sq,GrpSet[g])){ che[n]->SetPartition('-',sq); } }
	    } else {	// accept change...
	      BooLean OkayToAdd=TRUE;
	      if(Tree){		// Then add an edge from set in row g to the misc node n within the tree.
		// WARNING: This assumes that Hpt was first generated from a tree! Won't work otherwise.
		x=(Int4) floor(d+0.5);
		if(Hpt->AddEdgeToTree(n,g,Root,x,Tree,NewTree) == 0) OkayToAdd=FALSE;
		// PrintNewickTreeWDG(stderr,Root, Tree); // need to pass in Root
	      } // Add edges to the input Tree.
	      if(OkayToAdd){ 
	        if(fp) fprintf(fp,"Changed HP[%d][%d] from '-' to '+' (delta=%.3f).\n",g,n,d);
	        Hpt->Change('-','+',g,n); ReSetRelations(); assert(HP[g][n] == '+');
	      } else { // revert back...
		for(sq=1; sq <= num_sq; sq++){ if(MemberSet(sq,GrpSet[g])){ che[n]->SetPartition('-',sq); } }
	      }
	    }
	}
    } 
    if(Tree){ Hpt->MergeOldTreeIntoNew(Root,Tree,NewTree); NilWdgraph(Tree); Tree=NewTree; }
    CalcTotalLPR( );
    return TRUE;
}

Int4	cmc_typ::RemoveSimilarSets( )
// Find sets that share a similar pattern and lpr with each other's patterns.
// remove the weaker of these and move sequences into the other set.
{
	Int4	  sq,g,h,i,j,k,n,lenI,lenJ,NumRemoved=0;
	bpps_typ  *ppsI,*ppsJ;
	sst_typ	  *sstI,*sstJ,*qst;
	double	  lpr,d,x,xII,xIJ,xJJ,xJI;

        Int4	num_sq = NumSeqsCMSA(MainCMA);
        char	**HP=HyperPartition;
	for(i=1; i < Hpt->NumSets(); i++){	// skip Rejected set..
	   if(Hpt->TypeOfSet(i) == '?') continue;
	   if(IsFailedSet[i]) continue;
	   ppsI=che[i]->BPPS();
	   lenI=ppsI->LenPattern( );
	   sstI=ppsI->RtnCopySST();
	   for(j=i+1; j < Hpt->NumSets(); j++){	// skip Rejected set..
		// if(j==i) continue; // shouldn't matter if loop from i+1...
	   	if(Hpt->TypeOfSet(j) == '?') continue;
	   	if(IsFailedSet[i]) continue;
		ppsJ=che[j]->BPPS();
	   	lenJ=ppsJ->LenPattern( );
		assert(lenI == lenJ);
	   	sstJ=ppsJ->RtnCopySST();
		lpr=CalcTotalLPR(0,FALSE); xII =Map[i]; xJJ=Map[j];

	// WARNING: this bypasses some restrictions for che_typ, such as min number column requirement...
		ppsI->ReplacePattern(sstJ); ppsJ->ReplacePattern(sstI);
		x=CalcTotalLPR(0,FALSE); xIJ=Map[i]; xJI=Map[j];
		ppsI->ReplacePattern(sstI); ppsJ->ReplacePattern(sstJ); // put pattern back...
		d=CalcTotalLPR(0,FALSE);
		if(x > 0.0 && xIJ > 0.0 && xJI > 0.0){
		  double fxJI=xJI/xJJ;
		  double fxIJ=xIJ/xII; 
		  char c=' ';
		  if(fxJI >= 0.50 && fxIJ >= 0.50) c='*';
		  fprintf(stderr,"%d vs %d swapped lpr = %.2f; lpr =%.2f (%.2f)\n",i,j,x,lpr,d);
		  fprintf(stderr,"   Map[%d] ptrn %d = %.3f (%.3f)%c\n",i,j,xIJ,xII,c);
		  fprintf(stderr,"   Map[%d] ptrn %d = %.3f (%.3f)%c\n",j,i,xJI,xJJ,c);
		  if(c=='*'){		// remove the weaker set.
		    if(xII < xJJ){ IsFailedSet[i]=TRUE; IsFailedBPPS[i]=TRUE;  g=i; h=j; }
		    else { IsFailedSet[j]=TRUE; IsFailedBPPS[j]=TRUE;  g=j; h=i; }
#if 0
		    for(sq=1; sq <= num_sq; sq++){
			   if(MemberSet(sq,GrpSet[g])) {
				if(MemberSet(sq,Labeled)){ DeleteSet(sq,Labeled); } 
				// ^^ remove these from Untouchable set.
				DeleteSet(sq,GrpSet[g]); AddSet(sq,GrpSet[h]);
		        	for(n=1; n<= Hpt->NumBPPS(); n++){ che[n]->SetPartition(HP[h][n],sq); }
			   }	// no seqs will be left in the removed set...
		    } NumRemoved++;
#else
		    TransferAllSeqs(g,h); NumRemoved++; // move sequences from failed set g to h;
		    assert(g <= Hpt->NumBPPS()); n = g;
		    che[n]->SetMinNumColumns(0);	// this is required to remove all columns!!
    		    Int4 Length=LenSeq(che[n]->KeyE());
		    for(k=1; k <= Length; k++){ if(che[n]->PttrnPos(k)) che[n]->RemoveColumn(k); }
#endif
		  }
		} free(sstJ);
	   } free(sstI);
	} return NumRemoved;
}

double	cmc_typ::SampleSeq(Int4 sq, double oLPR)
{
    if(MemberSet(sq,Labeled)) return oLPR;
    const double NEG_INFINITY=-1.0e+100;
    char	**HP=HyperPartition;
    double	nLPR,bestLPR;
    Int4	n,g,bestG,lastG;

    // 1. find the current set = lastG.
    for(g=1; g <= Hpt->NumSets(); g++){	
      if(MemberSet(sq,GrpSet[g])){ lastG=g; DeleteSet(sq,GrpSet[lastG]); break;} 
    }

    // 2. Obtain subLPRs for fast computing.
    double	*Dbl,*ColumnMap[256]; NEW(ColumnMap['o'],Hpt->NumBPPS()+3,double);
    NEW(ColumnMap['+'],Hpt->NumBPPS()+3,double); NEW(ColumnMap['-'],Hpt->NumBPPS()+3,double);
    // update NullParameters before removing sequence...!
    for(n=1; n<= Hpt->NumBPPS(); n++){
	che[n]->SetPartition('o',sq); Dbl=che[n]->SubMap( ); ColumnMap['o'][n]=Dbl[0];
	// can later speed up the below by only checking the, say, 5 best FG sets.
	che[n]->SetPartition('+',sq); Dbl=che[n]->SubMap( ); ColumnMap['+'][n]=Dbl[0]; 
	che[n]->SetPartition('-',sq); Dbl=che[n]->SubMap( ); ColumnMap['-'][n]=Dbl[0]; 
    }

    // 3. Store LPRs for each set assignment.
    double	*SetLPR; NEW(SetLPR,Hpt->NumSets()+3,double); bestG=0; bestLPR=NEG_INFINITY; 
    double	*deltaProb; NEW(deltaProb,Hpt->NumSets()+3,double);
    for(g=1; g<= Hpt->NumSets(); g++){	
	if(IsFailedSet[g]){ SetLPR[g]=NEG_INFINITY; continue; }
	double d0,dp=0.0,dm=0.0,d_o=0.0;
        for(n=1; n<= Hpt->NumBPPS(); n++){
	   switch(HP[g][n]){	// Ratio of being in the FG vs BG at these positions.
	    case '+': dp += ColumnMap['+'][n]; break;
	    case '-': dm += ColumnMap['-'][n]; break;
	    case 'o': d_o += ColumnMap['o'][n]; break;
	    default: print_error("input error"); break;	
	   }
	} d0= dp + dm + d_o; SetLPR[g]=d0;
	if(d0 > bestLPR){ 
		// fprintf(stderr,"sq %d: LPR %.2f --> %.2f.\n",sq,bestLPR,nLPR);
		bestLPR=d0; bestG=g; 
	}
    }

   // AddSet(sq,GrpSet[lastG]);	// No need to do this...
   // 4. Sample an elementary set for this sequence...
   double s,rand,sum=0.0,d;
   Int4 sampledG=0;
   if(temperature > 0.0){	// then reset bestG based on sampling...
    assert(temperature <= 300.0);
    for(g=1; g <= Hpt->NumSets(); g++){
	d=SetLPR[g]-bestLPR;	// Likelihood ratio vs best.
	assert(d <= 0.0);
	if(d < -14) deltaProb[g]=0.0;   // d < -14 --> < 1 millionth chance to sampled. 
	else { d = std::exp(d);	
	  if(temperature == 300.0){ deltaProb[g]=d; sum += d; }
	  else { 
		double y=(300.0/temperature);
		assert(!(d==0 && y <= 0)); assert(d >= 0);
		if(d == 0) deltaProb[g]=0.0;
                else {
		  d = pow(d,y);
		  if(!std::isfinite(d)) assert(!"isfinite failed");   // confirms that d is okay.
		  deltaProb[g]=d; sum += d; 
		}
	  }
	}
    }
    rand = sum * ((double) Random()/(double) RANDOM_MAX); // 0 <= random number <= sum.
    for(s=0.0, g=1; g <= Hpt->NumSets(); g++){
	if(deltaProb[g] == 0.0) continue;
	s += deltaProb[g]; 
	if(s >= rand){ sampledG=g; break; }
    } assert(s >= rand && s <= sum);
   } else sampledG=bestG; 

   // 5. Add the Seq. to the sampled set.  ; SetPartition were probably not set right...
   AddSet(sq,GrpSet[sampledG]); 
   TotalLPR=SetLPR[sampledG];
   for(n=1; n<= Hpt->NumBPPS(); n++){
	che[n]->SetPartition(HP[sampledG][n],sq);
	Map[n]=ColumnMap[HP[sampledG][n]][n]; // <-- This should be the same as CalcTotalLPR( );
   } 

   // 6. Free memory...
   free(SetLPR); free(deltaProb);
   free(ColumnMap['o']); free(ColumnMap['+']); free(ColumnMap['-']); // move below later when use for map!
   
   // 7. Check the value of LPR and, if okay and > best, then store it...
   BooLean StoreBestOK=FALSE;
   if(SaveBest) StoreBestOK=TRUE; 
   for(n=1; n<= Hpt->NumBPPS(); n++){ if(che[n]->NumColumns() > MaxNumCol[n]) StoreBestOK=FALSE; }
   CheckValue(TotalLPR);
   if(StoreBestOK && TotalLPR > 0.0 && TotalLPR > BestLPR) StoreBest();
   return TotalLPR;
}

BooLean	cmc_typ::IsConflict(Int4 k, Int4 better, Int4 worse)
// Is there a conflict between the patterns at position k in categories n and m?
// if(che[n]->PttrnPos(k)){ insrtHeap(n,(keytyp)-(LPR_Wt[n]*SubLPR[n][k]),dH); } 
{
	Int4 n=better,m=worse;

	// fprintf(stderr,"entering IsConflict(%d vs %d) col %d:\n",n,m,k);
	assert(n != m);			// This should not happen.
	sst_typ qst_n= che[n]->BPPS()->RtnSST(k); assert(qst_n);
	sst_typ qst_m=che[m]->BPPS()->RtnSST(k); assert(qst_m);
        char relFG=RelateFGs[n][m],relBG=RelateBGs[n][m];
	if(qst_n == qst_m){	// identical patterns...
		// fprintf(stderr,"qst_n == qst_m.\n");
	        if(StrictIndepend){
		    if(relFG == '0') return FALSE;	// foreground partitions disjoint=OK.
		    else return TRUE;
		}
		if(relFG != '0' && relBG != '0'){
 		   // fprintf(stderr,"Incompatible patterns (%d vs %d) %d:\n",n,m,m);
		   return TRUE;
		} else return FALSE; 
	} else if(relFG == '0') return FALSE; // if FGs distinct then pattern positions irrelevant.
	else {	// i.e., qst_n != qst_m && relFG != '0'.
	  // fprintf(stderr,"qst_n != qst_m && relFG != '0'.\n");
	  sst_typ ist=IntersectSset(qst_n,qst_m);
	  if(ist == 0) return FALSE;	// patterns distinct; no problem.
	  sst_typ ust=UnionSset(qst_n,qst_m);
	  switch (relFG){		// based on relationship to 
	   case '0': return FALSE;		// FGs don't overlap; do nothing; okay.
	   case '=': 			// identical FG sets.
	     if(StrictIndepend) return TRUE;
	     if(relBG != '0') return TRUE; 
	     else return FALSE;
	   case '+': 			// FG sets overlap but not identical or subsets.
	        if(StrictIndepend) return TRUE;
		if(relBG != '0'){	
		  if(ist != 0) return TRUE; else return FALSE;
		} else return FALSE;
	   case '<':			// FG_n is a proper subset of FG_m
	    if(qst_n == ist){		  // qst_n is a proper subset of qst_m.
		assert(qst_n != qst_m);
		return FALSE; 		 // do nothing... okay.
	    } else {			  // patterns overlap but !(qst_n < qst_m).
	       if(StrictIndepend) return TRUE;
	       if(relBG != '0') return TRUE;	// BGs overlap
	       else return FALSE;	// else do nothing as BGs are distinct...
	    } break;
	   case '>':				// FG_n is a proper superset of FG_m
	    if(qst_m == ist){		  	// qst_m is a proper superset of qst_n	
		assert(qst_n != qst_m);
		return FALSE; 			// do nothing... okay.
	    } else {
	       if(StrictIndepend) return TRUE;
	       if(relBG != '0') return TRUE;		// BGs overlap
	       else return FALSE;		// else do nothing as BGs are distinct...
	    } break;
	   default: print_error("cmc_typ::IsConflict(). This should not happen (1).\n");
	    break;
	  }
	} return FALSE;
}

double	cmc_typ::SampleColumns(BooLean UseNegCol)
#if 0	//*****************************************************************
// Operations: remove or add columns; move sequences up or down.
set options:
   A & B != 0 (Intersecting but 	-->	
   	A < B	  (subset)	-->	only subpatterns allowed.
	A !< B	  (not subset)	-->	no common pattern positions allowed.
   A & B = 0 (disjoint)		-->	All common pattern positions allowed...
 					(As long as transitive non-existent)
	(A | C) & (B | D) != 0 	-->	C and D intersect. 
	Implies that FG BG sampling is coupled between sets.
#endif	//*****************************************************************
{
     Int4  i,j,k,m,n;

// 1. Sample in all columns that most positively contribute to the LPR. 
     double startLLR=CalcTotalLPR(efp); // computes SubLPR[n][k] for all.
     Int4 Length=LenSeq(che[1]->KeyE());
	double temp = temperature; 
    if(temp > 300) temp=300;
#if 0	// this seems to work about as well for one test domain.
     for(n=1; n<= Hpt->NumBPPS(); n++){
	if(IsFailedBPPS[n]) continue;
	assert(LenSeq(che[n]->KeyE()) == Length);  // make sure that all same # columns!
        for(k=1; k <= Length; k++) che[n]->AddBestColumn(Map[n],k,sst[n][k]);
     }
#else
    for(j=1; j <=2; j++){  // do this twice to converge on stable alpha...
     if(efp) fprintf(efp,"loading up columns....\n");
     for(n=1; n<= Hpt->NumBPPS(); n++){
	if(IsFailedBPPS[n]) continue;
	assert(LenSeq(che[n]->KeyE()) == Length);  // make sure that all same # columns!
        // for(k=1; k <= Length; k++) che[n]->AddBestColumn(Map[n],k,SST[n][k]);
	m=che[n]->RtnMinNumColumns();
	che[n]->SetMinNumColumns(0);
	che[n]->SetMaxNumColumns(Length);	// redundant; also set above...
        for(k=1; k <= Length; k++){
	    che[n]->RemoveColumn(k);	// want to recalculate LPR here.
	    // che[n]->AddBestColumn(-99999999999999.9,k,SST[n][k]);
	    // if(che[n]->SamplePattern(-99999999999999.9,k,SST[n][k],temperature)){
	    if(che[n]->SamplePattern(-99999999999999.9,k,SST[n][k],temp)){
		double  *sub_lpr=che[n]->SubMap();
		if(sub_lpr[k] < -10.0){
		   if(0 && efp){ 
		        bpps_typ *bpps=che[n]->BPPS();
			fprintf(efp,"Added negative col %d from analysis %d (%.2f;%.3f).\n",
				k,n,sub_lpr[k],bpps->Alpha( ));
		   }
		   if(j == 1) che[n]->RemoveColumn(k);
		   // alpha changes when add or remove columns and this changes LPR at other sites!
		   // null_LPR depends only on priors --> priors effect zero cutoff.
		   // a negative column can increase the LPR by changing alpha...
		}
	    } else assert(!che[n]->PttrnPos(k)); // if failed then there should be no pattern.
	} che[n]->SetMinNumColumns(m);
	// che[n]->SetMaxNumColumns(MaxNumCol[n]);
     }
    }
#endif

//================================================================================
   // 2. ****************** Create a pattern column max heap... ****************
//================================================================================
   Int4 *BestToWorstCol,*BestToWorstCat,TotalBestToWorst;
   {	// begin of pch scope.
     double key;
     pch_typ pch = pch_typ(Hpt->NumBPPS(),Length);
     // 3. Compute the subLPR for all categories and positions...
     CalcTotalLPR(efp,FALSE); // computes SubLPR[n][k] for all.
     if(efp) fprintf(efp,"sampling columns....\n");
     // 4. Insert all positive columns into pch.
     for(j=0,n=1; n<= Hpt->NumBPPS(); n++){
	if(IsFailedBPPS[n]) continue;
	for(k=1; k <= Length; k++){
	   if(che[n]->PttrnPos(k)){
	      key = SubLPR[n][k];
	      // if(UseNegCol || key > 0.0){ j++; pch.Insert(j,key,n,k); }
	      // if(key > -10.0){ j++; pch.Insert(j,key,n,k); }
	      if(key > 0.0){ j++; pch.Insert(j,key,n,k); }
	      else {
		if(efp) fprintf(efp,"Removing negative col %d from analysis %d (%f).\n",k,n,key);
		// che[n]->RemoveColumn(k); // recalculates LPR!!! changes SubLPR[n][k] !!!
		che[n]->RmColButKeepLPR(k); // removes column without recalculating LPR.
	      }
	   } else assert(SubLPR[n][k] == 0.0);
	}
     } TotalBestToWorst=pch.NumItems(); assert(TotalBestToWorst == j);

// 5. Order all pattern-positions by contribution to total LPR;
// fprintf(stderr,"Ordering pattern-positions by contribution to total LPR.\n");
     NEW(BestToWorstCol,TotalBestToWorst+3,Int4);
     NEW(BestToWorstCat,TotalBestToWorst+3,Int4);
     for(j=0; !pch.Empty(); ){
	i=pch.DeleteMax(key,n,k);
	j++; BestToWorstCol[j]=k; BestToWorstCat[j]=n;
     } assert(TotalBestToWorst == j);
   }
//================================================================================
//**************************** end of pch scope. *********************************
//================================================================================

// 6. "Add in" columns starting from the best to the worst. 
// fprintf(stderr,"Adding columns starting from the best to the worst (%d).\n",TotalBestToWorst); 
    Int4 *TotalAddedCol,*TotalAddedCat;
    NEW(TotalAddedCol,Hpt->NumBPPS()+3,Int4); NEW(TotalAddedCat,Length+3,Int4);
    for(j=1; j <= TotalBestToWorst; j++){
	n=BestToWorstCat[j]; 
	if(n == 0) continue; 			// this column has already been removed.
	if(TotalAddedCol[n] == MaxNumCol[n]){	// if the maximum have been added, then remove the rest.
	  for(i=j; i <= TotalBestToWorst; i++){	// remove all remaining columns in this category.
	    if(BestToWorstCat[i] == n){			// is this the same category?
		k=BestToWorstCol[i]; 
		che[n]->RmColButKeepLPR(k); // removes column without recalculating LPR.
		// che[n]->RemoveColumn(k);
		BestToWorstCat[i] = 0; BestToWorstCol[i]=0;
	    }
	  }
	} else {					// still more left to add.
	  assert(TotalAddedCol[n] < MaxNumCol[n]);
	  k=BestToWorstCol[j];
	  assert(k != 0);
	  if(TotalAddedCat[k] > 0){	// if already added a column here then check compatibility.
	     for(i=j-1; i > 0; i--){		// find the previous categories at this position.
		if(BestToWorstCol[i] == k){		// is this the same column?
		  m=BestToWorstCat[i];			// m is a previous (better) category
		  assert(m != 0 && m != n);		// this should not happen...
		  // fprintf(stderr,"cat %d vs %d at col %d.\n",n,m,k);
		  if(IsConflict(k,m,n)){		// is there a conflict between m & n at k?
		        // fprintf(stderr,"conflict found!\n");
			// che[n]->RemoveColumn(k);	// then remove the worst pattern...
			che[n]->RmColButKeepLPR(k); // removes column without recalculating LPR.
			BestToWorstCat[j] = 0; BestToWorstCol[j]=0;
			break;	// no need to check further...one conflict is enough.
		  } // else fprintf(stderr,"conflict not found.\n");
		}					// else skip over these...
	     }
	     if(i == 0){ TotalAddedCat[k]++; TotalAddedCol[n]++; }  // else add column k for n.
	  } else { TotalAddedCat[k]++; TotalAddedCol[n]++; }	// add first pattern at k...
	}
    }
// 7. Free memory.
   free(BestToWorstCol); free(BestToWorstCat); free(TotalAddedCol); free(TotalAddedCat);
#if 1
   double endLLR=CalcTotalLPR(efp); // computes SubLPR[n][k] for all.
   fprintf(stderr,"Column sampling LLR: %.2f to %.2f (%.2f)\n",startLLR,endLLR,endLLR-startLLR);
   return endLLR; 
#else
   return CalcTotalLPR(efp); // only recalculate LPR at very end.
#endif
}

Int4	cmc_typ::RemoveInternalNode(FILE *fp,Int4 node)
/**************************** FD-table *******************************
Set:  1  2  3  4  5  6  7  8  9 10 11 12 13 14 
  1:  +  -  o- o- o  o  o  o  o- o  o  o  -  o  1.cd00142 {31}
  2:  +  +  -  -  o  o  o  o  -  o  o  o  -  o  2.cd00891 {109} <- remove
  3:  +  +- +  -  o  o  o  o  -  o  o  o  -  o  3.cd00896 (274)
  4:  +  +- -  +  -  -  -  -  -  o  o  o  -  o  4.cd05165 {56} 
  5:  +  +- -  +  +  -  -  -  -  o  o  o  -  o  5.cd00894 (25)
  6:  +  +- -  +  -  +  -  -  -  o  o  o  -  o  6.cd05173 (16)
  7:  +  +- -  +  -  -  +  -  -  o  o  o  -  o  7.cd05174 (16)
  8:  +  +- -  +  -  -  -  +  -  o  o  o  -  o  8.cd05175 (23)
  9:  +  +- -  -  o  o  o  o  +  -  -  -  -  o  9.cd05166 {51}
 10:  +  +- -  -  o  o  o  o  +  +  -  -  -  o  10.cd00895 (20)
 11:  +  +- -  -  o  o  o  o  +  -  +  -  -  o  11.cd05176 (19)
 12:  +  +- -  -  o  o  o  o  +  -  -  +  -  o  12.cd05177 (20)
 13:  +  -  o- o- o  o  o  o  o- o  o  o  +  -  13.cd00893 {26}
 14:  +  -  o- o- o  o  o  o  o- o  o  o  +  +  14.cd05167 (276)
 15:  -  o  o  o  o  o  o  o  o  o  o  o  o  o  Rejected (0)
**********************************************************************/
{
	char	state,**HP=HyperPartition;  // HP[row][col]
	Int4	sq,g,i,j,k,n,s,r,row,col,c,child;	// no grandchildren affected...
	Int4	num_sq = NumSeqsCMSA(MainCMA);

	// 1. Confirm that node is eligable for removal.
	assert(node > 0 && node <= Hpt->NumBPPS() && node < Hpt->NumSets());
	assert(IsTreeHpt);
	assert(Hpt->TypeOfSet(node) == '?');	// i.e., this is an internal node.
	assert(HP[node][node] == '+');
	
	// 2. Modify the FD-table accordingly.
	for(row=node+1; row < Hpt->NumSets(); row++){	// skip last set == Random & 1st set == node.
	     if(HP[row][node] == '+'){			// This row is affected by the removal.
       		for(child=0,c=node+1; c <= Hpt->NumBPPS(); c++){	// Find first child in row.
		   if(HP[row][c] == '+'){ child=c; break; }	// This column is affected...
		} assert(child != 0);
		for(r=1; r < Hpt->NumSets(); r++){	// find background sets for 'node'.
		   if(HP[r][node] == '-' && 		// if r is a background set...
				HP[r][child] == 'o'){	// & not yet in child's background set.
			if(HP[r][child] != 'o'){
			   fprintf(stderr,"node = %d; child = %d; row=%d; r=%d\n",
				   node,child,row,r);
     			   PutHyperPartition(outfp); fflush(outfp);
			}
			assert(HP[r][child] == 'o');	// 
	      		Hpt->Change('o','-',r,child); 	// move this node to child's background.
			for(sq=1; sq <= num_sq; sq++){
			   if(MemberSet(sq,GrpSet[r])){ che[child]->SetPartition('-',sq); } 
			} ReSetRelations(fp); assert(HP[r][child] == '-');
		   }
	        } // ReSetRelations( );
	     }
	}

#if 0
	// 3. Change all '+' rows in node column to '-' except for node==row;
	for(row=node+1; row < Hpt->NumSets(); row++){	// skip last set == Random & 1st set == node.
	     if(HP[row][node] == '+'){			// This row is affected by the removal.
	      	Hpt->Change('+','-',row,node); 
		for(sq=1; sq <= num_sq; sq++){
		   if(MemberSet(sq,GrpSet[row])){ che[node]->SetPartition('-',sq); } 
		} ReSetRelations(fp); assert(HP[row][node] == '-');
	     }
	}
#elif 1	// Do nothing...
	Hpt->DeleteBPPS(node);
	Hpt->DeleteRow(node);
#else
	// 3. Change all rows in node column to 'o' except.
	for(row=1; row < Hpt->NumSets(); row++){	// skip last set == Random & 1st set == node.
	     if(HP[row][node] != 'o'){			// This row is affected by the removal.
		state=HP[row][node];
	      	Hpt->Change(state,'o',row,node); 
		for(sq=1; sq <= num_sq; sq++){
		   if(MemberSet(sq,GrpSet[row])){ che[node]->SetPartition('o',sq); } 
		} ReSetRelations(fp); assert(HP[row][node] == 'o');
	     }
	}
	// 4. Change all colums in row == node to 'o'.
	for(col=1; col <= Hpt->NumBPPS(); col++){	// skip last set == Random & 1st set == node.
	     if(HP[node][col] != 'o'){			// This row is affected by the removal.
		state=HP[node][col];
	      	Hpt->Change(state,'o',node,col); 
		for(sq=1; sq <= num_sq; sq++){
		   if(MemberSet(sq,GrpSet[node])){ che[col]->SetPartition('o',sq); } 
		} ReSetRelations(fp); assert(HP[row][node] == 'o');
	     }
	}
#endif
	return 0;
}

Int4	cmc_typ::RemoveLeafNode(FILE *fp,Int4 node)
{
	char	state,**HP=HyperPartition;  // HP[row][col]
	Int4	sq,g,i,j,k,n,s,r,row,col,c,child;	// no grandchildren affected...
	Int4	num_sq = NumSeqsCMSA(MainCMA);

	// 1. Confirm that node is eligable for removal.
	assert(node > 0 && node <= Hpt->NumBPPS() && node < Hpt->NumSets());
	assert(IsTreeHpt);
	assert(Hpt->TypeOfSet(node) != '?');	// i.e., this is a leaf node.
	assert(HP[node][node] == '+');

#if 1
	Hpt->DeleteBPPS(node);
	Hpt->DeleteRow(node);
#else
	// 3. Change all rows in node column to 'o' except.
	for(row=1; row < Hpt->NumSets(); row++){	// skip last set == Random & 1st set == node.
	     if(HP[row][node] != 'o'){			// This row is affected by the removal.
		state=HP[row][node];
	      	Hpt->Change(state,'o',row,node); 
		for(sq=1; sq <= num_sq; sq++){
		   if(MemberSet(sq,GrpSet[row])){ che[node]->SetPartition('o',sq); } 
		} ReSetRelations(fp); assert(HP[row][node] == 'o');
	     }
	}
	// 4. Change all colums in row == node to 'o'.
	for(col=1; col <= Hpt->NumBPPS(); col++){	// skip last set == Random & 1st set == node.
	     if(HP[node][col] != 'o'){			// This row is affected by the removal.
		state=HP[node][col];
	      	Hpt->Change(state,'o',node,col); 
		for(sq=1; sq <= num_sq; sq++){
		   if(MemberSet(sq,GrpSet[node])){ che[col]->SetPartition('o',sq); } 
		} ReSetRelations(fp); assert(HP[row][node] == 'o');
	     }
	}
#endif
	return 0;
}

Int4	cmc_typ::RemoveFailed( )
// Routine for Identifying the sets  & partitions to be removed if LPR <= 0.
// BPPS does not include a Misc Set.
{
    double	nLPR,lastLPR;
    Int4	g,i,j,k,n,s,NumFailed=0;
    char	**HP=HyperPartition;

    // nLPR=CalcTotalLPR(stderr,FALSE);
    nLPR=CalcTotalLPR(0,FALSE);

    // 1. Find failed sets. 
    for(g=Hpt->NumSets() - 1; g > 1; g--){	// skip last set == Random and first set == root.
      if(IsTreeHpt){   // for pmcBPPS program only unless mcBPPS used with -tree option.
	n=g; assert(g <= Hpt->NumBPPS());
	if(Map[n] <= 0.0 && HP[g][n] == '+'){	// then remove this set and contrast alignment.
	      if(!IsFailedSet[g]){
		 NumFailed++; IsFailedSet[g]=TRUE; 
	         assert(!IsFailedBPPS[n]); IsFailedBPPS[n]=TRUE;
                 if(Hpt->TypeOfSet(g) == '?') RemoveInternalNode(0,g);
		 else RemoveLeafNode(0,g); 	// Not much for leaf nodes.
	      } // RowsInColumn(Int4 col,Int4 &row_neg,Int4 &row_pos,Int4 &row_omit);
	}
      } else {	// if running mcBPPS can't assume that FD-table corresponds to a tree!
       // assert(!"DEBUG: this should not happer");
       for(n=1; n<= Hpt->NumBPPS(); n++){
	   if(Map[n] <= 0 && HP[g][n] == '+'){	// then don't allow this option...
		if(!IsFailedSet[g]) NumFailed++;
		IsFailedSet[g]=TRUE; IsFailedBPPS[n]=TRUE; 
	   }
       }
      }
    }
    if(IsTreeHpt && IsTreePMC){ NumFailed += RemoveSimilarSets( ); }
    if(NumFailed == Hpt->NumSets()) print_error("SemiConvergent test failed...now exiting");

    //-------------------- 2. Move Failed Sets to Main set. -------------------------
    for(g=2; g<= Hpt->NumSets(); g++){	
      if(IsFailedSet[g]) TransferAllSeqs(g,1); // transfer the sets to the main set == 1.
    }
    //-------------------- 3. Remove all columns from Failed BPPS. -------------------------
    Int4 Length=LenSeq(che[1]->KeyE());
    for(n=1; n<= Hpt->NumBPPS(); n++){
	if(IsFailedBPPS[n]){
	  che[n]->SetMinNumColumns(0);	// this is required to remove all columns!!
	  for(k=1; k <= Length; k++){ if(che[n]->PttrnPos(k))che[n]->RemoveColumn(k); }
	}
    } // nLPR=CalcTotalLPR(0,TRUE);	// 
// Hpt->Put(stderr,FALSE,TRUE);
// Hpt->Put(stderr,FALSE,FALSE);
    nLPR=CalcTotalLPR(0,FALSE);	// 
    return NumFailed;
}

BooLean	cmc_typ::Sample(Int4 IterStart,Int4 NumRounds,Int4 pruneIter)
// same as Sample2( ) above essentially.
// returns TRUE if converged; else returns FALSE;
// 3. Perform Gibbs sampling on columns and sequences 
{
     double	nLPR,lastLPR,BestThisRound=-999999999.9;
     Int4	sq,g,i,j,k,m,n,s;
     BooLean	RmAllNeg=TRUE;
     dh_type	dH=0;
    char	**HP=HyperPartition;

     assert(nAlpha(AB) == 20);
     DidRestoreBest=FALSE;	// if do sampling need to restore new Best.
  for(Int4 iter=IterStart; iter <= NumRounds; iter++){
     char	tmpstr[100];
     sprintf(tmpstr,"improvement in LPR (iter S%d)",iter);
     fprintf(stderr,"\n***************** iter S%d *******************\n",iter);
     lastLPR=CalcTotalLPR(0);  // Calculates all Map[n].
     BestThisRound=-999999999.9;
     Int4 num_sq = NumSeqsCMSA(MainCMA);

     //============ find failed sets & move to reject set. ==============
     // BooLean	IsTreeHpt=FALSE;	// Does the input FD-table correspond to a tree? i.e., pmcBPPS program.
     // if(strcmp(program_name,"pmcBPPS")==0) IsTreeHpt=TRUE;
     if(iter >= pruneIter) n=RemoveFailed( );
     else n=0; // remove sequences from unfruitful sets.
     nLPR=CalcTotalLPR(0,FALSE);
#if 1	// if removed nodes then need to store best all over again from current LPR.
     if(nLPR > 0.0 && n > 0){ 
	StoreBest();	// stores current LPR as best regardless of previous BestLPR...
     }
#endif
     if(nLPR > 0.0) SaveBest=TRUE;	// start saving the best configuration.

     dH = dheap(num_sq+3,4);
     for(sq=1; sq <= num_sq; sq++){
	if(!MemberSet(sq,Labeled)) insrtHeap(sq,(keytyp)Random(),dH); 
     }
     if(efp) fprintf(efp,"sampling sequences....\n");
     double lpr0=lastLPR; 
     Int4 Mod= (Int4) ceil((double) ItemsInHeap(dH)/(double) 25); 
     // reoptimize columns after each 4% of sequences sampled.
     Mod=MAXIMUM(Int4,Mod,5);
     Int4 Mod2=Mod;
     Int4 Mod2m1=Mod2 - 1;
     if(cfp) fprintf(cfp,"%d %.1f %.1f %d\n",
			Iteration,lastLPR,temperature,TotalColumns( ));

     for(i=1; (sq=delminHeap(dH)) != 0; i++) {
	nLPR=SampleSeq(sq,nLPR);  Iteration++;
	if(nLPR > BestThisRound) BestThisRound=nLPR;
        if(i % Mod == 0){	// sample a new column configuration...
	   // need cfp to avoid spike in LPR due to > MaxNumCols prior to Col. Sampling.
           if(cfp) fprintf(cfp,"%d %.1f %.0f %d\n",
			Iteration,nLPR,temperature,TotalColumns( ));
           if(ifp){
        	fprintf(ifp,"%d",Iteration);
		for(s=1; s< Hpt->NumSets(); s++){
		  fprintf(ifp,"\t%d",CardSet(GrpSet[s])); 
		} fprintf(ifp,"\t%d\n",CardSet(GrpSet[s])-NumRandom);
	   }
	   if(nLPR > 0.0)	// wait until finding significant partitions...
	   {
     	     // fprintf(stderr,"sampling columns....\n");
	     // if(iter < 3 || temperature >= 100.0) nLPR=SampleColumns(TRUE);	
	     // ^ Add pattern pos to optimize LPR.
	     if(temperature >= 200.0) nLPR=SampleColumns(TRUE);
	     else nLPR=SampleColumns(FALSE);	// Only add patterns contributing to LPR
	     if(nLPR > BestThisRound) BestThisRound=nLPR;
	     Iteration++;
	   }
	}
	if(i % Mod2 == Mod2m1){
	   double delta=100.0*(nLPR-lpr0)/lpr0;	// compute the % change
	   // double delta=(nLPR-lpr0);	// 
	   // double slope=delta/25.0;	// 
	   // if((delta <= 1.0 || slope < 1.0) && temperature > 0.0)
	   if(nLPR > 0.0 && (delta <= 1.0) && temperature > 0.0) {
		// temperature -= 1.0;
		temperature -= 10.0;
     		if(temperature < MinTemperature) temperature = 0.0;
	   }
#if 1
     	   fprintf(stderr,
		"%d(%d): LPR = %.2f (Last = %.2f; change = %.1f%c)(%.1f K)(%d cols)\n",
			iter,i,nLPR,lpr0,delta,'%',temperature,TotalColumns());
#endif
	} else if(i % Mod2 == 1){ lpr0=nLPR; } 
     } Nildheap(dH);
PutHyperPartition(stderr);
#if 0	// Test SampleHpt() here
     SampleHpt(stderr,3);
#endif
     if(cfp) fflush(cfp); if(ifp) fflush(ifp); 
     PutHyperPartition(outfp); fflush(outfp);

     // double increase=((nLPR -lastLPR)/lastLPR)- 1.0;
     double increase=100.0*((nLPR -lastLPR)/lastLPR);
     fprintf(stderr,"%d: LPR = %.2f (Last = %.2f; increase = %.5f%c)(%.1f K)\n",
			iter,nLPR,lastLPR,increase,'%',temperature);
     fflush(stderr);
     if(iter == 1 && temperature > 300.0) temperature=300.0;
     // if(iter > 1 && (temperature < 100.0 || increase < 0.1)) 	//  increase less than 1/10th %.
     if(iter > 1 && (temperature < 100.0 && increase < 0.1)) 	//  increase less than 1/10th %.
     {
        if(iter >= 3 && BestThisRound <= BestLPR) return TRUE;
						// no improvement after 3 iters --> converged.
        increase=((nLPR -lastLPR)/lastLPR);
        double ppb = ((double)ppb_increase/1000000000.0);
        if(increase < ppb){ return TRUE; }
        // if(iter > 4 && temperature < 50.0 && increase < ppb){ return TRUE; }
     }
    } // end of iter "for" loop 
    return FALSE;
}

