/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#include "scc_typ.h"

gth_typ	*scc_typ::RtnDiGraph(FILE *fp)
// Return a Graph for tree construction.
{
   static UInt4	junk_num=0;
   double d,ratio,lprJI,lprIJ,lpr,lprJ,lprI;
   set_typ SetJmI=0;
   fprintf(fp,"=================== Final sets and patterns ===================\n");
   Int4	i;
   Root=MaxSetID+1; assert(Root==NumSets+1); assert(Set[Root] == 0);
   Set[Root]=MakeSet(SetSize); FillSet(Set[Root]);
// IntersectNotSet(Set[Root], RandomSet);
   gth_typ *gth= new gth_typ(NumSets,MinLPRforEdge);

#if 1	// store key information.
   NEW(NodeSize,NumSets+4,Int4);
   NEWP(Overlap,NumSets+4,Int4);
   for(i=1; i <= Root; i++) NEW(Overlap[i],NumSets+4,Int4);
#endif

   for(i=1; i <= NumSets; i++){
     if(Set[i]){
	if(SetJmI==0) SetJmI=MakeSet(SetN(Set[i]));	// make a temporary set.
	Int4	N=0,n,j,cardI=CardSet(Set[i]),cardJ,cardIJ;
	NodeSize[i]=cardI;

	// fprintf(fp,"Set%d(%d) = %d seqs\n ",i,SetID[i],cardI);
	fprintf(fp,"\nSet%d(%d):\n PATTERN: ",i,cardI);
        for(n=0,j=1; j <= Length; j++){
	  char *str=GetPatternFromSST(SST[i][j]);
	  if(str){ fprintf(fp,"%s%d ",str,j); free(str); n++; }
	} if(n > 0) fprintf(fp,"(%d positions)\n",n); else fprintf(fp,"(none) \n");

	fprintf(fp," SUPERSETS: "); 	//************** is j a superset of i? ******************
        for(N=0,n=0,j=1; j <= NumSets; j++){
	  if(i==j) continue;
	  d=GetBasicIntersectInfo(i,j,ratio,cardI,cardJ,cardIJ);
	  Overlap[i][j]=cardIJ; Overlap[j][i]=cardIJ;
	  if(cardJ <= cardI) continue; // implies that j can't be a superset of i. 
	  if(cardIJ == 0) continue; 	// the sets are distinct.
	  d = (double) cardIJ/(double)cardI; if(d < MinSubIntersectSuperRatio) continue; 
	  // >= 50% of i must be in j.
	  d=(double) cardJ/(double) (cardIJ); if(d < SuperSetMinRatio) continue;	
	  // j must be >= 33% larger than i intersect j.
	  // e.g., d = 0.5 (1/2 intersection) & ratio = 4 (superset 4x larger).

	  // check to ensure that FG vs BG LPR is significant.
	  sst_typ *isst= GetOptPttrnLPR(0,Set[i],Set[j],FALSE,lprIJ,20,'L');	 // leaf check mode...
	  // Get SetI vs (SetJ - SetI) optimum PttrnI.
	  if(lprIJ < MinLPRforEdge) { free(isst); continue; }	// Might happen if sets are very similar.

#if 0	  // this should not be necessary...defined isst to mismatch SetJ - SetI.
	  // Want to make sure that patterns I & J are quite distinct.
	  lprJI=CalcSetvsPttrnLPR(0,SetJmI,Set[i],isst,TRUE);	// SetJ-I vs Not SetI should not match PttrnI!
	  // lprJI=CalcSetvsPttrnLPR(0,Set[j],Set[i],isst,TRUE);	// SetJ vs Not SetI should not match PttrnI!
	  if(lprJI >= 0){ free(isst); continue; }	//  this needs to be negative.
#endif
	  // see whether SetI contributes at least 80% of its fair share to superfamily lpr.
	  double dummy,WtCntsFG,WtCntsSfFG;
	  sst_typ *jsst=GetOptPttrnLPR(0,Set[j],Set[j],TRUE,lprJ,20,'M');	// SetJ vs not SetJ.
	  WtCardFG_BG_Sets(WtCntsSfFG,dummy);	// Superfamily FG weighted counts.

	  lprI=CalcSetvsPttrnLPR(0,Set[i],Set[j],jsst,TRUE,'M');  	// SetI vs not SetJ should match PttrnJ.
	  WtCardFG_BG_Sets(WtCntsFG,dummy);
	  double min_lpr = lprJ*0.80*((double) WtCntsFG/(double) WtCntsSfFG); free(jsst);
	  if(lprI <  min_lpr){ free(isst); continue; } // does subset contribute >= 80% to total lprJ?
	  //  i(head) <-- j (tail)
	  if(gth->AddEdge(lprIJ,isst,j,cardJ,i,cardI,cardIJ) == 0){ free(isst); continue; }
	  // if(SST[i]) free(SST[i]); SST[i]=isst;
	  
	  n++; fprintf(fp,"%d(%d;%d) ",SetID[j],cardJ,cardIJ);
	  fprintf(fp,"[lpr: %.1f vs %.1f (%.1f)]; ",lprJ,lprI,min_lpr); 
	} if(n > 0){ fprintf(fp,"(%d sets)\n",n);  } else fprintf(fp," (none)\n");
	// if(MemberSet(i,Reject)) continue;

	fprintf(fp," DISTINCT: "); N+=n;
        for(n=0,j=1; j <= NumSets; j++){
	  if(i==j) continue;
	  d=GetBasicIntersectInfo(i,j,ratio,cardI,cardJ,cardIJ);
	  if(d <= MaxDistinctRatio){ n++; if(d > 0.0) fprintf(fp,"%d(%d;%d) ",SetID[j],cardJ,cardIJ); }
	} if(n== NumSets-1) gth->MkDistinct(i); // does not intersect with anything but itself...
	if(n > 0) fprintf(fp,"(%d/%d sets)\n",n,NumSets-1); else fprintf(fp," (none)\n");
	N+=n; 
     }
   } if(SetJmI) NilSet(SetJmI);
 {
   for(i=1; i <= NumSets; i++){
	unsigned char *csq;
	sst_typ *xsst= GetOptPttrnLPR(0,Set[i],Set[Root],FALSE,lpr,20,csq,'L'); // SetI vs SetJ optimum PttrnI.
	if(gth->LinkToRoot(lpr,xsst,i) == 0){ free(xsst); free(csq); }
	else {
		if(SST[i]) free(SST[i]); SST[i]=xsst; 
		if(CSQ[i]) free(CSQ[i]); CSQ[i]=csq; 
	}	// use optimized pattern versus RootSet.
	// create a CopySST() routine to avoid doubly freed up memory and/or lost memory.
   }
 }
  gth->PutSetsGraph(fp);
  NilSet(Set[Root]); Set[Root]=0;
  fflush(fp);
  return gth;
}

grf_typ	*scc_typ::MkGraphOfSimilarSets(FILE *fp, FILE *ptrn_fp)
// Create a graph with edges between similar sequence sets.
{
	h_type	HGz,HGz0;
	if(ptrn_fp){
	  HGz=Histogram("Z-scores for mean differences", 0,30,1.0);
	  HGz0=Histogram("Z-scores for mean differences", 0,1,0.05);
	} grf_typ *grf=new grf_typ(MaxSetID+2);
	for(Int4 i=1; i <= NumSets; i++){		// skip 1st set = MainSet...
	     set_typ SetI=Set[i];
	     Int4 cardsetI=CardSet(SetI),cardsetJ,overlap;
	     for(Int4 j=i+1; j <= NumSets; j++){	// skip 1st set = MainSet...
		set_typ SetJ=Set[j]; assert(SetJ && SetI);

		//============== Do their sequence sets sufficiently overlap? ================
	        double ratio,lprJI,lprIJ,d,di,dj;
		d=GetBasicIntersectInfo(i,j,ratio,cardsetI,cardsetJ,overlap);
		if(overlap == 0) continue;
		di = (double) overlap/(double) cardsetI;
		dj = (double) overlap/(double) cardsetJ;
		if(di < MinIntersect) continue;
 		if(dj < MinIntersect) continue;	// make sure that their intersection is sufficient.
		ratio=MINIMUM(double,ratio,1.0/ratio);	// adjust ratio to be <= 1.0.
		if(ratio < MinSetSizeRatio) continue;	// make sure that the sets are about equal.

		//============== Are their signature patterns similar? ================
		Int4 len;
		sst_typ *osst,*ISST[4]; ISST[1]=SST[i]; ISST[2]=SST[j];
	   	osst = ConsensusPattern(0,2,ISST,len); free(osst); // requires pattern intersection.
 		if(len < MinPttrnIntersect) continue;

		Int4 maxlen=MAXIMUM(Int4,LengthPattern(SST[i]),LengthPattern(SST[j]));
		di=(double) len/ (double) maxlen;
		if(di < MinFractPttrnIntersect) continue;

		// if(!ConsistentSSTs(SST[i],SST[j])) continue;

		//=========== Do seq. sets match each others signature patterns similarly? =============
	  	lprJI=CalcSetvsPttrnLPR(0,Set[j], Set[i],SST[i],TRUE);	// SetJ vs Not SetI or SetJ with pattern I.
	  	if(lprJI < MinLPRforEdge) continue;	//  
	  	lprIJ=CalcSetvsPttrnLPR(0,Set[i], Set[j],SST[j],TRUE);
	  	if(lprIJ < MinLPRforEdge) continue;	//  

#if 1		// see whether lprJI and lprIJ are roughly the same...
		double minlpr=MINIMUM(double,lprJI,lprIJ);
		double maxlpr=MAXIMUM(double,lprJI,lprIJ);
		// if(minlpr < 0.90 * maxlpr) continue;	// if SetJ is substituted for Set I.
		if(minlpr < 0.80 * maxlpr) continue;	// if SetJ is substituted for Set I.
#endif
#if 0		
		fprintf(fp," sets i=%d & j=%d merged! (lprIJ=%.1f [II=%.1f] vs lprJI=%.1f [JJ=%.1f])\n",
						SetID[i],SetID[j],
						lprIJ, CalcSetvsPttrnLPR(0,Set[i], Set[j],SST[i],TRUE),
						lprJI, CalcSetvsPttrnLPR(0,Set[j], Set[i],SST[j],TRUE));
#endif
#if 0		// this is for print out only; no need to recompute...
		set_typ SetU=CopySet(Set[i]);  UnionSet(SetU, Set[j]);
		double lprUJ=CalcSetvsPttrnLPR(0,SetU, Set[j],SST[j],TRUE);
		double lprUI=CalcSetvsPttrnLPR(0,SetU, Set[j],SST[i],TRUE);
		fprintf(fp," union sets %d & %d! (lprUI=%.1f; lprUJ=%.1f) =====\n",
							SetID[i],SetID[j],lprUI,lprUJ);
		NilSet(SetU);
#endif
#if 0
		fprintf(fp,"===== sets %d & %d merged! (lprJI=%.1f; lprIJ=%.1f) =====\n",
							SetID[i],SetID[j],lprJI,lprIJ);
		if(lprIJ >= MinLPRtoMerge && lprJI >= MinLPRtoMerge
					&& ConsistentSSTs(SST[i],SST[j]))
			MergeSSTs(SST[i],SST[j]);	// Merge the pattern sets.
#endif
		grf->AddEdge(SetID[i],SetID[j]); 
	    }
	}
	if(ptrn_fp){
	  PutHist(ptrn_fp,60,HGz0); NilHist(HGz0);
	  PutHist(ptrn_fp,60,HGz); NilHist(HGz);
	} return grf;
}

//*********************************** SuperSet Routines *********************************

#if 0	// junk files that are not being used...
grf_typ	*scc_typ::MkGraphOfSetOverlaps(FILE *fp, Int4 num_sets, set_typ *set, sst_typ **sst, Int4 *set_id)
// Create a graph with edges between similar sequence sets.
{
	h_type	HG,HG0;
	Int4	i,max_set_id=0;
	for(i=1; i<=num_sets; i++) max_set_id=MAXIMUM(Int4,max_set_id,set_id[i]);
	if(fp){
	  HG=Histogram("percent edge overlap", 0,100,2.0);
	  HG0=Histogram("LPRs", 0,1,0.05);
	} grf_typ *grf=new grf_typ(max_set_id+2);
#if 0
	ds_type sets=DSets(NumSets);
	seti=findDSets(i,sets);
	setj=findDSets(j,sets);
	seti=linkDSets(seti,setj,sets);  
	dset=AssignDSets(sets, &Cardinality, &num_dsets);
	NilDSets(sets);
#endif
	for(i=1; i <= num_sets; i++){		// skip 1st set = MainSet...
	     set_typ SetI=set[i];
	     Int4 cardsetI=CardSet(SetI),cardsetJ,overlap;
	     for(Int4 j=i+1; j <= num_sets; j++){	// skip 1st set = MainSet...
		set_typ SetJ=set[j]; assert(SetJ && SetI);

		//============== Do their sequence sets sufficiently overlap? ================
	        double ratio,lprJI,lprIJ;
		double d=GetBasicIntersectInfo(i,j,ratio,cardsetI,cardsetJ,overlap);
		if(overlap == 0) continue;
		char wtI = (char) ceil(100.0*((double) overlap/(double) cardsetI));	// % of setI overlap.
		assert(wtI <= 100 && wtI > 0);
		char wtJ = (char) ceil(100.0*((double) overlap/(double) cardsetJ));	// % of setI overlap.
		assert(wtJ <= 100 && wtJ > 0);
		grf->AddEdge(set_id[i],set_id[j],wtI,wtJ); 
		IncdHist((double)wtI,HG); IncdHist((double)wtJ,HG);
	    }
	}
	if(fp){
	  PutHist(fp,60,HG0); NilHist(HG0);
	  PutHist(fp,60,HG); NilHist(HG);
	} return grf;
}

#endif


