/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#if !defined (_SCC_TYP_)
#define _SCC_TYP_

#include "alphabet.h"
#include "dheap.h"
#include "cmsa.h"
#include "sset.h"
#include "ptrn_typ.h"
#include "clique.h"
#include "wdigraph.h"
#include "tax_typ.h"
#include "tree2hpt.h"
#include "probability.h"
#include "bpps_typ.h"
#include "swt_typ.h"
#include "dsets.h"
#include "rst_typ.h"
#include "gth_typ.h"
#include "lpr_typ.h"
#include "rsq_typ.h"

//============================ scc_cma.cc ===========================
h_type	NumPartlyMatchingSeqCMSA(FILE *ofp, cma_typ cma, set_typ sqset, sst_typ *Pattern,
                Int4 NumPttrnPos);

// void    PutBestRepsCMSA(FILE *fp, Int4 num_phyla, BooLean IgnoreGaps, cma_typ cma);

class scc_typ {         // set clique clustering type.
public:
          scc_typ( ){ assert(!"Illegal constructor"); }
	  scc_typ(Int4 , Int4 *, set_typ **,sst_typ ***,cma_typ ,swt_typ *);
          scc_typ(Int4 ,set_typ *, sst_typ **, cma_typ , Int4 *,swt_typ *);

	  scc_typ(char *argv1,cma_typ in_cma,swt_typ *SWT);

          ~scc_typ( ){ Free( ); }
	//============================ scc_typ.cc ===========================
	set_typ	*CreateTree(FILE *,Int4 &,Int4);
	BooLean RemoveMisfitNodes(FILE *fp,wdg_typ &Tree, Int4 LeafTrimCutoff);
	
	//============================ scc_pttrn.cc ===========================
	void	PutOptPattern(FILE *fp, Int4 st);
	void	PutOptSST(FILE *fp);
	//============================ scc_init.cc ===========================
	void	SetSuperSetMinRatio(double d){ SetParameters(d,'S'); }
	void	SetMaxDistinctRatio(double d){ SetParameters(d,'D'); }
	void	SetMinSetSizeRatio(double d){ SetParameters(d,'M'); }
	void	SetMinIntersect(double d){ SetParameters(d,'i'); }

	//============================ scc_sets.cc ===========================
	Int4	MapOptSetToInSet(Int4 set){
			if(set > 0 && set <= NumRtnOptSet) return RtnOptSetID[set];
			else print_error("MapOptSetToInSet( ) input error");
		}
	set_typ	*	RtnUCSets(Int4 &nuc){ set_typ *rtn=UCSet; nuc=NumClust; NumClust=0; UCSet=0; return rtn; }
	sst_typ	**RtnUCSSTs( ){ sst_typ **rtn=UCSST; UCSST=0; return rtn; }
	Int4	*RtnUCSetIDs( ){ Int4 *rtn=UCSetID; UCSetID=0; return rtn; }
	Int4	RtnNumSets(){ return NumSets; }
	Int4	RtnNewNumSets(){ return NumClust; }
	swt_typ	*RtnSWT(){ return swt; }
	cma_typ	RtnCMA(){ return cma; }
	BooLean	MergeSimilarSets(FILE*,FILE*,BooLean);
	Int4	FindSuperSets(FILE*, double , Int4,double);
	Int4	CreateSuperSets(FILE*, double , Int4,double);
	void    ClusterSets(double ratio_cutoff);
	// BooLean	CreateSuperSets(FILE *fp, FILE *ptrn_fp, BooLean WithClustering);
	// ============================== output =============================
	void	PutVerboseReport(FILE *fp);
	void    PrintSMA(FILE *smafp,Int4 num_phyla);
	wdg_typ	RtnOptTree(Int4 &root){ root= Root; wdg_typ rtn=OptTree; OptTree=0; return rtn; }
	void    PrintNewickTree(FILE *phfp,FILE *nwfp){
			assert(OptTree != 0); PrintNewickTree(phfp,Root,'x',OptTree);
  			PrintNewickTree(nwfp,Root,'I',OptTree);
		}
	BooLean	WillFreeInput(){ return OwnInput; }
	set_typ	RtnRandomSet( ){ return RandomSet; }
	// for code that may someday be useful see scc_junk.cc
private: //============================ scc_typ.cc ===========================
	BooLean	RefineTree(FILE *fp,wdg_typ &Tree, Int4 LeafTrimCutoff);
	BooLean MakeNodeSetsDisjoint(wdg_typ &Tree);
	BooLean OptimizePatternsPartitions(FILE *fp, wdg_typ &Tree);
	BooLean RemoveBadNodes(FILE *fp,wdg_typ &Tree, Int4 LeafTrimCutoff);
	Int4	Iteration(Int4 i){ assert(i > 0 && i <= NumSets); return SetID[i]/10; }

	//============================ scc_init.cc ===========================
	void	SetParameters(double d, char mode );
	void	InitializeParameters( );
	void    InitA(Int4 NumIters, Int4 *nsets, set_typ **set, sst_typ ***sst);
        void    InitB( );
	Int4    ReadPttrnSets(FILE *fp,set_typ *&set, sst_typ **&sst, Int4 &Len);
	void	CheckSets(Int4 nsets,set_typ *set,Int4 iter=0);
        void    Free( );

	//============================ scc_grph.cc ===========================
	gth_typ *RtnDiGraph(FILE *fp);
	grf_typ *MkGraphOfSetOverlaps(FILE *fp, Int4 num_sets, set_typ *set, sst_typ **sst, Int4 *set_id);
	grf_typ *MkGraphOfSimilarSets(FILE *fp, FILE *ptrn_fp);
	// grf_typ *MkGraphOfSuperSets(FILE *fp, FILE *ptrn_fp);

	//============================ scc_tree.cc ===========================
	// move these to wdg_typ eventually...?
	wdg_typ	TrimLeaves(Int4 Root, Int4 LeafTrimCutoff,wdg_typ T);
	wdg_typ RewireMisfitNodes(Int4 Root, Int4 LeafTrimCutoff,wdg_typ T,BooLean &IsRewired);
	wdg_typ RmBadInternalNode(Int4 Root, Int4 LeafTrimCutoff,wdg_typ T,BooLean &IsRemoved);
	set_typ	RtnLeafSet(wdg_typ T);
	set_typ	GetRewiredSet(Int4 parent, wdg_typ T);
	Int4	FindEdge(Int4 tail, Int4 head, wdg_typ T);
	wdg_typ	RmBadEdge(Int4 bad_edge, Int4 &ParentNode, wdg_typ T);
	wdg_typ	RmSingleChildNodes(wdg_typ T,Int4 LeafTrimCutoff);
	BooLean	IsLeafNode(Int4 v, wdg_typ T);
	Int4	ParentNode(Int4 v, wdg_typ T);
	BooLean	IsRootNode(Int4 v, wdg_typ T);
	void	PrintNewickTree(FILE *fp,Int4 root, char mode, wdg_typ T);
	void	TreeDFS(FILE *fp, Int4 v, char mode, wdg_typ T);
	Int4	*RtnSubTreeNodes(Int4 subroot, Int4 &size, wdg_typ T);
	set_typ	RtnSubTreeSeqSet(Int4 subroot, set_typ *set,wdg_typ T);
	Int4	TreeDFS(Int4 *SubTreeNodes, Int4 &number, Int4 v, wdg_typ T);

	//============================ scc_sets.cc ===========================
	set_typ	ConsensusSet(set_typ *inSet,set_typ ISet, set_typ USet, Int4 num);
	double	GetBasicIntersectInfo(Int4 i, Int4 j,double &ratio, Int4 cardI,Int4 &cardJ, Int4 &cardIJ);
	void	DeleteOptimizedSet(Int4 node);
	double  CalcSetSimilarity(FILE *, cma_typ,set_typ,set_typ,sst_typ*,Int4,double &,double &);
	void    PutSetCluster(FILE *fp,Int4 Size,Int4 *set_id,set_typ *inset,set_typ cns_set);
	set_typ SetsInCliques(Int4 maxsetid, Int4 NC, vst_typ **clique);

	//============================ scc_lpr.cc ===========================
	sst_typ *GetOptPttrnLPR(FILE *f,set_typ S1, set_typ S2,BooLean B,double &L,Int4 x)
		{ return lpr->GetOptPttrnLPR(f,S1, S2,B,L,x); }
	sst_typ *GetOptPttrnLPR(FILE *fp,set_typ SetFG, set_typ SetBG,BooLean Negate,double &llr,
					Int4 MaxCols,unsigned char *&rtn_csq)
		{ return lpr->GetOptPttrnLPR(fp,SetFG,SetBG,Negate,llr,MaxCols,rtn_csq); }
	sst_typ *GetOptPttrnLPR(FILE *f,set_typ S1, set_typ S2,BooLean B,double &L,Int4 x,char typ)
		{ return lpr->GetOptPttrnLPR(f,S1,S2,B,L,x,typ); }
	sst_typ *GetOptPttrnLPR(FILE *fp,set_typ SetFG, set_typ SetBG,BooLean Negate, double &llr,
                	Int4 MaxCols, unsigned char *&rtn_csq, char Type)
		{ return lpr->GetOptPttrnLPR(fp,SetFG, SetBG,Negate, llr, MaxCols, rtn_csq, Type); }
	double	WtCardFG_BG_Sets(double &WtCntsFG, double &WtCntsBG)
		{ return lpr->WtCardFG_BG_Sets(WtCntsFG,WtCntsBG); }
	double  CalcSetvsPttrnLPR(FILE *fp,set_typ SetFG, set_typ SetBG,sst_typ *qsst,BooLean neg)
		{ return lpr->CalcSetvsPttrnLPR(fp,SetFG, SetBG,qsst,neg); }
	double  CalcSetvsPttrnLPR(FILE *fp,set_typ SetFG, set_typ SetBG,sst_typ *qsst,BooLean neg,char typ)
		{ return lpr->CalcSetvsPttrnLPR(fp,SetFG, SetBG,qsst,neg,typ); }

	lpr_typ	*lpr;

	//============================ scc_misc.cc ===========================
	double	*ConvertHGtoProb(h_type HG, Int4 &numbins,char mode);
	double	ComputeJSD(h_type HG1,h_type HG2);
	set_typ RmWeakMatches(FILE *ofp, Int4 II, double NumStdev);
	double	ComputeRelativeEntropy(h_type HG1,h_type HG2);
	long double	DoExactTest(set_typ SetI, set_typ SetJ, long double &onetail);

	Int4	LabelMoreNodes(set_typ Labeled, wdg_typ G);

	void    PrintSeq(FILE *fp, unsigned char *sq) {
             for(Int4 j=1; j <= Length; j++){ fprintf(fp,"%c",AlphaChar(sq[j],AB)); } fprintf(fp,"\n"); }
	Int4	RowID(Int4 i){ assert(i > 0 && i <= NumSets); return SetID[i]%10; }

	//============================ scc_pttrn.cc ===========================
	Int4	PatternLength(sst_typ *sst);
	Int4	LengthPattern(sst_typ *sst);
	sst_typ *UnionOfSSTs(sst_typ *sst1,sst_typ *sst2);
	void	MergeSSTs(sst_typ *sst1,sst_typ *sst2);
	BooLean ConsistentSSTs(sst_typ *sst1, sst_typ *sst2);
	sst_typ	*ConsensusPattern(FILE *fp,Int4 N,sst_typ **in_sst,Int4 &Score);
	sst_typ	*UnionizePatterns(FILE *fp,Int4 N,sst_typ **in_sst,Int4 &Score);
	sst_typ	*PatternIntersection(FILE *fp,Int4 N,sst_typ **in_sst,Int4 &Score);
	double  PatternIntersection(FILE *fp, Int4 Length, double *lprI, double *lprJ,
                                  sst_typ *sstI, sst_typ *sstJ,a_type AB);
	char    *GetPatternFromSST(sst_typ sstI);
	void	PutPatternFromSST(FILE *fp,sst_typ *xsst);
	Int4	GetPatternScore(sst_typ *sst, Int4 sq);
	double  SimilarPatterns(FILE *fp,double cutoff,Int4 I,Int4 J);
	void	PrintPattern(FILE *fp, sst_typ *xsst);
	void    PrintOptimizedCSQ(FILE *fp, Int4 i,Int4 id);

	//*********************** class variables *****************************************
	//*********************** class variables *****************************************
	//==================== Parameters settings =======================
	double	MinIntersect;
	double	MinSetSizeRatio;
	double	MaxDistinctRatio;
	double	SuperSetMinRatio;
	double	SubSetMaxRatio;
	double	MinSubIntersectSuperRatio;
	Int4	MinPttrnIntersect;
	double	MinFractPttrnIntersect;
	double	MinLPRtoMerge;
	double	MinLPRforEdge;

	//****************************************************************
	BooLean	OwnInput;
	a_type	AB;
	//---------------------------------------------------------------
	set_typ	*OptimizedSet;
	Int4	NumOptimizedSets;
	Int4	*InSetToOptimizedSet;
	Int4	*OptimizedSetToInSet;
	sst_typ	**OptimizedSST;
	unsigned char **OptimizedCSQ;
	wdg_typ OptTree;

	Int4	NumRtnOptSet;
	Int4	*RtnOptSetID;
	set_typ *RtnOptSets;
	//--------------------- Returned Sets ---------------------------
	set_typ	*UCSet;		// The union for each clique set.
	sst_typ	**UCSST;		// The pattern union for each clique.
	sst_typ	**ICSST;		// The pattern intersection for each clique.
	Int4	*UCSetID;	// The union for each clique set.
	Int4	NumClust;
	//---------------------------------------------------------------
	Int4	Length;		// Alignment length.
	cma_typ	cma;		// Input alignment.
	//---------------------------------------------------------------
	Int4	NumCols;
	sst_typ **SST;		// Hpt columns. make sure that NumCols == NumSets...
	unsigned char **CSQ;	// consensus sequences corresponding to pattern.
	double	*LPR;
	Int4	NumBPPS;	// Number of BPPS analyses...
	//---------------------------------------------------------------
	Int4	*SetID;		// set identifiers... 10,11,etc. == Set1_0,Set1_1,etc
	// set_typ SetID;	// represent as sets and take unions; use for *.sma files? 
	Int4	*SetID2index;	// set identifiers... 10,11,etc. == Set1_0,Set1_1,etc
	Int4	MaxSetID;	// max value of set id.
	Int4	NumSets;	// = total number of rows.
	set_typ	*Set;		// Hpt rows.
	Int4    SetSize;	// maximum size common to all input sets.
	Int4	NumRandom;	// number of random sequences counted in sets.
	set_typ	RandomSet;	// Set containing only random sequences.
	//---------------------------------------------------------------
	char	SCC_MODE;	// tells whether to create supersets.
	//-------------------- Tree structures --------------------------
	Int4	Root;		// Root node of graph.
	//---------------------------------------------------------------
	Int4	**Overlap;	// overlap between setI and setJ.
	Int4	*NodeSize;	// 
	//---------------------------------------------------------------
	swt_typ	*swt;
};

scc_typ *CreateSuperSetsDriver(FILE *fp, double cutoff,double MinLPR, scc_typ *in_scc);


#endif

