/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#if !defined (_CMC_TYP_)
#define _CMC_TYP_

#include "dsets.h"
#include "clique.h"
#include "probability.h"
#include "cmsa.h"
#include "chn_typ.h"
#include "sqd_typ.h"
#include "che_typ.h"
#include "hpt_typ.h"
#include "pch_typ.h"
#include "mheap.h"
#include "wdigraph.h"
#include "tax_typ.h"
#include "rsq_typ.h"
#include "rst_typ.h"
#include "swt_typ.h"

// #include "my_ncbimath.h"
#include "bpcp_typ.h"

#define CMC_USAGE_START "FATAL: parameter setting syntax error within <infile_prefix>.hpt:\n\
   USAGE: GroupName [options}\n\
     -A<real>:<real> - alpha hyperparameters A0:B0 (default: A0=B0=1.0)\n\
     -verbose      - Create extra output files (for public version)\n\
     -col=<int>:<int>  - Specify the min and max number of columns allowed\n\
     -N=<int>      - Maximum number of significant pattern positions to highlight\n\
                   - (This sets the contrast for the alignment.)\n\
     -P=<str>      - seed pattern string\n\
     -Ri=<real>    - Set prior probability that a row (seq) is in the foreground (default: 0.5)\n\
     -rho=<real>   - set prior probability (rho) that a column is a pattern position (default: 0.5)\n\
                        input is log(rho); e.g., rho=100 --> rho=e^-100 = column 'penalty' of -100 nats\n\
   \n\n"

/*******************************************************************************

Four states:
	FG 	foreground only.
	BG 	background only.
	OM	removed from analysis (these sampled to BG at higher levels of the hierarchy).

  Gold Standard Sequences:
	GoldStd FG sequences:
	implicit GoldStd BG sequences: (keep these in the BG!);
	 - based on how sugroups are defined some FG goldstd sequences will need to stay in BG

  Gold Standard Patterns:
	Define patterns for each subgroup in *.arg input file.

  Gold Standard subgroups:
	Define subgroups in *.arg input file.

 *******************************************************************************/

#if 1
typedef struct {
	cma_typ	main_cma;	// passed_in_main_cma vs PartitionByInputSetCMSA()
	hsw_typ	hsw;		// passed_in_hsw within GetChnFiles() 
	hpt_typ	*hpt;		// passed in hyperpartition.
	Int4	num_sets;	// passed_in_sets vs PartitionByInputSetCMSA()
	set_typ	*set;		// passed in sets vs PartitionByInputSetCMSA()
	set_typ	labeled;	// use instead of creating new within Init()
	Int4	num_ssts;	// should == hpt->NumBPPS().
	sst_typ	**sst;		// use to replace FindSeedPattern(n)
} cmc_checkpt_type;

typedef cmc_checkpt_type *cmc_chk_typ;

#endif

class cmc_typ {         // comprehensive multiple category partitioning with pattern selection
public:
	cmc_typ( ){ PrintUsage(stderr); }
	//=============== cmc_typ.cc =============
	cmc_typ(Int4 argc, char *argv[]); 
	cmc_typ(cma_typ in_cma,Int4 argc, char *argv[]);
	cmc_typ(cma_typ in_cma,cma_typ in_mcma, hsw_typ hsw, Int4 argc, char *argv[]);
	cmc_typ(cma_typ in_cma,Int4 NumInSets, set_typ *InSet, Int4 argc, char *argv[]);
	cmc_typ(cma_typ, cma_typ, hsw_typ, Int4, set_typ*, Int4 argc, char *argv[]);
	cmc_typ(cma_typ in_cma, cma_typ in_mcma, hsw_typ hsw, Int4 NumInSets, set_typ *InSet,
                        hpt_typ *in_hpt, cma_typ *in_sma, Int4 argc, char *argv[]);
	~cmc_typ( ){ if(SetsMode) free(SetsMode); Free( ); }

	void    Free();         // free memory.
	BooLean	SaveSets;

public:
	//=============== cmc_put.cc ======================
	void    PutHyperPartition(FILE *fp=0);
	double  Put( ){ Put(TRUE); return 0.0; }
	double  Put(BooLean put_rtf);
	void	PutMapContributions(FILE *);
	void	PutSeqContrib(FILE *cfp,FILE *hfp,FILE *sfp=0,FILE *pfp=0);
	BooLean	PutHptPttrns(FILE *fp);
	void	PutHpt(FILE *fp){ Hpt->Put(fp); }
	void    PutRTF(BooLean SaveChnFiles);
private:
	void    PutAllSubLPRs(FILE *fp){
		   for(Int4 i=1; i<=Hpt->NumBPPS(); i++){
			assert(che[i]); che[i]->PutSubLPRs(fp);  
		   }
		} 
	void	PutSortedHpt(FILE *fp){ Hpt->PutSorted(fp); }
	BooLean	PrintEachRTF;
public:
	//================= cmc_sample.cc =================
	BooLean	Sample( ){ return Sample(1,NumberRounds,PruneIter); }
	BooLean	Sample(Int4 IterStart,Int4 NumRounds,Int4 pruneIter);
	void	ResetPruneIter(Int4 i){ assert(i > 0); PruneIter=i; }
	Int4	MaxIterations() { return MaximumIter; }
	BooLean	SampleHpt(){ wdg_typ X=0; return SampleHpt(0,0,0,X); }
	BooLean	SampleHpt(FILE *fp){ wdg_typ X=0; return SampleHpt(fp,0,0,X); }
	BooLean	SampleHpt(FILE *fp,Int4 Root, wdg_typ &Tree){
			return SampleHpt(fp,0,Root,Tree); }
	BooLean	SampleHpt(FILE *fp,Int4 SampledCol){
			wdg_typ X=0; return SampleHpt(fp,SampledCol,0,X); }
	BooLean	SampleHpt(FILE *fp,Int4 SampledCol, Int4 Root, wdg_typ &Tree);
	Int4	RemoveSimilarSets( ); // Just for testing right now.
private:
	double	SampleSeq(Int4 sq,double lpr);
	double  SampleColumns(BooLean);
	void	ReSetRelations(FILE *fp=0);
	Int4    RemoveFailed( );
	Int4    RemoveInternalNode(FILE *fp,Int4 nd);
	Int4    RemoveLeafNode(FILE *fp,Int4 nd);
	BooLean IsConflict(Int4 k, Int4 better, Int4 worse);
public:
	//=================== cmc_chkpt.cc ===================
	void	ReadCheckpoint(char *outfilename);
	BooLean	WriteCheckpoint(char *outfilename);
	BooLean	WriteCheckPoint;
private:
	cmc_chk_typ checkpoint;
	void    FWriteSST(FILE *fp);
	sst_typ	**FReadSST(FILE *fp,Int4 &numBPPS,Int4 &len);

public:
	//================ in cmc_misc.cc =================
	hpt_typ	*GetHpt( ){ return Hpt; }
	Boolean	HptIsTree(){ return IsTreeHpt; }
	set_typ	*CopyOfSeqSets(){ return CopyOfSeqSets_Private(); }
	Int4	RtnNumElmntSetCMA( ){ return Hpt->NumSets(); }
	cmc_typ *OptimizeDisplaySet(hsw_typ HSW,Int4 argc, char *argv[]);
	void	PutSARPHpt(FILE *fp);
private:
	hpt_typ *MkSARPHpt( );
	set_typ *CopyOfBestTreeSets( );
	set_typ	*CopyOfPartitionSets();
	set_typ	*CopyOfSeqSets_Private();
	sst_typ	**RtnCopyOfSSTs( );
	double  **RtnCopyOfLPRs( );	// to be called after sampling is completed.
	void    WritePttrnSets(FILE *fp);	// for scc_typ to read.
	Int4	ReadPttrnSets(FILE *fp,set_typ *&set, sst_typ **&sst, Int4 &Len);
	BooLean	ChecksOut();  // check to see whether anything worthwhile was found.
	set_typ	CopyOfSeedSets();
	void	TransferAllSeqs(Int4 from, Int4 to);
	char    **RtnCopyOfPttrns( );
	cma_typ RtnCsqAsCMSA(Int4 n, char *name,sst_typ *xsst=0);

private:
	//=================== cmc_debug.cc ==================== 
	set_typ	DiffSets(set_typ set1, set_typ set2, set_typ setI, set_typ setJ);
	BooLean	TheSame(cmc_typ *cmc2);
	BooLean ConsistencyCheck();
	void	PutSetRelations(FILE *fp);
	void	PutFixedSeqs(FILE *fp);

	double	Temperature( ){ return temperature; }
	Int4	GetNumRandom(){ return NumRandom; }
	void    PutContinueFile(FILE *fp,BooLean Label);
	Int4	RtnLengthMainCMSA( ){ return LengthCMSA(1,MainCMA); }
	Int4	RtnNumCategories( ){ return Hpt->NumBPPS(); }
	Int4	Debugger(){
		  // this->PutHpt(stderr);  // saves hpt settings.
		  this->StoreBest();
		  this->RestoreBest();
		  this->PutHyperPartition(stderr);
		  // char *junk=AllocString("junk");
		  // this->WriteCheckpoint(junk); free(junk);
		  double dd=this->CalcTotalLPR();
		  fprintf(stderr,"LPR = %.3lf\n",dd);
		  return 0;
		}

	//=================== cmcBPPS.cc ========================
	// cma_typ	MkMainFileCMSA(cma_typ cma, Int4 num_random);

	//================= cmc_lpr.cc) ==================== 
public:
	double  CalcTotalLPR( )	
		   { if(SaveBest) return CalcTotalLPR(0,TRUE);
			else return CalcTotalLPR(0,FALSE); }
	double  CalcTotalLPR(FILE *fp)
		   { if(SaveBest) return CalcTotalLPR(fp,TRUE);
			else return CalcTotalLPR(fp,FALSE); }
	double	RestoreBest();	
	Int4    RtnNumFailed();
private:
	double  **GetResEvals(Int4 n);
		// ^For passing into tmpchn() for printing rtf.
	BooLean CheckValue(double x);
	double  TreeCalcTotalLPR(FILE *fp,BooLean StoreBestOK);
	double  NotTreeCalcTotalLPR(FILE *fp,BooLean StoreBestOK);
	double  CalcTotalLPR(FILE *fp,BooLean StoreBestOK){
		   if(IsTreeHpt){
			return TreeCalcTotalLPR(fp,StoreBestOK);
		   } else {
			return NotTreeCalcTotalLPR(fp,StoreBestOK);
		   }
		}
	double  NthLPR(Int4 n, FILE *fp=0);
	void	StoreBest();

	//****** variables for saving the Optimum 
	set_typ BestSet[MAX_NUM_ELMENTARY_SETS];
	sst_typ	*best_sst[MAX_NUM_ELMENTARY_SETS];
	double	BestLPR;
	BooLean	SaveBest;
	BooLean	ShowIndels;	// for rtf constrast alignments.
	BooLean	DidRestoreBest;
	hpt_typ	*BestHpt;
	BooLean	IsFailedBestSet[MAX_NUM_ELMENTARY_SETS];
	BooLean	IsFailedBestBPPS[MAX_NUM_ELMENTARY_SETS];

	//==================== cmc_arg.cc ============================
	void	ReadMainArg(Int4 argc, char *argv[]);
	void    PrintUsage(FILE *fp);

	//==================== cmc_init.cc =====================
	void	GetMainCMA();
	void	GetSeedAlns();
	char    GetSetsMode(Int4 Row);
	void    Init(Int4 argc, char *argv[]);
	void    InitAsNull();
	void	InitFlags();
	void	InitDefaults();

	set_typ	*PttrnPos;
	//==================== cmc_fill.cc =====================
	void    FillUpHpt( );
	void	PartitionBySeedAlnCMSA(Int4 NumSeedAln, char *TypeOfSet);
	void    PartitionByInputSetCMSA(Int4 NumSeedAln);

	Int4	ReadSeedPttrns( );
	void	GetChnFiles();
	Int4    SetUpNthSrch(Int4 n, Int4 argc,char *argv[]);
	void    RmAbsentSeqs();

	char    **GetSetRelations(const char *Title,Int4 *nGrpsX, Int4 **GrpsX,
			set_typ **RtnSetX);

	//==================== cmc_score.cc =====================
	void	ComputeMinSeed2CsqScores();
	Int4    *ComputeMinScore(FILE *ofp,Int4 NumSeedAln,FILE *efp=0);
	Int4    *ComputeMaxScore(FILE *ofp,Int4 NumSeedAln,
					Int4 *MinScoreInSet,FILE *efp=0);

	//==================== cmc_sort.cc =====================
	Int4    SortDisplaySets( );
	Int4    *SortByScoreCMSA(FILE *fp, char mode, Int4 &first_best,
					cma_typ cma, Int4 set);
	Int4	Seq2SeedCsqScore(Int4 sq,Int4 set);

	//==================== data arrays =====================
	Int4	*WorstToBest[MAX_NUM_ELMENTARY_SETS];
	Int4	Index1stBest[MAX_NUM_ELMENTARY_SETS];

	cma_typ	SeedCMA[MAX_NUM_ELMENTARY_SETS];
	Int4	MinSeed2CsqScore[MAX_NUM_ELMENTARY_SETS];

	//======================== cmc_pttrn.cc ==========================
	void	PartitionSingleSeedAlnCMSA(Int4 NumSeedAln, cma_typ *seed_cma,
			char *TypeOfSet);
	char    *FindSeedPattern(Int4);
	
	//==================== private inline code =======================
	Int4	TotalColumns( ){
			Int4 n,nCol=0;
			for(n=1; n<= Hpt->NumBPPS(); n++) nCol += che[n]->NumColumns( );
			return nCol; 
		}
#if 1	// new code for improved LPR
	BooLean IsInSet(Int4 sq, Int4 st){
                   assert(st > 0 && st <= Hpt->NumSets());
                   assert(sq > 0 && sq <= NumSeqsCMSA(TrueMainCMA));
                   return MemberSet(sq,GrpSet[st]);
        }
#endif

	//========================= VARIABLES ===============================
	BooLean	IsTreeHpt; // Does the input FD-table correspond to a Newick tree?
	BooLean	IsTreePMC; // Was the input FD-table generated by the pmcBPPS program?
	Int4	PruneIter;

	UInt4   RandomSeed;
	char	*SetsMode;
	static const Int4	MaxNumNodes=1000;
	static const Int4	MaxNumNodesPlus=1005;

	char	set_mode[MAX_NUM_ELMENTARY_SETS];
	cma_typ	passed_in_cma;
	cma_typ	passed_in_mcma;
	hsw_typ	passed_in_hsw;
#if 1	// new afn 11/2/12.
	hpt_typ	*passed_in_hpt;
	cma_typ	*passed_in_sma;
	Int4	num_passed_in_sma;
	Int4	SetSize;
#endif
	char	*program_name;
	char	*infile;
	a_type  AB;
	BooLean	ownAB;
	hpt_typ	*Hpt;	// hyperpartition type. 
	hpt_typ	*InitHpt;	// hyperpartition type. Input Hyperpartition.
	double  *SubLPR[MAX_NUM_ELMENTARY_SETS];
	double  *SubRawLPR[MAX_NUM_ELMENTARY_SETS];

	//=============== hpt_typ redundant information ================= 
	char	*HyperPartition[MAX_NUM_ELMENTARY_SETS];
	// ^ HyperPartition[grp][bpps]: FG=+; BG=-; RM=0.
	char	*sst_str[MAX_NUM_ELMENTARY_SETS];
	//************* End redundant with Hpt ********************

	Int4	SemiConvergedState;		// decide when to give up on failed sets.
	BooLean	IsFailedSet[MAX_NUM_ELMENTARY_SETS];
	BooLean	IsFailedBPPS[MAX_NUM_ELMENTARY_SETS];
	Int4	RandomSet;

	//==================== InputSets =========================
	Int4	num_passed_in_sets;
	set_typ	*passed_in_sets;
	set_typ InitSet[MAX_NUM_ELMENTARY_SETS];
	//==================== InputSets =========================

	set_typ	GrpSet[MAX_NUM_ELMENTARY_SETS];
	cma_typ	*DisplayCMA;
	Int4	NumDisplayCMA;

	cma_typ *IN_CMA;
	cma_typ	dummyCMA;
	cma_typ	*QryCMAs;
	cma_typ	MainCMA;	// input alignment with random sequences added.
	cma_typ	TrueMainCMA;	// input alignment.
	hsw_typ	MainHSW;
	Int4	StartCMA[MAX_NUM_ELMENTARY_SETS];
	// cma_typ *GetCMAs4CHN(cma_typ,cma_typ);

	chn_typ	**chn;

	che_typ	**che;
	sqd_typ **sqd;
	sst_typ ***SST;
	char    *SFBG;
	BooLean	PutIntermediateFiles;
	Int4	NumRandom;
	//=============== GLOBAL settings. ===================
	double	GlobalA0,GlobalB0,GlobalRi,RejectRi;
	double	MiscGlobalA0,MiscGlobalB0,MiscGlobalRi;
	Int4	GlobalN;
	double	Global_rho;
	BooLean	NoFailureMode;
	// Int4	NthContrast[MAX_NUM_ELMENTARY_SETS];
	double	TotalLPR;
	double	TotalRawLPR;
	double	Map[MAX_NUM_ELMENTARY_SETS];
	//=============== Set information ===================
	char	**RelateFGs;	// subset[n1][n2]='<',superset='>',Intersect='+',Disjoint='0'.
	char    **SetRelationsFG( ){
		  return GetSetRelations("FG",Hpt->nGrpsFG(),Hpt->GrpsFG(),&SetFG); 
		}
	char    **SetRelationsBG( ){
		  return GetSetRelations("BG",Hpt->nGrpsBG(),Hpt->GrpsBG(),&SetBG); 
		}
	set_typ *SetFG,*SetBG;
	char	**RelateBGs;	// subset[n1][n2]='<',superset='>',Intersect='+',Disjoint='0'.
	set_typ	Labeled;		// these are labeled as belonging to a fixed set.
	Int4	*MaxNumCol;
	Int4	SeedPttrnLen;
	BooLean	NoSeeds;	// TRUE -> call FindSeedPatterns heuristic.
	BooLean	AddCSQ;		// TRUE -> add a consensus sequence to seed alignments.
	double	temperature,MinTemperature;
	UInt4	ppb_increase;	// limit for
	Int4	NumberRounds;
	Int4	MaximumIter;
	Int4	fixed_cutoff;
	FILE	*cfp;		// convergence file pointer (iteration, LPR, temperature).
	FILE	*ifp;		// sampling iteration file pointer with cardinality of sets.
	FILE	*efp;		// stderr file pointer.
	FILE	*outfp;		// infile.out file pointer.
	UInt8	Iteration;
	BooLean	StrictIndepend;
	Int4	DefaultMaxCol,DefaultMinCol;
};

#if 0	// new routines to be created (only stubs for now):
	BooLean EmptyCMSA(); 	// allow empty cmsa files as bookmarks.
	BooLean	CheckPttrnIntegrity( ); // make sure display set is consistent with input patterns.
	void	CreateRandomSet( );	// As BG for entire class.
#endif

Int4    RuncmcBPPS(Int4 argc,char *argv[],char &HptIs);

#endif

