/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#include "lps_typ.h"
#include "hmmer.h"

#if 0
	hmmer code was modified by moving jackhmmer.c to run_jackhmmer.c,
	changing main() to RunJackHMMer() and by creating a simple driver 
	file called jackhmmer.c that merely calls RunJackHMMer to create
	the jackhmmer executable.  the src/Makefile.in file was modified 
	accordingly.
Reference:
Hidden Markov model speed heuristic and iterative HMM search procedure.
Johnson LS, Eddy SR, Portugaly E.
BMC Bioinformatics. 2010 Aug 18;11:431. doi: 10.1186/1471-2105-11-431.
PMID: 20718988

HMMER web server: 2015 update.
Finn RD, Clements J, Arndt W, Miller BL, Wheeler TJ, Schreiber F, Bateman A, Eddy SR.
Nucleic Acids Res. 2015 Jul 1;43(W1):W30-8. doi: 10.1093/nar/gkv397. Epub 2015 May 5.
PMID: 25943547 

and CD-hit as well...
#endif

extern "C" int RunJackHMMer(int argc, char **argv);
extern "C" int RunReformat(int argc, char **argv);
extern int convert_msa(int argc, char *argv[],a_type AB);

Int4	lps_typ::jhs_typ::PutUniqueMergedMinColCMSA(char *filename, char *matstr, double MinCol, a_type AB)
{
	Int4 Number,file;
	FILE *fp=open_file(filename,".cma","r");
	cma_typ cma,tcma,*IN_CMA=MultiReadCMSA(fp,&Number,AB); fclose(fp);

	// -m option
	assert(nBlksCMSA(IN_CMA[1]) == 1);
	if(Number > 1){
	   fp=tmpfile();
	   PutMergedCMSA(fp,Number,IN_CMA); 
	   for(file=1; file <= Number; file++) TotalNilCMSA(IN_CMA[file]); free(IN_CMA);
	   rewind(fp); cma=ReadCMSA(fp,AB); fclose(fp);
	} else { cma=IN_CMA[1]; free(IN_CMA); }

	// -mincol=0.75 option
	h_type HG=Histogram("fraction aligned",0,1,0.05);
	Int4	i,j,k,s,J,I,n,Len,na;
	Int4    N = NumSeqsCMSA(cma);
        BooLean *skip; NEW(skip,N+3,BooLean);
        for(J=1; J <= N; J++){ skip[J]=TRUE; }
	Len=LengthCMSA(1,cma);
        for(n=0,J=1; J <= N; J++){
		// e_type E=FakeSeqCMSA(J,cma);
		for(na=0,s=1 ; s <= Len;s++){
			Int4 r=ResidueCMSA(1,J,s,cma);
			if(r != UndefAlpha(A)) na++; 
		}
		double fr=(double)na/(double)Len;
		if(fr >= MinCol){ skip[J]=FALSE; n++; }
		IncdHist(fr, HG);
	} 
	if(n > 0){
		fp=tmpfile(); PutSelectCMSA(fp,skip,cma); 
		TotalNilCMSA(cma); rewind(fp); cma=ReadCMSA(fp,AB); fclose(fp);
		PutHist(stdout,60,HG); NilHist(HG); fflush(stdout); free(skip);
	} else {
		PutHist(stdout,60,HG); NilHist(HG); fflush(stdout);
		free(skip); TotalNilCMSA(cma);
		return 0;
	}

	// -U option...
	ss_type data = TrueDataCMSA(cma);
	N=NSeqsSeqSet(data); NEW(skip,N+3,BooLean); 
        for(i=1;i < N; i++) {
	   if(skip[i]) continue;
	   e_type  qE=SeqSetE(i,data);
	   if(i % 1000 == 0) fprintf(stderr,"\r%.1f",100.0*((double)i/(double)N));
       	   for(j=i+1;j <= N; j++) {
		if(skip[j]) continue;
		if(IdentSeqs(qE,SeqSetE(j,data))) skip[j]=TRUE;
	   }
	} fprintf(stderr,"\n");
	fp = open_file(filename,matstr,"w");
	PutSelectCMSA(fp,skip,cma); free(skip); fclose(fp);
	TotalNilCMSA(cma);
	return n;
}

int	lps_typ::jhs_typ::run_lapis_JH(int argC, char *argV[])
{
	double	incE=0.001;
	if(argC < 4) print_error(LAPIS_USAGE_PLUS);
	char	*ArgV[20],str[5002];
	int	i,j,J,ArgC=0;
	ArgV[ArgC]=AllocString("lapis"); ArgC++;
	ArgV[ArgC]=AllocString("-A"); ArgC++;
	sprintf(str,"%s.sto",argV[1]);
	ArgV[ArgC]=AllocString(str); ArgC++;
	ArgV[ArgC]=AllocString("-o"); ArgC++;
	ArgV[ArgC]=AllocString("/dev/null"); ArgC++;
	ArgV[ArgC]=AllocString("--cpu"); ArgC++;
	ArgV[ArgC]=AllocString(argV[3]); ArgC++;
	sprintf(str,"%0.5lf",incE);
	ArgV[ArgC]=AllocString("--incE"); ArgC++;
	ArgV[ArgC]=AllocString(str); ArgC++;
	ArgV[ArgC]=AllocString("-E"); ArgC++;
	ArgV[ArgC]=AllocString("0.1"); ArgC++;
	ArgV[ArgC]=AllocString(argV[1]); ArgC++;
	ArgV[ArgC]=AllocString(argV[2]); ArgC++;
	ArgV[ArgC]=0;
	for(i=0; ArgV[i]!= 0; i++) fprintf(stderr,"%s ",ArgV[i]); fprintf(stderr,"\n");
	RunJackHMMer(ArgC,ArgV);
	for(i=1; ArgV[i] != 0; i++) free(ArgV[i]); ArgC=1;

	ArgV[ArgC]=AllocString("-o"); ArgC++;
	sprintf(str,"%s_fa",argV[1]);
	ArgV[ArgC]=AllocString(str); ArgC++;
	ArgV[ArgC]=AllocString("--informat"); ArgC++;
	ArgV[ArgC]=AllocString("stockholm"); ArgC++;
	ArgV[ArgC]=AllocString("afa"); ArgC++;
	sprintf(str,"%s.sto",argV[1]);
	ArgV[ArgC]=AllocString(str); ArgC++;
	ArgV[ArgC]=0;
	for(i=0; ArgV[i]!= 0; i++) fprintf(stderr,"%s ",ArgV[i]); fprintf(stderr,"\n");
	RunReformat(ArgC,ArgV);
	for(i=1; ArgV[i] != 0; i++) free(ArgV[i]); ArgC=1;
	sprintf(str,"%s.sto",argV[1]); std::remove(str);

	FILE *ifp=open_file(argV[1],"_fa","r");
	FILE *ofp=open_file(argV[1],".afa","w");
	for(i=0; fgets(str,5000,ifp) != NULL; i++){
	   if(str[0]=='>'){
	      for(j=1; isdigit(str[j]); j++) ;
	      if(str[j]=='|'){ 
		j++; J=j;
	        fprintf(ofp,">");
		for( ; str[j] != '/' && ! isspace(str[j]); j++){
			 fprintf(ofp,"%c",str[j]);
		}
		if(str[j]=='/'){
			char *st=strstr(str,"[subseq from]");
			assert(st != NULL);
			st = st + 13;
			fprintf(ofp,"%s",st);
		} else {	// str[j] == ' ';
		  for( ; str[j] != 0; j++) fprintf(ofp,"%c",str[j]);
		}
	      } else fprintf(ofp,"%s",str);
	   } else fprintf(ofp,"%s",str);
	} fclose(ofp); fclose(ifp);
	// sprintf(str,"%s_fa",argV[1]); std::remove(str);
	for(i=1; ArgV[i] != 0; i++) free(ArgV[i]); ArgC=1;

	a_type AB = MkAlpha(AMINO_ACIDS,PROT_BLOSUM62);
	sprintf(str,"%s.afa",argV[1]); ArgC=1;
	ArgV[ArgC]=AllocString(str); ArgC++;
	sprintf(str,"%s.cma",argV[1]);
	ArgV[ArgC]=AllocString(str); ArgC++;
	ArgV[ArgC]=0;
	convert_msa(ArgC, ArgV,AB);
	for(i=1; ArgV[i] != 0; i++) free(ArgV[i]); ArgC=1;

	double MinCol=0.75;
	char matstr[50];
	sprintf(matstr,"_m%.0lf.cma",100.0*MinCol);
	Int4 left=PutUniqueMergedMinColCMSA(argV[1],matstr,MinCol,AB);
	return 0;
}

// lapis <dbs> --> gismo/mapgaps mode
// lapis <query> <dbs> --> jackhmmer mode
int	lps_typ::jhs_typ::RunJH()
{
	//============ find out if running JH or gismo/mapgaps mods =====
	BooLean	RunJH=FALSE;
	Int4 arg,i,j,x,Argc=0;
	char **Argv,str[20];
	for(i=0; argV[i]!= 0; i++) fprintf(stderr,"%s ",argV[i]); fprintf(stderr,"\n");
	NEWP(Argv,argC+20,char); Argc=0;
	Argv[Argc]=argV[Argc]; Argc++;     // argv[0] = lapis;   Argc==1;
	Argv[Argc]=argV[Argc]; Argc++;     // argv[1] = <query>; Argc==2
	Argv[Argc]=argV[Argc]; Argc++;     // argv[2] = <dbs>;   Argc==3;
        for(arg=3; arg < argC; arg++){
            if(argV[arg][0] != '-') break;  // this will be reported later.
	    if(argV[arg][1] == 0) print_error(LAPIS_USAGE_PLUS);
            switch(argV[arg][1]) {
             case 't':
		if(sscanf(argV[arg],"-thrds=%d",&x)==1){ 
		   sprintf(str,"%d",x);
		   Argv[Argc]=AllocString(str); Argc++;     // argv[3] = #thrds
		} else print_error(LAPIS_USAGE_JH);
		break;
	     default: 	// ignore these
		print_error(LAPIS_USAGE_JH);
		break;
	    }
   	} if(Argc==3){ Argv[Argc]=AllocString("1"); Argc++; }
	int rtn=this->run_lapis_JH(Argc,Argv); Argc--;
	free(Argv[Argc]); free(Argv);
	return rtn;
}

