/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

// # using namespace std;

#include "drc_typ.h"	// extern "C" int run_ccmpred(int argc, char **argv);

Int4    ReSizeForCCM(Int4 Size, char *infile,char *outfile)
{
        Int4    m,sq,N;
	char	str[5003];

        FILE *fp=open_file(infile,"","r");
        for(N=0; fgets(str,5000,fp) != NULL; ) N++;

        dh_type dH=dheap(N+2,4);
        for(sq=2; sq <= N; sq++){ insrtHeap(sq,(keytyp) Random(),dH); }
        BooLean *keep; NEW(keep,N+3,BooLean);
        for(keep[1]=TRUE,m=1; (sq=delminHeap(dH)) != 0; ){
                        m++; keep[sq]=TRUE;
                        if(m >= Size) break;
        } Nildheap(dH);

	FILE *ofp=open_file(outfile,"","w"); rewind(fp);
        for(sq=1; fgets(str,5000,fp) != NULL; sq++){     // read CCM input lines
                if(keep[sq]) fprintf(ofp,"%s",str);
        } free(keep); fclose(fp); fclose(ofp); return 0;
}

#define USAGE "\
  ----------------------------------------------------\n\
     subsmpl_dca v1.0.3 (November 14, 2019)\n\
     www.igs.umaryland.edu/labs/neuwald/software/darc/\n\
  ----------------------------------------------------\n\
  SubSample Direct Coupling Analysis\n\
  Usage: sbsmpl_dca <infile> <PDBID>_<CHN> <pdb_dir> <int1> <int2> [options]\n\
	 Input: \n\
	   <infile> 	  	Multiple sequence alignment (MSA) in cma format.\n\
	   <PDBID>_<CHN>	Map results to this sequence (e.g., 3GLF_B) in cma file.\n\
	   <pdb_dir>		Directory containing the pdb coordinates for <PDBID>\n\
				   named using the syntax <pdbid>_H.pdb; (e.g., '3glf_H.pdb')\n\
	   <int1>		Number of sequences sampled from the MSA.\n\
	   <int2>		Number of samplings (range: 2 to 1000).\n\
	 Output: \n\
	   <infile>.in  	MSA in PSICOV aln format.\n\
	   <infile>_X.mst	PDB sequences in aln format.\n\
	   <infile>_X.dca	DCA output file in PSICOV format.\n\
	   <infile>_X		DCA output file in EVcouplings format.\n\
	   <pdbid>.sbsmpl 	Table of the number of samples for which DC-pairs\n\
				             are among the top 20, 10,5 and 2.\n\
	   <pdbid>_ss.pml 	PyMOL script (when -pml option is used).\n\
         Options:\n\
	   -pml			Create a pymol script showing top scoring couplings\n\
	   -seed=<int>		Provide a seed for the random number generator\n\
	   -thrds=<int>		Number of threads to be used for CCMpred (default: 1)\n\
	   -dev=<int>		GPU device number to be used for CUDA CCMpred (default: -1 = none)\n\
   References:\n\
     1.Tondnevis F., Dudenhausen E.E., Miller A.M., McKenna R., Altschul S.F., Bloom L.B. & Neuwald A.F.\n\
        2019. Deep Analysis of Residue Constraints (DARC): identifying determinants of protein \n\
        functional specificity. Submitted.\n\
     2.Seemayer S., Gruber M. & Söding J. CCMpred--fast and precise prediction of protein\n\
        residue-residue contacts from correlated mutations. Bioinformatics. 2014. 30(21):3128-30.\n\
	\n"

int	main(Int4 argc,char *argv[])
{ 
	Int4	i,Cut,Size,NN,Num,thrds=1,x,A,arg;
	char    c,*Argv[20],str[50],mode='P',chn=0;
	int     Argc=0;
	UInt4   Seed=18364592;
	BooLean	PutPyMOL=FALSE;
	FILE	*pmlfp=0;
	Int4	time1=time(NULL);
	Int4	gpu_num=-1;

#if 0
   {
	char Str[200];
	Int4 len_str=200;
	Int4    i,x,y,ii,jj,sum,top20,top10,top5,top2;
	Int4	OS=69;
OS=124;
        char    cI,cJ;
        double	d,D,score;
	while(fgets(Str,len_str,stdin) != NULL){
	  if(sscanf(Str,"%d\t%c%d\t%c%d\t%d\t%d\t%lf\t%d\t%d\t%d\t%d\t%d\t%lf\n",
                &i,&cI,&ii,&cJ,&jj,&x,&y,&D,&top20,&top10,&top5,&top2,&sum,&score) ==14){
	      ii -= OS; jj -= OS;
	      fprintf(stdout,"%d\t%c%d\t%c%d\t%d\t%d\t%.2lf\t%d\t%d\t%d\t%d\t%d\t%.3f\n",
                i,cI,ii,cJ,jj,x,y,D,top20,top10,top5,top2,sum,score);
	  }
	} exit(1);
   }
#endif
	// 0. Get parameters for the analysis.
	if(argc < 6 || argc > 8) print_error(USAGE);
	if(strlen(argv[2]) != 6) print_error(USAGE);
	Cut=1; Size=atol(argv[4]); Num=atol(argv[5]);
	for(arg=6; arg < argc; arg++){
	   if(argv[arg][0] != '-') print_error(USAGE);
           switch(argv[arg][1]) {
               case 'd':
		  if(sscanf(argv[arg],"-dev=%d",&x) != 1) print_error(USAGE);
		  else if(x < 0) print_error(USAGE); else gpu_num=x;
                  break;
               case 'p':
		  if(strcmp("-pml",argv[arg]) == 0 && argv[arg][4]==0) PutPyMOL=TRUE; 
		  else print_error(USAGE); 
                  break;
               case 't':
		  if(sscanf(argv[arg],"-thrds=%d",&x) != 1) print_error(USAGE);
		  else if(x < 1) print_error(USAGE); else thrds=x;
                  break;
               case 's':
                  if(sscanf(argv[arg],"-seed=%u",&Seed) != 1) print_error(USAGE);
                  break;
               default : print_error(USAGE); break;
	   }
        }
        if(Seed == 18364592){
             Seed = (UInt4) time(NULL);      // else Seed was set above
             fprintf(stderr,"-seed=%u\n",Seed);
        } sRandom(Seed);
#ifndef CUDA
	if(0 && gpu_num >= 0){
	   print_error("FATAL: code not compiled with CUDA; -dev option invalid");
	}
#endif
	
#if 1	// 1. Find offset needed to map DC-pairs to pdb coordinates.
	Int4	sq,Start=0,ncol=0,OffSet=0;
	e_type keyE=0;
	a_type  AB=MkAlpha(AMINO_ACIDS,PROT_BLOSUM62);
	{
	   // fprintf(stderr,"strlen(argv[1])=%d; (%d)\n",strlen(argv[1]),strlen(""));
	   FILE *fp=open_file(argv[1],"","r");
	   cma_typ cma=ReadCMSA(fp,AB); fclose(fp);
	   if(cma == NULL) print_error("FATAL: invalid input file");
	   assert(nBlksCMSA(cma) == 1);
	   ncol=LengthCMSA(1,cma); 
	   for(sq=1; sq <= NumSeqsCMSA(cma); sq++){
		e_type E=TrueSeqCMSA(sq,cma);
		StrSeqID(str,10,E);
		if(strlen(str) != 6) continue;
                if(str[4] != '_') continue;
                if(!isdigit(str[0])) continue;
		str[6]=0;
// fprintf(stderr,"str=%s; argv[2]=%s\n",str,argv[2]);
		if(strcmp(str,argv[2]) == 0){ keyE=CopySeq(E); break; }
	   } TotalNilCMSA(cma);
	} 
	if(keyE){
	   for(i=0; i < 4; i++) str[i]=tolower(argv[2][i]); str[i]=0;
	   char filename[50]; sprintf(filename,"%s/%s_H.pdb",argv[3],str);
	   pdb_typ pdb=MakePDB(filename);
	   Int4 chn=GetChainNumberPDB(pdb,argv[2][5]);
	   e_type pdbE=GetPDBSeq(chn,pdb);
	   NilPDB(pdb);
	   // PutSeq(stdout,keyE,AB); PutSeq(stdout,pdbE,AB);
	   Int4 minol=ncol/2;
	   if(OverlappingSeq(Start,keyE,pdbE,minol,0)){
		Int4 os=OffSetSeq(pdbE); Int4 kos=OffSetSeq(keyE); 
		OffSet=(Start + kos) -os;
		// fprintf(stderr,"Start=%d; os=%d; kos=%d; OffSet=%d\n",Start,os,kos,OffSet);
	   } // exit(1);
	   NilSeq(keyE); NilSeq(pdbE);
	} else print_error("pdb sequence absent from the alignment");
#endif
	// 2. create the *_X.mst and *.in files.
	Argv[0]=AllocString("cma2aln"); Argc=1; Argv[Argc]=argv[1]; Argc++;
        char **pdbID=run_cma2aln(Argc,Argv,NN); // pdbID=pdbids in *_fg and in *.mst files.
	free(Argv[0]);

	// 3. Create an initial DCA file in PSICOV format = <infile>_fg_X.dca
	// 	don't need a large file for this...
	char str0[50];
        sprintf(str,"%s.in",argv[1]); sprintf(str0,"%s_tmp.in",argv[1]);
        ReSizeForCCM(250,str,str0);
	Argv[3]=Argv[4]=0; 
	Argv[0]=AllocString("ccmpred"); Argc=1;
        Argv[Argc]=AllocString("-t"); Argc++;
        sprintf(str,"%d",thrds); Argv[Argc]=AllocString(str); Argc++;
	if(gpu_num >= 0){ sprintf(str,"-d %d",gpu_num); Argv[Argc]=AllocString(str); Argc++; }
	// fprintf(stderr,"gpu_num = %d\n",gpu_num);
        sprintf(str,"%s_tmp.in",argv[1]); Argv[Argc]=AllocString(str); Argc++;
        sprintf(str,"%s_X.dca",argv[1]); Argv[Argc]=AllocString(str); Argc++;
        run_ccmpred(Argc,Argv);  fflush(stdout); fflush(stderr);
	for(A=0; A < Argc; A++){ free(Argv[A]); Argv[A]=0; } Argc=0;
        sprintf(str,"%s_tmp.in",argv[1]); remove(str);

	// 2. Create a DCA file in EV-couplings format mapped to <pdbid> = <infile>_fg_X
	Argv[Argc]=AllocString("sbsmpl_dca"); Argc++; 
	Argv[Argc]=AllocString(argv[1]); Argc++; 
	// Argv[Argc]=AllocString("pdb_paths"); Argc++; 
	Argv[Argc]=AllocString(argv[3]); Argc++; 
	if(thrds != 1){ sprintf(str,"-thrds=%d",thrds); Argv[Argc]=AllocString(str); Argc++; }
	if(gpu_num >= 0){
	   sprintf(str,"-dev=%d",gpu_num); Argv[Argc]=AllocString(str); Argc++; 
	}
	// sprintf(str,"-inCCM=%d",Size); Argv[Argc]=AllocString(str); Argc++; 
	drc_typ *drc = new drc_typ(Argc,Argv,'B');
	drc->run_key_starc(argv[2],argv[3]);

	// 3. Peform subsampling analysis = <infile>_<pdbid>.sbsmpl
	sprintf(str,"%s_%s",argv[1],argv[2]);
	pml_typ *pml=0;
	if(PutPyMOL){ 
	   Int4	wire_width=60;
	   for(i=0; i < 4; i++) str[i]=tolower(argv[2][i]); str[i]=0;
	   char filename[50]; sprintf(filename,"%s/%s_H.pdb",argv[3],str);
	   sprintf(str0,"%s_%s",argv[1],str);
	   pmlfp=open_file(str0,"_ss.pml","w"); chn=argv[2][5]; 
	   pml= new pml_typ(filename,FALSE);
	   pml->PrintHEADER(pmlfp,FALSE,chn,wire_width);
	   fprintf(pmlfp,"cmd.show(\"cartoon\",\"resi 1-10000 and backbone\")\n");
	   // fprintf(pmlfp,"cmd.show(\"cartoon\",\"resi 1-10000 and chain %c and backbone\")\n",chn);
	   // pml->PrintTrace(FILE *fp, Int4 start, Int4 end, char c,char color,char mode,Int4 diameter);
	}
	FILE *xfp=open_file(argv[1],".sbsmpl","w");
	lsd_typ *lsd=drc->SubSampleDCA(Cut,Size,Num,xfp); 
	fprintf(xfp," Reference pdb = '%s'\n\n",argv[2]);
        Int4 **colpair=lsd->Put(xfp,pmlfp,chn,OffSet); delete lsd; 
	{
          rdc_typ *rdc = new rdc_typ(argv[1]);
          for(i=1; colpair[1][i] != 0; i++){
                Int4 x=colpair[1][i],y=colpair[2][i];
                assert(y > 0 && x > 0); rdc->Run(xfp,x,y,i);
          } delete rdc; 
        } free(colpair[1]); free(colpair[2]); free(colpair); fclose(xfp);
	for(i=0; i < Argc; i++) free(Argv[i]);
	if(pmlfp){ 
	   pml->PrintTAIL(pmlfp); 
	   fprintf(pmlfp,"cmd.center(\"chain %c\")\n",chn);
	   fclose(pmlfp); delete pml; 
	} delete drc;
	if(pdbID){ for(i=1; pdbID[i]; i++) free(pdbID[i]); free(pdbID); }
	NilAlpha(AB);
#if 1   // remove temporary files.
        sprintf(str,"%s_X.mst",argv[1]); remove(str);
        sprintf(str,"%s_X",argv[1]); remove(str);
        sprintf(str,"%s_X.dca",argv[1]); remove(str);
        sprintf(str,"%s.in",argv[1]); remove(str);
#endif
	fprintf(stderr,"\nSubsampling completed successfully.\n");
	fprintf(stderr,"\trun time: %d seconds (%0.2f minutes)\n\n",
		time(NULL)-time1,(float)(time(NULL)-time1)/60.0);
	return 0;
}

