/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#include "sprc_typ.h"

#define USAGE "   *** Multiple Statistical Analyses of Residue Direct Couplings***\n\
   Usage: mstarc <pdb_directory> <msarc_prefix> [options]\n\
   Input:  <pdb_directory> location of pdb files\n\
           <mstarc_prefix>.mst file of NCBI pdb ids and aln files obtained from cma2aln\n\
           <mstarc_prefix> file of DCA scores in EVC format\n\
              or <msarc_prefix>.dca file of DCA scores in PSICOV or GaussDCA format\n\
              from which will be created a file of DCA scores in EVcouplings format\n\
      Note: will also take a PSICOV or GaussDCA formatted input file as \n\
            <dca_file>.dca, in which case a corresponding <dca_file>.aln file\n\
            must also be provided.  The latter indicates which residues in <chain>\n\
            are aligned & map to DCA scores (upper case) and which are unaligned\n\
            & thus map to unscored gaps (lower case).  From these an EVcouplings\n\
            formatted file <dca_file> will be generated.\n\
   Output: Adjusted p-value and the corresponding -log10(p) score to stdout.\n\
   Options: \n\
      -A	  Print ICA array to stdout\n\
      -BG=<str>   prefix for background dca file (= <str>.dca).\n\
      -D=<real>   Maximum distance (in Angstroms) between scored residue pairs (default: 5.0).\n\
      -d=<real>   Minimum distance (in Angstroms) between scored residue pairs (default: 0).\n\
      -F          Print full length output.\n\
      -I          Optimize over ICA P-values instead of unified P-values\n\
      -m=<int>    Minimum sequence separation between paired residues (default: 5)\n\
      -M=<int>    Millions of simulations to perform for permutation scores (default: 0)\n\
      -R          Randomize column pair scores (control for ICA p-value calculation).\n\
      -S          Shuffle column pair ranks (control for Ball-in-urn & permutation p-value calculations).\n\
      -seed=<int> Provide a random seed (used with -R & -S options).\n\
      -v          Verbose output.\n\
   References:\n\
      Neuwald, A.F. & Altschul, S.F. 2018. Statistical Investigation of Protein Residue Couplings. \n\
          (submitted)\n\
      Altschul, S.F. & Neuwald, A.F. 2018. Initial Cluster Analysis. Journal of\n\
          Computational Biology 25(2):121-129.\n\
\n\n"

Int4	sprc_typ::IsChainOkayPDB(char c, char PDBID[],pdb_typ PDB)
// return TRUE if no probems with pdb file, else return FALSE.
{
	Int4	numHyd=CountHydrogensPDB(PDB);
	Int4	numRes=CountResPDB(PDB);
	double Dd= (double)numHyd/(double)numRes;
#if 0
	Int4	numAtm=CountAtomsPDB(PDB);
	fprintf(stderr,"numRes=%d; numAtm=%d; numHyd=%d\n",numRes,numAtm,numHyd);
	fprintf(stderr,"numAtm/numRes=%lf; numHyd/numRes=%lf.\n",
			(double) numAtm/(double) numRes,(double)numHyd/(double)numRes);
#endif
	Int4 k=GetChainNumberPDB(PDB,c);	// key chain identifier.
	if(!IsFullProteinChainPDB(k,PDB)){ //  problems (e.g., only alpha carbons)
	  	return 1;
	} else if(Dd < 2) { // missing hydrogens or sidechains.
		return 2;
	} else if(0 && IsResGapPDB(k,PDB)){ // missing residues.
		return 3;
	} else return 0;
}

Int4	sprc_typ::RunSTARC(char mstAln[],char PDBID[],char chain, Int4 values[], char ColorCode,
		char mode,FILE *pmlfp, Int4 num_dc, FILE *dcfp)
{
	Int4	i,j,k,Rtn=0;
        int	arg,Argc=0,ArgB=0;
        char	*Argv[50],AdjChn[30],Str[100],pdbid[9],pstr[9],chn=0;

static Int4 call=0; call++;

// fprintf(stderr,"str=%s\n",mstAln);
	for(j=0; j < 4; j++) pdbid[j]=tolower(PDBID[j]); pdbid[j]=0;
	if(pmlfp) { chn=chain; assert(isalpha(chn) || isdigit(chn)); }
	sprintf(Str,"%s/%s_H.pdb",argv[1],pdbid); 
	FILE *tfp=fopen(Str,"r");
	if(tfp==NULL) return 4;
	pdb_typ PDB=MakePDB(tfp,Str); fclose(tfp);
	Rtn=IsChainOkayPDB(chain,PDBID,PDB);
	if(Rtn > 0){ NilPDB(PDB); return Rtn; }
	Argv[0]=AllocString("starc");	// do Argv[1] & Argv[2]...
	Argv[1]=AllocString(Str);
	for(i=2,j=3; i < argc; i++,j++){
		if(i==argB) ArgB=j;	// argB == bpps option.
		Argv[j]=argv[i]; 
		// fprintf(stderr,"Argv[%d]==argv[%d]=%s (ArgB=%d; argB=%d)\n",j,i,Argv[j],ArgB,argB);
	} Argc=argc+1;
	sprintf(Str,"-D=%.3f",Dmax);
	Argv[Argc]=AllocString(Str); Argc++;
	assert(Argc < 50);	// WARNING: don't change the value of Argc below!!!
	k=GetChainNumberPDB(PDB,chain);	// key chain
	e_type kE=GetPDBSeq(k,PDB);
	Int4	NumAdj,strt,MinOverlap=100,RtnNumX;
	float	maxRatioX=0.20;
if(0) fprintf(stderr,"%s_%c\n",pdbid,chn); AdjChn[0]=' ';
	// ======= find adjacent identical chains ======
	for(i=1,j=0; i <= nChainsPDB(PDB); i++){
		if(i==k || !IsProteinPDB(i,PDB)) continue;
		e_type aE=GetPDBSeq(i,PDB);
                if(IsSubSeqMaxRatioX(aE,kE,strt,maxRatioX) 
			|| IsSameSeqFast(aE,kE,&strt,&RtnNumX,MinOverlap)){
		    		j++; AdjChn[j]=ChainCharPDB(i,PDB);
		} NilSeq(aE); 
	} NilPDB(PDB); NumAdj=j; AdjChn[j+1]=0; NilSeq(kE);
#if 1	// Subtract background DC-scores.
	dcm_typ *dcmBG=0;
	if(this->dcaBGfile){
	    // fprintf(stderr,"dcaBGfile=%s\n",this->dcaBGfile);
	    // starc <pdb_file> <chain> <dca_file> ; dca_file = Argv[3];
	    char *tmp=Argv[3];
	    sprintf(Str,"%c",chain); Argv[2]=AllocString(Str); Argv[3]=this->dcaBGfile; 
	    if(ArgB > 0) Argv[ArgB][1]=' '; 
	    // for(Int4 a=0; a < Argc; a++) fprintf(stderr,"%s ",Argv[a]); fprintf(stderr,"\n");
 	    edc_typ *edcBG = new edc_typ(Argc,Argv,mstAln);
	    dcmBG=edcBG->RtnDCM(); delete edcBG;
	    free(Argv[2]); Argv[2]=0; Argv[3]=tmp;
	}
#endif
	// ====== Run analysis for all combinations ====
// fprintf(stderr,"#num_dc=%d\n",num_dc);
	double DD=0,dd=0;
	long double lastS=0,S,lastBIU=0,BIU;
	for(i=0; i <= NumAdj; i++){
	     if(i > 0){ if(ArgB > 0) Argv[ArgB][1]=' '; sprintf(Str,"%c:%c",chain,AdjChn[i]); }
	     else { if(ArgB > 0) Argv[ArgB][1]='b'; sprintf(Str,"%c",chain); }
	     Argv[2]=AllocString(Str);
	     // fprintf(stderr,"%s %s %s %s\n",Argv[0],Argv[1],Argv[2],Argv[3]);
	     // add an input bpps string as an alternative to a bpps pattern file...
	     edc_typ *edc=0;
	     Int4 *bpps=0,max_bpps=0;
	     if(values[0] > 0 && (strlen(Str) == 1 || 0)){	// don't pass bpps pattern for multiple chains.
		// fprintf(stderr,"num bpps=%d\n",values[0]);
		NEW(bpps,values[0]+3,Int4);	// bpps is freed by edc_typ.
		for(j=0; j <= values[0]; j++){ bpps[j]=values[j]; max_bpps=MAXIMUM(Int4,max_bpps,bpps[j]); }
		bpps[j]=0;
		sprintf(pstr,"%4s_%c",PDBID,Str[0]);
		edc= new edc_typ(Argc,Argv,mstAln,mode,bpps,pstr);
	     } else edc= new edc_typ(Argc,Argv,mstAln,mode);
// fprintf(stderr,"i=%d; Str=%s; len=%d\n",i,Str,strlen(Str));
	     dci_typ *dci=0;
	     if(dcfp){	
		   char *Chns=edc->RtnChains(); 
	           if(1 || strlen(Chns) == 1){
		   	if(Chns[1] == 0){
			  fprintf(dcfp,"\n-------------- %s chain %s -------------\n",edc->pdbid,Chns);
			}
		   	// fprintf(dcfp,"DCA\t\t\t\t\t3D\n",edc->pdbid,Chns);
		   	// fprintf(dcfp,"rank\tresI\tresJ\tdist\tDC-scr\trank\n",edc->pdbid,Chns);
		   	// fprintf(dcfp,"rank\tresI\tresJ\tdist\tDC-scr\trank\n",edc->pdbid,Chns);
		   	if(Chns[1] == 0){
			   fprintf(dcfp,"rank\tresI\tresJ\tdist\tDC-scr\n",edc->pdbid,Chns);
			} fprintf(dcfp,".........................................\n");
			dci=edc->RunDCA(ColorCode,dcfp,num_dc,Chns,dcmBG); 
		   	   fprintf(dcfp,".........................................\n");
			if(dci) dci->PutResults(dcfp,Mode,Chns,edc->pdbid,0.0);
		   } else dci=edc->RunDCA(ColorCode,0,0,0,dcmBG); 
		   free(Chns);
	     } else dci=edc->RunDCA(ColorCode,0,0,0,dcmBG); 
// fprintf(stderr,"*num_dc=%d\n",num_dc);
	     if(dci){ 
#if 0	// DEBUG
		{
		  char *Chns=edc->RtnChains(); 
		  dci->PutResults(stdout,Mode,Chns,edc->pdbid,0.0); free(Chns);
		}
#endif
		// i > 0 --> adjacent subunits...
		if(pmlfp){
		    if(0) fprintf(stderr,"%d. %d = %d? %s_%c (%c)\n",call,i,NumAdj,pdbid,chn,AdjChn[i]);
		    dci->PyMolLines(pmlfp,num_dc,chn,AdjChn[i]); 
		}
		S=dci->RtnScores(BIU);
		if(S != lastS || BIU != lastBIU){
		  char *Chns=edc->RtnChains(); 
	          if(edc->DoShowPairs()){
		    dci->Put(stdout);
		    // fprintf(stderr,"pdbid=%s\n",edc->pdbid);
		    assert(edc->pdbid[0] != 0);
		    if(i > 0){ dd=dci->PutResults(stdout,Mode,Chns,edc->pdbid,DD); }
		    else { DD=dci->PutResults(stdout,Mode,Chns,edc->pdbid); }
		  }
		  if(i==0) DD=0;
		  // fprintf(stderr,"%d. %s %s S=%.3Lf S0=%.3f\n",i,pdbid,Chns,S,DD);
	 	  if(dch->Insert(dci,Chns,pdbid,DD)==0){
		     fprintf(stderr,"dch hpsz = %d; items = %d\n",dch->RtnHpsz(),dch->RtnItemsInMheap());
		     print_error("S-heap size exceeded.");
		  }
		  if(i==0) DD=(double)S; free(Chns);
		} else delete dci; 
	        if(strlen(Str) == 1){ lastS=S; lastBIU=BIU; }
	     } delete edc; free(Argv[2]); 
	     mode='m';
	}  free(Argv[0]); free(Argv[1]); free(Argv[Argc-1]);
	if(dcmBG) delete dcmBG;
	return 0;
}

void	sprc_typ::Init()
{
	Int4	i,x;
        int	arg;
	char	str[200];

	Dmax=3.0; Mode='M'; num_dc=0; argB=0;
	if(argc < 3) print_error(USAGE);
	// for(Int4 i=0; i < argc; i++) fprintf(stderr,"%d %s\n",i,argv[i]);
	for(arg=3; arg < argc; arg++){
           if(argv[arg][0] != '-') print_error(USAGE);
           switch(argv[arg][1]) {
	      // case 'n': { if(strncmp(argv[arg],"-no_evc_file",12) == 0){ argB=arg; } } break;
	      case 'B': if(sscanf(argv[arg],"-BG=%s",str)==1){
			this->dcaBGfile=AllocString(str); argv[arg][1]=' '; 
		} break;
	      case 'b': if(strncmp(argv[arg],"-bpps=",6) == 0){ argB=arg; } break;
	      case 'D': {
		if(sscanf(argv[arg],"-D=%f",&Dmax)==1){	// maximum distance...
			if(Dmax < 2.0) Dmax=3.0; 	// ignore if value too low.
			// do nothing...save
		} else if(sscanf(argv[arg],"-DC=%d",&x)==1){
		   if(x >= 0 && x <= 100){ num_dc=x;
		     argv[arg][1]=' ';
		     // fprintf(stderr,"x=%d\n",x);
		   } else print_error("-DC option input error");
		} else print_error(USAGE);
	      } break;
	      case 'F': {
		if(argv[arg][2] != 0) print_error(USAGE); 
		else { Mode=0; argv[arg][1]=' '; }
	      } break;
	      default:	// ignore...
		break;
	   }
	} TurnOffLicenseStatement();
}

/**************************************************************************
 *.mst input file syntax:
5FF8_A: IVIIGINPGLMAAYKGHHYPGPGNHFWKCLFMSGLSEVQLNHMDDHTLPGkYGIGFTNMVERTTPGSKDLSSKEFREGGRILVQKLQKYQPRIAVFNGKCIYEIFSKevfgvKVKNLEFGLQPHKIPDTEtlCYVMPSSS
#Y=141,231,138,234,272,273,134,271,135,159,136,155,163,191,137,189,228,215,227,187,267,226,139,142,266
#R=157,152,271,159,145,188,160,151,155,270,153,181,200,272,154,269,229,142,192,273,191,230,162,146,228
#O=31,103,23,28,67,82,83,99,79,106,86,34,36,20,48,42,81,40,57,107,62,63,26,77,29
&5FF8_A;5HF7_A;5T2W_A.
 **************************************************************************/

int     sprc_typ::run_mstarc(FILE *pmlfp, char *pdb_id,FILE *dcfp,FILE *infp)
{
	Int4	f,i,j,k,x;
        char	str[2020],PDBID[9],chn,aln[2009],Str[100],str0[500];
	static Int4 pml_num=0;
	FILE *ifp=0;
	if(infp != 0) ifp=infp; else ifp=open_file(argv[2],".mst","r");
#if 1
	if(dcfp){	
	   pml_num++;
	   strncpy(str,argv[2],100);
	   char *s=strstr(str,"_fg_X");
	   if(s == NULL) print_error("sprc_typ::run_mstarc input error 1"); else *s=0;
	   // if(sscanf(argv[2],"%s_fg_X",str) != 1) print_error("sprc_typ::run_mstarc input error 2");
	   if(0) fprintf(stderr,"\n=============== %s_%d.pml = %s ===============\n", str,pml_num,pdb_id);
	   fprintf(dcfp,"\n================== %s_%d.pml = %s ==================\n", str,pml_num,pdb_id);
	}
#endif
	for(f=1,str[2000]=0; fgets(str,2010,ifp) != NULL; f++){
	   assert(str[2000] == 0); // make sure string is not too long.
	   if(!isprint(str[0])) break;
// fprintf(stderr,"%d.%s",f,str);
	   if(sscanf(str,"%4s_%c: %s\n",PDBID,&chn,aln) != 3){
		fprintf(stderr,"str=%s\n",str); print_error("run_mstarc input error 3");
	   }
	   sprintf(str0,"%s_%c",PDBID,chn);
// fprintf(stderr,"%d.%s_%c (%s)\n",f,PDBID,chn,pdb_id);
	   char x=0,code=0,Code=0,tmpstr[2020],*sptr=0;
	   if(pdb_id && strncmp(pdb_id,str0,6) != 0){
#if 1
	     while((x=fgetc(ifp)) == '#' || x == '&'){ 
		 assert(fgets(tmpstr,2010,ifp) != NULL); 
	     } if(x != EOF) ungetc(x,ifp);
#else
	     if((x=fgetc(ifp)) == '#' || x == '&'){ assert(fgets(tmpstr,2010,ifp) != NULL); }
	     else ungetc(x,ifp);
#endif
	     continue;	// skip over 
	   }
	   // ============== Check for patterns ====================
	   Int4	values[200],NumPttrns=0; values[0]=0;
	   do { 
	     if((x=fgetc(ifp)) == '#'){
		assert(fgets(tmpstr,2010,ifp) != NULL);
		if(sscanf(tmpstr,"%c=%[0-9,]\n",&code,str0) != 2) 
			print_error("FATAL: mst file pattern syntax error 1");
		if(code == 'Y' || code=='R'){
		   Code=code;
		   // fprintf(stderr,"str=%s\n",str0);
		   NumPttrns=ParseIntegers(str0,values, "FATAL: mst file pattern syntax error 2");	
		   values[0]=NumPttrns; // fprintf(stderr,"values[0]=%d\n",values[0]);
		   // need to modify STARC to take string instead of filename. 
		}
	     } else ungetc(x,ifp);
	   } while(x == '#');
	   //=============================================
	   BooLean done=TRUE;
	   if((x=fgetc(ifp)) == '&'){ assert(fgets(tmpstr,2010,ifp) != NULL); sptr=tmpstr; }
	   else { sptr=0; ungetc(x,ifp); }
	   char mode='M',LastPDB[10]; LastPDB[0]=0;
	   char pdbid[8];
	   Int4	Rtn=0;
	   do {
	     if(Rtn > 0 && strncmp(PDBID,LastPDB,4)==0){	// skip this pdb file..
	     } else {
	     	Rtn=this->RunSTARC(str,PDBID,chn,values,Code,mode,pmlfp,num_dc,dcfp); mode='m';
		if(Rtn > 0){
		  for(j=0; j < 4; j++) pdbid[j]=tolower(PDBID[j]); pdbid[j]=0;
		  sprintf(Str,"%s/%s_H.pdb",argv[1],pdbid); 
		  switch(Rtn){
		    case 1:
	               	fprintf(stderr,"%s has problems (e.g., only alpha carbons)...skipped\n",Str);
		       	break;
		    case 2:
	  		fprintf(stderr,"%s is missing hydrogens or sidechains...skipped\n",Str);
		    	break;
		    case 3: fprintf(stderr,"%s is missing residues...skipped\n",Str); break;
		    case 4: fprintf(stderr,"%s not found.\n",Str); break;
		    default:  print_error("Error in RunSTARC"); break;
		  }
		}
// else fprintf(stderr,"%s:%c (%s) = %d\n",PDBID,chn,LastPDB,Rtn);
	     	strncpy(LastPDB,PDBID,4);
	     }
	     // ============= check for other structures for the same protein. =========
	     if(sptr){
		char X=0;
		// fprintf(stderr,"pstr=%s\n",sptr);
		if(sptr[0]=='.'){
		   // ignore these...
		} else if(sscanf(sptr,"%4s_%c%c",PDBID,&chn,&X) == 3){
		   done=FALSE;
#if 1
		   if(chn==X){
		      fprintf(stderr,"%s_%c%c == %s_%c (ignored)\n",PDBID,chn,X,PDBID,tolower(chn));	   
		      X = sptr[7]; chn=tolower(chn);
		      // sprintf(str,"%s_%c: %s\n",PDBID,tolower(chn),aln);
		   } 
#endif
		   if(X==';'){ sptr=strchr(sptr,';'); sptr++; } 
		   else if(X == '.'){ sptr=0; }
		   else if(chn != X) print_error("FATAL: mst file structure syntax error 3");
		   sprintf(str,"%s_%c: %s\n",PDBID,chn,aln);
		   // fprintf(stderr,"pdb=%s_%c;\n",PDBID,chn);
		} else print_error("FATAL: mst file structure syntax error 4");
	     } else done=TRUE; 
	     //====================================================================
	   } while(!done);
	   fflush(stdout);
	} if(ifp != infp) fclose(ifp);
	return 0;
}

