/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#include "edc_typ.h"

Int4	edc_typ::ChainExistsPDB(char chain,pdb_typ pdb)
{
	for(Int4 c=1; c <= nChainsPDB(pdb); c++) if(ChainCharPDB(c,pdb)==chain) return c;
	return 0;
}

#include "stdinc.h"

void	edc_typ::Free()
{
	if(bpps) free(bpps);
// if(dcaE) PutSeq(stderr,dcaE,AB); 
	if(dcaE) NilSeq(dcaE);  // don't free up in dcm_typ!!!
	if(dca_file) free(dca_file);
	if(pdb_file) free(pdb_file); 
	if(program) free(program);
	if(AB) NilAlpha(AB); AB=0;
	if(0 && time1 != 0){
	   double runtime=difftime(time(NULL),time1);
	   fprintf(stderr,"\ttime: %0.1f seconds (%0.2f minutes)\n",runtime,runtime/60.0);
	}
}

void	edc_typ::PutOverlapSeqs(FILE *fp,Int4 OS, Int4 st1, e_type E1, Int4 st2, e_type E2)
{
	Int4	s1,s2,i,j,end1,end2;
	unsigned char	*sq1,*sq2;
	end1=LenSeq(E1); end2=LenSeq(E2); sq1=SeqPtr(E1); sq2=SeqPtr(E2);
	for(s1=st1,s2=st2; s1 <= end1 && s2 <= end2; s1++,s2++){
		fprintf(fp,"%c",AlphaChar(sq1[s1],AB));
	} fprintf(fp,"\n");
	for(s1=st1,s2=st2; s1 <= end1 && s2 <= end2; s1++,s2++){
		fprintf(fp,"%c",AlphaChar(sq2[s2],AB));
	} fprintf(fp,"\n");
}

#if 0	// use from sequence.cc
BooLean	edc_typ::OverlappingSeqs(Int4 &offset,e_type E1, e_type E2,Int4 MinOverlap,
			Int4 MaxMisMatch)
/*************************************************************
return TRUE if E1 & E2 are overlapping fragments of the same sequence.
This sets offset > 0 if query starts before pdbseq else it sets dca2pdbOS <= 0.
It also allows for 'X' residues in pdb files...

Start:					end1=21; end2=24; MinOverlap=5;
	E1  ----+----+----+----+-	
	    |||||||||||||||||||||	start1=start2=1 offset = 1 - 1 = 0
	E2  ----+----+----+----+----
		:	:	:		(offset= start1 - start2 = 17 -1 = 16)
		:	:	:		( add 16 to second seq)
	E1                     ----+----+----+----+-
                               |||||	
	E2  ----+----+----+----+----    start2=stop2 = end2 - MinOverlap +1 = 24 -5 + 1 = 20       
					offset = 1 - 20 = -19 (add 19 to first seq)
		:	:	:	
		:	:	:	
       	E1  ----+----+----+----+-     	start1=stop1=end1-MinOverlap+1 = 21-5+1=17;
                            |||||	start2 = 1;
	E2                  ----+----+----+----+----           

 (move to sequence.cc eventually)
 *************************************************************/
{
	Int4	start1,start2,end1,end2,s1,s2,e1,e2,stop1,stop2,mismatch,match,best=0;
	unsigned char	*sq1,*sq2;

	end1=LenSeq(E1); end2=LenSeq(E2); sq1=SeqPtr(E1); sq2=SeqPtr(E2);
	stop1=end1 - MinOverlap +1; stop2=end2 - MinOverlap +1; best=0;
// fprintf(stderr,"end1=%d; end2=%d; stop1=%d; stop2=%d\n",end1,end2,stop1,stop2);
        for(start1=start2=1; start2 <= stop2; start2++){
	   for(match=mismatch=0,s1=start1, s2=start2; s1 <= end1 && s2 <= end2; s1++,s2++){
		if(sq1[s1]==0 || sq2[s2] == 0) continue;  // skip one or two == 'X'.
	       	if(sq1[s1] == sq2[s2]) match++;
		else { mismatch++; if(mismatch > MaxMisMatch) break; }
	   }
	   if(s1 >= end1 || s2 >= end2){
	   	// fprintf(stderr,"s1=%d; s2=%d; match=%d; best=%d; offset=%d\n",s1,s2,match,best,offset);
		if(match >= MinOverlap && match > best){ best=match; offset=start1-start2; }
	   }
	}
        for(start1=start2=1; start1 <= stop1; start1++) {
	   for(match=mismatch=0,s1=start1, s2=start2; s1 <= end1 && s2 <= end2; s1++,s2++){
		if(sq1[s1]==0 || sq2[s2] == 0) continue;  // skip one or two == 'X'.
	       	if(sq1[s1] == sq2[s2]) match++;
		else {
if(0) fprintf(stderr,"mismatch: %c%d != %c%d\n",AlphaChar(sq1[s1],AB),s1,
			AlphaChar(sq2[s2],AB),s2);
			mismatch++; if(mismatch > MaxMisMatch) break; 
		}
	   }
	   if(s1 >= end1 || s2 >= end2){
	   	// fprintf(stderr,"s1=%d; s2=%d; match=%d; best=%d; offset=%d\n",s1,s2,match,best,offset);
		if(match >= MinOverlap && match > best){ best=match; offset=start1-start2; }
	   }
	}
if(0) fprintf(stderr,"best = %d\n",best);
	if(best > 0) return TRUE; else return FALSE;
}
#endif

void	edc_typ::InitDefaults( )
{
	color_code=0;
	UseSide=0;
	Split=0;
	TruncateArray=0;
	bpps=0; TheOtherMtrx=0; OtherNumResC=0; TheOtherSet=0;
	ShowPairs=0;
	UsePPV=FALSE;
	ComputeOverRange=FALSE;
	PermuteMillions=0;
	MinSqSeparation=5;
	UseJeffreys=FALSE; UseHydrogens=TRUE; seed=18364592; ShuffleRanks=0;
	MaxDist=5.0; MinDist=0; 
	NumAdjChns=0; pdbid[0]=0;
	Simulate=0; afptr=0; efptr=0; dca_file=0; KeyChain=0; dcaE=0; dca2pdbOS=0; 
}

dcm_typ	*edc_typ::RtnDCM( )
// Derived from RunDCA; see that for debugging...
{
	Int4	i,j,k,n,x;
	pdb_typ	PDB=MakePDB(pdb_file);	// PutPDB(stdout,PDB);
	if(!ChainExistsPDB(KeyChain,PDB)){
	   fprintf(stderr,"KeyChain==%c\n",KeyChain);
	   print_error("chain input error 4");
	}
	k=GetChainNumberPDB(PDB,KeyChain);
	// RenumberChainPDB(KeyChain,1,PDB); // XXX
	if(k == 0) print_error("chain input error 5");
	i=MinResPDB(k,PDB); j=MaxResPDB(k,PDB);
	if(i < 1){
	  FILE *tfp=tmpfile(); PutSubChainPDB(tfp,1,j,KeyChain,PDB); rewind(tfp);
	  NilPDB(PDB); PDB=MakePDB(tfp); fclose(tfp);
	  i=MinResPDB(k,PDB); j=MaxResPDB(k,PDB);
	}
	if(!ChainExistsPDB(KeyChain,PDB)) print_error("chain input error 6");
	if(dcaE){	// check for consistency...
		e_type E=GetPDBSeq(GetChainNumberPDB(PDB,KeyChain),PDB);
		if(E == 0) print_error("chain input error 7"); // may be non-protein
		a_type ab=AminoAcidAlphabetPDB(PDB); 
		// AlnSeqSW(stderr,11, 1,dcaE, E, ab);
        	if(!OverlappingSeqs(dca2pdbOS,dcaE,E,20,0)){
		  fprintf(stderr,"  dca file is inconsistent with pdb file %s:%c.\n",
					pdb_file,KeyChain);
		  NilPDB(PDB); NilSeq(E); return 0;
		}
		Int4 pdbOS=OffSetSeq(E),os=OffSetSeq(dcaE); dca2pdbOS=os-pdbOS+dca2pdbOS;
		NilSeq(E);
	} else { NilPDB(PDB); return 0; }
	dcm_typ *dcm=new dcm_typ(KeyChain,PDB,this); NilPDB(PDB);
	if(!dcm->InitializedOkay()){ delete dcm; return 0; } else return dcm; 
}

#if 0	// needs more work on dcm_typ end.
float	**edc_typ::CalcMtrxPDB(Int4 &num_resC)
{
	Int4	i,j,k,n,x,II=0; 
	char	c=' '; 
	pdb_typ	PDB=MakePDB(pdb_file);	// PutPDB(stdout,PDB);
	if(!ChainExistsPDB(KeyChain,PDB)) print_error("chain input error 1");
	// fix problem with negative numbering...
	k=GetChainNumberPDB(PDB,KeyChain);
	if(k == 0) print_error("chain input error 2");
	i=MinResPDB(k,PDB); j=MaxResPDB(k,PDB);
	if(i < 1){
	  if(0) fprintf(stderr,"Eliminating residue positions < 1 from chain '%c'; res=%d..%d\n",KeyChain,i,j);
	  FILE *tfp=tmpfile(); PutSubChainPDB(tfp,1,j,KeyChain,PDB); rewind(tfp);
	  NilPDB(PDB); PDB=MakePDB(tfp); fclose(tfp);
	  i=MinResPDB(k,PDB); j=MaxResPDB(k,PDB);
	  if(0) fprintf(stderr,"New chain '%c' pdb file: res=%d..%d\n",KeyChain,i,j);
	} II++;
	// fprintf(stdout,"\n=========== %d. %s:%c ============\n", II,pdb_file,KeyChain);
	if(dcaE){	// check for consistency...
		e_type E=GetPDBSeq(GetChainNumberPDB(PDB,KeyChain),PDB);
		if(E == 0) print_error("chain input error 3"); // may be non-protein
		a_type ab=AminoAcidAlphabetPDB(PDB); 
        	if(!OverlappingSeqs(dca2pdbOS,dcaE,E,20,0)){
           	  fprintf(stderr,"Sequences mismatch (%s)\n",pdb_file);
		  AlnSeqSW(stderr,11, 1,dcaE, E, ab);
           	  // PutSeq(stderr,dcaE,ab); PutSeq(stderr,E,ab);
		  print_error("dca file is inconsistent with pdb file");
		  // free(pdb_file); NilPDB(PDB); continue; 
        	} else if(efptr) AlnSeqSW(efptr,11, 1,dcaE, E, ab);
		Int4 pdbOS=OffSetSeq(E),os=OffSetSeq(dcaE); dca2pdbOS=os-pdbOS+dca2pdbOS;
		if(efptr) fprintf(efptr,"pdbOS= %d; os=%d; dca2pdbOS=%d\n",pdbOS,os,dca2pdbOS);
		if(0) fprintf(stderr,"pdbOS= %d; os=%d; dca2pdbOS=%d\n",pdbOS,os,dca2pdbOS);
		NilSeq(E);
	}

	// Compute the significance of DCA-structural overlap.
	dcm_typ *dcm=new dcm_typ(KeyChain,PDB,this); assert(dcaE);
	float    **MtrxPDB=dcm->CalcDistMtrx(FALSE); 
	delete dcm; NilPDB(PDB);
	return MtrxPDB;
}
#endif

dci_typ	*edc_typ::RunDCA(char ColorCode, FILE *dcfp, Int4 num_dc, char *chns,
		dcm_typ *dcmBG, FILE *lgfp,set_typ SetUsd)
{
	Int4	i,j,k,n,x,II=0; 
	char	c=' '; 
	FILE	*efp=0; // efp=stderr;
// efptr=stderr;
	pdb_typ	PDB=MakePDB(pdb_file);	// PutPDB(stdout,PDB);
	if(!ChainExistsPDB(KeyChain,PDB)){
	   fprintf(stderr,"pdb_file=%s; KeyChain=='%c'\n",pdb_file,KeyChain);
	   print_error("chain input error 4");
	}

     	// fix problem with numbering...
	k=GetChainNumberPDB(PDB,KeyChain);
	if(k == 0) print_error("chain input error 5");
	i=MinResPDB(k,PDB); j=MaxResPDB(k,PDB);
	if(i < 1){
	  if(lgfp) fprintf(lgfp,"Removing residues <= 0 from chain '%c' in '%s'; res=%d..%d\n",
			KeyChain,pdb_file,i,j);
	  FILE *tfp=tmpfile(); PutSubChainPDB(tfp,1,j,KeyChain,PDB); rewind(tfp);
	  // FILE *tfp=tmpfile(); PutChainPDB(tfp,1,j,KeyChain,PDB); rewind(tfp);
	  NilPDB(PDB); PDB=MakePDB(tfp); fclose(tfp); ReNamePDB(pdb_file,PDB);
	  i=MinResPDB(k,PDB); j=MaxResPDB(k,PDB);
	  if(lgfp) fprintf(lgfp,"   ...new chain '%c' pdb file: res=%d..%d\n",
				KeyChain,i,j);
	}

	if(!ChainExistsPDB(KeyChain,PDB)) print_error("chain input error 6");
	II++;
	// fprintf(stdout,"\n=========== %d. %s:%c ============\n", II,pdb_file,KeyChain);
	if(dcaE){	// check for consistency...
		if(0) fprintf(stderr,"%s: Chn[%d]=%c\n",FilenamePDB(PDB),
			GetChainNumberPDB(PDB,KeyChain),KeyChain);
		e_type E=GetPDBSeq(GetChainNumberPDB(PDB,KeyChain),PDB);
		if(E == 0) print_error("chain input error 7"); // may be non-protein
		a_type ab=AminoAcidAlphabetPDB(PDB); 
// AlnSeqSW(stderr,11,1,E,dcaE,AB);
        	if(!OverlappingSeqs(dca2pdbOS,dcaE,E,20,0)){
           	  fprintf(stderr,"===== edc->RunDCA(): %s %c =====\n",
				pdb_file,KeyChain);
		  PutSeq(stderr,E,AB); PutSeq(stderr,dcaE,AB);
		  AlnSeqSW(stderr,11,1,E,dcaE,AB);
		  fprintf(stderr,"=== Sequence & structural files are inconsistent: ===\n");
		  fprintf(stderr,"\tThis may due to residue positions less than 1,\n");
		  fprintf(stderr,"\tin which case delete those positions, or\n");
		  fprintf(stderr,"\tto sequence mismatches between the MSA and pdb files.\n");
		  fprintf(stderr,"\tDelete all output files after editing any input files.\n");
		  if(lgfp){
#if 0
           	    fprintf(lgfp,"Sequences mismatch (%s)\n",pdb_file);
		    fprintf(lgfp,"Skipping '%s' '%c'\n",pdb_file,KeyChain);
		    AlnSeqSW(lgfp,11, 1,dcaE, E, ab);
           	    // PutSeq(lgfp,dcaE,ab); PutSeq(lgfp,E,ab);
#endif
		     fprintf(lgfp,
			"%s %c: dca & pdb files inconsistent (e.g., disordered regions?).\n",
					pdb_file,KeyChain);
		  } NilPDB(PDB); NilSeq(E); return 0;
		  // print_error("dca file is inconsistent with pdb file");
		  // free(pdb_file); NilPDB(PDB); continue; 
        	} else if(efptr){
		  PutDiagonalSeq(stderr, dca2pdbOS,dcaE, E, ab);
	          // AlnSeqSW(efptr,11, 1,dcaE, E, ab);
           	  // PutSeq(stderr,dcaE,ab); PutSeq(stderr,E,ab);
		}
		Int4 pdbOS=OffSetSeq(E),os=OffSetSeq(dcaE); dca2pdbOS=os-pdbOS+dca2pdbOS;
		if(efptr) fprintf(efptr,"pdbOS= %d; os=%d; dca2pdbOS=%d\n",pdbOS,os,dca2pdbOS);
		if(0) fprintf(stderr,"pdbOS= %d; os=%d; dca2pdbOS=%d\n",pdbOS,os,dca2pdbOS);
		if(0) PutDiagonalSeq(stderr, dca2pdbOS,dcaE, E, ab);
		NilSeq(E);
	} else { NilPDB(PDB); return 0; }
	// Compute the significance of DCA-structural overlap.
	dcm_typ *dcm=new dcm_typ(KeyChain,PDB,this);
	if(!dcm->InitializedOkay()){ delete dcm; NilPDB(PDB); return 0; }
#if 1
	if(dcmBG) dcm->MinusBgMtrxDCA(dcmBG); // subtract BG MtrxDCA from foreground MtrxDCA.
#endif
	// for(i=1; i <= 120; i++){ fprintf(stdout,"%lg\n",SampleUniformProb()); } exit(1);
	dci_typ *dci=dcm->PDBvsDCA(stdout,ColorCode,dcfp,num_dc,chns,SetUsd); 
	if(0 && dci==0) print_error("FATAL: input parameters fail to provide meaningful results\n");
	// if(ShowPairs) for(i=1; i <= dci->rtnN(); i++){ dci->put(stderr,i); }
	// if(dci) dci->PutResults(stderr);
	set_typ RtnSet=dcm->RtnDistPairSet();
	delete dcm; NilPDB(PDB);
	if(RtnSet) NilSet(RtnSet);
	return dci;
}

#define MyUSAGE "   *** Evaluation of Direct Coupling Analyses ***\n\
   Usage: evalDCA <pdb_file> <chain> <dca_file> [options]\n\
        or evalDCA <pdb_file> <chain>:<str> <dca_file> [options]\n\
	   where <str> is a list of adjacent identical chains\n\
   Input:  <pdb_file> protein database structural coordinate file for benchmarking\n\
	   <chain>    protein chain corresponding to the DCA analysis\n\
	   <dca_file> file of DCA scores in EVcouplings format\n\
      Note: will also take a PSICOV or GaussDCA formatted input file as \n\
	    <dca_file>.dca, in which case a corresponding <dca_file>.aln file\n\
	    must also be provided.  The latter indicates which residues in <chain>\n\
	    are aligned & map to DCA scores (upper case) and which are unaligned\n\
	    & thus map to unscored gaps (lower case).  From these an EVcouplings\n\
	    formatted file <dca_file> will be generated.\n\
   Output: Adjusted p-value and the corresponding -log10(p) score to stdout.\n\
   Options: \n\
      -bpps=<file> Read in positions of discriminating residues for a BPPS/DCA comparitive analysis\n\
      -D=<real>   Maximum distance (in Angstroms) between scored residue pairs (default: 5.0).\n\
      -d=<real>   Minimum distance (in Angstroms) between scored residue pairs (default: 0).\n\
      -I          Optimize over ICA P-values instead of unified P-values\n\
      -J          Use Jeffreys priors.\n\
      -m=<int>    Minimum sequence separation between paired residues (default: 5)\n\
      -M=<int>    Millions of simulations to perform for permutation scores (default: 0)\n\
      -O=<int>    User provided offset for printing out residue pairs (default: 0)\n\
      -R          Randomize column pair scores (control for ICA p-value calculation).\n\
      -S          Shuffle column pair ranks (control for Ball-in-urn & permutation p-value calculations).\n\
      -seed=<int> Provide a random seed (used with -R & -S options).\n\
      -v          Verbose output.\n\
   References:\n\
      Neuwald, A.F. & Altschul, S.F. 2018. Statistical Investigations of Protein Residue Direct \n\
          Couplings. PLoS Computationl Biology (in revision)\n\
      Altschul, S.F. & Neuwald, A.F. 2018. Initial Cluster Analysis. Journal of\n\
          Computational Biology 25(2):121-129\n\
\n\n"

#define DUSAGE "   *** Statistical Tool for Analysis of Residue Couplings ***\n\
   Usage: starc <pdb_file> <chain> <dca_file> [options]\n\
        or starc <pdb_file> <chain>:<str> <dca_file> [options]\n\
	   where <str> is a list of adjacent identical chains\n\
   Input:  <pdb_file> protein database structural coordinate file for benchmarking\n\
	   <chain>    protein chain corresponding to the DCA analysis\n\
	   <dca_file> file of DCA scores in EVcouplings format\n\
      Note: will also take a PSICOV or GaussDCA formatted input file as \n\
	    <dca_file>.dca, in which case a corresponding <dca_file>.aln file\n\
	    must also be provided.  The latter indicates which residues in <chain>\n\
	    are aligned & map to DCA scores (upper case) and which are unaligned\n\
	    & thus map to unscored gaps (lower case).  From these an EVcouplings\n\
	    formatted file <dca_file> will be generated.\n\
   Output: Adjusted p-value and the corresponding -log10(p) score to stdout.\n\
   Options: \n\
      -A          Print the ICA array to stdout.\n\
      -bpps=<file> Read in positions of discriminating residues for a BPPS/DCA comparison\n\
                     Syntax example: R=234,181,174,313,225,295,209,298,...\n\
      -D=<real>   Maximum distance (in Angstroms) defining contacts (default: 5.0).\n\
      -d=<real>   Minimum distance (in Angstroms) defining contacts (default: 0.0).\n\
      -NoH	  Ignore hydrogen atoms in reference structures (if present)\n\
      -I          Optimize over ICA P-values only (i.e., ignore ball-in-urn component)\n\
      -F=<real>   Use fixed cutoff X = <real>*length of DCA sequence (range: 0.1..20)\n\
      -m=<int>    Minimum sequence separation between paired residues (default: 5)\n\
      -M=<int>    Millions of simulations to perform by shuffling DCA rankings (default: 0)\n\
      -P=<int>    Show the <int> highest DCA scoring pairs\n\
      -ppv        Compute PPV score instead of S-score (must be used with -F option only)\n\
      -R          Randomize column pair scores (control for ICA p-value calculation).\n\
      -S          Shuffle column pair ranks (control for Ball-in-urn & permutation p-value calculations).\n\
      -seed=<int> Provide a random seed (used with -R & -S options).\n\
      -split=<int> Look only at residue pairs with res1 <= <int> & res2 > <int>.\n\
                   If <int> < 0 then where not (res1 <= -<int> & res2 > -<int>).\n\
      -side=<int> With -split option: if <int> = -1 or 1 use left or right side only\n\
   References:\n\
      Neuwald, A.F. & Altschul, S.F. 2018. Statistical Investigations of Protein Residue Direct \n\
          Couplings. PLoS Computationl Biology (in revision)\n\
      Altschul, S.F. & Neuwald, A.F. 2018. Initial Cluster Analysis. Journal of Computational \n\
          Biology 25(2):121-129\n\
   Funding for this program:\n\
     National Institutes of Health grant R01GM125878 from the National Institute of \n\
     General Medical Sciences\n\
\n\n"

/***********************************************************************
      -J          Don't correct implicit Jeffreys priors for Inital Cluster Analysis.\n\
 *************************************************************************/


Int4     *edc_typ::ParseIntegers(char *str,const char *msg)
// input string: "%c=3,5,7,9,11,17" (from sipris file)
// returns int array =[6,3,5,7,9,11,17,0]
{
        Int4     n,v=0,i,*values;

	n=strlen(str); NEW(values,n+2,Int4);
	if(!isalpha(str[0])) print_error(msg);
	color_code=str[0];
        if(str[1] != '=') print_error(msg);
        for(n=1,i=2; str[i] != 0; ){
           if(str[i] == ',') { n++; i++; }
           else if(isdigit(str[i])){
                if(sscanf(str+i,"%d",&v) != 1) print_error(msg); 
                else { values[n]=v; while(isdigit(str[i])) i++; }
           } else if(isspace(str[i])) break; else print_error(msg);
        } values[0]=n; return values;
}

void    edc_typ::PrintError(char *program_name)
{
// assert(2 == 3);
	if(strcmp(program_name,"evalDCA") == 0 || strcmp(program_name,"EvalDCA") == 0){
		print_error(MyUSAGE);
	} else { PrintLicenseStatement("starc v1.0.2"); print_error(DUSAGE); }
}

void	edc_typ::Init(int argc,char *argv[],char *mAln)
{
	Int4	arg,X;
	char	str[2000],*map_file=0;
	BooLean	openVSI=FALSE,OpenTable=FALSE,MkEVcouplingsFile=TRUE;

	InitDefaults( );
	if(argc < 4) PrintError(argv[0]);
#if 0
	for(arg = 0; arg < argc; arg++) fprintf(stderr,"%s ",argv[arg]);
	fprintf(stderr,"\n");
// exit(1);
#endif
	TurnOffLicenseStatement();
	AB=MkAlpha(AMINO_ACIDS,GBLAST_BLOSUM62);
	program=AllocString(argv[0]);
	pdb_file=AllocString(argv[1]);
	// fprintf(stderr,"pdb = %s (%s)\n",pdb_file,argv[2]);
	KeyChain=argv[2][0]; NumAdjChns=0;
	if(argv[2][1] == ':') {
	   Int4	i;
	   for(i=2; (isalpha(argv[2][i]) || isdigit(argv[2][i])); i++){
		NumAdjChns++; AdjacentChains[NumAdjChns]=argv[2][i];
		if(NumAdjChns > 25) print_error("FATAL: maximum number of adjacent chains exceeded\n");
	   } if(argv[2][i] != 0) PrintError(argv[0]);
	} else if(argv[2][1] != 0) PrintError(argv[0]);
	dca_file=AllocString(argv[3]);
        for(arg = 4; arg < argc; arg++){
           if(argv[arg][0] != '-') PrintError(argv[0]);
	   // fprintf(stderr,"case = '%c'\n",argv[arg][1]);
// fprintf(stderr,"argv[arg] = %s\n",argv[arg]);
           switch(argv[arg][1]) {
                case 'A': if(argv[arg][2] != 0) PrintError(argv[0]); else afptr=stdout; break; 
                case 'b': {
			if(sscanf(argv[arg],"-bpps=%s",str) == 1){
			    // fprintf(stderr,"bpps file = \"%s\"\n",str); 
			    if(this->bpps) print_error("edc_typ->Init(): bpps already set.");
			    FILE *fp=open_file(str,"","r");
			    if(fgets(str,1995,fp) == NULL) PrintError(argv[0]);
			    else if(strlen(str) > 1990) print_error("bpps option file too long");
			    else { fclose(fp); this->bpps=this->ParseIntegers(str,DUSAGE); }
			} else  PrintError(argv[0]); 
		   } break;
                case 'D': { MaxDist=(float) RealOption(argv[arg],'D',1.0,500.0,DUSAGE); } break;
                case 'd': { MinDist=(float) RealOption(argv[arg],'d',0.0,500.0,DUSAGE); } break;
                case 'I': { if(argv[arg][2] != 0) PrintError(argv[0]); else Mode='I'; } break;
                case 'F': {
			    TruncateArray=(float) RealOption(argv[arg],'F',0.0,20.0,DUSAGE); 
			    if(TruncateArray > 0.0 && TruncateArray < 0.1){ 
				print_error("-F=<real> option out of range");
			    }
			  } break;
                case 'J': { if(argv[arg][2] != 0) PrintError(argv[0]); else UseJeffreys=TRUE; } break;
                case 'M': { PermuteMillions=IntOption(argv[arg],'M',1,10000,DUSAGE); } break;
                case 'm': { MinSqSeparation=IntOption(argv[arg],'m',1,50,DUSAGE); } break;
		case 'N': { if(strcmp(argv[arg],"-NoH") == 0){ UseHydrogens=FALSE; }
			    else PrintError(argv[0]); } break;
		case 'n': { if(strncmp(argv[arg],"-no_evc_file",12) == 0){ MkEVcouplingsFile=FALSE; } 
			    else PrintError(argv[0]); } break;
                case 'P': { ShowPairs=IntOption(argv[arg],'P',1,100000,DUSAGE); } break; 
		case 'p': { if(strcmp(argv[arg],"-ppv") == 0){ UsePPV=TRUE; } else PrintError(argv[0]); } break;
                case 'R': if(argv[arg][2] != 0) PrintError(argv[0]); else Simulate=1; break; 
                case 'r': 
		  if(argv[arg][2] != 0) PrintError(argv[0]);
		  else ComputeOverRange=TRUE; break; 
                case 'S':
		  if(argv[arg][2] != 0) PrintError(argv[0]);
		  else ShuffleRanks=1; break; 
		case 's': 
		   if(sscanf(argv[arg],"-split=%d",&Split) == 1){
			// if(Split < 0) PrintError(argv[0]);
		   } else if(sscanf(argv[arg],"-seed=%u",&seed) == 1){
			// PrintError(argv[0]);
		   } else if(sscanf(argv[arg],"-side=%d",&UseSide) == 1){
			// fprintf(stderr,"side= %d\n",UseSide);
			if(UseSide > 1 && UseSide < -1) PrintError(argv[0]);
		  } else PrintError(argv[0]); break;
                case 'v': if(argv[arg][2] != 0) PrintError(argv[0]); 
		  else efptr=stderr; break; 
		case 'z': break; // bummy variable to be ignored (used within scripts)...
		case ' ': break; // ignore these...
                default: 
// fprintf(stderr,"argv[arg] = %s\n",argv[arg]);
		this->PrintError(argv[0]);
            }
        }
if(Split < 0 && UseSide != 0)
  print_error("-split=<int> option with <int> < 0 disallowed with -side!=0");
        FILE	*fp=0,*ifp=0,*tfp=0;
	Int4	i,j,sqlen=0,ii,jj,k,alnlen=0,dm[12];    // dm[] = dummy variables.
	set_typ iSet=0; NumHits=0;
	double Dd,dd;
	if(UsePPV && TruncateArray <= 0.0){ print_error("-ppv option must be used with -F=<real> option"); }
	char	Str[105],cI,cJ,Aln[9009],SeqX[9009]; // Aln == *.aln file; SeqX = query seq.
if(0) fprintf(stderr,"DEBUG ALN '%s' !!!!!!!!\n",dca_file);
	if(mAln || (fp=fopen(dca_file,"r")) == NULL){
	     // then open dca_file.aln & dca_file.dca files; else dca_file == required file.
	     // ************************************************************************
	     // For PSICOV or GaussDCA input obtain true positions within the alignment.
	     // ************************************************************************
	     Int4 col2sq[9009],sq2col[9009]; iSet=MakeSet(9009);
	     if(mAln){ 
		if(sscanf(mAln,"%6s: %s\n",pdbid,Aln) != 2) print_error("mstarc input error"); 
// fprintf(stderr,"pdbid=%s\n",pdbid);
	     } else {
		tfp=open_file(dca_file,".aln","r");
	        if(fgets(Aln,9003,tfp)==NULL) print_error("failed to open *.aln file.");
		fclose(tfp);
	     }
	     if(strlen(Aln) > 9000){
	         fprintf(stderr,"Aln: %s\n",Aln);
		 print_error("FATAL: *.aln file length > 9000 aa");
	     }
	     // for(alnlen=sqlen=j=0; Aln[j] != 0; j++)
	     for(alnlen=sqlen=j=0; isprint(Aln[j]); j++)
	     {
		if(isupper(Aln[j])){
		   SeqX[sqlen]=Aln[j]; alnlen++; sqlen++;
		   col2sq[alnlen]=sqlen; sq2col[sqlen]=alnlen;
	        } else if(Aln[j]=='-'){  alnlen++; col2sq[alnlen]=0; 
		} else if(islower(Aln[j])){
			SeqX[sqlen]=Aln[j]; sqlen++; AddSet(sqlen,iSet);  
			sq2col[sqlen]=0;
		} else {
			fprintf(stderr,"Aln[%d] = %c = %d\n",j,Aln[j],Aln[j]); 
			fprintf(stderr,"Aln: %s\n",Aln); fprintf(stderr,"Seq: %s\n",SeqX); 
			print_error("edc_typ::Init() input error");
		}
	     } SeqX[sqlen]=0;
	     e_type  qE=StringToSeq(SeqX, "input aln seq",1,AB);
if(0){ fprintf(stderr,"Aln: %s\n",Aln); fprintf(stderr,"Seq: %s\n",SeqX); PutSeq(stderr,qE,AB);
	exit(1);
}

	     // ************************************************************************
	     // Find maximum aligned seq position & truncate dca file at that point.
	     // -------------- Output in GaussDCA format. -----------------
	     // ************************************************************************
	     ifp=open_file(dca_file,".dca","r");
             // while(fgets(Str,100,ifp) != NULL){ fprintf(stderr,"%s",Str); } rewind(ifp);
	     Int4 MaxAlnd=0,R1,R2,C1,C2;
             for(tfp=tmpfile();fgets(Str,100,ifp) != NULL; ){	 
		if(Str[0]=='#') break;	// last line of ccmpred output.
	     	if(sscanf(Str,"%d %d %d %d %lf",&R1,&R2,&dm[0],&dm[1],&dd)==5){ // PSICOV format. 
// fprintf(stderr,"PSICOV format\n");
		    // Convention modified on 10/12/2018 to fix problems - AFN.
                    if(R2 <= sqlen && (sq2col[R1]==0 || sq2col[R2]==0 )) continue; // query deletion
                    if(R2 <= sqlen){
			NumHits++;
			fprintf(tfp,"%d %d %.8lf\n",sq2col[R1],sq2col[R2],dd);
		    }
		} else if(sscanf(Str,"%d %d %lf",&C1,&C2,&dd) == 3){
// fprintf(stderr,"GaussDCA & new CCM format.\n");
		  // WARNING: original CCM format uses residues in seq. not columns as for darc!!!
// fprintf(stderr,"GaussDCA format\n");
		  if(C1 >= C2){
			fprintf(stderr,"dca_file = %s; C1 = %d; C2 = %d\nStr=%s\n",
				dca_file,C1,C2,Str);
			print_error("dca file format error 0");
		  }
		  if(C2 <= alnlen && (col2sq[C1]==0 || col2sq[C2]==0 )) continue;  // query deletion
		  if(C2 > MaxAlnd) MaxAlnd=C2;
		  if(C2 <= alnlen){
			NumHits++;
			fprintf(tfp,"%d %d %.8lf\n",C1,C2,dd); // GaussDCA format.
		  }
		}
	     } fclose(ifp); rewind(tfp); ifp=tfp;
// fprintf(stderr,"alnlen=%d; sqlen=%d; MaxAlnd=%d\n",alnlen,sqlen,MaxAlnd);

	     // ************************************************************************
	     // ================== Create EVC format input file.  ======================
	     // ************************************************************************
FILE *cfp=0; // cfp=open_file(dca_file,".cnvt","w"); 
	     // make dca_file or tmpfile() as output...adjusts pos and adds residues.
// fprintf(stderr,"NumHits=%d\n",NumHits);
if(NumHits < 10){ NilSeq(qE); return; }
	     if(MkEVcouplingsFile) fp=open_file(dca_file,"","w"); else fp=tmpfile(); 
             while(fgets(Str,100,ifp) != NULL){
	      // =============== read in GaussDCA format as input =================
              if(sscanf(Str,"%d %d %lf",&i,&j,&Dd) == 3){
// fprintf(stderr,"%s\t%s\t%d\n",Str,dca_file,MkEVcouplingsFile);
		ii=col2sq[i]; jj=col2sq[j]; assert(ii != 0 &&  jj != 0);
		if(!(ii > 0 && ii <= LenSeq(qE))) print_error("aln file input error 1");; 
		if(!(jj > 0 && jj <= LenSeq(qE))) print_error("aln file input error 1");; 
		cI=AlphaChar(ResSeq(ii,qE),AB); cJ=AlphaChar(ResSeq(jj,qE),AB);
                fprintf(fp,"%d,%d,%.5lf,%d,%d,0,0,0,0,0,%c,%c\n",ii,jj,Dd,i,j,cI,cJ);
		if(cfp && abs(ii-jj) > 5)
			fprintf(cfp,"%d,%d,%.5lf,%c%d,%c%d\n",i,j,Dd,cI,ii,cJ,jj);
	      } else print_error("conversion to EVC format failed!");
	     } fclose(ifp); 
	     if(cfp) fclose(cfp);
	     if(MkEVcouplingsFile){ fclose(fp); fp=open_file(dca_file,"","r"); 
	       // while(fgets(Str,100,fp) != NULL) fprintf(stdout,"%s",Str); rewind(fp);
	     } else {
		rewind(fp); print_error("-no_evc_file option needs more coding to work\n"); 
		// need to pass fp into dcm_typ::GetMtrxDCA(); may need other changes.
	     } NilSeq(qE);
	}
	// ****************************************************************************
	// =============== Read in EVcouplings-formatted input file. ==================
	// ****************************************************************************
        Int4  max_ij=0,min_ij=9999,NumLines=0;
        unsigned char *seq;     NEW(seq,9005,unsigned char); // 30-500 is evfold max...
        while(fgets(Str,100,fp) != NULL){
// fprintf(stderr,"-->%s\t%d\n",dca_file,NumLines);
            if(sscanf(Str,"%d,%d,%lf,%d,%d,%d,%d,%d,%d,%d,%c,%c",
                   &i,&j,&Dd,&dm[1],&dm[2],&dm[3],&dm[4],&dm[5],&dm[6],&dm[7],&cI,&cJ) != 12){
			fprintf(stderr,"input: %s\n",Str);
                        print_error("DCA score file input error.");
	    }
	    if(!std::isfinite(Dd)) print_error("Fatal: STARC input error");
            if(i > 9000 || j > 9000) print_error("FATAL: site in EVC file > 9000!");
            if(i < 1 || j < 1) print_error("FATAL: non-positive site in EVC file!");
            if(i > max_ij) max_ij=i; if(j > max_ij) max_ij=j;
            if(i < min_ij) min_ij=i; if(j < min_ij) min_ij=j;
	    if(iSet && (MemberSet(i,iSet) || MemberSet(j,iSet))){
		    print_error("FATAL: site within insert region.");
	    }
            seq[i]=AlphaCode(cI,AB); seq[j]=AlphaCode(cJ,AB); NumLines++;
if(0 && i < 8 && j < 20) fprintf(stderr,"%s\t%s\t%d\n",Str,dca_file,NumLines);
        }
// rewind(fp); while(fgets(Str,100,fp) != NULL) fprintf(stderr,"%s",Str); 
	fclose(fp);
	if(iSet) NilSet(iSet);
// fprintf(stderr,"-->%s\t%d\n",dca_file,NumLines);
	if(NumLines < 10){
	   fprintf(stderr,"Number of pairs= %d\n",NumLines);
	   print_error("DCA score file input error: too few pairs.");
	}
        dcaE=MkSeq("DCA seq",max_ij-min_ij + 1,seq + min_ij-1); free(seq);
        SetOffSetSeq(min_ij - 1,dcaE);
	// PutSeq(stderr,dcaE,AB);  exit(1);
	time1=time(NULL);
#if 0
        for(arg=0; arg < argc; arg++) fprintf(stdout,"%s ",argv[arg]); 
        if(seed != 18364592) fprintf(stdout,"\n");
	else {
	   seed = (unsigned int) (time(NULL)/2); 
	   if(Simulate || ShuffleRanks) fprintf(stdout,"-seed=%u\n",seed); else fprintf(stdout,"\n");
	} sRandom(seed);
#endif
        if(seed == 18364592) seed = (unsigned int) (time(NULL)/2); 
	sRandom(seed);
}


