/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu
 ******************************************************************************************/

#include "goscan.h"
#include "random.h"
#include "purge.h"
#include "gsq_typ.h"
#include "histogram.h"
#include "oscan2msa.h"
// #include "gibbs.h"
#include "blosum62.h"

#define	USAGE_START	"USAGE: goscan database snfile [options]\n\
   snfile = file with aligned segments\n\
   options:\n\
     -A         - realign the detected sequences after scanning\n\
     -a         - create selex alignment file for viewing\n\
     -B<char>   - define residue frequencies to use for background model\n\
                  m = use multiple alignment background counts.\n\
                  d = use database background counts.\n\
                  s = use standard background counts.\n\
                  (default = use standard background counts).\n\
     -b<int:int>- output fasta files of conserved blocks with flanking regions\n\
     -C<float>  - minimum field map Cutoff for individual blocks (0.0)\n\
     -c         - create sequence file for output\n\
     -D         - Don't mask low complexity regions (will mask by default)\n\
     -d<float>  - maximum E-value for repeat domain detection\n\
     -e<float>  - maximum E-value for printing alignments\n\
     -E<float>  - maximum E-value for single motif block\n\
     -F         - print output in *.msa format\n\
     -f<int>    - number of failed blocks tolerated (default: 0)\n\
     -H         - create histogram .gaps file to report gap lengths\n\
     -G<real>   - get gapped alignments <real> stdev above the mean\n\
     -g<int>,<int> - specify gap opening and extension penalties (default: 18,2)\n\
     -I<int>:<int> - create fasta file of internal repeats with flanking regions\n\
     -I         - exhaustive search allowing full insertions and deletions\n\
                   (output as fasta sequences)\n\
     -i         - exhaustive search allowing full insertions and deletions\n\
                   (fancy output to show motif alignments)\n\
     -iC        - exhaustive search allowing full insertions and deletions\n\
                   (cma output to allow tweaking of multiple alignments)\n\
     -i<real>   - create *.crs file with info cutoff <real>\n\
     -i=<string> - input color string for *.crs file (e.g., 'XROYGBM')\n\
     -L<int>    - set cutoff score for gapped alignments\n\
     -M<int>    - maximum number of input sequences (default: 1,000,000)\n\
     -m<char>   - method = <char> (lowercase = use informative columns only)\n\
                  M or m = modified Gribskov method.\n\
                  F or f = modified Gribskov method with off columns=blosum45.\n\
                  D or d = Dirichlet Mixtures priors.\n\
                  R or r = product multinomial.\n\
                  H or h = Henikoff's method\n\
                  B or b = Henikoff's method with motif residue background\n\
     -N<int>    - maximum number of repeats to look for (default: 10)\n\
     -n         - mask potential nonglobular regions using 'seg x 45 3.4 3.75'\n\
     -O         - open database (*.dbs) file\n\
     -P<int>    - create PHYLIP infile of scanned hits with purge cutoff <int>\n\
     -p<float>  - pseudo counts for product multinomial model\n\
     -R<int>    - remove redundant sequences at <int> % identity level (1-100%)\n\
     -r<int>    - minimum number of repeats to output a hit (1-1000:1)\n\
     -S         - shuffle input sequences\n\
     -s<int>    - seed for random number generator\n\
     -T<int>    - print out top <int> scoring sequences only\n\
     -t<int>    - test options (1-10:0)\n\
     -u<char>   - scan method with or without gap function (default 'O')\n\
                  [inactivated]'a' = alternative global with gap function\n\
                  [inactivated]'g' = global with gap function\n\
                  'G' = global without gap function\n\
                  [inactivated]'d' = local with gap function\n\
                  [inactivated]'D' = local without gap function\n\
                  [inactivated]'o' = global overlap with gap function\n\
                  'O' = global overlap without gaps\n\
                  [inactivated]'c' = local core with gap function\n\
                  [inactivated]'C' = local core without gaps\n\
     -v         - create a rasmol file to visualize conserved blocks\n\
     -w         - DON'T apply weights to sequences in scan file\n\
     -X         - mask out sequence regions not matching motif(s)\n\
     -x         - mask out sequence regions matching motif(s)\n\
     -z         - output scoring matrix(s)\n\n"

/**************************** Global Variables ******************************/

int	main(Int4 argc,char *argv[])
{ 
	Int4	m,n,i,j,k,length,arg,number,maxrpts=0,num;
	Int4	min_segs=1,max_segs=10,*counts,time1,total;
	Int4	MAX_IN_SEQS=50000000,*mtfcnts=NULL;
	Int4	a=18,b=2;  // pernats is not relevant here (using blosum62).
	unsigned short	*nsize,*nsz;
	char	method='H';	/** default: Henikoff's method **/
	char	c,str[200];
	float	domEval=0.01,expect=0.01,pseudo=0.5,singleEval=1.0;
	double	map,*freq;
	BooLean	rasmol=FALSE;
	char	mode='O';
	int	return_status=0,test=0;
	a_type	A;
        FILE    *fptr,*ofp,*pfp=NULL,*ifp;
	e_type	gE,E,*listE;
	Int4	phylip=-200,right=0,left=0;
	BooLean seqfile=FALSE,open_dbs = FALSE,report_gaps=FALSE;
	BooLean	mask=FALSE,shuffle=FALSE,neg_mask=FALSE;
	BooLean	realign=FALSE,put_align=FALSE,put_repeats=FALSE;
	BooLean	fullSWalign=FALSE;
	BooLean	FAfullSWalign=FALSE;
	UInt4 seed=18364592;
	UInt4 min_rpt=1;
	gsn_typ F;
	gd_type	G;
	snh_typ	sH;
	sni_typ	*I,*I2;
	h_type	H,HG1,HG2;
	cma_typ	msa=NULL;
	a_type	A2;
	char	nextfile[200];
	const char *stub[3] = {"gibbs","tempfile", "10"};
	BooLean	segmask=TRUE, mask_nonglobular=FALSE;
	BooLean	weights=TRUE,dogaps=FALSE;
	double	stdev_cut=0.0;
	/**** NEW *****/
	Int4	cutoff=0,lowscore=0;
	BooLean	fa_blocks=FALSE;
	Int4	print_top=0;
	Int4	Nflank,Cflank;
	/**** NEW *****/
	char	aafreq='s';
	float	minmap=0.0;
	BooLean	see_smatrix=FALSE;
	sma_typ	MA;
	BooLean	msaformat=FALSE,cma_output=FALSE;
	double info_cutoff=-1.0; // in bits
	Int4	MaxBadBlks=0;
	char	*InputColors=0;

	time1=time(NULL);
	if(argc < 3) print_error(USAGE_START);
	sRandom((UInt4) time(NULL)/2);
	A=MkAlpha(AMINO_ACIDS,PROT_BLOSUM62);
	for(arg = 3; arg < argc; arg++){
	   if(argv[arg][0] != '-') print_error(USAGE_START);
	   switch(argv[arg][1]) {
	     case 'A': realign = TRUE; break;
	     case 'a': put_align= TRUE; break;
	     case 'B': if(!isalpha(aafreq=argv[arg][2]))
                                print_error(USAGE_START); break;
	     case 'b': fa_blocks= TRUE; 
		     if(sscanf(argv[arg],"-b%d:%d",&Nflank,&Cflank) != 2)
			print_error(USAGE_START); 
		     break;
	     case 'C': minmap=RealOption(argv[arg],'C',-1000,+5000,USAGE_START); break;
	     case 'c': seqfile = TRUE; break;
	     case 'D': segmask= FALSE; break;
	     case 'd': domEval=RealOption(argv[arg],'d',0,500000,USAGE_START); break;
	     case 'E': singleEval=RealOption(argv[arg],'E',0,10000,USAGE_START); 
			break;
	     case 'e': expect=RealOption(argv[arg],'e',0,500000,USAGE_START); break;
	     case 'F': msaformat=TRUE; break;
	     case 'f': MaxBadBlks=IntOption(argv[arg],'f',0,1000,USAGE_START); break;
	     case 'H': report_gaps = TRUE; break;
	     case 'G': stdev_cut=RealOption(argv[arg],'G',0,500,USAGE_START); 
			dogaps = TRUE; break;
	     case 'g': if(sscanf(argv[arg],"-g%d,%d",&a,&b) != 2)
					print_error(USAGE_START); 
		     if(a < 0 || b < 0) print_error(USAGE_START);
		     break;
	     case 'I': 
		if(isdigit(argv[arg][2])){
		     put_repeats= TRUE; 
		     if(sscanf(argv[arg],"-I%d:%d",&left,&right) != 2)
			print_error(USAGE_START); 
		} else {
		   fullSWalign=TRUE; FAfullSWalign=TRUE;
		} break;
	     case 'i': 
	 	     if(argv[arg][2] == '='){
			if(!isalpha(argv[arg][3])) print_error(USAGE_START);
			InputColors=argv[arg]+3;
	 	     } else if(argv[arg][2] == 'C'){
			cma_output=TRUE; fullSWalign = TRUE;
		     } else {
			if(isdigit(argv[arg][2])){
	     			info_cutoff=RealOption(argv[arg],'i',0,50,USAGE_START); 
			} fullSWalign = TRUE; 
		     } break;
	     case 'L': lowscore=IntOption(argv[arg],'L',-5000,500000,USAGE_START); 
		     break;
	     case 'M': MAX_IN_SEQS=IntOption(argv[arg],'M',1000,2000000000,USAGE_START); break;
	     case 'm': if(!isalpha(method=argv[arg][2])) 
				print_error(USAGE_START); break;
	     case 'N': maxrpts=IntOption(argv[arg],'N',1,1000,USAGE_START); break;
	     case 'n': mask_nonglobular=TRUE; break;
	     case 'O': open_dbs = TRUE; break;
	     case 'P': phylip=IntOption(argv[arg],'P',-1,5000,USAGE_START); break;
	     case 'p': pseudo=RealOption(argv[arg],'p',0,500,USAGE_START); break;
	     case 'R': cutoff=IntOption(argv[arg],'R',1,1000,USAGE_START); break;
             case 'r': min_rpt=IntOption(argv[arg],'r',1,1000,USAGE_START); break;
	     case 'T': print_top=IntOption(argv[arg],'T',1,1000,USAGE_START); break;
	     case 't': test = IntOption(argv[arg],'t',1,100,USAGE_START); 
		if(test == 5) mode= 'g'; if(test == 6) mode= 'g';
		if(test == 1) mode= 'g'; // if(test == 12) mode= 'g'; 
		break;
	     case 'S': shuffle = TRUE; break;
	     case 's': if(sscanf(argv[arg],"-s%d",&seed)!=1)
				print_error(USAGE_START); break;
	     case 'u': if(!isalpha(mode=argv[arg][2])) 
				print_error(USAGE_START); break;
	     case 'v': rasmol=TRUE; break;
	     case 'w': weights = FALSE; break;
	     case 'x': mask = TRUE; neg_mask = FALSE; break;
	     case 'X': mask = TRUE; neg_mask = TRUE; break;
	     case 'z': see_smatrix= TRUE; break;
	     default: print_error(USAGE_START);
	   }
	}
	if(maxrpts==0) { if(fullSWalign && !FAfullSWalign) maxrpts=1; else maxrpts=10; }
	if(seed == 18364592)  seed = (UInt4) time(NULL)/2;
	sRandom(seed);
	number = GetFastaInfo(argv[1], MAX_IN_SEQS, &counts, &nsize, A);
	for(total=0, i=0; i<=nAlpha(A); i++) total += counts[i];
	if(!dogaps && !mask && test==0 && print_top < 1 && !fullSWalign){
	   for(arg = 0; arg < argc; arg++) printf("%s ",argv[arg]); printf("\n");
	}
	fptr = open_file(argv[1],"","r");
	NEW(freq,nAlpha(A)+2,double);
	if(aafreq == 'm') {
	  NEW(mtfcnts, nAlpha(A)+2, Int4);
	  MA=ReadSMA(argv[2]); CountsSMA(mtfcnts, MA); NilSMA(MA);
	  for(n=0, i=0; i<=nAlpha(A); i++) n += mtfcnts[i];
    	  for(i=0; i<=nAlpha(A); i++) freq[i]= (double)mtfcnts[i]/(double)n;
	  free(mtfcnts);
	} else if(aafreq == 'd') {
    	  for(i=0; i<=nAlpha(A); i++) freq[i]= (double)counts[i]/(double)total;
	} else {
	  for(i=0; i<= nAlpha(A); i++) freq[i] = (double) blosum62freq[i];
	}
	for(m=0,i=1; i<= number; i++) m = MAXIMUM(Int4,nsize[i],m);
	F=MakeGOScan(argv[2],A,maxrpts,method,mode,expect,singleEval,m,weights,
		minmap,pseudo,freq);
	SetRptEvalGOScan(domEval,F);
	if(shuffle) ShuffleSegsGOScan(F);
	if(!segmask) NoMaskGOScan(F);
	if(mask_nonglobular){ MaskNonGlobularGOScan(F); }
	if(open_dbs) OpenDatabaseGOScan(F);
   if(see_smatrix) PutSmxGOScan(stdout,F);
   else if(dogaps){
      Int4 J,score,start,end;
      ptm_typ PM=PrtnModelGOScan(F);
      double	avesd,AveSD,bestavesd;
      smx_typ *smx=SMatricesPrtnModel(PM);
      char *operation;

 	   mode= 'g';
	   H = Histogram("random scores",0,1000,2.0);
	   // first compute mean and sd for shuffles
	   e_type *EList,rE;
	   NEW(EList,number+3,e_type);
	   for(J=1; J <= number; J++){
	     E = ReadSeq(fptr,J,nsize[J],A);
	     EList[J] = E; rE = CopySeq(E);
	     ShuffleSeq(rE);
	     operation=GapAlnTraceSMatrix2(a,b,LenSeq(rE),SeqPtr(rE),
		NumModelsPrtnModel(PM),smx,GapScoresPrtnModel(PM),&start,&score);
	     IncdHist(score,H);
	     free(operation);
	   } PutHist(stderr,60,H); NilHist(H);
	   double mean = MeanHist(H);
	   double stdev = sqrt(VarianceHist(H));
	   Int4 cutscore = (Int4)((mean + stdev_cut*stdev)+0.5);
	   fprintf(stderr,"cutoff score = %d\n",cutscore);
	   H = Histogram("true scores",0,1000,2.0);
	   for(J=1; J <= number; J++){
	     E = EList[J];
	     operation=GapAlnTraceSMatrix2(a,b,LenSeq(E),SeqPtr(E),
		NumModelsPrtnModel(PM),smx,GapScoresPrtnModel(PM),&start,&score);
	     IncdHist(score,H);
	     if(score >= cutscore){ 
		for(i=1; operation[i] != 'E'; i++) if(operation[i] != 'i') break;
		for(end=start; operation[i] != 'E'; i++) end++;
		for(i--,end--; operation[i] == 'i'; i--) end--;	// back up to end...
		PutSubSeq(stdout, start-left, end+right,E,PrtnModelA(PM));
	     }
	     free(operation);
	     NilSeq(E);
	   } PutHist(stderr,60,H); NilHist(H);
	   free(EList);
   } else if(test){
      Int4 J,score,start,end;
      ptm_typ PM=PrtnModelGOScan(F);
      double	avesd,AveSD,bestavesd;
      smx_typ *smx=SMatricesPrtnModel(PM);
      char *operation;
      FILE	*fp;
      switch(test){
	case 12:   // NEW: find gapped alignment region....
	 {
	   H = Histogram("smatrix scores",0,1000,10.0);
	   smx_typ *multsmx;
	   Int4	**mgs,bestscore,bestrpts,beststart,topscore;
	   Int4 toprpts,topstart;
	   Int4 rpt,nmod = NumModelsPrtnModel(PM);
	   e_type TopE=0;
	   Int4 **gapscore=GapScoresPrtnModel(PM);
	   char *bestoperation,*topoperation;
	   NEW(multsmx,nmod*maxrpts + 3,smx_typ);
	   NEWP(mgs,nmod*maxrpts + 3,Int4);
	   for(j=1,J=1; J <= maxrpts; J++){
	  	for(i=1; i <= nmod; i++){
		   multsmx[j] = smx[i];
		   if(gapscore) mgs[j] = gapscore[i];
		   j++;
		} 
	   }
	   bestoperation=topoperation=0;
	   for(topscore=0,J=1; J <= number; J++){
	     E = ReadSeq(fptr,J,nsize[J],A);
	     if(shuffle) ShuffleSeq(E);
	     else if(segmask) ProcessSeqPSeg(17,2.2,2.5,100,E,A);
	     for(bestscore=bestrpts=0,rpt=1; rpt <=maxrpts; rpt++){
	       if(gapscore) operation=GapAlnTraceSMatrix2(a,b,LenSeq(E),
		   XSeqPtr(E), nmod*rpt,multsmx,mgs,&start,&score);
	       else operation=GapAlnTraceSMatrix2(a,b,LenSeq(E),
		   XSeqPtr(E), nmod*rpt,multsmx,NULL,&start,&score);
	       if((score-bestscore) >= lowscore){ // if next motif sign...
		  if(bestoperation) free(bestoperation);
		  bestoperation=operation; beststart=start;
		  bestscore=score; bestrpts=rpt;
	       } else { free(operation); break; }
	     }
	     PutSeqInfo2(stderr,E); 
	     fprintf(stderr,"%d repeats; score = %d\n",bestrpts,bestscore);
	     fprintf(stderr,"\n");
	     IncdHist(bestscore,H);
#if 0	// Fix this later for output of repeat regions...
start=beststart; operation=bestoperation;
if(operation) std::cerr << operation; std::cerr << std::endl;
PutGappedSeqAlnSMatrix(stderr,operation,OffSetSeq(E)+start-1,
		LenSeq(E),SeqPtr(E),nmod*bestrpts,multsmx);
 std::cerr << std::endl;
	     if(bestscore >= lowscore){ 
		for(i=1; operation[i] != 'E'; i++) if(operation[i] != 'i') break;
		for(end=start; operation[i] != 'E'; i++) end++;
		for(i--,end--; operation[i] == 'i'; i--) end--;	// back up to end...
		PutSubSeq(stdout, start-left, end+right,E,PrtnModelA(PM));
	     } NilSeq(E); if(operation) free(operation); 
#endif
#if 1	// For output of top sequence...
	     if(topscore < bestscore){
		if(TopE) NilSeq(TopE);
		TopE = E; topscore=bestscore; 
		toprpts=bestrpts; topstart=beststart;
		if(topoperation) free(topoperation);
		topoperation=bestoperation; 
	     } else { free(bestoperation); NilSeq(E); }
	     bestoperation=0;
#endif
	   }
	   PutHist(stderr,60,H); NilHist(H);
#if 1	// For output of top sequence...
start=beststart; operation=bestoperation;
if(topoperation && topscore >= lowscore){ 
	fprintf(stderr,"top: %d repeats; score = %d\n",toprpts,topscore);
	// std::cerr << operation; std::cerr << std::endl;
	E=TopE;
	PutGappedSeqAlnSMatrix(stderr,topoperation,OffSetSeq(E)+topstart-1,
		LenSeq(E)-topstart+1,SeqPtr(E)+topstart-1,nmod*toprpts,multsmx);
 std::cerr << std::endl; PutSeq(stdout,E,PrtnModelA(PM)); NilSeq(E);
}
#endif
	   free(multsmx); free(mgs);
	  } break;
	case 1:   // find gapped alignment region....
	   H = Histogram("smatrix scores",0,1000,2.0);
	   for(J=1; J <= number; J++){
	     E = ReadSeq(fptr,J,nsize[J],A);
	     if(shuffle) ShuffleSeq(E);
	     operation=GapAlnTraceSMatrix2(a,b,LenSeq(E),SeqPtr(E),
		NumModelsPrtnModel(PM),smx,GapScoresPrtnModel(PM),&start,&score);
	     IncdHist(score,H);
	     if(score >= lowscore){ 
		for(i=1; operation[i] != 'E'; i++) if(operation[i] != 'i') break;
		for(end=start; operation[i] != 'E'; i++) end++;
		for(i--,end--; operation[i] == 'i'; i--) end--;	// back up to end...
		PutSubSeq(stdout, start-left, end+right,E,PrtnModelA(PM));
	     }
	     free(operation);
	     NilSeq(E);
	   }
	   PutHist(stderr,60,H); NilHist(H);
	 break;
	case 22: {	// brackets to keep these in scope.
	    a_type  A = PrtnModelA(PM);
	    e_type  gE,sgE;
	    for(J=1; J <= number; J++){
		E = ReadSeq(fptr,J,nsize[J],A);
		  printf("*****************************************\n");
		  PutSeqInfo2(stdout,E); printf("\n");
		  operation=gapped_aln_seq_smatrixSW(a,b,
				LenSeq(E),SeqPtr(E),NmaxPrtnModel(PM),
				SMatricesPrtnModel(PM),
				GapScoresPrtnModel(PM),&start,&score);
		  put_seqaln_smatrixSW(stdout,operation, LenSeq(E),SeqPtr(E), 0, start,
			NmaxPrtnModel(PM),SMatricesPrtnModel(PM));
        	  fprintf(stderr,"operations = %s\n",operation);
		  put_cmaseq_smatrixSW(stdout,operation, LenSeq(E),SeqPtr(E), 0, start,
			NmaxPrtnModel(PM),SMatricesPrtnModel(PM));

#if 0
	// start print out overlaps...
		for(i=1; operation[i] != 'E'; i++) if(operation[i] != 'i') break;
		for(end=start; operation[i] != 'E'; i++) end++;
		for(i--,end--; operation[i] == 'i'; i--) end--;	// back up to end...
		PutSubSeq(stdout, start-left, end+right,E,PrtnModelA(PM));
		NumModelsPrtnModel(PM);
	// end print out overlaps.
#endif
        	  free(operation);
		  // PutSWAlnPrtnModelRpts(stdout,a,b,E,PM);
		  printf("score = %d\n",score);
		  printf("=========================================\n");
		}
		NilSeq(E);
	 } break;
	case 2: {	// brackets to keep these in scope.
	    a_type  A = PrtnModelA(PM);
	    e_type  gE,sgE;
	    for(J=1; J <= number; J++){
		E = ReadSeq(fptr,J,nsize[J],A);
		if(shuffle) { 
			ShuffleSeq(E);
			if(number == 1){
			  H = Histogram("smatrix scores",0,1000,10.0);
			  for(Int4 x=1; x <=25; x++){
		  	    score = PutSWAlnPrtnModel(stdout,a,b,E,PM);
		  	    printf("score = %d\n",score);
			    IncdHist(score,H);
			    ShuffleSeq(E);
			  }
	 		  PutHist(stdout,60,H); NilHist(H);
			} else PutSWAlnPrtnModel(stdout,a,b,E,PM);
		} else {
		  printf("*****************************************\n");
		  PutSeqInfo2(stdout,E); printf("\n");
		  score = PutSWAlnPrtnModel(stdout,a,b,E,PM);
#if 0
		  operation=gapped_aln_seq_smatrixSW(a,b,
				LenSeq(E),SeqPtr(E),NmaxPrtnModel(PM),
				SMatricesPrtnModel(PM),
				GapScoresPrtnModel(PM),&start,&score);
		  put_seqaln_smatrixSW(stdout,operation, LenSeq(E),SeqPtr(E), 0, start,
			NmaxPrtnModel(PM),SMatricesPrtnModel(PM));
        	  // fprintf(stderr,"operations = %s\n",operation);
	// start print out overlaps...
		for(i=1; operation[i] != 'E'; i++) if(operation[i] != 'i') break;
		for(end=start; operation[i] != 'E'; i++) end++;
		for(i--,end--; operation[i] == 'i'; i--) end--;	// back up to end...
		PutSubSeq(stdout, start-left, end+right,E,PrtnModelA(PM));
		NumModelsPrtnModel(PM);
	// end print out overlaps.
        	  free(operation);
#endif
#if 0	// 
        score=PutSeqAlnSMatrixSW(fp,a,b,LenSeq(E),SeqPtr(E),NmaxPrtnModel(PM),
		SMatricesPrtnModel(PM),GapScoresPrtnModel(PM));

==
char    *GapOperationsSMatrix(Int4 a, Int4 b, Int4 len, unsigned char *seq2,
        Int4 nmod, smx_typ *M, Int4 **gapscore)
{ Int4    s,N; return gapped_aln_seq_smatrixSW(a,b,len,seq2,nmod,M,gapscore,&N,&s); }


        char    *operation;
        Int4    J,alnscore;

        operation=gapped_aln_seq_smatrixSW(a,b,n2,seq2,nmod,M,gapscore,&J,&alnscore);
        // fprintf(fp,"Put: %s\n\n",operation);
        put_seqaln_smatrixSW(fp, operation, n2, seq2, 0, J, nmod, M);
        // fprintf(stderr,"operations = %s\n",operation);
        free(operation);
        return alnscore;


#endif
		  // PutSWAlnPrtnModelRpts(stdout,a,b,E,PM);
		  printf("score = %d\n",score);
		  printf("=========================================\n");
		}
		NilSeq(E);
	    }
	  } break;
	 case 10: {	// brackets to keep variables in scope.
	    a_type  A = PrtnModelA(PM);
	    e_type  gE,sgE;
	    printf("seed = %d\n",seed);
	    for(J=1; J <= number; J++){
	//	H = Histogram("smatrix scores",0,500,1.0);
		E = ReadSeq(fptr,J,nsize[J],A);
		if(shuffle) ShuffleSeq(E);
		printf("*****************************************\n");
		PutSeqID(stdout,E); printf("\n");
// for(Int4 x=1; x <=200; x++)
for(Int4 x=1; x <=3; x++)
	//		IncdHist(PutSampledSWAlnPrtnModel(stdout,a,b,E,PM),H);
	PutSampledSWAlnPrtnModel(stdout,a,b,E,PM);
		printf("=========================================\n");
		NilSeq(E);
	 //       PutHist(stdout,60,H); NilHist(H);
	    }
	  } break;
	 case 17: {	// brackets to keep these in scope.
	    a_type  A = PrtnModelA(PM);
	    e_type  gE,sgE;
	    printf("seed = %d\n",seed);
	    for(J=1; J <= number; J++){
		H = Histogram("smatrix scores",0,500,1.0);
		E = ReadSeq(fptr,J,nsize[J],A);
		if(shuffle) ShuffleSeq(E);
		printf("*****************************************\n");
		PutSeqID(stdout,E); printf("\n");
    for(Int4 x=1; x <=200; x++)
		IncdHist(PutSampledSWAlnPrtnModel(stdout,a,b,E,PM),H);
		printf("=========================================\n");
		NilSeq(E);
	        PutHist(stdout,60,H); NilHist(H);
	    }
	  } break;
	case 3:
	    AveSD=0.;
	    if(number > 1){
		H = Histogram("smatrix stdev scores",0,50,0.5);
		fp = NULL; 
	    } else { fp = stdout; H = NULL; }
	    for(J=1; J <= number; J++){
	      E = ReadSeq(fptr,J,nsize[J],A);
	      if(shuffle){
		bestavesd=-9999;
		for(Int4 iter=1; iter <=1000; iter++){
		    ShuffleSeq(E);
		    if(mode=='D' || mode == 'd')
			avesd=LocalHistPrtnModel(NULL,E, PM);
		    else avesd=HistPrtnModel(NULL,E, PM);
		    if(bestavesd < avesd) bestavesd = avesd;
		}
	      } else {
		if(mode=='D' || mode == 'd')
			avesd=LocalHistPrtnModel(fp,E, PM);
		else {
			PutSeqID(stderr,E); std::cerr << std::endl;
			avesd=HistPrtnModel(fp,E, PM);
		}
	      }
	      AveSD+=avesd;
	      NilSeq(E);
	      if(fp!=NULL) fprintf(fp,"average stdev score = %.2f\n",bestavesd);
	      else {
		if(shuffle) IncdHist(bestavesd, H);
		else IncdHist(avesd, H);
	      }
	    }
	    if(H != NULL){
	      PutHist(stdout,60,H); NilHist(H);
	      fprintf(stdout,"Overall average stdev = %.2f\n",AveSD/(double)number);
	    }
	 break;
	case 4:
	    for(J=1; J <= number; J++){
		E = ReadSeq(fptr,J,nsize[J],A);
	        if(shuffle) ShuffleSeq(E);
	   	// SWHistPrtnModel(stdout,15,5,E, PM);
	   	SWHistPrtnModel(stdout,10,5,E, PM);
		NilSeq(E);
	    }
	 break;
	case 5:
	    for(J=1; J <= number; J++){
		E = ReadSeq(fptr,J,nsize[J],A);
	   	PutSWAlnPrtnModel(stdout,10,5,E,PM);
		NilSeq(E);
	    }
	 break;
	case 6:
	    for(J=1; J <= number; J++){
		E = ReadSeq(fptr,J,nsize[J],A);
	   	CompareScoresPrtnModel(a,b,E,PM);
	   	// ShortGapAlnPrtnModel(stdout,10,5,E,PM);
		NilSeq(E);
	    }
	 break;
	case 7: {	// brackets to keep these in scope.
	    a_type  A = PrtnModelA(PM);
	    e_type  gE,sgE;
	    for(J=1; J <= number; J++){
		E = ReadSeq(fptr,J,nsize[J],A);
		if(shuffle) ShuffleSeq(E);
		printf("*****************************************\n");
		PutSeqID(stdout,E); printf("\n");
		PutFullSWAlnPrtnModel(stdout,a,b,E,PM);
		printf("=========================================\n");
		NilSeq(E);
	    }
	  } break;
	case 18:
	    if(number > 1){ H = Histogram("gapped scores",-5000,5000,10); fp = NULL; }
	    else { fp = stdout; H = NULL; }
	    for(J=1; J <= number; J++){
	      E = ReadSeq(fptr,J,nsize[J],A);
	      if(shuffle) ShuffleSeq(E);
	      score = ScoreSWAlnPrtnModel(a,b,E,PM);
	      IncdHist(score, H); NilSeq(E);
	    }
	    if(H != NULL){ PutHist(stdout,60,H); NilHist(H); }
	 break;
	case 8:
	  { Int4 score1,score2; h_type HG; Int4 sum;
	    assert(number > 1);

#if 0
H = Histogram("Goonnet gap function",0,500,1); 
for(Int4 gp=0; gp <=200; gp++){
	double	pen;
	if(gp != 10) {
	   pen=pow(abs(gp-10),1.7);  pen=log(pen);
	   IncdMHist(gp, (Int4)(pen*5), H);
	}
} PutHist(stdout,60,H); NilHist(H); 
#endif

	    HG1 = Histogram("gapped scores",-500,5000,10); 
	    HG2 = Histogram("reverse gapped scores",-500,5000,10); 
	    H = Histogram("diff gapped scores",-500,5000,10); 
	    HG = Histogram("both gapped scores",-500,5000,10); 
	    for(sum=0,J=1; J <= number; J++){
	      E = ReadSeq(fptr,J,nsize[J],A);
	      score1 = ScoreSWAlnPrtnModel(a,b,E,PM);
	      IncdHist(score1, HG1); 
	      if(shuffle) ShuffleSeq(E); else ReverseSeq(E);
	      score2 = ScoreSWAlnPrtnModel(a,b,E,PM);
	      IncdHist(score1-score2, H); sum+=score1-score2;
	      IncdHist(score2, HG2); NilSeq(E);
	      IncdHist(score1, HG); IncdHist(score2, HG); 
	    }
	    PutHist(stdout,60,HG1); NilHist(HG1); 
	    PutHist(stdout,60,HG2); NilHist(HG2); 
	    PutHist(stdout,60,H); NilHist(H); 
	    PutHist(stdout,60,HG); NilHist(HG);
	    fprintf(stdout,"sum diff = %d\n",sum);
	  }
	 break;
	case 9:
	  { Int4 score1,score2; h_type HG;
	    if(number > 1) HG1 = Histogram("gapped scores",-500,5000,10); 
	    if(number > 1) HG2 = Histogram("reversed gapped scores",-500,5000,10); 
	    if(number > 1) H = Histogram("diff gapped scores",-500,5000,10); 
	    if(number > 1) HG = Histogram("both gapped scores",-500,5000,10); 
	    else { H = NULL; }
	    for(J=1; J <= number; J++){
	      E = ReadSeq(fptr,J,nsize[J],A);
#if 1
	      score1 = LocalScoreSWAlnPrtnModel(a,b,E,PM);
	      IncdHist(score1, HG1); 
	      if(shuffle) ShuffleSeq(E); else ReverseSeq(E);
	      score2 = LocalScoreSWAlnPrtnModel(a,b,E,PM);
	      IncdHist(score1-score2, H); 
	      IncdHist(score2, HG2); NilSeq(E);
	      IncdHist(score1, HG); IncdHist(score2, HG); 
#endif
#if 0
	      if(shuffle) ShuffleSeq(E);
	      score = LocalScoreSWAlnPrtnModel(10,5,E,PM);
	      IncdHist(score, H); NilSeq(E);
#endif
	    }
	    if(H != NULL){ 
		PutHist(stdout,60,HG1); NilHist(HG1);
		PutHist(stdout,60,HG2); NilHist(HG2);
		PutHist(stdout,60,H); NilHist(H);
		PutHist(stdout,60,HG); NilHist(HG);
	    }
	  }
	 break;
	 default: print_error(USAGE_START);
      }
   } else if(fullSWalign){
      if(info_cutoff >= 0.0){	//*********** Create a CRS file ************
	cma_typ cma=0;
#if 1	// open a XXX
	char tmp_str[300],*tmp_sp;
	FILE *tfp=0;
	fm_type *FM=0;

	j = strlen(argv[2]); 
	if(j > 4 && (tmp_sp=strstr(argv[2],".msa")) != NULL){
		strncpy(tmp_str,argv[2],j-4); tmp_str[j-4]=0;
		// if(sscanf(argv[2],"%[^.].msa",tmp_str) == 1){
		strcat(tmp_str,".cma");
		if((tfp= fopen(tmp_str,"r")) != NULL) {
			cma=ReadCMSA(tfp,A); fclose(tfp); 
			if(cma) FM=ModelsCMSA(cma); 
		}
	} 
#endif
        ptm_typ PM=PrtnModelGOScan(F);
	Int4 open=a,extend=b;
	if(cma) { 
		assert(nBlksCMSA(cma) == NumModelsPrtnModel(PM));
	}
	for(Int4 J=1; J <= number; J++){
		E = ReadSeq(fptr,J,nsize[J],A);
		PlotCRSPrtnModelSW(stdout,open,extend,info_cutoff,E,FM,PM,InputColors);
	}
	if(cma) TotalNilCMSA(cma);
      } else if(cma_output){	//************** cma output ************
	GOScanSWScan(fptr,number,total,nsize,a,b,F,'C'); // purify complaining?
      } else if(!FAfullSWalign){	//************** Output alignment only ************
	// GOScanSWScan(fptr,number,total,nsize,F); // purify complaining?
	GOScanSWScan(fptr,number,total,nsize,a,b,F); // purify complaining?
      } else {
	e_type **ListE=GapSeqGOScanScan(fptr,a,b,left,right,min_rpt,number,total,nsize,F);
        for(Int4 s=1; ListE[s]; s++){
           for(Int4 r=1; ListE[s][r]; r++){
                PutSeq(stdout,ListE[s][r],A);
		NilSeq(ListE[s][r]);
           } free(ListE[s]);
        } free(ListE);
      }
   } else {
	sH=GOScanScan(fptr,number, total, nsize, min_rpt,MaxBadBlks,F);
#if 0
	fprintf(stdout,"\tsearch time: %d seconds (%0.2f minutes)\n",
                time(NULL)-time1,(float)(time(NULL)-time1)/60.0);
#endif
		
	if(cutoff > 0){ 	// TEST PURGE SCANINFO: NEEDS WORK 
	     fprintf(stderr,"%s\n",argv[2]);
             listE=PurgeScanInfo(cutoff,left,right,sH,A);
             // listE=PurgeScanInfoRpts(cutoff,left,right,sH,A);
             if(listE != NULL){
		ofp = open_file(argv[2],".rsq","w");
		for(k = 1; listE[k] != NULL; k++){
			PutSeq(ofp,listE[k],A);
			NilSeq(listE[k]);
		} fclose(ofp); free(listE); 
             }
	} 			// END TEST PURGE SCANINFO.
		if(nScanHeap(sH) > 0){
		 if(print_top > 0){
		     if(put_repeats){
			PutTopRptsScanHeap(stdout,left,right, print_top,sH, A);
		     } else {
		  	k=PutTopSeqScanHeap(stdout, print_top,sH,A);
			fprintf(stderr,"%d hits (%s)\n",k,argv[1]);
		     }
		 } else {
		  if(!mask && nScanHeap(sH) > 10){
		     H=HistScanHeap(sH);
		     PutHist(stdout,60,H);
		  }
		  if(mask) PutMaskSeqScanHeap(stdout, sH, neg_mask, A);
		  else {
			if(msaformat) {
			    ofp = open_file(argv[2],"2.msa","w");
			    PutMSAScanHeap(ofp, sH, A);
			    fclose(ofp);
			    PutInfoScanHeap(stdout, sH, A);
			    // PutFormatMSAScanHeap(stdout, sH, A);
			} else PutInfoScanHeap(stdout, sH, A);
		  }
		  if(report_gaps){
		  	ofp = open_file(argv[2],".gaps","w");
		  	PutGapsScanHeap(ofp, sH);
		  	fclose(ofp); 
		  }
		  if(fa_blocks){
			PutSeqBlkScanHeap(argv[2], sH, Nflank, Cflank, A);
		  }
		  if(put_repeats){
		     // ofp = open_file(argv[2],".rpts","w");
		     ofp = open_file(argv[1],".rpts","w");
		     // PutRptsScanHeap(ofp, left,right, sH, A); // OLD
	             if(min_rpt < 2) PutRptsScanHeap(ofp,left,right,sH,A);
                     else PutMinRptsScanHeap(ofp,left,right,min_rpt,sH,A);
		     fclose(ofp); 
		  }
		  if(rasmol){
		  	ofp = open_file(argv[2],".ras","w");
			PutRasmolScanHeap(ofp, sH);
		  	fclose(ofp); 
		  }
		  if(put_align){
		  	ofp = open_file(argv[2],".aln","w");
			PutSelexAlnScanHeap(ofp, sH, A);
			/*** PutFAAlnScanHeap(ofp, sH, A);/****/
		  	fclose(ofp); 
		  }
		  if(seqfile){
		  	// ofp = open_file(argv[2],".fsq","w");
		  	ofp = open_file(argv[1],".fsq","w");
             		PutFailSeqScanHeap(ofp, sH, A);
		  	fclose(ofp); 
		  	// ofp = open_file(argv[2],".seq","w");
		  	ofp = open_file(argv[1],".seq","w");
		  	k=PutSeqScanHeap(ofp, sH, A);
			fprintf(stderr,"%d hits\n",k);
		  	fclose(ofp); 
		  }
		 }
		} else {
			return_status=1;
			fprintf(stderr,"no hits\n");
		}
		if(!mask && nScanHeap(sH) > 10 && print_top < 1){
		  for(i=1; i<=nblksScanHeap(sH); i++){
			H=histScanHeap(i,sH); PutHist(stdout,60,H);
		  }
		}
/**************************** create PHYLIP file ********************/
	if(realign){
	    if(maxrpts > 1) print_error("realign not yet implemented for repeats");
	    msa = ScanHeap2CMSA(sH, 0, argv[1], 
			MkAlpha(AMINO_ACIDS,PROT_BLOSUM62),FALSE);
	    sH = NULL;		/** WARNING: ScanHeap2CMSA( ) destroys sH ***/
	    if(msa != NULL){ WriteMtfCMSA(argv[2], msa,NULL); } 
	} else if(phylip > -2){
	    if(maxrpts > 1) print_error("phylip not yet implemented for repeats");
	    msa = ScanHeap2CMSA(sH, phylip, argv[1],
			MkAlpha(AMINO_ACIDS,PROT_BLOSUM62),FALSE);
	    sH = NULL;		/** WARNING: ScanHeap2CMSA( ) destroys sH ***/
	    if(msa != NULL){ WritePhylipCMSA(argv[2], msa); } 
	}
	if(msa != NULL) NilCMSA(msa);
	if(sH != NULL) NilScanHeap(sH);
/**************************** NEW *******************************/
		 if(!mask && print_top < 1)
		    fprintf(stdout,"\ttime: %d seconds (%0.2f minutes)\n",
                        time(NULL)-time1,(float)(time(NULL)-time1)/60.0);
    }// end if(fullSWalign)
	NilGOScan(F); fclose(fptr); NilAlpha(A); free(counts); free(nsize);
	free(freq);
	return return_status;
}

