/******************************************************************************************
    Copyright (C) 1997-2014 Andrew F. Neuwald, Cold Spring Harbor Laboratory
    and the University of Maryland School of Medicine.

    Permission is hereby granted, free of charge, to any person obtaining a copy of 
    this software and associated documentation files (the "Software"), to deal in the 
    Software without restriction, including without limitation the rights to use, copy, 
    modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so, subject to the 
    following conditions:

    The above copyright notice and this permission notice shall be included in all 
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
    PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
    OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
    OTHER DEALINGS IN THE SOFTWARE.

    For further information contact:
         Andrew F. Neuwald
         Institute for Genome Sciences and
         Department of Biochemistry & Molecular Biology
         University of Maryland School of Medicine
         801 West Baltimore St.
         BioPark II, Room 617
         Baltimore, MD 21201
         Tel: 410-706-6724; Fax: 410-706-1482; E-mail: aneuwald@som.umaryland.edu

===================== Uniprot (U,V,W,...) ====================
UniProtKB accession numbers consist of 6 or 10 alphanumerical characters in the format:

1          2	 3	  4         5         6      7     8        9    10
[O,P,Q]	  [0-9]	[A-Z,0-9] [A-Z,0-9] [A-Z,0-9] [0-9]
[A-N,R-Z] [0-9]	[A-Z]     [A-Z,0-9] [A-Z,0-9] [0-9]
[A-N,R-Z] [0-9]	[A-Z]     [A-Z,0-9] [A-Z,0-9] [0-9] [A-Z] [A-Z,0-9] [A-Z,0-9] [0-9]

See: http://www.uniprot.org/help/accession_numbers 

==================================== RefSeq NCBI (R) =========================================

Repseq formats:

AP_	Protein	Annotated on AC_ alternate assembly
NP_	Protein	Associated with an NM_ or NC_ accession
YP_	Protein	Annotated on genomic molecules without an instantiated transcript record
XP_	Protein	Predicted model, associated with an XM_ accession
WP_	Protein	Non-redundant across multiple strains and species

AP_	Annotated on AC_ alternate assembly
NP_	Associated with an NM_ or NC_ accession
YP_	
XP_	Predicted model, associated with an XM_ accession
ZP_	Predicted model, annotated on NZ_ genomic records

See: https://www.ncbi.nlm.nih.gov/books/NBK21091/

==================================== NCBI (A) ==============================================
Protein Accession Prefixes
Prefix	Database	Type	
BAA-BZZ	DDBJ	Protein ID	
CAA-CZZ,
SAA-SZZ,
VAA-VZZ	EMBL	Protein ID	
AAA-AZZ,
QAA-QZZ	GenBank	Protein ID	
AAE	GenBank	Protein ID for Patents (also some patent proteins with AAA and AAC	
FAA_FZZ	DDBJ	TPA Protein ID	
DAA-DZZ	GenBank	TPA or TPA WGS Protein ID	
GAA-GZZ	DDBJ	WGS Protein ID	
EAA-EZZ,
KAA-KZZ,
OAA-OZZ,
PAA-PZZ,
RAA-RZZ,
TAA-TZZ	GenBank	WGS Protein ID	
HAA-HZZ	GenBank	TPA WGS/TSA Protein ID	
IAA-IZZ	DDBJ	TPA WGS Protein ID	
LAA-LZZ	DDBJ	TSA or Targeted Gene Project Protein ID	
JAA-JZZ	GenBank	TSA Protein ID	
MAA-MZZ,
NAA-NZZ	GenBank	WGS/TSA Protein ID

See: https://www.ncbi.nlm.nih.gov/Sequin/acc.html

UniProt:

[OPQ][0-9][A-Z0-9]{3}[0-9]  or
[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}

 ******************************************************************************************/

%p 2000
%p 2000
%e 2000
%n 2000
%k 2000
%a 2000
%o 2000
%START NODES NAMES PDBID ACCESS SEQS DEFLINE 
I       (0|[1-9][0-9]*)
D       [0-9]
P       [A-Z0-9]
CH      [A-Za-z0-9+]
PDB     ({D}[A-Z0-9]{3}"_"{CH}{1,4})
L       [A-Za-z]
C       {L}+[ ]+{L}+
H       {L}+[/]+{L}+
T	({C}|{H}|{L}+)+
S       [ \t]+
RS      ([ANYXW]P"_"[0-9]{6})
R       ([ANYXW]P"_"[0-9]{9})
A       ([A-Z][A-Z][0-9A-Z][0-9]{5})
B       ([A-Z][A-Z][0-9A-Z][0-9]{7})
U       ([A-Z][0-9][0-9A-Z]{3}[0-9])
W	([A-NR-Z][0-9][A-Z][A-Z0-9]{2}[0-9][A-Z][A-Z0-9]{2}[0-9])
PRF	([0-9]{6,7}[A-Z]{1,2})
%{

#include "stdinc.h"
#include "alphabet.h"
#include "sequence.h"
#include "seqset.h"
#include "residues.h"
#include "addphylum_usage.h"

// #define MAX_NUM_GROUPS	500
#define MAX_SEQ_LENG	100000
#define NIL 		-1

#if defined (YYLMAX)
#undef YYLMAX
#endif

#define YYLMAX  10000

// #define MYLMAX  2000000
// #define MYLMAX_PLUS  2000010
#define MYLMAX  300
#define MYLMAX_PLUS  310

#define MAX_SHORT_ID	100003
#define MAX_LSHORT_ID	1000003
#define MAX_LONG_ID	10000003
#define MAX_XLONG_ID	1000000003
#define MAX_SP_ID	20
// UInt4 max = 4294967296.
// 100,000,000 x 4 = 400 Mb
#define MAX_TAX_NODES	25000000	// 25 million
#define MAX_TAX_NAMES	10
#define MAX_ACC_CHAR	127

typedef struct {
	UInt4   tax_id;         // node id in GenBank taxonomy database
        char            *rank;          // superkingdom, kingdom, ...
        UInt4   parent;         // parent node id in GenBank taxonomy database.
        char            *name;         // names for node.
        unsigned char   gc_code;        // genetic code id.
} taxonomic_node_type;
typedef taxonomic_node_type *tnd_typ;

tnd_typ	tax_node;
BooLean	UsePDBaa;
Int4	NumSkippedLargePDB;

UInt4	*Rep6Seq2Node[MAX_ACC_CHAR+3];		// Rep6Seq2Node[a][MAX_LSHORT_ID];
UInt4	*Rep9Seq2Node[MAX_ACC_CHAR+3];		// Rep9Seq2Node[a][MAX_XLONG_ID];
UInt4	***SeqAcc2Node[MAX_ACC_CHAR+3];		// SeqAcc2Node[a][b][c][MAX_SHORT_ID];
UInt4	***SeqAcc2NodeL[MAX_ACC_CHAR+3];	// SeqAcc2NodeL[a][b][c][MAX_LONG_ID];
UInt4	*****SwissAcc2Node[MAX_ACC_CHAR+3];	// SwissAcc2Node[a][b][c][d][e][MAX_SP_ID];
UInt4	*********UniPrtW[MAX_ACC_CHAR+3];	// UniPrtW[a][b][c][d][e][f][g][h][j][MAX_SP_ID];

UInt4	****PDBID2Node[MAX_ACC_CHAR+3];		// PDBID2Node[a][b][c][d][MAX_ACC_CHAR+3];

void	print_tax_node(FILE *fp,taxonomic_node_type tnd)
{
	fprintf(fp,"node: %d | %s | %d | %d |",
		tnd.tax_id,tnd.rank,tnd.parent,tnd.gc_code);
	if(tnd.name){
		fprintf(fp," %s ",tnd.name);
	} else fprintf(fp," no name!");
	fprintf(fp,"\n");
}

BooLean	assign_pseudophyla_tax_node(UInt4 n,taxonomic_node_type *tnd)
{
	UInt4 p;
	// if(!strstr(tax_node[n].rank,"no rank")) return FALSE;
	if(tax_node[n].rank && 
		strstr(tax_node[n].rank,"kingdom") ||
		strstr(tax_node[n].rank,"phylum")) return FALSE;
	p=tax_node[n].parent;
	if(tax_node[p].rank && strstr(tax_node[p].rank,"kingdom")){
		if(tax_node[n].rank) free(tax_node[n].rank);
		tax_node[n].rank = AllocString("pseudophylum");
		return TRUE;
	} return FALSE;
}

char	find_kingdom_tax_node(UInt4 n,taxonomic_node_type *tnd)
{
	UInt4 p;
	if(tax_node[n].rank && strstr(tax_node[n].rank,"kingdom") &&
		strstr(tax_node[n].rank,"subkingdom") == NULL) { // skip subkingdoms
		return tax_node[n].name[0];
	}
	while((p=tax_node[n].parent) > 1){	// look at a higher level...
		n = p;
		if(tax_node[n].rank && strstr(tax_node[n].rank,"kingdom") &&
			strstr(tax_node[n].rank,"subkingdom") == NULL) {
		    // if(tax_node[n].name[0] == 'D') print_tax_node(stderr,tax_node[n]);
		    return tax_node[n].name[0];
		}
	} return 0;
}

char	*RmCandidatus(Int4 n)
// remove Candidatus from phylum name --> candidate phylum.
// remove string beyond space ' ' charcter
{
	assert(n < MAX_TAX_NODES);
	if(tax_node[n].rank && 
	    (strstr(tax_node[n].rank,"phylum") 
			|| strstr(tax_node[n].rank,"class"))){
	     if(strncmp(tax_node[n].name,"Candidatus ",11) == 0){
		char *s=tax_node[n].name; 
		while(s[0] != ' ') s++;
		s++;  return AllocString(s);
	// fix new format... afn: 5_8_2023.  turned this off...
	    } else if(0 && strchr(tax_node[n].name,' ') != 0){
		// free(tax_node[n].name); tax_node[n].name=0; 
		char *ss,*s=tax_node[n].name; 
		fprintf(stderr,"hit=%s --> ",s);
		while(s[0] != ' ') s++;
		*s=0; ss=tax_node[n].name; 
		free(tax_node[n].rank); tax_node[n].rank=0;
		fprintf(stderr,"%s\n",ss);
		return 0;
		return AllocString(ss);
	// end fix...
	    }
	} return 0;
}

#define dbgX	0
char	*find_class_tax_node(UInt4 num,taxonomic_node_type *tnd)
{
	UInt4 p,n=num;
	assert(n < MAX_TAX_NODES);
	if(tax_node[n].rank && strcmp(tax_node[n].rank,"class")==0){
		return tax_node[n].name;
	}
#if dbgX
fprintf(stderr,"==============\n");
print_tax_node(stderr,tax_node[n]);
#endif
	while((p=tax_node[n].parent) > 1){
		n = p;
#if dbgX
print_tax_node(stderr,tax_node[n]);
#endif
		if(tax_node[n].rank && strcmp(tax_node[n].rank,"class")==0){
		   return tax_node[n].name;
		}
	}
	//=========== look for an infraclass, superclass, etc... ======
	n=num; 
	if(tax_node[n].rank && strstr(tax_node[n].rank,"class")){
		return tax_node[n].name;
	}
	while((p=tax_node[n].parent) > 1){
		n = p;
		if(tax_node[n].rank && strstr(tax_node[n].rank,"class")){
		   return tax_node[n].name;
		}
	} return 0;
}

char	*find_phylum_tax_node(UInt4 n,taxonomic_node_type *tnd)
{
	UInt4 p,num=n;
	assert(n < MAX_TAX_NODES);
	if(tax_node[n].rank && strcmp(tax_node[n].rank,"phylum")==0){
		return tax_node[n].name;
	}
	while((p=tax_node[n].parent) > 1){
		n = p;
		if(tax_node[n].rank && strcmp(tax_node[n].rank,"phylum")==0){
		   return tax_node[n].name;
		}
	}
	n=num;
	if(tax_node[n].rank && strstr(tax_node[n].rank,"phylum")){
		return tax_node[n].name;
	}
	while((p=tax_node[n].parent) > 1){
		n = p;
		if(tax_node[n].rank && strstr(tax_node[n].rank,"phylum")){
		   return tax_node[n].name;
		}
	}
	return 0;
}

void	prune_othertax_nodes(taxonomic_node_type *tnd)
// These seem to be viruses and are not of interest in CHAIN analysis.
{
	for(UInt4 n=2; n <= MAX_TAX_NODES; n++){
	   if(tax_node[n].rank){
#if 0
char hit=0;
if(strstr(tax_node[n].rank,"class")){
fprintf(stderr,"!!! ---> %d.rank = %s\n",n,tax_node[n].rank); hit=1;
}
#endif
		UInt4 p = tax_node[n].parent;
		if(p==1 && strcmp(tax_node[n].name,"other")==0 ||
		     strstr(tax_node[n].name,"environmental samples") ||
		     strstr(tax_node[n].name,"uncultured bacteria") ||
		     strstr(tax_node[n].name,"candidate division")){
			free(tax_node[n].name); tax_node[n].name=0;
			free(tax_node[n].rank); tax_node[n].rank=0;
		} else if(strstr(tax_node[n].rank,"no rank")){
			if(strstr(tax_node[n].name,"unclassified ")){
				free(tax_node[n].name); tax_node[n].name=0;
				free(tax_node[n].rank); tax_node[n].rank=0;
			} else if(tax_node[p].rank && 
					strstr(tax_node[p].rank,"phylum")){
			   free(tax_node[n].name); tax_node[n].name=0;
			   free(tax_node[n].rank); tax_node[n].rank=0;
			} else if(p ==1 && 	// root node as parent...
			       (strstr(tax_node[n].name,"Viruses") ||
				strstr(tax_node[n].name,"Viroids") ||
				strstr(tax_node[n].name,"unclassified"))){
			   free(tax_node[n].name); tax_node[n].name=0;
			   free(tax_node[n].rank); tax_node[n].rank=0;
			   // print_tax_node(stderr,tax_node[n]);
			} else if(find_phylum_tax_node(n,tnd)){
			   free(tax_node[n].name); tax_node[n].name=0;
			   free(tax_node[n].rank); tax_node[n].rank=0;
			} // else print_tax_node(stderr,tax_node[n]); // DEBUG...
		} else if(strstr(tax_node[n].rank,"subphylum") &&
			tax_node[p].rank && strstr(tax_node[p].rank,"phylum")){
			   // subphylum of a known phylum.
			   free(tax_node[n].name); tax_node[n].name=0;
			   free(tax_node[n].rank); tax_node[n].rank=0;
		}
#if 0
if(hit) fprintf(stderr,"$$$ ---> %d.rank = %s\n",n,tax_node[n].rank);
#endif
	   }
	}
}

#define dbgY	0

UInt4	prune_tax_nodes(taxonomic_node_type *tnd)
// Remove names from all nodes pointing to nodes with rank=0;
{
	// n == 1 is root node.
	UInt4	total=0;
	for(UInt4 n=2; n <= MAX_TAX_NODES; n++){
	  if(tax_node[n].rank){
#if dbgY
char hit=0;
if(strstr(tax_node[n].rank,"class")){
fprintf(stderr,"!!! ---> %d.rank = %s\n",n,tax_node[n].rank); hit=1;
}
#endif
		UInt4 p = tax_node[n].parent;
		if(tax_node[p].rank==0 &&
			strstr(tax_node[n].rank,"class")==NULL){
			total++;
			free(tax_node[n].name); tax_node[n].name=0;
			free(tax_node[n].rank); tax_node[n].rank=0;
		}
#if dbgY
if(hit) fprintf(stderr,"$$$ ---> %d.rank = %s\n",n,tax_node[n].rank);
#endif
	  }
	} return total;
}

a_type	aaAB,dnaAB,AB;
BooLean	ISDNA=FALSE;
BooLean	PRINTSEQ=FALSE;
BooLean	SKIPCHORDATES=FALSE;
BooLean	CHORDATES_ONLY=FALSE;
BooLean	SKIPUNKNOWN=FALSE;

char	*ArgV1=0;
FILE	*mfp=0;
UInt4	Field=0;
UInt4	File=0;
UInt4	Node=0;
UInt4	LineNum=1;
UInt8	SeqId=0;
UInt4	myleng=0;
char	Format=0;
char	mytext[MYLMAX_PLUS];
void	CopyMyText(char *text)
{
	   Int4 i; myleng=0;
	   for(i=0; text[i]; i++){ 
		// if(myleng >= MYLMAX) print_error("myleng exceeds limit");
		// mytext[myleng]=text[i]; myleng++;
		if(myleng < MYLMAX){ mytext[myleng]=text[i]; myleng++; }
	   } mytext[myleng]=0;
	}
void	PrintMissing(char *text, const char frmt)
{
	char str[55]; strncpy(str,text,50);
	if(mfp==0 && ArgV1) mfp=open_file(ArgV1,".log","w");
	if(mfp) fprintf(mfp,"Missing accession code %c: %s\n",frmt,str);
	else fprintf(stderr,"Missing accession code %c: %s\n",frmt,str);
}
%}
%%

^">"{D}{P}{P}{P}_{CH}{S} { BEGIN DEFLINE; Format='P'; CopyMyText(yytext);  }

^">"{RS}("."{I})?{S} { BEGIN DEFLINE; Format='r'; CopyMyText(yytext);  }

^">"{R}("."{I})?{S} { BEGIN DEFLINE; Format='R'; CopyMyText(yytext);  }

^">"{A}("."{I})?{S}	{ BEGIN DEFLINE; Format='A'; CopyMyText(yytext); }

^">"{B}("."{I})?{S}	{ BEGIN DEFLINE; Format='B'; CopyMyText(yytext); }

^">"{U}("."{I})?{S}	{ BEGIN DEFLINE; Format='U'; CopyMyText(yytext); }

^">"{W}("."{I})?{S}	{ BEGIN DEFLINE; Format='W'; CopyMyText(yytext); }

^">"{PRF}{S}		{ BEGIN DEFLINE; Format='J'; CopyMyText(yytext); }

<DEFLINE>.	{ if(myleng < MYLMAX){ mytext[myleng]=yytext[0]; myleng++; } }

<DEFLINE>[\n]	{
		   BEGIN SEQS;
		   UInt4 i,n,x,L;
		   char *phylum=0,a,b,c,d,e,f,g,h,j,*str,delimiter=' ';
char *Class; 
		   BooLean is_pdb=FALSE,IsMissing=FALSE;
		   mytext[myleng]=0;
		   if(myleng > MYLMAX) print_error("myleng exceeds limit");
		   // fprintf(stderr,"\"%s|'\n",mytext);
	if(UsePDBaa){
		// if(sscanf(mytext,">%c%c%c%c_%c%u",&a,&b,&c,&d,&e,&i) != 6)
		// fprintf(stderr,"%c%c%c%u\n",a,b,c,i);
		// if(i > 9) fprintf(stderr,"i = %lu; %s\n",i,mytext);
		// assert(i < 10);
		if(sscanf(mytext,">%c%c%c%c_%c",&a,&b,&c,&d,&e) != 5) {
			fprintf(stderr,"--> '%s'\n",mytext);
			print_error("Accession error 'P'"); 
		} else if(sscanf(mytext,">%c%c%c%c_%c%c",&a,&b,&c,&d,&e,&f) == 6 && 
			!isspace(f)) {
		   IsMissing=TRUE;
		} else if(PDBID2Node[a]==0) IsMissing=TRUE;
		else if(PDBID2Node[a][b]==0) IsMissing=TRUE;
		else if(PDBID2Node[a][b][c]==0) IsMissing=TRUE;
		else if(PDBID2Node[a][b][c][d]==0) IsMissing=TRUE;
		else if(PDBID2Node[a][b][c][d][e]==0) IsMissing=TRUE;
		if(IsMissing){ n=0; phylum=0; PrintMissing(mytext,'U'); }
		else {
		  n=PDBID2Node[a][b][c][d][e]; 
		  phylum=find_phylum_tax_node(n,tax_node); 
Class=find_class_tax_node(n,tax_node); 
		}
	} else switch(Format){
	   case 'J': {	// PRF format (Japan)
	   } break;
	   case 'A': {
		   if(sscanf(mytext,">%c%c%c%u",&a,&b,&c,&i) != 4){
			print_error("Accession error 'A'"); 
		   } // fprintf(stderr,"%c%c%c%u\n",a,b,c,i);
		   if(i >= MAX_SHORT_ID){
			fprintf(stderr,"%c%c%c i = %lu >= %d (%c) %s\n",
				a,b,c,i,L,Format,mytext);
			print_error("Accession error 'A'");
		   } 
		   if(SeqAcc2Node[a]==0) IsMissing=TRUE;
		   else if(SeqAcc2Node[a][b]==0) IsMissing=TRUE;
		   else if(SeqAcc2Node[a][b][c]==0) IsMissing=TRUE;
		   if(IsMissing){ n=0; phylum=0; PrintMissing(mytext,'A'); }
		   else {
			n = SeqAcc2Node[a][b][c][i];
			phylum=find_phylum_tax_node(n,tax_node); 
Class=find_class_tax_node(n,tax_node); 
		   }
	   } break;
	   case 'B': { 
		   if(sscanf(mytext,">%c%c%c%u",&a,&b,&c,&i) != 4){
			print_error("Accession error 'B'"); 
		   } // fprintf(stderr,"%c%c%c%u\n",a,b,c,i);
		   if(i >= MAX_LONG_ID){
		 	fprintf(stderr,"%c%c%c i = %lu (%c) %s\n",
			       a,b,c,i,Format,mytext);
			print_error("Accession error 'B'"); 
		   } 
		   if(SeqAcc2NodeL[a]==0) IsMissing=TRUE;
		   else if(SeqAcc2NodeL[a][b]==0) IsMissing=TRUE;
		   else if(SeqAcc2NodeL[a][b][c]==0) IsMissing=TRUE;
		   if(IsMissing){ n=0; phylum=0; PrintMissing(mytext,'B'); }
		   else {
			n = SeqAcc2NodeL[a][b][c][i];
			phylum=find_phylum_tax_node(n,tax_node); 
Class=find_class_tax_node(n,tax_node); 
		   }
	   } break;
	   case 'r': {	// refseq...short
		   if(sscanf(mytext,">%cP_%u",&a,&i) != 2){
			print_error("Accession error 'r'"); 
		   } // fprintf(stderr,"%c%c%c%u\n",a,b,c,i);
		   if(i >= MAX_LSHORT_ID){
			fprintf(stderr,"%cP_%lu (%c): %s\n",a,i,Format,mytext);
			print_error("Accession error 'r'"); 
		   }
		   if(Rep6Seq2Node[a]==0 || Rep6Seq2Node[a][i]==0){
			n=0; phylum=0; PrintMissing(mytext,'r'); 
		   } else {
			n = Rep6Seq2Node[a][i];
			phylum=find_phylum_tax_node(n,tax_node); 
Class=find_class_tax_node(n,tax_node); 
		   }
	   } break;
	   case 'R': {	// refseq...
		   if(sscanf(mytext,">%cP_%u",&a,&i) != 2){
			print_error("Accession error 'R'"); 
		   } // fprintf(stderr,"%c%c%c%u\n",a,b,c,i);
		   if(Format=='A') L=MAX_SHORT_ID; else L=MAX_XLONG_ID;
		   if(i >= MAX_XLONG_ID){
			fprintf(stderr,"%c%c%c i = %lu >= %d (%c) %s\n",
				a,b,c,i,L,Format,mytext);
			print_error("Accession error 'R'"); 
		   }
		   if(Rep9Seq2Node[a]==0) IsMissing=TRUE;
		   else if(Rep9Seq2Node[a][i]==0) IsMissing=TRUE;
		   if(IsMissing){ n=0; phylum=0; PrintMissing(mytext,'R'); }
		   else {
			n = Rep9Seq2Node[a][i];
			phylum=find_phylum_tax_node(n,tax_node); 
Class=find_class_tax_node(n,tax_node); 
		   }
	   } break;
	   case 'U': {	// swissprot short accession...
		   if(sscanf(mytext,">%c%c%c%c%c%u",&a,&b,&c,&d,&e,&i) != 6){
			print_error("Accession error 'U'"); 
		   } // fprintf(stderr,"%c%c%c%u\n",a,b,c,i);
		   if(i > 9) fprintf(stderr,"i = %lu; %s\n",i,mytext);
		   assert(i < 10);
		   if(SwissAcc2Node[a]==0) IsMissing=TRUE;
		   else if(SwissAcc2Node[a][b]==0) IsMissing=TRUE;
		   else if(SwissAcc2Node[a][b][c]==0) IsMissing=TRUE;
		   else if(SwissAcc2Node[a][b][c][d]==0) IsMissing=TRUE;
		   else if(SwissAcc2Node[a][b][c][d][e]==0) IsMissing=TRUE;
		   if(IsMissing){ n=0; phylum=0; PrintMissing(mytext,'U'); }
		   else { n = SwissAcc2Node[a][b][c][d][e][i];
			phylum=find_phylum_tax_node(n,tax_node); }
Class=find_class_tax_node(n,tax_node); 
	   } break;
	   case 'W': {	// swissprot A0A0 accessions...
		   UInt4	*********SpAcc=0;
		   if(sscanf(mytext,">%c%c%c%c%c%c%c%c%c%u",&a,&b,&c,&d,&e,&f,&g,&h,&j,&i) == 10){
			SpAcc=UniPrtW[a];
		   } else print_error("Swiss accession error 'W'"); 
		   // fprintf(stderr,"%c%c%c%u\n",a,b,c,i);
		   if(i > 9) fprintf(stderr,"i = %lu; %s\n",i,mytext); assert(i < 10);
		   if(SpAcc==0) IsMissing=TRUE;
		   else if(SpAcc[b]==0) IsMissing=TRUE;
		   else if(SpAcc[b][c]==0) IsMissing=TRUE;
		   else if(SpAcc[b][c][d]==0) IsMissing=TRUE;
		   else if(SpAcc[b][c][d][e]==0) IsMissing=TRUE;
		   else if(SpAcc[b][c][d][e][f]==0) IsMissing=TRUE;
		   else if(SpAcc[b][c][d][e][f][g]==0) IsMissing=TRUE;
		   else if(SpAcc[b][c][d][e][f][g][h]==0) IsMissing=TRUE;
		   else if(SpAcc[b][c][d][e][f][g][h][j]==0) IsMissing=TRUE;
		   if(IsMissing){ n=0; phylum=0; PrintMissing(mytext,'W'); }
		   else {
			n=(UInt4) SpAcc[b][c][d][e][f][g][h][j][i];
			phylum=find_phylum_tax_node(n,tax_node); 
Class=find_class_tax_node(n,tax_node); 
		   }
	   } break;
	   default: { phylum=0; n=0; PrintMissing(mytext,'?'); }break;
	} // end of switch
	//================= print out sequence =======================
	if((!CHORDATES_ONLY && (SKIPUNKNOWN && phylum==0 || 
			phylum && SKIPCHORDATES && strstr(phylum,"Chordata")))
			|| CHORDATES_ONLY && (phylum==0 || !strstr(phylum,"Chordata"))  ) {
			PRINTSEQ=FALSE;
	} else {
		PRINTSEQ=TRUE;
		char *gi,*gi1=0,*gi2;
		char kingdom=0;
		if(n > 0) kingdom=find_kingdom_tax_node(n,tax_node);
		char *str2=strstr(mytext," "); str2++;
		if(ISDNA) {
			 for(x=0; !isspace(mytext[x]); x++){ fprintf(stdout,"%c",mytext[x]); } x++;
		} else {
			 if(0 && is_pdb && str > str2){	
			  // then skip ahead to 'pdb' or swissprot and skip priors.
			   fprintf(stdout,">");		
			   gi=strstr(mytext,"gi|"); gi1=strstr((gi+1),"gi|");
			   // delimiter=*(gi1-1);
			   while(gi1 > 0 && gi1 < str){
				gi=gi2=gi1; gi2++; gi1=strstr(gi2,"gi|");
			   } gi1=gi;	// gi position of pdb or sp identifier...
			   for( ; !isspace(*gi) && *gi; gi++) fprintf(stdout,"%c",*gi);
			   str2=gi+1; 	// jump over space...
			 } else {
			  for(x=0; !isspace(mytext[x]); x++){
				fprintf(stdout,"%c",mytext[x]);		
			  } x++;
			 }
		}
		if(kingdom==0) kingdom='X';
		if(phylum==0) { 
		  if(ISDNA){ fprintf(stdout," {<unknown(%c1)>}%s",kingdom,str2); }
		  else {
			if(is_pdb) fprintf(stdout," {<unknown(%c)>}%s",tolower(kingdom),str2);
			else fprintf(stdout," {<unknown(%c)>}%s",kingdom,str2);
		  }
		} else { 
		  if(ISDNA){
			fprintf(stdout," {<%s(%c%d)>}%s", phylum,kingdom,tax_node[n].gc_code,str2);
		  } else if(Class){
			if(is_pdb) fprintf(stdout," {|0(%d)|<%s;%s(%c)>}%s",
				tax_node[n].tax_id,phylum,Class,tolower(kingdom),str2);
			else fprintf(stdout," {|0(%d)|<%s;%s(%c)>}%s",
				tax_node[n].tax_id,phylum,Class,tolower(kingdom),str2);
		  } else {
			if(is_pdb) fprintf(stdout," {|0(%d)|<%s(%c)>}%s",
				tax_node[n].tax_id,phylum,tolower(kingdom),str2);
			else fprintf(stdout," {|0(%d)|<%s(%c)>}%s",
				tax_node[n].tax_id,phylum,kingdom,str2);
		  }
		} 
		// print out front of mytext string behind end...
	        if(is_pdb && gi1 > 0){	
		  str=mytext+1;
		  // fprintf(stdout,"%c",delimiter);		
		  do { gi1--; } while(!isprint(*gi1)); // back over delimiter...
		  fprintf(stdout,"%c",(char)1);		// NCBI ^A character...
		  while(str <= gi1){ fprintf(stdout,"%c",*str); str++; } 
		}
	       }
		 if(PRINTSEQ) printf("\n");
		Format=' ';
	   }

<SEQS>.		{ if(PRINTSEQ) ECHO; }

<SEQS>[\n]	{ if(PRINTSEQ) ECHO; }

^accession{S}accession"."version{S}taxid{S}gi |	// afn4_13_22
^accession"."version{S}taxid	{ 
		   LineNum=1; 
		   if(UsePDBaa){ BEGIN PDBID;  }
		   else { BEGIN ACCESS; }
	        }

<PDBID>^{D}{P}{P}{P}"_"{CH}{CH}+{S}{D}{P}{P}{P}"_"{CH}{CH}+{S}{I}{S}{I}$	{	// pdb code.
			// fprintf(stderr," skipping large pdb file --> %s\n",yytext);
			NumSkippedLargePDB++;
			
		}

<PDBID>^{D}{P}{P}{P}"_"{CH}{S}{D}{P}{P}{P}"_"{CH}{S}{I}{S}{I}$	{	// pdb code.
		   UInt4 i,n,k;
		   char  a,b,c,d,e;
		   if(sscanf(yytext,"%c%c%c%c_%c %*s %u",&a,&b,&c,&d,&e,&n) != 6){
			fprintf(stderr," --> %s\n",yytext);
			print_error("PDB accession error 'U'");
		   }
		   // if(strstr(yytext,"D0P1B2") != NULL) fprintf(stderr,"PDB: %s\n",yytext);
		   // if(LineNum >= 136970890) fprintf(stderr,"%s (%d)\n",yytext,LineNum);
		   // fprintf(stderr,"PDB: %s\n",yytext);
		   if(File==2){
			 if(PDBID2Node[a] == 0) NEWP3(PDBID2Node[a],MAX_ACC_CHAR+3,UInt4);
			 if(PDBID2Node[a][b] == 0) NEWPP(PDBID2Node[a][b],MAX_ACC_CHAR+3,UInt4);
			 if(PDBID2Node[a][b][c] == 0) NEWP(PDBID2Node[a][b][c],MAX_ACC_CHAR+3,UInt4);
			 if(PDBID2Node[a][b][c][d] == 0) NEW(PDBID2Node[a][b][c][d],MAX_ACC_CHAR+3,UInt4);
			 PDBID2Node[a][b][c][d][e]=n;
		   } else print_error("addphylum input error 1!");
		}

<ACCESS>^{P}{4,6}{S}{I}$	{
		}

<ACCESS>^{PDB}(":PDB".+)?{S}{I}$ {	// PDB code 
		}

<ACCESS>^{PRF}(":PDB".+)?{S}{I}$	{	// PRF code 
		   // fprintf(stderr,"%s (%d)\n",yytext,LineNum);
		}

<ACCESS>^{PRF}("."{I})?{S}{I}$	{	// PRF code 
		   // fprintf(stderr,"%s (%d)\n",yytext,LineNum);
		}

<ACCESS>^{RS}{S}{RS}("."{I})?{S}{I}{S}{I}$    |	// afn4_13_22
<ACCESS>^{RS}("."{I})?{S}{I}$	{	// refseq code.
		   UInt4 i,n,k;
		   char  a,b,c;
		   // if(LineNum >= 136970890) fprintf(stderr,"%s (%d)\n",yytext,LineNum);
		   if(sscanf(yytext,"%cP_%u.%*u %u",&a,&i,&n) != 3){
		     if(sscanf(yytext,"%cP_%u %*s %u",&a,&i,&n) != 3) // afn4_13_22
		       if(sscanf(yytext,"%cP_%u %u",&a,&i,&n) != 3){
				print_error("Accession error 'r'");
		       }
		   }
		   // if(n==i) fprintf(stderr,"%s\n",yytext);
		   // if(n==0) fprintf(stderr,"%s\n",yytext);
		   if(i >= MAX_LSHORT_ID){
			print_error("Accession error 'r'");
		   }
		   if(File==2){
			assert(n < MAX_TAX_NODES);
			if(Rep6Seq2Node[a] == 0){
			    NEW(Rep6Seq2Node[a],MAX_LSHORT_ID+3,UInt4);
			} Rep6Seq2Node[a][i]=n;
		   } else print_error("addphylum input error 'r'!");
		}

<ACCESS>^{R}{S}{R}("."{I})?{S}{I}{S}{I}$  | // afn4_13_22
<ACCESS>^{R}("."{I})?{S}{I}$	{	// refseq code.
		   UInt4 i,n,k;
		   char  a,b,c;
		   // if(LineNum >= 136970890) fprintf(stderr,"%s (%d)\n",yytext,LineNum);
		   if(sscanf(yytext,"%cP_%u.%*u %u",&a,&i,&n) != 3){
		     if(sscanf(yytext,"%cP_%u %*s %u",&a,&i,&n) != 3)  // afn4_13_22
		      if(sscanf(yytext,"%cP_%u %u",&a,&i,&n) != 3){
				print_error("RefSeq accession error R");
		      }
		   } 
		   if(i >= MAX_XLONG_ID){
				print_error("Accession error R");
		   }
		   // if(n==i) fprintf(stderr,"%s\n",yytext);
		   // if(n==0) fprintf(stderr,"%s\n",yytext);
		   if(File==2){
			 assert(n < MAX_TAX_NODES);
			 if(Rep9Seq2Node[a] == 0) NEW(Rep9Seq2Node[a],MAX_XLONG_ID+3,UInt4);
			 Rep9Seq2Node[a][i]=n;
		   } else print_error("addphylum input error 1!");
		}

<ACCESS>^{B}{S}{B}("."{I})?{S}{I}{S}{I}$  | // afn4_13_22
<ACCESS>^{B}("."{I})?{S}{I}$	{	// standard NCBI long accession code.
		   UInt4 i,n,k;
		   char  a,b,c;
		   // if(LineNum == 749631826) fprintf(stderr,"%s (%d)\n",yytext,LineNum);
		   if(sscanf(yytext,"%c%c%c%u %*s %u",&a,&b,&c,&i,&n) != 5) // afn4_13_22
		     if(sscanf(yytext,"%c%c%c%u.%*u %u",&a,&b,&c,&i,&n) != 5){
		      if(sscanf(yytext,"%c%c%c%u %u",&a,&b,&c,&i,&n) != 5){
			 print_error("Accession error B");
		      }
		   }
		   // if(n==i) fprintf(stderr,"%s\n",yytext);
		   // if(n==0) fprintf(stderr,"%s\n",yytext);
		   // fprintf(stderr,"%c%c%c%u %u\n",a,b,c,i,n);
		   if(File==2){
			 assert(n < MAX_TAX_NODES);
			 if(SeqAcc2NodeL[a] == 0)
				NEWPP(SeqAcc2NodeL[a],MAX_ACC_CHAR+3,UInt4);
			 if(SeqAcc2NodeL[a][b] == 0)
				NEWP(SeqAcc2NodeL[a][b],MAX_ACC_CHAR+3,UInt4);
			 if(SeqAcc2NodeL[a][b][c] == 0)
				NEW(SeqAcc2NodeL[a][b][c],MAX_LONG_ID+3,UInt4);
			 SeqAcc2NodeL[a][b][c][i]=n;
		   } else print_error("addphylum input error 1!");
	}

<ACCESS>^{A}{S}{A}("."{I})?{S}{I}{S}{I}$   | // afn4_13_22
<ACCESS>^{A}("."{I})?{S}{I}$	{	// standard NCBI accession code.
		   UInt4 i,n,k;
		   char  a,b,c;
		   // if(LineNum == 749631826) fprintf(stderr,"%s (%d)\n",yytext,LineNum);
		   if(sscanf(yytext,"%c%c%c%u.%*u %u",&a,&b,&c,&i,&n) != 5){
		    if(sscanf(yytext,"%c%c%c%u %*s %u",&a,&b,&c,&i,&n) != 5) // afn4_13_22
		      if(sscanf(yytext,"%c%c%c%u %u",&a,&b,&c,&i,&n) != 5){
			 print_error("Accession error 'A'");
		      }
		   }
		   // if(n==i) fprintf(stderr,"%s\n",yytext);
		   // if(n==0) fprintf(stderr,"%s\n",yytext);
		   // fprintf(stderr,"%c%c%c%u %u\n",a,b,c,i,n);
		   if(File==2){
			 assert(n < MAX_TAX_NODES);
			 if(SeqAcc2Node[a] == 0)
				NEWPP(SeqAcc2Node[a],MAX_ACC_CHAR+3,UInt4);
			 if(SeqAcc2Node[a][b] == 0)
				NEWP(SeqAcc2Node[a][b],MAX_ACC_CHAR+3,UInt4);
			 if(SeqAcc2Node[a][b][c] == 0)
				NEW(SeqAcc2Node[a][b][c],MAX_SHORT_ID+3,UInt4);
			 SeqAcc2Node[a][b][c][i]=n;
		   } else print_error("addphylum input error 1!");
	}

<ACCESS>^{U}{S}{U}("."{I})?{S}{I}{S}{I}$   | // afn4_13_22
<ACCESS>^{U}("."{I})?{S}{I}$	{	// swiss/uniprot code.
		   UInt4 i,n,k;
		   char  a,b,c,d,e;
		   if(sscanf(yytext,"%c%c%c%c%c%u.%*u %u",&a,&b,&c,&d,&e,&i,&n) != 7){
		     if(sscanf(yytext,"%c%c%c%c%c%u %*s %u",&a,&b,&c,&d,&e,&i,&n) != 7) // afn4_13_22
		      if(sscanf(yytext,"%c%c%c%c%c%u %u",&a,&b,&c,&d,&e,&i,&n) != 7){
		   	fprintf(stderr,"%c%c%c%c%c%u %u\n",a,b,c,d,e,i,n);
			fprintf(stderr," --> %s\n",yytext);
			print_error("Swiss accession error 'U'");
		      }
		   }
// if(strstr(yytext,"D0P1B2") != NULL) fprintf(stderr,"SP: %s\n",yytext);
		   // if(LineNum >= 136970890) fprintf(stderr,"%s (%d)\n",yytext,LineNum);
		   // fprintf(stderr,"SP: %s\n",yytext);
		   if(File==2){
			 assert(n < MAX_TAX_NODES);
			 if(SwissAcc2Node[a] == 0) NEWP3(SwissAcc2Node[a],MAX_ACC_CHAR+3,UInt4*);
			 if(SwissAcc2Node[a][b] == 0) NEWP3(SwissAcc2Node[a][b],MAX_ACC_CHAR+3,UInt4);
			 if(SwissAcc2Node[a][b][c] == 0) NEWPP(SwissAcc2Node[a][b][c],MAX_ACC_CHAR+3,UInt4);
			 if(SwissAcc2Node[a][b][c][d] == 0) NEWP(SwissAcc2Node[a][b][c][d],MAX_ACC_CHAR+3,UInt4);
			 if(SwissAcc2Node[a][b][c][d][e] == 0) NEW(SwissAcc2Node[a][b][c][d][e],MAX_SP_ID +3,UInt4);
			 SwissAcc2Node[a][b][c][d][e][i]=n;
		   } else print_error("addphylum input error 1!");
		}

<ACCESS>^{W}{S}{W}("."{I})?{S}{I}{S}{I}$  | // afn_4_13_22
<ACCESS>^{W}("."{I})?{S}{I}$	{	// swiss/uniprot 9 %c + UInt1
		   UInt4 i,k,n;
		   char  a,b,c,d,e,f,g,h,j;
		   if(sscanf(yytext,"%c%c%c%c%c%c%c%c%c%u.%*u %u",
				&a,&b,&c,&d,&e,&f,&g,&h,&j,&i,&n) != 11){
		     if(sscanf(yytext,"%c%c%c%c%c%c%c%c%c%u %*s %u",
				&a,&b,&c,&d,&e,&f,&g,&h,&j,&i,&n) != 11) // afn4_13_22
		      if(sscanf(yytext,"%c%c%c%c%c%c%c%c%c%u %u",
				  &a,&b,&c,&d,&e,&f,&g,&h,&j,&i,&n) != 11){
			fprintf(stderr,"%c%c%c%c%c%c%c%c%c%u %u\n",a,b,c,d,e,f,g,h,j,i,n);
			fprintf(stderr," --> %s\n",yytext);
			print_error("Uniprot 1 accession error 'W' ");
		      }
		   }
		   if(i >= 10){
			fprintf(stderr," --> %s\n",yytext);
			print_error("UniProt 2 accession error 'W'");
		   }
// if(strstr(yytext,"D0P1B2") != NULL) fprintf(stderr,"SP: %s\n",yytext);
		   // if(LineNum >= 136970890) fprintf(stderr,"%s (%d)\n",yytext,LineNum);
		   // fprintf(stderr,"SP: %s\n",yytext);
		   if(File==2){
			 assert(n < MAX_TAX_NODES);
			 if(UniPrtW[a] == 0) NEWP3(UniPrtW[a],MAX_ACC_CHAR+3,UInt4*****);
			 if(UniPrtW[a][b] == 0) NEWP3(UniPrtW[a][b],MAX_ACC_CHAR+3,UInt4****);
			 if(UniPrtW[a][b][c] == 0) NEWP3(UniPrtW[a][b][c],MAX_ACC_CHAR+3,UInt4***);
			 if(UniPrtW[a][b][c][d] == 0) NEWP3(UniPrtW[a][b][c][d],MAX_ACC_CHAR+3,UInt4**);
			 if(UniPrtW[a][b][c][d][e] == 0) NEWP3(UniPrtW[a][b][c][d][e],MAX_ACC_CHAR+3,UInt4*);
			 if(UniPrtW[a][b][c][d][e][f] == 0)
				NEWP3(UniPrtW[a][b][c][d][e][f],MAX_ACC_CHAR+3,UInt4);
			 if(UniPrtW[a][b][c][d][e][f][g] == 0)
				NEWPP(UniPrtW[a][b][c][d][e][f][g],MAX_ACC_CHAR+3,UInt4);
			 if(UniPrtW[a][b][c][d][e][f][g][h] == 0)
				NEWP(UniPrtW[a][b][c][d][e][f][g][h],MAX_ACC_CHAR+3,UInt4);
			 if(UniPrtW[a][b][c][d][e][f][g][h][j] == 0)
				NEW(UniPrtW[a][b][c][d][e][f][g][h][j],MAX_ACC_CHAR+3,UInt4);
			 UniPrtW[a][b][c][d][e][f][g][h][j][i]=(UInt1)n;
		   } else print_error("addphylum input error 1!");
		}

<NODES>{I}	|
<NAMES>{I}	|
<INITIAL>{I}	{
		   UInt4 i;
		   sscanf(yytext,"%lu",&i);
		   if(0 && i == 40674){
			fprintf(stderr,"%d.%d ==> node %d(%u)\n",
					File,Field,i,Node);
			if(File==1){
			   print_tax_node(stderr,tax_node[i]);
			   Int4 p,n=Node;
			   while((p=tax_node[n].parent) > 1){
				n = p;
				print_tax_node(stderr,tax_node[n]);
			   } fprintf(stderr,"---> p=%d\n",p);
			}
		   }
		   if(File==0){		// == nodes.dmp
			BEGIN NODES;
			if(Field == 0){ 
				Node=i;
				tax_node[Node].tax_id=i;
			   	tax_node[Node].name=0;
				tax_node[Node].rank=0;
				assert(Node > 0 && Node < MAX_TAX_NODES);
				// printf("node %d",Node);
			} else {
			  assert(Node > 0 && Node < MAX_TAX_NODES);
			  if(Field == 1){	// Node is Field == 0
				assert(tax_node[Node].tax_id == Node);
				tax_node[Node].parent=i;
				// printf("!!! --> %ld\n",i);
			  } else if(Field == 6){
				tax_node[Node].gc_code=i;
				// printf("gc: %ld\n",i);  fflush(stdout);
			  }
			}
		   } else if(File==1){	// == names.dmp
			   BEGIN NAMES;
			   if(Field == 0){ 
				Node=i;
				assert(Node < MAX_TAX_NODES);
				// printf("Node %d",Node);
			   }
		   } else {
			fprintf(stderr,"1: Node = %d; line = %ld\n",Node,LineNum);
			fprintf(stderr,"yytext = \"%s\"\n",yytext);
			for(Int4 i=1; i <=200; i++){
				char c=fgetc(yyin); 
				if(c==EOF) break;
				fprintf(stderr,"%c",c);
			} fprintf(stderr,"\n"); print_error("addphylum input error 2!");
		   }
		}

<NODES>[|][^|;]*[;]		|
<NODES>[|][^|;]*[;][^|]+	{ ; }

<NODES>[|]		{ Field++; }

<NODES>{L}+	|
<NODES>{C}	{
			// fprintf(stderr,"--> Field =%d\n",Field);
			if(Field != 2) ;
			else if(strstr(yytext,"phylum") || 
					strstr(yytext,"clade") || 
					strstr(yytext,"class") || 
					strstr(yytext,"order") || 
					strstr(yytext,"no rank") || 
					strstr(yytext,"kingdom")){
				tax_node[Node].rank = AllocString(yytext);
				// if(Node==40674) print_tax_node(stdout,tax_node[Node]);
			} 
#if 0
			if(Node==40674){
			    fprintf(stderr,"%d -->%s\n",Node,yytext);
			   Int4 p,n=Node;
			   while((p=tax_node[n].parent) > 1){
				n = p;
				print_tax_node(stderr,tax_node[n]);
			   }
			}
#endif
		}

<NAMES>[|]		{
				Field++; // ECHO; 
			}

<NAMES>{T}+	{
		  if(Field == 1){
		    if(tax_node[Node].rank){	// class/phylum/kingdom/etc.
			   if(tax_node[Node].name == 0){
			     tax_node[Node].name = AllocString(yytext);
			     char *s=RmCandidatus(Node);
			     if(s != 0){ // remove 'Candidatus'
				free(tax_node[Node].name);
				tax_node[Node].name=s;
			     }
			   // print_tax_node(stderr,tax_node[Node]);
			   }
		    }
#if 0
			if(Node==40674){
			   Int4 p,n=Node;
			   fprintf(stderr,"%d.%d --> %s\n",Field,Node,yytext);
			   print_tax_node(stderr,tax_node[n]);
			   while((p=tax_node[n].parent) > 1){
				n=p;
				print_tax_node(stderr,tax_node[n]);
			   }
			}
#endif
		  }
		}

<NAMES>.	{ }

<PDBID>[\n]	|
<ACCESS>[\n]	|
<NODES>[\n]	|
<NAMES>[\n]	{ 
			Field=0; LineNum++; // ECHO;
		}

{S}		{
		}

.		{
			char c;
			fprintf(stderr,"2: File=%d; Node=%d; line=%ld\n",
				File,Node,LineNum);
			if(1){
			   while((c=fgetc(yyin)) != '\n' && c != EOF){
				fprintf(stderr,"%c",c);
			   } fprintf(stderr,"\n");
			} fprintf(stderr,"PARSE ERROR!: '%s'\n",yytext);
			exit(1);
			// if(fgets(str,200,yyin) != NULL) fprintf(stderr," --> %s",str);;
		}

%%

int	yywrap()
{
    if(File==0){
	BEGIN INITIAL;
    } else if(File==1){
	BEGIN INITIAL;
	prune_othertax_nodes(tax_node);
	while(prune_tax_nodes(tax_node) > 0);
	for(Int8 n=2; n <= MAX_TAX_NODES; n++){
		if(tax_node[n].rank){
			assign_pseudophyla_tax_node(n,tax_node);
			// print_tax_node(stdout,tax_node[n]);
			if(strstr(tax_node[n].rank,"phylum") ||
			      strstr(tax_node[n].rank,"class") ||
				strstr(tax_node[n].rank,"kingdom")){
				print_tax_node(stderr,tax_node[n]);
			}
		}
	}
    } return 1;
}

#define DNA_MTRX "-4  -4  -4  -4  -4 \
                  -4   5  -4  -4  -4 \
                  -4  -4   5  -4  -4 \
                  -4  -4  -4   5  -4 \
                  -4  -4  -4  -4   5 "

#if 0
       // -V          output Vertebrate (Chordate) sequences only\n\

#endif

int	main(int argc, char *argv[])
{
	char	str[300];
	Int4	i,arg;
	BooLean	UseStdIn=FALSE;
	UsePDBaa=FALSE;
	NumSkippedLargePDB=0;


	aaAB = MkAlpha(AMINO_ACIDS,GBLAST_BLOSUM62);
	dnaAB = MkAlpha("NACGT",DNA_MTRX);
	if(argc < 2){ fprintf(stderr,"%s\n",ADDPHYLUM_VERSION); print_error(USAGE_ADDPHYLUM); }
	for(arg = 2; arg < argc; arg++){
          if(argv[arg][0] == '-'){
           switch(argv[arg][1]) {
             case 'n': ISDNA=TRUE; print_error("DNA version not yet implemented."); break;
             case 'C': SKIPCHORDATES=TRUE; break;
#if 0	// need to add cma_typ to this...
             case 'T':  { // addphyla to a cma file from a corresponding seq file.
        	sprintf(str,"%s.cma",argv[1]); cma=ReadCMSA2(str,aaAB);
        	ss_type Data=TrueDataCMSA(cma);
        	sprintf(str,"%s.seq",argv[1]); data=MkSeqSet(str,AB);
        	for(i=1,j=2; i <= NSeqsSeqSet(data); i++,j++){
                   e_type E=SeqSetE(i,data);
                   char *phylum=PhylumSeq(E);
                   char king=KingdomSeq(E);
                   e_type Sq=SeqSetE(j,Data);
                   if(!IsSameSeqID(E,Sq)){
			print_error("addphylum -T option input error");
		   } StrSeqInfo(str,Sq);
                   //fprintf(stderr,"%d. %s\n",j,str);
                   Int4    s,e;
                   if(sscanf(str,"%*s %d-%d",&s,&e) == 2){
                        if(s > 0) SetOffSetSeq(s-1,Sq);
                   }
                   if(phylum) TaxAssignSeq(phylum,king,Sq);
                   // PutSeq(stderr,Sq,AB);
                } PutCMSA(stdout,cma); TotalNilCMSA(cma); return 0;
	       } break;
#endif
             case 'V': CHORDATES_ONLY=TRUE; break;
             case 'u': SKIPUNKNOWN=TRUE; break;
             case 'p': 
		  if(strcmp(argv[arg],"-pdb") == 0) UsePDBaa=TRUE; 
		  else print_error(USAGE_ADDPHYLUM);
		break;
             case 'x': break;
             default : print_error(USAGE_ADDPHYLUM);
           }
          } else if(arg > 1) print_error(USAGE_ADDPHYLUM);
	}
	if(getenv("TAXDUMPDIR") == 0){
		fprintf(stderr,"Define the path to NCBI taxdump directory ");
		fprintf(stderr,"via the environmental variable 'TAXDUMPDIR'.\n");
		print_error("   Fatal error.");
	}

	NEW(tax_node,MAX_TAX_NODES+2,taxonomic_node_type);
fprintf(stderr,"********** 1. Reading nodes.dmp ***********\n");
	File=0; LineNum=0;
	strcpy(str,getenv("TAXDUMPDIR")); strcat(str,"/nodes.dmp");
// fprintf(stderr,"==> %s\n",str);
	yyin=open_file(str,"","r");
        while(yylex()); fclose(yyin);

fprintf(stderr,"********** 2. Reading names.dmp ***********\n");
	File=1; LineNum=0;
	strcpy(str,getenv("TAXDUMPDIR")); strcat(str,"/names.dmp");
	FILE *fp=open_file(str,"","r");  yyin=tmpfile(); 
	while(fgets(str,290,fp) != NULL){
		if(strstr(str,"scientific name") != NULL) fprintf(yyin,"%s",str);
		// if(strstr(str,"40674\t") != NULL) fprintf(stderr,"%s",str);
	} rewind(yyin);
#if 0
char C; FILE *tfp=open_file("junk",".names","w");
while((C=fgetc(yyin)) != EOF) fprintf(tfp,"%c",C); rewind(yyin);
#endif
        while(yylex()); fclose(yyin);

if(UsePDBaa) fprintf(stderr,"********** 3. Reading  pdb.accession2taxid file ***********\n");
else fprintf(stderr,"********** 3. Reading  prot.accession2taxid file ***********\n");
	File=2; LineNum=0;
	strcpy(str,getenv("TAXDUMPDIR"));
	// if(ISDNA) strcat(str,"/gi_taxid_nucl.dmp"); else
	if(UsePDBaa){ 
	  BEGIN PDBID;;
	  strcat(str,"/pdb.accession2taxid");
	  for(UInt4 s=0; s <= MAX_ACC_CHAR; s++) PDBID2Node[s]=0; 
	} else {
	  BEGIN ACCESS;
	  strcat(str,"/prot.accession2taxid");
	  for(UInt4 s=0; s <= MAX_ACC_CHAR; s++){
		Rep6Seq2Node[s]=0; Rep9Seq2Node[s]=0; SeqAcc2Node[s]=0;
		SeqAcc2NodeL[s]=0; SwissAcc2Node[s]=0; UniPrtW[s]=0; 
	  }
	} yyin=open_file(str,"","r");
	// if(fgets(str,200,yyin)== NULL) print_error("input error");
	// skip first line == 'accession       accession.version       taxid   gi'
        while(yylex()); fclose(yyin);
	// while(1) ; exit(1);
	fprintf(stderr,"==== %d very large files skipped ====\n",NumSkippedLargePDB); 

fprintf(stderr,"********** 4. Reading fasta file  ***********\n");
	BEGIN DEFLINE;
	File=3; LineNum=0;
	if(strcmp(argv[1],"stdin")== 0) yyin=stdin; else yyin=open_file(argv[1],"","r"); 
	ArgV1=argv[1];
        while(yylex()); fclose(yyin);
	if(mfp) fclose(mfp);
	NilAlpha(aaAB); NilAlpha(dnaAB);
	return 0;
}

