%START IDENT OTHER END
I       (0|[1-9][0-9]*)
L       [A-Z1-9]
A       [A-Z0-9]
%{
#include "get_pdb.h"
long count0=0;
long count=0;
char mode='i';
char subdirectory[10000];
char pdb4vsi_query[10000];
char pdb0[10003][9];
char pdb[10003][9];
char pdbsq_file[200];
char **pdbid;

void	GetUniquePDB( )
{
	Int4	i,j;
	count=1;
	strncpy(pdb[count],pdb0[count],4); 
	for(i=2; i <= count0; i++){
	     for(j=1; j <= count; j++){
		if(strncmp(pdb[j],pdb0[i],4) == 0) break; // skip this one
	     }
	     if(j > count){	// did not find pdb0[i] in current list.
		count++; strncpy(pdb[count],pdb0[i],4);
	     }
	}
	// fprintf(stderr,"count=%d; count0=%d\n",count,count0);
}

%}
%%

[ ]>{I}{A}{A}{A}[_]{L}	{
				   count0++;
				   if(count0 >= 10000){
					fprintf(stderr,"==> %d seqs\n",count0);
					fprintf(stderr,"Too many input pdb sequences\n"); exit(1);
				   }
				   pdb0[count0][0]=yytext[2]; pdb0[count0][1]=yytext[3];
				   pdb0[count0][2]=yytext[4]; pdb0[count0][3]=yytext[5];
				   pdb0[count0][4]=yytext[6]; pdb0[count0][5]=yytext[7];
				   pdb0[count0][6]=0;
					// printf("%c%c%c%c%c\n", yytext[4],
					//	yytext[5],yytext[6],yytext[7],yytext[9]); 
			}

^>{I}{A}{A}{A}[_]{L}	|
[\001]{I}{A}{A}{A}[_]{L}	{ 
				   count0++;
				   if(count0 >= 10000){
					fprintf(stderr,"==> %d seqs\n",count0);
					fprintf(stderr,"Too many input pdb sequences\n"); exit(1);
				   }
				   pdb0[count0][0]=yytext[1]; pdb0[count0][1]=yytext[2];
				   pdb0[count0][2]=yytext[3]; pdb0[count0][3]=yytext[4];
				   pdb0[count0][4]=yytext[5]; pdb0[count0][5]=yytext[6]; 
				   pdb0[count0][6]=0;
					// printf("%c%c%c%c%c\n", yytext[4],
					//	yytext[5],yytext[6],yytext[7],yytext[9]); 
			}

pdb[|]{I}{A}{A}{A}[|]{L}	{ 
				   count0++;
				   if(count0 >= 10000){
					fprintf(stderr,"==> %d seqs\n",count0);
					fprintf(stderr,"Too many input pdb sequences\n"); exit(1);
				   }
				   pdb0[count0][0]=yytext[4]; pdb0[count0][1]=yytext[5];
				   pdb0[count0][2]=yytext[6]; pdb0[count0][3]=yytext[7];
				   pdb0[count0][4]='_'; 
				   pdb0[count0][5]=yytext[9]; pdb0[count0][6]=0;
					// printf("%c%c%c%c%c\n", yytext[4],
					//	yytext[5],yytext[6],yytext[7],yytext[9]); 
				}

pdb[|]{I}{A}{A}{A}		{
				   count0++;
				   if(count0 >= 10000){
					fprintf(stderr,"==> %d seqs\n",count0);
					fprintf(stderr,"Too many input pdb sequences\n"); exit(1);
				   }
				   pdb0[count0][0]=yytext[4]; pdb0[count0][1]=yytext[5];
				   pdb0[count0][2]=yytext[6]; pdb0[count0][3]=yytext[7];
				   pdb0[count0][4]=0; pdb0[count0][5]=0;
					// printf("%s\n",yytext+4); 
				}
.		{ ; }
[ \t\n]		{ ; };
%%
int yywrap()
{ 
    long i,j,f,d;
    char tmp_pdb[9];
    if(count0 > 0){
      // if(mode != 'c'){ GetUniquePDB( ); }
      switch(mode){
       case 'L': 	// ********** CE structural alignment... *********
	{
	for(i=1; i < count0; i++){ printf("%s ",pdb0[i]); }
	// printf("%s > seq_file\n\n",pdb0[count0]);
	printf("%s\n",pdb0[count0]);
	} break;
       case 'i': 	// ********** CE structural alignment... *********
	{
	fprintf(yyout,"#!/bin/csh\n\n");
	fprintf(yyout,"\ngetpdb ");
	for(i=1; i <= count0; i++){
	   char Str[200];
	   sprintf(Str,"blastdbcmd -db $FASTADIR/pdbaa -dbtype prot -entry %s >> %s\n",
		pdb0[i],pdbsq_file);
	   fprintf(stderr,"%s",Str);
	   Int4 status=system(Str);
	   fprintf(yyout,"%s ",pdb0[i]);
	} fprintf(yyout,"\n\n");
	} break;

      case 'm':
	{
	char str[10];
	fprintf(yyout,"#!/bin/csh\n\n");
	fprintf(yyout,"\nget_pdb c ");
	NEWP(pdbid,count0 +3, char);
	for(i=1; i <= count0; i++){
		for(j=0; j < 4; j++) fprintf(yyout,"%c",tolower(pdb0[i][j]));
		for(j=0; j < 4; j++) str[j]=tolower(pdb0[i][j]);
		if(i < count0) fprintf(yyout," ");
		else fprintf(yyout,"\n\n");
		pdbid[i]=AllocString(str);
	}

#if 0
	printf("\nmreduce ");
	for(i=1; i < count0; i++){
		for(j=0; j < 4; j++) printf("%c",tolower(pdb0[i][j]));
		printf(" ");
	}
	for(j=0; j < 4; j++) printf("%c",tolower(pdb0[count0][j]));
	printf("\n\n");
#endif
        }

#if 0
	printf("\nce2cma seq_file ");
	for(i=1; i < count0; i++){
		printf("pdb");
		for(j=0; j < 4; j++) printf("%c",tolower(pdb0[i][j]));
		if(pdb0[i][4] ==0) printf(".ent:- ");
		else printf(".ent:%c ",pdb0[i][4]);
	}
	printf("pdb");
	for(j=0; j < 4; j++) printf("%c",tolower(pdb0[count0][j]));
	if(pdb0[count0][4] ==0) printf(".ent:- ");
	else printf(".ent:%c ",pdb0[count0][4]);
	printf("\n");
#endif
	break;

       case 'v': 	// *********** 'File1=' lines for .vsi file ***********
	printf("\n");
	for(f=1,d=0,i=1; i <= count0; i++){
		printf("File%d=%s/",f,subdirectory);
		for(j=0; j < 4; j++) printf("%c",tolower(pdb0[i][j]));
		if(pdb0[i][4] ==0) printf("_XH.pdb:X   // \n");
		else printf("_%cH.pdb:%c    // \n",toupper(pdb0[i][4]),toupper(pdb0[i][4]));
		if(i < count0 && strncmp(pdb0[i],pdb0[i+1],4)==0) f++;
		else { printf("\n"); d+=10; f=d+1; }
	}
	break;

       case 's': 	// ******** 'chn_pdb -S ' csh script for .vsi analysis ********
	printf("#!/bin/csh\n\n");
	for(i=1; i <= count; i++){
		printf("chn_pdb $CHN_RAS_DIR/%s/",subdirectory);
		for(j=0; j < 4; j++) printf("%c",tolower(pdb[i][j]));
		printf("_H -S ");
		printf("\n");
	} printf("\n");
	break;

       case 'd': 	// ******** make directory for 'create_vsi' files  ********
	{
	  printf("#!/bin/csh\n\n");
	  char str[10];
	  printf("mkdir x%s_DIR\n",pdb4vsi_query);
	  printf("mkdir x%s_DIR/pdb\n",pdb4vsi_query);
	  printf("\\mv -f %s* x%s_DIR/\n",pdb4vsi_query, pdb4vsi_query);
	  for(i=1; i <= count; i++){
		sprintf(str,"%c%c%c%c",tolower(pdb[i][0]),tolower(pdb[i][1]),
					tolower(pdb[i][2]),tolower(pdb[i][3]));
		printf("\\mv -f *%s* x%s_DIR/pdb/\n",str,pdb4vsi_query);
	  } printf("\n");
	} break;

       case 'p': 	// ******** pdb4vsi csh script for .vsi analysis ********
	{ 
	  char str[10];
	  printf("#!/bin/csh\n\n");
	  for(d=0,i=1; i <= count; i++,d+=10){
		sprintf(str,"%c%c%c%c",tolower(pdb[i][0]),tolower(pdb[i][1]),
					tolower(pdb[i][2]),tolower(pdb[i][3]));
		// printf("echo '************ %s_H.pdb ************'\n",str);
		printf("pdb4vsi %s $CHN_RAS_DIR/%s/",pdb4vsi_query,subdirectory);
		printf("%s_H -A%d -p=%s -subdir=%s",str,d,str,subdirectory);
		printf("\n");
	  } printf("\n");
	} break;
       default:
	print_error("mode option input error");
	break;
     }
    }
        return 1; 
}

#define USAGE "\
Usage 1: get_pdb <infile>\n\
     Starting with a cma-formated MSA or a fasta <infile> it performs two steps:\n\
     step 1:\n\
	Retrieves full deflines for NCBI pdbaa fasta sequences using the command\n\
	    blastdbcmd -db $FASTADIR/pdbaa -dbtype prot -entry <pdb_id>\n\
	This requires formating pdbaa using the command:\n\
          makeblastdb -in $FASTADIR/pdbaa -input_type fasta -dbtype prot -parse_seqids\n\
	Obtain makeblastdb, blastdbcmd and pdbaa from the NCBI via anonymous ftp\n\
            at ftp.ncbi.nlm.nih.gov.\n\
     step 2:\n\
	Retrieve pdb coordinate files corresponding to the pdb_ids listed in the\n\
          retrieved fasta deflines and then add modeled hydrogen atoms.\n\
        You need to put the script batch_download.sh on your path and\n\
          to set the environmental variable 'REDUCE_PRGM' to the reduce program\n\
            (e.g., 'setenv REDUCE_PRGM reduce_3.3'),\n\
	The batch_download script and reduce program are available at:\n\
	  https://www.rcsb.org/docs/programmatic-access/batch-downloads-with-shell-script\n\
	  and https://github.com/rlabduke/reduce, respectively.\n\
\n"

#define USAGE_OLD "\
Usage 1: get_pdb <mode> <fasta_file>\n\
   modes: \n\
     'i' Output NCBI pdb_id command line to retrieve fasta files\n\
	  The following command line retrieves the fasta sequences\n\
	    blastdbcmd -db $FASTADIR/pdbaa -dbtype prot -entry <pdb_id>\n\
	  note: This requires formating pdbaa using the command:\n\
          makeblastdb -in $FASTADIR/pdbaa -input_type fasta -dbtype prot -parse_seqids\n\
	  The programs makeblastdb, blastdbcmd and pdbaa are available from the NCBI\n\
	    via anonymous ftp at ftp.ncbi.nlm.nih.gov.\n\
     'm' Output 'get_pdb c' command line with pdb_id arguments\n\
Usage 2: get_pdb c <pdb_id> [<pdb_id> ...]\n\
     'c' Retrieve pdb coordinate files corresponding to the listed pdb_ids and then\n\
          add modeled hydrogen atoms. You need to put the batch_download.sh script\n\
          on your path and to set the environmental variable 'REDUCE_PRGM' to the\n\
	  reduce program (e.g., 'setenv REDUCE_PRGM reduce_3.3'), which are available at:\n\
	  https://www.rcsb.org/docs/programmatic-access/batch-downloads-with-shell-script\n\
	  and https://github.com/rlabduke/reduce, respectively.\n\
\n"

#define USAGE_FULL "\
Usage 1: get_pdb <mode> <fasta_file>\n\
   modes: \n\
     'i' Output NCBI pdb_id command line to retrieve fasta files\n\
	  The following command line retrieves the fasta sequences\n\
	    blastdbcmd -db $FASTADIR/pdbaa -dbtype prot -entry <pdb_id>\n\
	  note: This requires formating pdbaa using the command:\n\
          makeblastdb -in $FASTADIR/pdbaa -input_type fasta -dbtype prot -parse_seqids\n\
	  The programs makeblastdb, blastdbcmd and pdbaa are available from the NCBI\n\
	    via anonymous ftp at ftp.ncbi.nlm.nih.gov.\n\
     'L' Extract pdb_ids for each subunit\n\
     'm' Output 'get_pdb r' and 'mreduce' commands with pdb_id arguments\n\
Usage 2: get_pdb c <pdb_id> <pdb_id> ...\n\
     'c' Retrieve pdb coordinate files correspoding to list of pdb_ids\n\
	  and then add modeled hydrogen atoms. Perl needs to be on your path\n\
          and you need to set the environmental variable 'REDUCE_PRGM' to the\n\
          reduce program (e.g., 'setenv REDUCE_PRGM reduce_3.3'),\n\
	  which is available at https://github.com/rlabduke/reduce.\n\
Usage 3: get_pdb <mode> [options]\n\
   modes: \n\
     'd' create 'create_vsi' output file directory\n\
     'v' 'File1=' lines for .vsi analysis\n\
   options:\n\
     -dir=<str>	pdb subdirectory for vsi output (default: 'ZZZ')\n\
     -query=<str>	query for pdb4vsi command (default: '1XYZ')\n\
\n\n"
 
#include "get_pdb.h"
#include "twp_typ.h"
#include "afnio.h"
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <cstdint>
// #include <filesystem>

static int RunMReduce(int argc,char *argv[],BooLean *skip)
{
	Int4	i,j;
	int	status;
	char	str[200],Str[200],*Name;
	FILE *xfp,*fp;
	char *reduce=getenv("REDUCE_PRGM");
	if(reduce==0) print_error("setenv REDUCE_PRGM to the reduce program");
	char *dictionary=getenv("REDUCE_HET_DICT");
	if(dictionary==0) print_error("setenv REDUCE_HET_DICT to reduce_het_dict.txt");
	for(i=1; argv[i]; i++){
	   if(skip[i]) continue;
	   //========= see whether output files exist ===================
	   Name=argv[i];
	   sprintf(str,"pdb%s.ent",Name);
	   fp=fopen(str,"r");
	   if(fp == NULL) continue; else fclose(fp);
	   sprintf(str,"%s_H.pdb",Name);
	   fp=fopen(str,"r");
	   if(fp != NULL){
		fprintf(stderr,"%s_H.pdb file already exists;skipping\n",Name);
		fclose(fp); continue; 
	   }

	   //========= Run reduce with build ===================
	   sprintf(Str,"%s -BUILD pdb%s.ent -DB \"%s\" > %s_H.reduce.pdb",
			reduce,Name,dictionary,Name);
	   fprintf(stderr,"%d. %s\n",i,Str);
	   status=system(Str); 
	   if(status != 0){	//==== if failed; try another option ====
	        sprintf(Str,"%s pdb%s.ent -DB \"%s\" > %s_H.reduce.pdb",
			reduce,Name,dictionary,Name);
	   	status=system(Str); 
	   }

	   int	Argc=0;
	   char	*Argv[20];

	   //=========== chn_pdb ${name}_H.reduce -C > new.pdb =======
	   Argv[Argc]=AllocString("tweakPDB"); Argc++;
	   sprintf(str,"%s_H.reduce",Name);
	   Argv[Argc]=AllocString(str); Argc++;
	   Argv[Argc]=AllocString("-C"); Argc++;
	   Argv[Argc]=0;
	   sprintf(str,"%s_H.reduce.pdb",Name); 
	   pdb_typ pdb=MakePDB(str);

	   twp_typ *twp= new twp_typ(Argc,Argv,pdb);
	   for(j=0; j < Argc; j++) fprintf(stderr,"%s ",Argv[j]); fprintf(stderr,"\n");
	   FILE *ofp=tmpfile();
	   twp->Run(ofp); rewind(ofp); delete twp;
	   NilPDB(pdb); pdb=MakePDB(ofp); fclose(ofp); 

	   ReLabelPDB(pdb); 
	   ReNamePDB(Name,pdb);
	   sprintf(str,"%s_H.pdb",Name);
	   fp=open_file(str,"","w"); PutPDB(fp,pdb); fclose(fp);
	   sprintf(str,"%s_H.reduce.pdb",Name); remove(str);
	   if(pdb) NilPDB(pdb);
	   for(i=0; Argv[i]; i++) free(Argv[i]); 
	}
}

static BooLean	FileExists(const char file[])
{
	FILE *fp=fopen(file,"r");
	if(fp != NULL){ fclose(fp); return TRUE; } else return FALSE;
}

int	main(int argc, char *argv[]) 
{
	Int4	arg,start,i,j,x,Argc;
	char	c,str[200],**Argv=0;
	FILE	*fp=0;
	int	status;

	if(argc < 2) print_error(USAGE);
	sprintf(pdbsq_file,"%s.fasta_sq",argv[1]);
	if((fp=fopen(pdbsq_file,"r")) != NULL){
	   fclose(fp); std::remove(pdbsq_file);
	}
	mode = 'i'; count0=0;
	yyout = tmpfile();
	yyin=open_file(argv[1],"","r"); while(yylex());
	fclose(yyin); 
	rewind(yyout); 
	fprintf(stderr,"--------------\n");
	while((c=fgetc(yyout)) != EOF) fprintf(stderr,"%c",c);
	fprintf(stderr,"--------------\n");
	fclose(yyout);

	mode='m'; count0=0;
	yyout = tmpfile();
	yyin=open_file(pdbsq_file,"","r"); while(yylex());
	rewind(yyout); 
	fprintf(stderr,"--------------\n");
	while((c=fgetc(yyout)) != EOF) fprintf(stderr,"%c",c);
	fprintf(stderr,"--------------\n");
	fclose(yyout);
	
	mode='c';  //==== don't call yylex(); ====
	BooLean *skip;
	NEW(skip,count0+5,BooLean); NEWP(Argv,count0+5,char);
	assert(pdbid != 0);
	for(i=1,x=0; pdbid[i]; i++){
		if(skip[i]) continue;
		x++; Argv[x]=pdbid[i];
		fprintf(stderr,"%d: %s\n",x,pdbid[i]);
	  	for(j=i+1; pdbid[j]; j++){
		   if(strcmp(pdbid[i],pdbid[j])==0){ skip[j]=TRUE; }
		}
	} Argc=x; free(skip);
	for(i=1,j=0; Argv[i]; i++){
	     sprintf(str,"%s.pdb.gz",Argv[i]);
	     if(FileExists(str)) continue;
	     sprintf(str,"pdb%s.ent",Argv[i]);
	     if(FileExists(str)) continue;
	     sprintf(str,"%s_H.pdb",Argv[i]);
	     if(FileExists(str)) continue;
	     if(j==0) fp=open_file(argv[0],".in","w"); else  fprintf(fp,",");
	     j++;
	     fprintf(fp,"%s",Argv[i]);
	} 
	if(j > 0){
	    fprintf(fp,"\n"); fclose(fp);
	    fprintf(stderr,"retrieving pdb files");
	    sprintf(str,"batch_download.sh -f %s.in -p \n",argv[0]);
	    status=system(str);
	    sprintf(str,"%s.in",argv[0]); remove(str);
	    if(status != 0){
	     print_error("Need RCSB batch_downlaod.sh script on your path\n");
	    }
	}
	NEW(skip,Argc + 5, BooLean);
	for(i=1; Argv[i]; i++){
	     sprintf(str,"%s_H.pdb",Argv[i]);
	     if(FileExists(str)){ skip[i]=TRUE; continue; }
	     sprintf(str,"pdb%s.ent",Argv[i]);
	     if(FileExists(str)){ continue; }
	     sprintf(str,"gunzip -f %s.pdb.gz",Argv[i]);
	     status=system(str);
	     if(status != 0) skip[i]=TRUE;
	     else {
	        sprintf(str,"%cmv -f %s.pdb pdb%s.ent",'\\',Argv[i],Argv[i]);
	        status=system(str);
	     }
	}
	RunMReduce(Argc,Argv,skip); free(skip); free(Argv);
	for(i=1; pdbid[i]; i++) free(pdbid[i]); free(pdbid);
	return 0;
}


