#!/bin/sh

#extracts files from a concatenated pdb file generated by rosetta using the 
# -concatenate_pdbs -concatenate_pdbs_filename <out> switches
# first parameter is the out file name. If no other parameter is provided, all
# pdbs are extracted. o/w the list of pdbs provided is extracted.
# if the -uniq_seqs flag is provided then only a set of pdbs with unique sequences is output
# also outputs a list of sequences and the queried field for all of the files
# SJF

### find the beginning and end of $curr_pdb in the concatenated file
# file_end and file_begin delineate the file
function find_file_in_concat {
  idx=`awk '$1=="'$curr_pdb'"{ print NR; }' lines.TMP`
  if [ -e $idx ] ; then
    echo "$curr_pdb not found. Exiting"
	 exit 1;
  fi

  name=`awk 'NR=='$idx'{print $1}' lines.TMP`
  file_begin=`awk 'NR=='$idx'{print $2}' lines.TMP`
  if [ $idx -eq $file_num ]; then #last pdb entry in the concatenated file
    file_end=`wc $concat_file | awk '{print $1}'`
  else
    file_end=`awk 'NR=='$idx'+1 {print $2-1}' lines.TMP`
  fi
}

### prints a line with the file name, sequence and a field of $curr_pdb
function get_seq_field {
  find_file_in_concat
  printf $curr_pdb" "
  awk 'NR>='$file_begin' && NR<='$file_end' { 
    if( $1=="'$field':" ) { printf " "$2; } # use field to print out different energy values (e.g,. bk_tot)
    if( $1=="ATOM" && $3=="CA" ) { # no switch statements in awk ....
		if ( $4=="ALA" ) { printf "A"; }
		else if ( $4=="CYS" ) { printf "C"; }
		else if ( $4=="ASP" ) { printf "D"; }
		else if ( $4=="GLU" ) { printf "E"; }
		else if ( $4=="PHE" ) { printf "F"; }
		else if ( $4=="GLY" ) { printf "G"; }
		else if ( $4=="HIS" ) { printf "H"; }
		else if ( $4=="ILE" ) { printf "I"; }
		else if ( $4=="LYS" ) { printf "K"; }
		else if ( $4=="LEU" ) { printf "L"; }
		else if ( $4=="MET" ) { printf "M"; }
		else if ( $4=="ASN" ) { printf "N"; }
		else if ( $4=="PRO" ) { printf "P"; }
		else if ( $4=="GLN" ) { printf "Q"; }
		else if ( $4=="ARG" ) { printf "R"; }
		else if ( $4=="SER" ) { printf "S"; }
		else if ( $4=="THR" ) { printf "T"; }
		else if ( $4=="VAL" ) { printf "V"; }
		else if ( $4=="TRP" ) { printf "W"; }
		else if ( $4=="TYR" ) { printf "Y"; }
	 } #sequence
  } END {printf "\n"; }' $concat_file
}  

#extract a single file from the concatenated file
function extract_single {
  find_file_in_concat
  awk 'NR>'$file_begin' && NR<='$file_end'' $concat_file > $name 
  echo "extracted $name"
}


#### MAIN ####
if [ $# -lt 2 ]; then
  echo "usage: extract_from_concat_file.sh < out file > ..."
  echo "the second parameter can be -uniq_seqs or -low or -high"
  echo "-uniq_seqs will extract only unique sequences"
  echo "-low and -high expect two more fields, a field and a number. The field specifies the"
  echo "sorting criterion, and the number specifies how many to extract from the sorted list"
  echo "If no second parameter is given, all files are extracted."
  echo "If a list of files is provided, rather than the switches then they are output."
  exit 0;
fi

concat_file=$1

#unzip the file if needed
gz=`echo $concat_file | awk '{ 
  if ( substr($1,length($1)-2,3) == ".gz" ) { print 1; } 
  else { print 0; } 
}'`
if [ $gz -eq 1 ]; then #unzip the file
  gunzip $concat_file;
  concat_file=`echo $concat_file | sed 's|.gz||'`
fi

basen=`basename $concat_file`
path=`echo $concat_file | sed 's|\/'$basen'||'`

#grab the header lines that separate individual pdb entries
awk '$1=="HEADER" && $2=="CURRENT" {print $4,NR}' $concat_file > lines.TMP
all_pdbs=`awk '{print $1}' lines.TMP`
file_num=`wc lines.TMP | awk '{print $1}'`

####different extraction modes start here
#unique sequences
if [ "$2" = "-uniq_seqs" ]; then #extract only unique sequences
  rm -f list.seqs_bktot list.to_save
  field="bk_tot"
  for curr_pdb in $all_pdbs; do
    get_seq_field >> list.seqs_bktot
  done
  sort -k2 list.seqs_bktot | awk '{print $2}' | uniq > uniq_seqs.TMP
  for s in `cat uniq_seqs.TMP`; do
    awk '{ if ( $2=="'$s'" ) { print $0; exit 0; } }' list.seqs_bktot >> list.to_save
  done

  mv list.to_save $path/list.seqs_bktot

  for curr_pdb in `awk '{print $1}' $path/list.seqs_bktot`; do
    extract_single
  done
  exit 0;
fi # unique sequences

#extract by field
if [ "$2" = "-low" ] || [ "$2" = "-high" ]; then #extract by field
  if [ $# -lt 4 ]; then
    echo "expecting a field name (e.g., bk_tot, fa_rep) and number. Exiting."
	 exit 1;
  fi
  field=$3
  number=$4
  rm -f list.seqs_bktot list.to_save
  
  for curr_pdb in $all_pdbs; do
    get_seq_field >> list.to_save
  done
  sort -k3 -n list.to_save > $path/list.seqs_$field
  if [ "$2" = "-low" ]; then #take lowest
    head -$number $path/list.seqs_$field | awk '{print $1}' > list.to_save
  else # take the highest
    tail -$number $path/list.seqs_$field | awk '{print $1}' > list.to_save
  fi
  for curr_pdb in `cat list.to_save`; do
    extract_single
  done
else #extract subsets of files
  if [ $# -lt 2 ]; then #extract all
    for curr_pdb in $all_pdbs; do
      extract_single
    done
  else # extract list of files from command line
    for curr_pdb in $@; do
      if [ "$curr_pdb" != "$1" ]; then # ugly way to iterate over all BUT the first parameter
        extract_single
	   fi
    done
  fi
fi

### Cover up the changes
if [ $gz -eq 1 ]; then #rezip the file
  gzip $concat_file
fi
rm -f lines.TMP uniq_seqs.TMP list.seqs_bktot
