#!/usr/bin/perl -w
use strict;

###############################################################################
##
## FIND HOMOLOGUES
##
## Implements a modified version of Rich Bonneau's script to choose homologues
## for Rosetta ab initio modeling and clustering.
##
## Copyright 2002, University of Washington
##   This document contains private and confidential information and
##   its disclosure does not constitute publication.  All rights are
##   reserved by University of Washington and the Baker Lab
##   except those specifically granted by license.
##
##  Initial Author: Richard Bonneau
##                  David E. Kim (dekim@u.washington.edu)
##  $Revision: 6156 $
##  $Date: 2005-04-29 04:59:10 -0400 (Fri, 29 Apr 2005) $
##  $Authors: David Kim $
##
###############################################################################
# conf
###############################################################################
 
$| = 1;                  # disable stdout buffering
 
my $debug = 0;           # 0=off, 1=on

my $Blastpgp	= "/net/local/blast/blastpgp";
my $NRDatabase  = "/scratch/shared/genomes/nr";

###############################################################################
# init
###############################################################################

## get command line options
my %opts           = &getCommandLineOptions ();
my $id             = $opts{id};
my $fasta          = $opts{fasta};
my $blast          = $opts{blast};
my $outfile        = $opts{outfile};
my $max_nalign     = $opts{maxalignments};
my $minlen         = $opts{minlen};
my $maxidentity    = $opts{maxidentity};
my $minavgidentity = $opts{minavgidentity};
my $maxinsert      = $opts{maxinsert};
my $maxgap         = $opts{maxgap};
my $sortby         = $opts{sortby};
my $maxhoms        = $opts{maxhoms};
my $rminserts      = $opts{rminserts};
my $verbose        = $opts{verbose};


###############################################################################
# main
###############################################################################

my ($queryheader, $queryseq, $querylen, @hsp, @distMatrix);
my ($listCount, @hspSorted, @bestList, @finalList);

## get query sequence
print "Reading fasta: $fasta\n" if ($debug || $verbose);
($queryheader, $queryseq) = &readFasta( $fasta );

$querylen = length($queryseq);
print "Query length: $querylen\n" if ($debug || $verbose);
($querylen) or die "ERROR - query length too short\n";

print "Minimum length of query: ".int($querylen*($minlen/100))." ($minlen%)\n" if ($debug || $verbose);
if (defined $maxinsert) {
	my $maxinsertpctg = $maxinsert;
	$maxinsert = sprintf("%.0f", $querylen*($maxinsert/100));
	print "Maximum length sum of inserts: $maxinsert ($maxinsertpctg%)\n" if ($debug || $verbose);
}
if (defined $maxgap) {
	my $maxgappctg = $maxgap;
	$maxgap    = sprintf("%.0f", $querylen*($maxgap/100));
	print "Maximum length sum of gaps: $maxgap ($maxgappctg%)\n" if ($debug || $verbose);
}

## run psi blast
if (!$blast || !-s $blast) {
	my $attempts     = 3;
	my $shell;
	my @cleanupfiles = ( $blast, 'core', "$id.check", "$id.msa_scores" );
	print "Running Psi-Blast (max attempts: $attempts)\n" if ($debug || $verbose);
	$blast  = $id."_NR.blast";
	$shell  = "( $Blastpgp -i $fasta -F F -j 3 -o $blast -d $NRDatabase ";
	$shell .= "-v 10000 -b 10000 -K 10000 -h 0.0009 -e 0.0009 -C $id.check -Q $id.msa_scores ) 2>&1";
	TRY: foreach my $attempt (1 .. $attempts) {
		my $exit_status = &_runCmd( cmd => $shell );
		last TRY if (!$exit_status && -s $blast);
		## cleanup and try again
		print "Attempt $attempt failed\n" if ($debug || $verbose);
		if ($attempt < $attempts) {
			print "Cleaning up for next attempt\n" if ($debug || $verbose);
			foreach my $f (@cleanupfiles) {
				if (-f $f) {
					print "Removing $f\n" if ($debug || $verbose);
					unlink($f);
				}
			}
		}
	}
	(-s $blast) or die "ERROR - blast of $fasta failed\n";
}

## read blast file (after last Searching..)
print "Reading blast output: $blast\n" if ($debug || $verbose);
print "Maximum alignments: $max_nalign\n" if ($debug || $verbose);
print "Removing inserts\n" if ($rminserts && ($debug || $verbose));

if ($blast =~ /\.gz|\.Z/) {
  $blast = "gzip -dc $blast |";
}

open(BLAST, $blast) or die "ERROR - cannot open blast file '$blast': $!\n";
while (<BLAST>) {
	if (/^\s*Searching\.\.\./) {
		# initialize w/ query
		@hsp  = ( { 'id'  => $id,
			    'eval' => '*****',
			    'identity' => $querylen,
			    'alignlen' => $querylen,
			    'len' => $querylen, 
			    '_Query' => { 'begin' => 1, 'end' => $querylen, 'data' => uc $queryseq },
			    '_Sbjct' => { 'begin' => 1, 'end' => $querylen, 'data' => uc $queryseq } } );
		next;
	} elsif ( /^>(\S+)\s*/ && $#hsp < $max_nalign - 1 ) {
		my $hit  = { id => $1 };
		my $prev = { line => $_ };
		HIT: while (<BLAST>) {
			if ( /Length\s*=\s*([\d,]+)/ ) {
				$hit->{len} = $1;
                   		$hit->{len} =~ s/\,//g;
			} elsif ( /Score\s*=\s*(\S+)\s*bits\s*\((\d+)\),\s*Expect(\(\d+\))?\s*=\s*(\S+)/) {
				$hit->{score} = $2;
				$hit->{bitscore} = $1;
				$hit->{eval} = $4;
			} elsif ( /Identities\s*=\s*(\d+)\s*\/\s*(\d+)\s*[\d\%\(\)]+\s*(,\s*Positives\s*=\s*(\d+)\/(\d+)\s*[\d\%\(\)]+\s*)?(\,\s*Gaps\s*=\s*(\d+)\/(\d+))?/i ) {
				$hit->{identity} = $1;
				$hit->{alignlen} = $2;
				$hit->{positive} = $4 if (defined $3);
				$hit->{gaps}     = $7 if (defined $6);
				$hit->{_Query}   = { 'begin' => 0, 'end' => 0, 'data' => '' };
				$hit->{_Sbjct}   = { 'begin' => 0, 'end' => 0, 'data' => '' };
			} elsif ( /^((Query|Sbjct):\s+(\d+)\s*)(\S+)\s+(\d+)/ ) {
				if ($prev->{"\_$2"} && $3 != $prev->{"\_$2"}->{'end'} + 1) {
					warn "WARNING - skipping $hit->{id}: numbering is not continuous\n" if ($debug);
					last HIT;
				}
				$hit->{"\_$2"}->{'data'} .= $4;
				$hit->{"\_$2"}->{'begin'} = $3 unless $hit->{"_$2"}->{'begin'};
           			$hit->{"\_$2"}->{'end'}   = $prev->{"\_$2"}->{'end'} = $5;
			} elsif ( /^\s*$/ && $prev->{line} =~ /^\s*$/ ) {
                		## switch B to N
                		my $BtoN_cnt = 0;
                		while ( $hit->{'_Sbjct'}->{'data'} =~ s/B/N/is ) { $BtoN_cnt++; }
                		warn "WARNING - switched B's to N's in $hit->{id}: $BtoN_cnt total\n" if ($debug && $BtoN_cnt); 
                		## switch Z to Q
                		my $ZtoQ_cnt = 0;
                		while ( $hit->{'_Sbjct'}->{'data'} =~ s/Z/Q/is ) { $ZtoQ_cnt++; }
                		warn "WARNING - switched Z's to Q's in $hit->{id}: $ZtoQ_cnt total\n" if ($debug && $ZtoQ_cnt);
                		## skip homolog if it has any bad aa's
                		my $non_aa_cnt = 0;
                		my @non_aa;
                		while ( $hit->{'_Sbjct'}->{'data'} =~ /([^ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy\.\-])/isg ) {
                		        push(@non_aa, $1);
                		}
                		if (@non_aa) {
                        		warn "WARNING - skipping $hit->{id}: sequence has non-aa characters - ".join("",@non_aa)."\n" if ($debug);
                        		last HIT;
                		}
				## find gaps and insertions and change to lowercase
				my $qlen   = length($hit->{'_Query'}->{'data'});
				my $slen   = 0; # number of positions mapped to query
				my $inlen  = 0; # length of inserts
				my $gaplen = 0; # length of gaps
				my $stmp   = "";
				my $qtmp   = "";
				## make sure lengths of query and sbjct are equal
				($qlen == length($hit->{'_Sbjct'}->{'data'})) or do {
					warn "WARNING - skipping $hit->{id}: query and sbjct lengths do not match\n" if ($debug);
					last HIT;
				};
				## ucase mapped positions and lcase inserts
				for (my $i=0;$i<$qlen;$i++) {
					my $qc = substr($hit->{'_Query'}->{'data'}, $i, 1);
					my $sc = substr($hit->{'_Sbjct'}->{'data'}, $i, 1);
					if ($qc eq '-') {
						$stmp .= lc $sc;
						$inlen++;
					} else {
						$stmp .= uc $sc;
						$slen++ if ($sc =~ /[A-Z]/);
					}
					if ($sc eq '-') {
						$qtmp .= lc $qc;
						$gaplen++;
					} else {
						$qtmp .= uc $qc;
					}
				}
				## get rid of hsp's whose inserts are too long
				if (defined $maxinsert && $inlen > $maxinsert) {
					warn "WARNING - skipping $hit->{id}: sum of inserts is too long\n" if ($debug);
					last HIT;
				}
				## get rid of hsp's whose gaps are too long
				if (defined $maxgap && $gaplen > $maxgap) {
					warn "WARNING - skipping $hit->{id}: sum of gaps is too long\n" if ($debug);
					last HIT;
				}
				$hit->{insertlen} = $inlen;
				$hit->{gaplen}    = $gaplen;
				## make sure sbjct has enough positions mapped to query sequence
				if ( $slen < int($querylen*($minlen/100)) ) {
					warn "WARNING - skipping $hit->{id}: too few positions mapped to query seq\n" if ($debug);
					last HIT;
				}
				$hit->{mappedlen} = $hit->{qlen} = $slen;
                                if ($rminserts) {
					## remove inserts
                                        $qtmp =~ s/\-+//g;
                                        $stmp =~ s/([^a-z])[a-z]+([^a-z])/lc($1).lc($2)/eg;
                                } else {
					$hit->{qlen} += $inlen;
				}
				## fill-in missing front and end positions w/ '-'
				$hit->{'_Query'}->{'data'} = '-' x ($hit->{"_Query"}->{'begin'}-1).$qtmp.'-' x ($querylen-$hit->{"_Query"}->{'end'});
				$hit->{'_Sbjct'}->{'data'} = '-' x ($hit->{"_Query"}->{'begin'}-1).$stmp.'-' x ($querylen-$hit->{"_Query"}->{'end'});
				push(@hsp, $hit);
				last HIT;
			}
			$prev->{line} = $_;
		}
	}
}
close(BLAST);
print scalar@hsp." alignments read\n" if ($debug || $verbose);

{
	## calculate the distance matrix to decide which protein fasta's to choose to represent ie, pick which ones to fold 
	## save all positions into $master[position][hsp] (ignoring insertions to allow all against all mapping)
	my @master;
	for (my $i=0; $i<=$#hsp; $i++) {
		my $seq = $hsp[$i]->{'_Sbjct'}->{'data'};
		$seq    =~ s/[a-z]+//g if (!$rminserts); # remove insertions
		my @tmpchars = split(//, $seq);
		# make sure the lengths match
		die "ERROR - lengths between query and sbjct do not agree\n" if ($#tmpchars+1 != $querylen);
		for (my $pos=0; $pos <= $#tmpchars; $pos++) {
			$master[$pos][$i] = uc $tmpchars[$pos];
		}
	}
	print "Calculating distance matrix\n" if ($debug || $verbose);
	for (my $i=0; $i<=$#hsp; $i++) {
		$distMatrix[$i][$i] = 1.0;
		for (my $j=$i+1; $j <= $#hsp; $j++) {
			my $idCount=0;
			for (my $pos=0; $pos < $querylen; $pos++) {
				$idCount++ if ( $master[$pos][$i] =~ $master[$pos][$j] );
	                }
	                $distMatrix[$i][$j]=$idCount/$querylen;
	                $distMatrix[$j][$i]=$idCount/$querylen;
	        }
		print '.' if ($debug || $verbose);
	}

	# calculate average identity to all hsps for each hsp
	print "\nCalculating average identity for all hsps\n" if ($debug || $verbose);
	my @sumId;
	for (my $i=0; $i <= $#hsp; $i++) {
		$sumId[$i] = 0;
		$hspSorted[$i] = $i; # initialize @hspSorted
		for (my $j=0; $j <= $#hsp; $j++) {
			$sumId[$i] += $distMatrix[$i][$j];
		}
		$hsp[$i]->{avg_identity} = $sumId[$i]/($#hsp+1);
	}
}



# sort hsps
shift @hspSorted; # get rid of query
if ($sortby =~ /identity/i) {
        print "Sorting hsps by avg_identity -> insertlen -> gaplen -> mappedlen\n" if ($debug || $verbose);
        @hspSorted = sort {    $hsp[$a]->{avg_identity} <=> $hsp[$b]->{avg_identity}
                                                ||
                              $hsp[$a]->{insertlen} <=> $hsp[$b]->{insertlen}
                                                ||
                              $hsp[$a]->{gaplen} <=> $hsp[$b]->{gaplen}
                                                ||
                              $hsp[$b]->{mappedlen} <=> $hsp[$a]->{mappedlen}
                          } @hspSorted;
} elsif ($sortby =~ /size/i) {
        print "Sorting hsps by qlen -> avg_identity\n" if ($debug || $verbose);
        @hspSorted = sort {   $hsp[$a]->{qlen} <=> $hsp[$b]->{qlen}
                                                ||
                              $hsp[$a]->{avg_identity} <=> $hsp[$b]->{avg_identity}
                          } @hspSorted;
} elsif ($sortby =~ /insertlen/i) {
        print "Sorting hsps by insertlen -> gaplen -> avg_identity -> mappedlen\n" if ($debug || $verbose);
        @hspSorted = sort {    $hsp[$a]->{insertlen} <=> $hsp[$b]->{insertlen}
                                                ||
                              $hsp[$a]->{gaplen} <=> $hsp[$b]->{gaplen}
                                                ||
                              $hsp[$a]->{avg_identity} <=> $hsp[$b]->{avg_identity}
                                                ||
                              $hsp[$b]->{mappedlen} <=> $hsp[$a]->{mappedlen}
                          } @hspSorted;
}


## NOW LETS CHOOSE THE HOMOLOGUES

if ($debug || $verbose) {
	print "Getting best list\n";
	print "  maximum identity: $maxidentity\n";
	print "  minimum average identity: $minavgidentity\n";
}
BESTOUT: foreach my $i (@hspSorted) {
	my $skip = 0;
	if ($distMatrix[$i][0] > $maxidentity/100) {
		printf("DROP %-20.20s  ID:%4.2f too high to query\n", $hsp[$i]->{id}, $distMatrix[$i][0]) if ($debug);
		next BESTOUT;
	}
	for (my $j=0;$j<=$#bestList;$j++) {
		if ($distMatrix[$i][$bestList[$j]] > $maxidentity/100) {
			printf("DROP %-20.20s  ID:%4.2f too high to best list $hsp[$bestList[$j]]->{id}\n", $hsp[$i]->{id}, $distMatrix[$i][$bestList[$j]]) if ($debug);
			next BESTOUT;
		}
	}
	if ($hsp[$i]->{avg_identity} < $minavgidentity/100 ) {
		printf("DROP %-20.20s  AvgID:%4.2f too low\n", $hsp[$i]->{id}, $hsp[$i]->{avg_identity}) if ($debug);
		next BESTOUT;
	}
	printf("ADD  %-20.20s  ID:%4.2f AvgID:%4.2f QLen:%4d\n", $hsp[$i]->{id}, $distMatrix[$i][0], $hsp[$i]->{avg_identity}, $hsp[$i]->{qlen}) if ($debug || $verbose);
	push(@bestList, $i);
}
@distMatrix = (); #empty
@hspSorted  = ();

# get desired number of homologues
$#bestList = $maxhoms-1 if (defined $maxhoms && scalar@bestList > $maxhoms);

# add query to top of list
unshift(@bestList, 0);

# initialize final alignment
for (my $i=0; $i<=$#bestList; $i++) {
	$hsp[$bestList[$i]]->{alignment} = $hsp[$bestList[$i]]->{_Sbjct}->{data};
}

if (!$rminserts) {
	## map all to all (add gaps) for final alignment
	print "Mapping all positions\n" if ($debug || $verbose);
	for (my $i=0; $i<=$#bestList; $i++) {
		## get inserts and their start position in the query seq
		my @inserts = &getInserts($hsp[$bestList[$i]]->{alignment});
		## add gaps to all other seqs foreach insert
		J: for (my $j=0; $j<=$#bestList; $j++) {
			next J if ($i == $j); # skip self
			foreach (@inserts) {
				my $insert    = $_->{insert};
				my $querypos  = $_->{querypos};
				my $insertlen = length($insert);
				my $jquerypos = 0;
				my $added     = 0;
				my $newj      = '';
				my @jcars     = split(//, $hsp[$bestList[$j]]->{alignment});
				for (my $k=0;$k<=$#jcars;$k++) {
					## get query position (A-Z and - map to query)
					$jquerypos++ if ($jcars[$k] =~ /[A-Z\-]/);
					$newj .= $jcars[$k]; ## append char to new seq
					## add gap after this position
					if (!$added && $jquerypos == $querypos) {
						$added    = 1;
						my $shift = 0;
						OFFSET: foreach my $offset (1 .. $insertlen) {
							if ($k+$offset > $#jcars || $jcars[$k+$offset] =~ /[A-Z\-]/) {
								# offset position is mapped to query
								$newj .= '.' x ($insertlen - $offset + 1);
								last OFFSET;
							} else {
								# offset position is already an insert/gap
								$newj .= $jcars[$k+$offset];
								$shift = $offset;
							}
						}
						$k += $shift;
					}
				}
				$hsp[$bestList[$j]]->{alignment} = $newj; # save new gapped seq
			}
		}
	}
}

## clean up alignment
foreach (@bestList) {
	if ($debug) {
		print "$hsp[$_]->{id}\n";
		print "$hsp[$_]->{'_Query'}->{'data'}\n";
		print "$hsp[$_]->{'_Sbjct'}->{'data'}\n";
		print "$hsp[0]->{alignment}\n";
		print "$hsp[$_]->{alignment}\n\n";
        } 
	$hsp[$_]->{alignment} =~ s/\./\-/g;
	#$hsp[$_]->{alignment} = uc $hsp[$_]->{alignment};
	(length($hsp[$_]->{alignment}) == length($hsp[0]->{alignment})) or
		die "ERROR - lengths do not agree in final alignment\n";
}

if ($outfile) {
    open   (OUTFILE, '>'.$outfile) or die "ERROR - cannot open outfile $outfile: $!\n";
    select (OUTFILE);
}

## write out alignment
printf("%-20.20s %4.4s %4.4s %5.5s %6.6s %8.8s ALIGNMENT\n", 'CODE', 'LEN', 'ALEN', 'IDENT', 'E-VAL', 'AvgIDENT');
foreach (@bestList) {
	printf("%-20.20s %4d %4d %5d %6.6s %8.2f $hsp[$_]->{alignment}\n", $hsp[$_]->{id}, 
		$hsp[$_]->{len}, $hsp[$_]->{alignlen}, ($hsp[$_]->{identity}/$hsp[$_]->{alignlen})*100, 
		$hsp[$_]->{eval}, $hsp[$_]->{avg_identity}*100 );
}

if ($outfile) {
    close  (OUTFILE);
    select (STDOUT);
}

## finished
exit(0);



## SUBS ####################################################################


sub readFasta {
        my $fasta = shift;
        my $header = "";
        my $sequence = "";
        (-s $fasta) or die "ERROR - cannot read fasta file $fasta: $!\n";
        open(FASTA, $fasta) or die "ERROR - cannot read fasta file $fasta: $!\n";
	while (my $line = <FASTA>) {
		if ($line =~ /^>(.+)$/) {
			$header .= $1;
			next;
		}
		$sequence .= $line;
	}
        close(FASTA);
        $header         =~ s/\n+/ /gs;
        $sequence       =~ s/\s+//gs;
        return $header, $sequence;
}


sub getInserts {
        my $seq = shift;
        my @inserts;
        while ($seq =~ /([a-z]+)/g) {
                my $insert = $1;
                my $prevseq = $`;
                my $querypos = 0;
                while ($prevseq =~ /[A-Z\-]/g) { $querypos++; }
                push(@inserts, { insert => $insert, querypos => $querypos });
        }
        return @inserts;
}

sub _runCmd {
        my %params = ( @_ );
 
        my $cmd                 = $params{cmd};
        my $catch_output        = $params{catch_output};
        my $print_output        = $params{print_output};
        my $save_output         = $params{save_output};
        my $overwrite           = $params{overwrite};
        my $verbose             = $params{verbose};
 
        my ($exit_status, $output, $pid, $cmd_pid);
 
        ## print output as default
        $print_output ||= 1 if ( !defined($print_output) || $print_output != 0 );
 
        print "RUN COMMAND ".localtime().": $cmd\n" if $verbose;
 
        $cmd_pid = open(CMD, "$cmd |") or die "ERROR: cannot run command $cmd: $!\n";
        print "COMMAND PID: ".$cmd_pid."\n" if $verbose;
 
        my $printtofile = 0;
        if ($save_output) {
                $printtofile = 1 unless( -f $save_output && !$overwrite );
        }
 
        if ($printtofile) {
                open(FILE, ">$save_output") or die "ERROR: cannot open file $save_output: $!\n";
        }
 
        $|=1;   # disable output buffering
 
        while (<CMD>) {
                print $_ if ($print_output && $verbose);
                $output .= $_ if ($catch_output);
                print FILE $_ if ($printtofile);
        }
        close(CMD);
        close(FILE) if ($printtofile);
 
        $exit_status = ($? >> 8);       ## doesn't handle negative exit codes
 
        ($catch_output) ? return $exit_status, $output : return $exit_status;
}


# getCommandLineOptions()
#
#  rets: \%opts  pointer to hash of kv pairs of command line options
#
sub getCommandLineOptions {
    use Getopt::Long;
    my $usage = qq{usage: $0
\t -id               <queryid>
\t -fasta            <fastafile>
\t[-blast            <blastfile -m0 alignment>]
\t[-outfile          <outfile>]                       (def: STDOUT)
\t[-maxalignments    <max alignments>]                (def: 1000)
\t[-maxidentity      <max % identity>]                (def: 75)
\t[-minavgidentity   <min % avg identity>]            (def: 20)
\t[-minlen           <min len (% query seq)>]         (def: 80)
\t[-maxinsert        <max insert len (% query seq)>]  (def: none)
\t[-maxgap           <max gap len (% query seq)>]     (def: none)
\t[-maxhoms          <max homologues>]                (def: 2)
\t[-sortby           <identity | size | insertlen>]   (def: identity)
\t[-rminserts]
\t[-verbose]
};

    # Get args
    my %opts = ();
    &GetOptions ( \%opts, "id=s", "fasta=s", "blast=s", "outfile=s", "maxalignments=i", "minlen=i", "maxidentity=i", 
	          "minavgidentity=i", "maxinsert=i", "maxgap=i", "maxhoms=i", "sortby=s", "rminserts", "verbose");
 
    # Check for legal invocation
    if (! defined $opts{id} ||
        ! defined $opts{fasta} 
        ) {
        print STDERR "$usage\n";
        exit(1);
    }

    # defaults
    #
    $opts{minlen} ||= 80;
    $opts{maxidentity} ||= 75;
    $opts{maxalignments} ||= 1000;
    $opts{minavgidentity} ||= 20;
    $opts{maxhoms} ||= 2;
    $opts{sortby} ||= 'identity';
    $opts{sortby} = 'identity' if ($opts{sortby} !~ /(identity|size|insertlen)/i);
    $opts{verbose} = 1 if (exists $opts{verbose});
    $opts{rminserts} = 1 if (exists $opts{rminserts});

    # existence checks
    #
    (-s $opts{fasta}) or die "ERROR: fasta file $opts{fasta} does not exist\n";
 
    return %opts;
}



