#!/usr/bin/perl -w

## REQUIRED PACKAGES
use FindBin qw($Bin);   ## find directory of this script
use lib "$Bin/../lib";  ## must point to directory where Clustering.pm resides 
use Clustering;

###########################################
## CONFIGURATION ##########################

my $CLUSTERER = "$Bin/../src/rosetta_cluster/rosetta_cluster"; ## must point to rosetta_cluster
my $EXTRACTOR = "$Bin/reconstruct_PDB_by_index";               ## must point to reconstruct_PDB_by_index

###########################################
###########################################
$| = 1;

my (%opts, $pwd, $out_dir, $getcenters, $silentfile, $aaseq, $base_name, $clusters_file);

## get opts
%opts = &getCommandLineOptions();
$silentfile     = $opts{silentfile};
$base_name      = $opts{base_name};
$out_dir        = $opts{out_dir};
$getcenters     = $opts{get_centers};

$out_dir ||= "./";
$out_dir .= "/" if ($out_dir !~ /\/$/);

## lets get full path to silent file and out_dir
$pwd = `pwd`;
chomp $pwd;
$pwd .= "/" if ($pwd !~ /\/$/);
$silentfile = $pwd.$silentfile if ($silentfile !~ /^\//);
$out_dir = $pwd.$out_dir if ($out_dir !~ /^\//);

(-d $out_dir || mkdir($out_dir)) || die "ERROR: cannot mkdir $out_dir: $!\n";

## get base name from silent mode file name
if (!$base_name && $silentfile =~ /\/?([^\/]+)$/) {
    $base_name = $1;
}


## get aaseq from silent mode file
open(FILE, $silentfile) or die "ERROR: cannot open $silentfile: $!\n";
while (<FILE>) {
    if (/^SEQUENCE:\s+(\w+)\s*$/) { $aaseq = $1; last; }
}
close(FILE);
die "ERROR: cannot get sequence from silent mode file\n" if (!$aaseq);

## initialize Clustering object
my $clustering_obj = new Clustering( executable => $CLUSTERER,
                                     base_name  => $base_name,
                                     run_dir    => $out_dir);
## add target
$clustering_obj->add_target( infile => $silentfile, seq => uc $aaseq );

## execute
$clusters_file = $base_name.".clusters";
if (-s $out_dir.$clusters_file) {
    print "Clusters file $out_dir$clusters_file exists so skipping clustering\n";
    ## make sure we are in the out directory for decoy extraction
    chdir($out_dir) or die "ERROR: cannot chdir $out_dir: $!\n";
} else {
    unless ( $clustering_obj->execute() && -s $clusters_file ) {
        die "ERROR: cannot execute clustering object\n";
    }
}


## shall we get and extract the cluster centers?
if ($getcenters) {
    ## get decoy count
    my $decoys = `grep -c SCORE: $silentfile`;
    $decoys--;
    print "$decoys decoys found in $silentfile\n";
    my ($centers_array_ref, $usedclustermembers_array_ref) = &findClusterCenters( cluster_file => $clusters_file,
                                                                                  max_centers  => $getcenters,
                                                                                  last_decoy   => $decoys );
    my $modelnumber = 0;

    MODEL: foreach my $model_ref (@{$centers_array_ref}) {

        ## target count
        $modelnumber++;

	## name of extracted cluster center
        my $modelpdb = $base_name.'.'.sprintf("%4.4d", $modelnumber).'.pdb';

        my ($decoy, $shell, $exit_status, $out);
        my ($pdbstr, $decoy_renamed, $decoy_pdb, $model_scores_array_ref);
        my @scorelines = ();
 
        my $model       = $model_ref->{decoy_number};
        my $modelid     = $model_ref->{decoy_id};
 
        ## extract model from silent mode file
        $decoy = $silentfile;
        $decoy =~ s/\.out$//i;
		$decoy =~ s/^.+\/([^\/]+)/$1/;
        $decoy = sprintf($decoy."__decoy_%04d.pdb", $model);
        printf( "Extracting model %5s decoy %6s $out_dir$modelpdb\n", $modelnumber, $model );
        $shell = "( $EXTRACTOR $silentfile $model ) 2>&1";
        ( $exit_status, $out ) = &_runCmd( cmd => $shell, catch_output => 1 );

        ## check if extraction was successful
        (-s $decoy && $out !~ /error:/s) or
                die "ERROR: decoy extraction error \n$out\n\n";
        ## make sure the index is the same
        if ($out =~ /in\s+decoy\s+$model\s+Identifier:\s+(\S+)/) {
                if ($modelid && $modelid ne $1) {
                die "ERROR: decoy extraction error, identifier $1 does not match $modelid\n";
                } else {
                $modelid = $1;
                foreach my $line ( split($/, $out) ) { push(@scorelines, $1) if ($line =~ /^SCORE:\s*(.+)\s*$/); }
                }
        } else {
                die "ERROR: decoy extraction error, cannot parse identifier from output\n";
        }
        (scalar@scorelines == 2) or
                die "ERROR: cannot extract scores from silent mode file, 2 score lines must exist\n";
 
        ## add source information to pdb for reference
        $pdbstr  = "SOURCE    DECOY $model IDENTIFIER $modelid\n";
        $pdbstr .= "SOURCE    $model_ref->{source}\n" if ($model_ref->{source});
 
        ## add score lines from silent file
        my @scoreheaders = split(/\s+/, &trim($scorelines[0]));
        my @scorevals    = split(/\s+/, &trim($scorelines[1]));
        (scalar@scoreheaders == scalar@scorevals) or
                die "ERROR: cannot extract scores from silent mode file, headers do not match scores\n";
        for (my $i = 0; $i<=$#scoreheaders; $i++) {
                $pdbstr .= sprintf("REMARK    SCORE %15s: %15s\n", $scoreheaders[$i], $scorevals[$i]);
        }

        ## repack extracted pdb
        #$repacked_decoy = &runRepacker( pdb => $decoy );
        #($repacked_decoy && -s $repacked_decoy) or
        #        die "ERROR: cannot repack decoy\n";
 
        ## move repacked decoy to models dir
        #print "Moving repacked decoy $repacked_decoy to $modelpdb\n" if $this->{verbose};
        #print "SHELL: mv $repacked_decoy $modelpdb\n" if $this->{verbose};
        #(system("mv", $repacked_decoy, $modelpdb) == 0) or
        #        die "ERROR: cannot 'mv $repacked_decoy $modelpdb'\n";
 
        my $n = $/;
        undef $/;
        open(PDB, $decoy) or die "ERROR: cannot open extracted decoy $decoy: $!\n";
        $pdbstr .= <PDB>;
        close(PDB);
        $/ = $n;
        open(PDB, ">$modelpdb") or die "ERROR: cannot open PDB $modelpdb: $!\n";
        print PDB $pdbstr;
        close(PDB);

        unlink( $decoy );  ## remove extracted decoy
    }
}

## Done!
exit(0);



## subs

sub findClusterCenters {
        my %params = ( @_ );
 
        my $clusters_file    = $params{cluster_file};
        my $max_centers      = $params{max_centers};
        my $last_decoy       = $params{last_decoy};
        my $members_to_avoid = $params{members_to_avoid};
 
        my $cluster_center_cnt  = 0;
        my $minimum_members     = 10;
        my @centers             = ();
        my @usedclustermembers  = ();
        my $clusters_filename   = $clusters_file;
        $clusters_filename      =~ s/^.+\/([^\/]+)$/$1/;
        $max_centers            ||= 10;
 
        print "Finding cluster centers and members\n";
 
        die "ERROR: cannot find cluster centers, decoy count undefined\n" if (!$last_decoy);
 
        ## get clusters
        my $clusters_array_ref = &readClusterFile( cluster_file => $clusters_file );
 
 
        CLUSTER: foreach my $cluster ( @{$clusters_array_ref} ) {
                $cluster_center_cnt++;
 
                my ($cluster_decoy_number, $cluster_decoyid);
 
                ## limit number of cluster centers to $max_centers
                last CLUSTER if ( scalar@centers >= $max_centers );
 
                ## make sure clusters have enough members
                if ( scalar@{$cluster} < $minimum_members ) {
                        warn "WARNING - skipping cluster centers starting from $cluster_center_cnt: less than $minimum_members members\n";
                        last CLUSTER;
                }
                ## cluster member of target
                my $target_member = 1;
                ## get cluster center, if not the target (first $last_decoy decoys) find member that is
                MEMBER: foreach my $member (@{$cluster}) {
                        my ($decoy_number, $decoyid) = split(',', $member);
                        if ($decoy_number <= $last_decoy) {
                                $cluster_decoy_number = $decoy_number;
                                $cluster_decoyid      = $decoyid;
                                last MEMBER;
                        }
                        $target_member++;
                }
                if ($cluster_decoy_number) {
                        ## make sure a cluster member is not in the list of members to avoid
                        if ($members_to_avoid) {
                                foreach my $avoid (@{$members_to_avoid}) {
                                        my ($avoid_number, $avoidid) = split(',', $avoid);
                                        foreach my $member (@{$cluster}) {
                                                my ($decoy_number, $decoyid) = split(',', $member);
                                                if ($decoy_number == $avoid_number && $decoyid eq $avoidid) {
                                                        warn "WARNING - skipping cluster center $cluster_center_cnt: $decoy_number $decoyid is already represented\n";
                                                        next CLUSTER;
                                                }
                                        }
                                }
                        }
 
                        ## keep account of all members to prevent choosing a similar one
                        push (@usedclustermembers, @{$cluster});
                        ## get cluster member
                        print "Found member $target_member of cluster $cluster_center_cnt (decoy $cluster_decoy_number $cluster_decoyid)\n";
                        push( @centers, { decoy_number => $cluster_decoy_number,
                                          decoy_id     => $cluster_decoyid,
                                          source       => "CLUSTER $cluster_center_cnt MEMBER $target_member FILE $clusters_filename" } );
                }
        }
        return \@centers, \@usedclustermembers;
}

sub readClusterFile {
        my %params = ( @_ );

        my $cluster_file = $params{cluster_file};

        my @clusters            = ();
        my @lines               = ();
        my $read_members        = 0;

        print "Reading cluster file $cluster_file\n";

        open(CLUSTERS, $cluster_file)  or die "ERROR: cannot open cluster file $cluster_file, $!\n";
        @lines = <CLUSTERS>;
        close(CLUSTERS);

        ## READ AND PARSE CLUSTER FILE
        foreach my $line (@lines) {
                next if ($line =~ /^\s*$/);
                if ($line =~ /^C/) {
                        $read_members = 1;
                        next;
                }
                next if (!$read_members);
                my @rows = split(/\s+/, $line);
 
                ## get cluster members (1st member is the center)
                if ($read_members && @rows && $rows[0] =~ /^(\d+):/) {
                        shift @rows;
                        $clusters[$1] = \@rows;
                }
        }
        return \@clusters;
}

sub trim {
        my $str = shift;
 
        return "" if (!$str);
 
        $str =~ s/^\s+//;
        $str =~ s/\s+$//;
        return $str;
}

sub _runCmd {
        my %params = ( @_ );
 
        my $cmd                 = $params{cmd};
        my $catch_output        = $params{catch_output};
        my $print_output        = $params{print_output};
        my $save_output         = $params{save_output};
        my $overwrite           = $params{overwrite};
        my $verbose             = $params{verbose};
 
        my ($exit_status, $output, $pid, $cmd_pid);
 
        ## print output as default
        $print_output ||= 1 if ( !defined($print_output) || $print_output != 0 );
 
        print "RUN COMMAND ".localtime().": $cmd\n" if $verbose;
 
        $cmd_pid = open(CMD, "$cmd |") or die "ERROR: cannot run command $cmd: $!\n";
        print "COMMAND PID: ".$cmd_pid."\n" if $verbose;
 
        my $printtofile = 0;
        if ($save_output) {
                $printtofile = 1 unless( -f $save_output && !$overwrite );
        }
 
        if ($printtofile) {
                open(FILE, ">$save_output") or die "ERROR: cannot open file $save_output: $!\n";
        }
 
        $|=1;   # disable output buffering
 
        while (<CMD>) {
                print $_ if ($print_output && $verbose);
                $output .= $_ if ($catch_output);
                print FILE $_ if ($printtofile);
        }
        close(CMD);
        close(FILE) if ($printtofile);
 
        $exit_status = ($? >> 8);       ## doesn't handle negative exit codes
 
        ($catch_output) ? return $exit_status, $output : return $exit_status;
}

sub getCommandLineOptions {
    use Getopt::Long;
    my $fail = 'FALSE';
    my $usage =<<USAGE;

       USAGE: $0 -silentfile <rosetta silent mode file>

               [-out_dir        <output directory>]
               [-base_name      <base name for output files> def: silentfile name]
               [-get_centers	<number of top cluster centers> def: none]

 DESCRIPTION: Clusters decoys from a rosetta silent mode file.
              If '-get_centers N' option is used, N top cluster centers will get extracted.

     EXAMPLE: $0 -silentfile aa1ubq.out -base_name ubiquitin -get_centers 10

              Clusters decoys from aa1ubq.out and extracts top 10 cluster centers.

              Files generated:  ubiquitin.clustercmd
                                ubiquitin.clusters
                                ubiquitin.0001.pdb
                                ubiquitin.0002.pdb
                                ..
                                ubiquitin.0010.pdb
USAGE

    # Get args
    #
    my %opts = ();
    &GetOptions (\%opts, "silentfile=s", "base_name=s", "out_dir=s", "get_centers=i");

    # Check for legal invocation
    #
    if (!$opts{silentfile}) {
        $fail = 'TRUE';
    }

    if ($fail eq 'TRUE') {
        print STDERR "$usage\n";
        exit -1;
    }

    return %opts;
}

