// -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
// vi: set ts=2 noet;
//
// (c) Copyright Rosetta Commons Member Institutions.
// (c) This file is part of the Rosetta software suite and is made available under license.
// (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
// (c) For more information, see http://www.rosettacommons.org. Questions about this can be
// (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.

/// @file   apps/pilot/james/cluster_alns.cc
///
/// @brief  Divide input alns into clusters based on gdtmm comparison of partial models.
/// @author James Thompson

#include <utility/exit.hh>
#include <utility/string_util.hh>
#include <utility/file/FileName.hh>
#include <utility/file/file_sys_util.hh>
#include <utility/io/ozstream.hh>
#include <ObjexxFCL/format.hh>

#include <protocols/init.hh>

#include <core/types.hh>

#include <core/pose/Pose.hh>
#include <core/pose/util.hh>
#include <core/util/Tracer.hh>
#include <core/chemical/util.hh>
#include <core/chemical/ResidueTypeSet.fwd.hh>
#include <core/io/pdb/pose_io.hh>
#include <core/sequence/util.hh>
#include <core/sequence/SequenceAlignment.hh>
#include <core/scoring/rms_util.hh>
#include <core/kinematics/FoldTree.hh>

#include <protocols/comparative_modeling/util.hh>
#include <protocols/comparative_modeling/coord_util.hh>
#include <protocols/comparative_modeling/PartialThreadingMover.hh>
#include <protocols/cluster/APCluster.hh>

#include <fstream>
#include <map>
#include <sstream>
#include <algorithm>

// option key includes
#include <core/options/option.hh>
#include <core/options/keys/in.OptionKeys.gen.hh>
#include <core/options/keys/cm.OptionKeys.gen.hh>
#include <core/options/keys/run.OptionKeys.gen.hh>

std::map< std::string, core::pose::Pose >
poses_from_cmd_line(
	utility::vector1< std::string > const & fn_list
) {
	using std::map;
	using std::string;
	using utility::vector1;
	using core::pose::Pose;
	using utility::file::file_exists;
	using core::io::pdb::pose_from_pdb;
	using namespace core::chemical;

	ResidueTypeSetCAP rsd_set = rsd_set_from_cmd_line();
	map< string, Pose > poses;

	typedef vector1< string >::const_iterator iter;
	for ( iter it = fn_list.begin(), end = fn_list.end(); it != end; ++it ) {
		if ( file_exists(*it) ) {
			Pose pose;
			pose_from_pdb( pose, *rsd_set, *it );
			string name = utility::file_basename( *it );
			name = name.substr( 0, 5 );
			poses[name] = pose;
		}
	}

	return poses;
}

int
main( int argc, char * argv [] ) {
	using core::Size;
	using core::Real;
	using std::map;
	using std::string;
	using core::pose::Pose;
	using core::options::option;
	using protocols::cluster::APCluster;
	using utility::vector1;
	using utility::file::FileName;
	using protocols::comparative_modeling::gather_coords;
	using ObjexxFCL::string_of;

	using namespace protocols;
	using namespace core::chemical;
	using namespace core::options;
	using namespace core::options::OptionKeys;
	using namespace core::sequence;
	using namespace ObjexxFCL::fmt;

	protocols::init(argc, argv);
	core::util::Tracer TR("cluster_alns.main");

	vector1< std::string > align_fns = option[ in::file::alignment ]();
	vector1< SequenceAlignment > alns;
	for ( Size ii = 1; ii <= align_fns.size(); ++ii ) {
		vector1< SequenceAlignment > tmp_alns = core::sequence::read_aln(
			option[ cm::aln_format ](), align_fns[ii]
		);
		for ( Size jj = 1; jj <= tmp_alns.size(); ++jj ) {
			alns.push_back( tmp_alns[jj] );
		}
	}

	map< string, Pose > template_poses = poses_from_cmd_line(
		option[ in::file::template_pdb ]()
	);

	using core::sequence::read_fasta_file;
	string query_sequence (
		read_fasta_file( option[ in::file::fasta ]()[1])[1]->sequence()
	);

	Size max_pose_len(0);
	vector1< Pose >   poses;
	vector1< string > aln_ids;
	for ( Size ii = 1; ii <= alns.size(); ++ii ) {
		string const aln_id( alns[ii].sequence(2)->id() );
		string const template_id( aln_id.substr(0,5) );
		map< string, Pose >::iterator pose_it = template_poses.find( template_id );

		if ( pose_it == template_poses.end() ) {
			string msg( "Error: can't find pose (id = "
				+ template_id + ")"
			);
			//utility_exit_with_message(msg);
			std::cout << msg << std::endl;
			continue;
		} else {
			core::pose::Pose query_pose;
			make_pose_from_sequence(
				query_pose, query_sequence, *(rsd_set_from_cmd_line())
			);
			core::pose::Pose template_pose = pose_it->second;
			using namespace protocols::comparative_modeling;
			std::cout << "building incomplete model with " << std::endl
				<< alns[ii] << std::endl;
			PartialThreadingMover mover( alns[ii], template_pose );
			mover.apply( query_pose );
			//std::cout << "fold_tree(" << aln_id << ") = " << query_pose.fold_tree()
			//	<< std::endl;
			max_pose_len = std::max( max_pose_len, query_pose.total_residue() );
			poses.push_back( query_pose );
			aln_ids.push_back( aln_id );
			if ( option[ run::debug ]() ) {
				query_pose.dump_pdb( aln_id + ".pdb" );
			}
		}
	} // for alns

	Size const max_similarities( poses.size() );
	Size const total_comparisons( ( poses.size() * poses.size() - poses.size() ) / 2 );
	APCluster cluster( poses.size(), max_similarities );
	TR << "Calculating similarity for " << poses.size() << " structures: "
		<< std::endl;

	vector1< vector1< Real > > gdtmms(
		poses.size(), vector1< Real >( poses.size(), 0.0 )
	);
	Size total_comparisons_done(0);
	for ( Size ii = 1; ii <= poses.size(); ++ii ) {
		gdtmms[ii][ii] = 1.0;
		for ( Size jj = ii + 1; jj <= poses.size(); ++jj ) {
			++total_comparisons_done;
			// gdtmm comparison
			SequenceAlignment aln( align_poses_naive( poses[ii], poses[jj] ) );
			int n_atoms;
			ObjexxFCL::FArray2D< Real > p1a, p2a;
			protocols::comparative_modeling::gather_coords(
				poses[ii], poses[jj],
				aln,
				n_atoms, p1a, p2a
			);

			Real const coverage( (Real) n_atoms / (Real) max_pose_len );

			using core::scoring::xyz_gdtmm;
			Real const gdtmm( xyz_gdtmm( p1a, p2a ) );
			Real const gdtmm_adj( coverage * gdtmm );
			cluster.set_sim( ii, jj, gdtmm_adj );
			cluster.set_sim( jj, ii, gdtmm_adj );
			gdtmms[ii][jj] = gdtmm_adj;
			gdtmms[jj][ii] = gdtmm_adj;

			if ( option[ run::debug ]() ) {
				std::cerr << "coverage = " << coverage << ", n_atoms = " << n_atoms
					<< ", max_pose_len = " << max_pose_len << ", gdtmm = " << gdtmm << std::endl;
				std::cerr << "pose1 seq = " << poses[ii].sequence() << std::endl;
				std::cerr << "pose2 seq = " << poses[jj].sequence() << std::endl;
				std::cerr << aln << std::endl;
				std::cerr << "sim(" << aln_ids[ii] << "," << aln_ids[jj] << ") = " << gdtmm_adj << std::endl;
				std::cerr << "--------------------------------------------------------------------------------"
					<< std::endl;
			}

			if ( total_comparisons_done % 50 == 0 ) {
				std::cout << "." << std::flush;
				if ( total_comparisons_done % 1000 == 0 ) {
					std::cout << " finished with "
						<< total_comparisons_done << " / " << total_comparisons << "."
						<< std::endl;
				}
			}
		} // jj
	} // ii

	// set up some thresholds for self-similarities. This implicitly controls
	// the number of clusters (low self-similarities => small number of
	// clusters), and input data points with high self-similarity are more
	// likely to be chosen as exemplars.
	Size const n_steps( 10 );
	vector1< Real > thresholds;
	for ( Size ii = 1; ii <= n_steps; ++ii ) {
		thresholds.push_back( (Real) ii / n_steps );
	}

	for ( Size jj = 1; jj <= thresholds.size(); ++jj ) {
		for ( Size ii = 1; ii <= poses.size(); ++ii ) {
			// don't change gdtmms, we'll need them later ...
			vector1< Real > loc_gdtmms( gdtmms[ii] );

			Size const idx( (Size) (loc_gdtmms.size() * thresholds[jj]) );
			std::sort( loc_gdtmms.begin(), loc_gdtmms.end() );
			Real const self_sim( loc_gdtmms[idx] );
			cluster.set_sim( ii, ii, self_sim );
		}

		TR << "self_sim_threshold = " << thresholds[jj] << std::endl;
		TR << "Clustering poses..." << std::endl;
		cluster.cluster(2000, 100, 0.8);
		TR << cluster.get_num_exemplars() << " clusters selected" << std::endl;

		Size const width(12);
		std::string outfile_name("aln_clusters." + string_of(jj) + ".txt");
		utility::io::ozstream out( outfile_name );

		vector1< Size > exemplars;
		cluster.get_all_exemplars( exemplars );
		out << A( width, "aln_id" ) <<  A( width, "exemplar" )
			<< A( width, "cluster_id" )
			<< A( width, "dist" )
			<< std::endl;
		for ( Size ii = 1; ii <= aln_ids.size(); ++ii ) {
			Size const exemplar_id( cluster.get_exemplar_for(ii) );
			out << A( width, aln_ids[ii] )
				<< I( width, exemplar_id )
				<< A( width, aln_ids[exemplar_id] )
				<< F( width, 3, gdtmms[ii][exemplar_id] )
				<< std::endl;
		}

		out.close();
	}

	return 0;
}
