Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
AlignmentClustering.cc
Go to the documentation of this file.
1 // (c) Copyright Rosetta Commons Member Institutions.
2 // (c) This file is part of the Rosetta software suite and is made available under license.
3 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
4 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
5 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
6 
7 /// @file apps/pilot/brunette/alignmentClustering
8 ///
9 /// @brief Divide input alns into clusters based on gdtmm comparison of partial models using ranked alignments.
10 /// @author TJ Brunettes
11 
12 #include <utility/exit.hh>
13 #include <utility/string_util.hh>
14 #include <utility/file/FileName.hh>
15 #include <utility/file/file_sys_util.hh>
16 #include <utility/io/ozstream.hh>
17 #include <utility/io/izstream.hh>
18 
19 
20 
21 //#include <devel/init.hh>
22 
23 #include <core/types.hh>
24 
25 #include <core/pose/Pose.hh>
26 // AUTO-REMOVED #include <core/pose/util.hh>
27 
28 #include <basic/Tracer.hh>
29 #include <core/chemical/util.hh>
31 // AUTO-REMOVED #include <core/io/pdb/pose_io.hh>
32 #include <core/sequence/util.hh>
34 #include <core/scoring/rms_util.hh>
36 
38 
39 // AUTO-REMOVED #include <protocols/comparative_modeling/util.hh>
42 
43 //utilities
44 
45 // option key includes
46 #include <basic/options/option.hh>
47 #include <basic/options/keys/in.OptionKeys.gen.hh>
48 #include <basic/options/keys/cm.OptionKeys.gen.hh>
49 #include <basic/options/keys/run.OptionKeys.gen.hh>
50 
51 // AUTO-REMOVED #include <ObjexxFCL/format.hh>
52 
53 #include <fstream>
54 #include <map>
55 #include <set>
56 #include <sstream>
57 #include <algorithm>
58 
61 #include <utility/vector1.hh>
62 
63 //Auto using namespaces
64 namespace ObjexxFCL { namespace fmt { } } using namespace ObjexxFCL::fmt; // AUTO USING NS
65 //Auto using namespaces end
66 
67 
68 
69 static basic::Tracer tr("AlignmentClustering");
70 
71 namespace protocols {
72 namespace comparative_modeling {
73 
74 using utility::vector1;
75 using core::Size;
76 using core::Real;
77 using std::string;
78 using std::map;
79 using std::set;
80 using std::multimap;
81 using core::pose::Pose;
82 
83 using namespace core::sequence;
84 ///////////////////////////////////////////////////////////////////////////////
85 /// @detail Creates a AlignmentCluster
86 ///////////////////////////////////////////////////////////////////////////////
87 AlignmentCluster::AlignmentCluster(SequenceAlignment & aln_in){
88  alns.push_back(aln_in);
89 }
90 ///////////////////////////////////////////////////////////////////////////////
91 /// @detail Deletes an AlignmentCluster object
92 ///////////////////////////////////////////////////////////////////////////////
93 AlignmentCluster::~AlignmentCluster(){
94  alns.clear();
95 }
96 ///////////////////////////////////////////////////////////////////////////////
97 /// @detail add alignment
98 ///////////////////////////////////////////////////////////////////////////////
99 void AlignmentCluster::add_aln(SequenceAlignment & aln_in){
100  alns.push_back(aln_in);
101 }
102 ///////////////////////////////////////////////////////////////////////////////
103 /// @detail get aln
104 ///////////////////////////////////////////////////////////////////////////////
105 SequenceAlignment AlignmentCluster::get_aln(Size index){
106  return (alns[index]);
107 }
108 ///////////////////////////////////////////////////////////////////////////////
109 /// @detail get aln
110 ///////////////////////////////////////////////////////////////////////////////
112  return alns.size();
113 }
114 ///////////////////////////////////////////////////////////////////////////////
115 /// @detail get cluster center -- assumes lowest energy point
116 ///////////////////////////////////////////////////////////////////////////////
117 //SequenceAlignment AlignmentCluster::get_clusterCenter(){
118 SequenceAlignment AlignmentCluster::get_clusterCenter(){
119  return(alns[1]);
120 }
121 ///////////////////////////////////////////////////////////////////////////////
122 /// @detail outputs cluster to file
123 ///////////////////////////////////////////////////////////////////////////////
124 void AlignmentCluster::output(std::ostream & alignment_out){
125  for ( Size ii = 1; ii <= alns.size(); ++ii ) {
126  alns[ii].printGrishinFormat(alignment_out);
127  string const aln_id( alns[ii].sequence(2)->id() );
128  tr << aln_id << "," ;
129  }
130  tr << std::endl;
131 }
132 ///////////////////////////////////////////////////////////////////////////////
133 /// @detail merges two clusters making sure the alignments are unique
134 ///////////////////////////////////////////////////////////////////////////////
136  set<string> alignmentsInCluster;
137  set<string>::iterator location;
138  for(Size ii=1; ii<= alns.size(); ++ii){
139  alignmentsInCluster.insert(alns[ii].sequence(2)->id());
140  }
141  for(Size jj= 1; jj <= cluster_in->size(); ++jj){
142  SequenceAlignment tempAln = cluster_in->get_aln(jj);
143  location = alignmentsInCluster.find(tempAln.sequence(2)->id());
144  if (location == alignmentsInCluster.end()){
145  add_aln(tempAln);
146  }
147  }
148 }
149 ///////////////////////////////////////////////////////////////////////////////
150 /// @detail gets the percent of alignments in the input cluster that are in the current cluster
151 ///////////////////////////////////////////////////////////////////////////////
153  set<string> alignmentsInCluster;
154  set<string>::iterator location;
155  for(Size ii=1; ii<= alns.size(); ++ii)
156  alignmentsInCluster.insert(alns[ii].sequence(2)->id());
157  //loop through and see what percent of cluster_in is in the current cluster;
158  Size count_inClust = 0;
159  for(Size ii=1; ii<=cluster_in->size(); ++ii){
160  location = alignmentsInCluster.find(cluster_in->get_aln(ii).sequence(2)->id());
161  if (location != alignmentsInCluster.end())
162  count_inClust++;
163  }
164  return((Real)count_inClust/(Real)cluster_in->size());
165 }
166 ///////////////////////////////////////////////////////////////////////////////
167 /// @detail creates an alignment clustering object, this is in charge of
168 /// clustering
169 ///////////////////////////////////////////////////////////////////////////////
170 AlignmentClustering::AlignmentClustering(){
171  using core::pose::Pose;
172  using basic::options::option;
173 
176  using ObjexxFCL::string_of;
177 
178  using namespace protocols;
179  using namespace core::chemical;
180  using namespace basic::options;
181  using namespace basic::options::OptionKeys;
182  using namespace core::sequence;
183  using namespace ObjexxFCL::fmt;
184  Real THRESHOLD_FOR_E_VAL = 1e-30;
185  Real MIN_GDT = .70;
186  Real MAX_GDT = .90;
187  Real INCREMENT_GDT = .05;
188  Size GOAL_NUMB_CLUSTERS = 5;
189  Real MAX_CLUSTER_OVERLAP = .70;
190  Size MIN_POSE_SIZE = 5;
191 
192  vector1< std::string > align_fns = option[ in::file::alignment ]();
193  //vector1< SequenceAlignment > alns;
194  map<string,SequenceAlignment> alns;
195  map<string,SequenceAlignment>::iterator location_alns;
196  for ( Size ii = 1; ii <= align_fns.size(); ++ii ) {
198  option[ cm::aln_format ](), align_fns[ii]
199  );
200  for ( Size jj = 1; jj <= tmp_alns.size(); ++jj ) {
201  string aln_id = tmp_alns[jj].sequence(2)->id();
202  location_alns = alns.find(aln_id);
203  while(location_alns != alns.end()){//if alignment exists change the name by adding 100000 to the name.
204  string fixed_aln_id = aln_id + "X";
205  tmp_alns[jj].sequence(2)->id(fixed_aln_id);
206  aln_id = tmp_alns[jj].sequence(2)->id();
207  location_alns = alns.find(aln_id);
208  }
209  aln_id = (tmp_alns[jj].sequence(2)->id() );
210  alns.insert(std::pair<string,SequenceAlignment>(aln_id,tmp_alns[jj]));
211  }
212  }
213  //rank alignments
214  vector1<SequenceAlignment> rankedAlignments = generateRankedAlignments(alns,THRESHOLD_FOR_E_VAL);
215  vector1<SequenceAlignment> rankedAlignments_valid;
216  //generate poses
217  map< string, Pose > template_poses = poses_from_cmd_line(
218  option[ in::file::template_pdb ]());
220  string query_sequence (
221  read_fasta_file( option[ in::file::fasta ]()[1])[1]->sequence() );
222  Size max_pose_len(0);
223  vector1< Pose > poses;
224  vector1< string > aln_ids;
225  for ( Size ii = 1; ii <= rankedAlignments.size(); ++ii ) {
226  string const aln_id( rankedAlignments[ii].sequence(2)->id() );
227  string const template_id( aln_id.substr(0,5) );
228  map< string, Pose >::iterator pose_it = template_poses.find( template_id );
229  if ( pose_it == template_poses.end() ) {
230  string msg( "Error: can't find pose (id = "
231  + template_id + ")"
232  );
233  //utility_exit_with_message(msg);
234  std::cout << msg << std::endl;
235  continue;
236  } else {
237  core::pose::Pose query_pose;
239  query_pose, query_sequence, *(rsd_set_from_cmd_line())
240  );
241  core::pose::Pose template_pose = pose_it->second;
242  using namespace protocols::comparative_modeling;
243  std::cout << "building incomplete model with " << std::endl
244  << aln_id << std::endl;
245  PartialThreadingMover mover( rankedAlignments[ii], template_pose );
246  mover.apply( query_pose );
247  max_pose_len = std::max( max_pose_len, query_pose.total_residue() );
248  if(query_pose.total_residue() >= MIN_POSE_SIZE){
249  poses.push_back( query_pose );
250  aln_ids.push_back( aln_id );
251  rankedAlignments_valid.push_back(rankedAlignments[ii]);
252  }
253  else
254  tr << aln_id << "has only "<<query_pose.total_residue() << "which is below size threshold of " << MIN_POSE_SIZE << std::endl;
255  if ( option[ run::debug ]() ) {
256  query_pose.dump_pdb( aln_id + ".pdb" );
257  }
258  }
259  } // for rankedAlignmentsalns
260  Size total_comparisons_done(0);
261  vector1< vector1< Real > > gdtmms(
262  rankedAlignments_valid.size(), vector1< Real >(rankedAlignments_valid.size(), 0.0 )
263  );
264  for ( Size ii = 1; ii <= rankedAlignments_valid.size(); ++ii ) {
265  gdtmms[ii][ii] = 1.0;
266  for ( Size jj = ii + 1; jj <= rankedAlignments_valid.size(); ++jj ) {
267  ++total_comparisons_done;
268  SequenceAlignment aln( align_poses_naive( poses[ii], poses[jj] ) );
269  int n_atoms;
270  ObjexxFCL::FArray2D< Real > p1a, p2a;
272  aln,
273  n_atoms, p1a, p2a
274  );
275  Real const coverage( (Real) n_atoms / (Real) max_pose_len );
276  //Real const gdtmm( xyz_gdtmm( p1a, p2a ) );
278  Real const gdtmm( xyz_gdtmm( p1a, p2a ) );
279  //Real const gdtmm_adj( coverage * gdtmm );
280  //gdtmms[ii][jj] = gdtmm_adj;
281  //gdtmms[jj][ii] = gdtmm_adj;
282  gdtmms[ii][jj] = gdtmm;
283  gdtmms[jj][ii] = gdtmm;
284  if ( option[ run::debug ]() ) {
285  std::cerr << "coverage = " << coverage << ", n_atoms = " << n_atoms
286  << ", max_pose_len = " << max_pose_len << ", gdtmm = " << gdtmm << std::endl;
287  std::cerr << "pose1 seq = " << poses[ii].sequence() << std::endl;
288  std::cerr << "pose2 seq = " << poses[jj].sequence() << std::endl;
289  std::cerr << aln << std::endl;
290  std::cerr << "sim(" << aln_ids[ii] << "," << aln_ids[jj] << ") = " << gdtmm << std::endl;
291  std::cerr << "--------------------------------------------------------------------------------"
292  << std::endl;
293  }
294  if ( total_comparisons_done % 50 == 0 ) {
295  std::cout << "." << std::flush;
296  if ( total_comparisons_done % 1000 == 0 ) {
297  std::cout << " finished with "
298  << total_comparisons_done << "." << std::endl;
299  }
300  }
301  } // jj
302  } // ii
303  Size number_clusters = 999999;
304  vector1<AlignmentClusterOP> cluster_v;
305  Real threshold_gdt = MAX_GDT;
306  set<Size> mergeSet;
307  set<Size>::iterator location_mergeSet;
308  multimap<Size,Size> mergeMap;
309  multimap<Size,Size>::iterator start_mergeMap,stop_mergeMap;
310  while((number_clusters > GOAL_NUMB_CLUSTERS) && (threshold_gdt >= MIN_GDT)){
311  cluster_v.clear();
312  cluster_v = cluster(gdtmms,rankedAlignments_valid,threshold_gdt);
313  //===Merging clusters that have overlap above MAX_CLUSTER_OVERLAP
314  mergeSet.clear();
315  mergeMap.clear();
316  //If cluster has been merged than ignore it.
317  for (Size ii = 1; ii <= cluster_v.size(); ++ii){
318  location_mergeSet = mergeSet.find(ii);
319  if(location_mergeSet == mergeSet.end()){
320  for(Size jj=ii+1; jj<=cluster_v.size(); ++jj){
321  location_mergeSet = mergeSet.find(jj);
322  if(location_mergeSet == mergeSet.end()){
323  if (cluster_v[ii]->overlap(cluster_v[jj])>MAX_CLUSTER_OVERLAP){
324  mergeSet.insert(jj);
325  mergeMap.insert(std::pair<int,int>(ii,jj));
326  }
327  }
328  }
329  }
330  }
331  start_mergeMap = mergeMap.begin();
332  stop_mergeMap = mergeMap.end();
333  while(start_mergeMap != stop_mergeMap){
334  cluster_v[start_mergeMap->first]->merge(cluster_v[start_mergeMap->second]);
335  start_mergeMap++;
336  }
337  //---end merge
338  std::cout << "cluster_v.size" << cluster_v.size() << "merged clusters" << mergeSet.size() << std::endl;
339 
340  number_clusters = cluster_v.size() - mergeSet.size();
341  tr << "threshold_gdt" << threshold_gdt << "number_clusters" << number_clusters << std::endl;
342 
343  if(number_clusters > GOAL_NUMB_CLUSTERS)
344  threshold_gdt = threshold_gdt - INCREMENT_GDT;
345  }
346 
347  //Output final clusters
348  Size numbOutput = 1;
349  for (Size ii = 1; ii <= cluster_v.size(); ++ii){
350  location_mergeSet = mergeSet.find(ii);
351  if(location_mergeSet == mergeSet.end()){
352  std::cout << "--------" << std::endl;
353  std::stringstream convert_to_string;
354  convert_to_string << numbOutput;
355  std::string filename = "alignmentCluster_" + convert_to_string.str() + ".filt";
356  utility::io::ozstream alignment_out(filename );
357  cluster_v[ii]->output(alignment_out);
358  numbOutput++;
359  alignment_out.close();
360  }
361  }
362  //output results of cluster. For now just output to file.
363 }
364 
365 ///////////////////////////////////////////////////////////////////////////////
366 /// @detail Deletes AlignmentClustering object
367 ///////////////////////////////////////////////////////////////////////////////
368 AlignmentClustering::~AlignmentClustering(){
369 }
370 ///////////////////////////////////////////////////////////////////////////////
371 /// @detail Does the clustering
372 ///////////////////////////////////////////////////////////////////////////////
373 vector1<AlignmentClusterOP> AlignmentClustering::cluster(vector1< vector1< Real > > & gdtmms, vector1<SequenceAlignment> & rankedAlignments, Real threshold_gdt){
374  vector1<AlignmentClusterOP> cluster_v;
375  vector1<bool> alignmentInCluster;
376  //the first item is automatically in the first cluster.
377  for ( Size ii = 1; ii <= rankedAlignments.size(); ++ii ) {
378  alignmentInCluster.push_back(false);
379  }
380  bool allClustered = false;
381  Size clusterCenter = 1;
382  //This is written so alignments may end up in multiple clusters.
383  while(!allClustered){
384  AlignmentClusterOP tempCluster = new AlignmentCluster(rankedAlignments[clusterCenter]);
385  alignmentInCluster[clusterCenter] = true;
386  for( Size ii = 1; ii <= rankedAlignments.size(); ++ii) {
387  if(ii != clusterCenter){
388  //string const aln_id(rankedAlignments[ii].sequence(2)->id() );
389  //string const aln_center_id(rankedAlignments[clusterCenter].sequence(2)->id());
390  //tr << "distance from cluster center" << gdtmms[clusterCenter][ii] << " aln_id:" << aln_id << "cluster_center" << aln_center_id << std::endl;
391  if (gdtmms[clusterCenter][ii] >= threshold_gdt){
392  tempCluster->add_aln(rankedAlignments[ii]);
393  alignmentInCluster[ii] = true;
394  }
395  }
396  }
397  cluster_v.push_back(tempCluster);
398  int nxtClusterCenter = -1;
399  for(Size jj = 1; ((jj <= alignmentInCluster.size()) && (nxtClusterCenter==-1)) ; ++jj){
400  if(alignmentInCluster[jj] == false){
401  nxtClusterCenter = jj;
402  }
403  }
404  if(nxtClusterCenter == -1)
405  allClustered = true;
406  else
407  clusterCenter = nxtClusterCenter;
408  }
409  return(cluster_v);
410 }
411 ///////////////////////////////////////////////////////////////////////////////
412 /// @detail gathers the poses
413 ///////////////////////////////////////////////////////////////////////////////
415 
418  using namespace core::chemical;
419 
421  map< string, Pose > poses;
423  for ( iter it = fn_list.begin(), end = fn_list.end(); it != end; ++it ) {
424  if ( file_exists(*it) ) {
425  Pose pose;
426  core::import_pose::pose_from_pdb( pose, *rsd_set, *it );
427  string name = utility::file_basename( *it );
428  name = name.substr( 0, 5 );
429  poses[name] = pose;
430  }
431  }
432  return poses;
433 }
434 ///////////////////////////////////////////////////////////////////////////////
435 /// @detail gets the aligment from file and ranks them. Some models do not have e-vals in ev_map because thy are too far down in ranking. As long as 10 models are in the hh_map or ev_map the ranking comes from there.
436 ///////////////////////////////////////////////////////////////////////////////
437  vector1< SequenceAlignment > AlignmentClustering::generateRankedAlignments(map <string,SequenceAlignment> & alns,Real THRESHOLD_FOR_E_VAL){
438  using namespace basic::options;
439  using namespace basic::options::OptionKeys;
441  vector1<SequenceAlignment> rankedAlignments;
442  multimap<Real,string> rankedTemplate_map; //templates are from ev_map or hh_map
443  map<string,Real> template_map;
444  vector1<SequenceAlignment>::iterator start_rankedAln,stop_rankedAln;
445  map<string,SequenceAlignment>::iterator start_alns, stop_alns;
446  //mjo commenting out 'hhsearch_ranking' because it is unused and causes a warning
447  //bool hhsearch_ranking = 0;
448  //mjo commenting out 'evmap_ranking' because it is unused and causes a warning
449  //bool evmap_ranking = 0;
450  Real low_e_val = 9999;
451  Size minNumbRankedModels = 10;
452  //need to get alignments from file and rank depending on type
453  if(option[ cm::ev_map ].user()){
454  multimap<Real,string>::iterator start_rankedTemplate,stop_rankedTemplate;
455  map<string,Real>::iterator start_template, stop_template,location_template;
456  vector1< FileName > ev_mapFile( option[ cm::ev_map ]() );
457  if (file_exists(ev_mapFile[1])){
458  tr << "ev_map being read in" << std::endl;
459  std::string const & filename = ev_mapFile[1];
460  utility::io::izstream data(filename);
461  if(!data){
462  utility_exit_with_message(" Warning: can't open file" + filename + "!");
463  }
464  string line, pdbid;
465  Real e_val;
466  while (getline(data,line)){
467  std::istringstream line_stream(line);
468  line_stream >> pdbid >> e_val;
469  if(pdbid != "template"){
470  if(pdbid.size()==6){
471  pdbid = pdbid.substr(0,4) + pdbid.substr(5,1);
472  }
473  if(pdbid.size()==7){
474  string temp_string = "X";
475  temp_string[0] = toupper(pdbid[5]);
476  pdbid = pdbid.substr(1,4) + temp_string;
477  }
478  if(e_val < low_e_val)
479  low_e_val = e_val;
480 
481  location_template = template_map.find(pdbid);
482  if (location_template == template_map.end())
483  template_map.insert(std::pair<string,Real>(pdbid,e_val));
484  else{
485  if (location_template->second > e_val)
486  location_template->second = e_val;
487  }
488  }
489  }
490  start_template =template_map.begin();
491  stop_template = template_map.end();
492  while(start_template!=stop_template){
493  rankedTemplate_map.insert(std::pair<Real,string>(start_template->second,start_template->first));
494  start_template++;
495  }
496  if(low_e_val <= THRESHOLD_FOR_E_VAL){
497  start_rankedTemplate = rankedTemplate_map.begin();
498  stop_rankedTemplate = rankedTemplate_map.end();
499  while(start_rankedTemplate != stop_rankedTemplate){
500  start_alns = alns.begin();
501  stop_alns = alns.end();
502  while (start_alns != stop_alns){
503  string aln_id = start_alns->second.sequence(2)->id();
504  string template_id = aln_id.substr(0,5);
505  if(template_id == start_rankedTemplate->second){
506  rankedAlignments.push_back(start_alns->second);
507  }
508  start_alns++;
509  }
510  start_rankedTemplate++;
511  }
512  }
513  }
514  }
515  //get hhsearch ranking
516  if((option[ cm::hh_map ].user()&& !option[ cm::ev_map ].user()) || (option[ cm::hh_map ].user() && (low_e_val >= THRESHOLD_FOR_E_VAL))){
517  rankedTemplate_map.clear();
518  rankedAlignments.clear();
519  vector1< FileName > hh_mapFile( option[ cm::hh_map ]() );
520  if (file_exists(hh_mapFile[1])){
521  multimap<Real,string>::reverse_iterator start_rankedTemplate,stop_rankedTemplate;
522  tr << "hh_map being read in" << std::endl;
523  std::string const & filename = hh_mapFile[1];
524  utility::io::izstream data(filename);
525  if(!data){
526  utility_exit_with_message(" Warning: can't open file" + filename + "!");
527  }
528  string line, pdbid;
529  Real hh_val;
530  while (getline(data,line)){
531  std::istringstream line_stream(line);
532  line_stream >> pdbid >> hh_val;
533  rankedTemplate_map.insert(std::pair<Real,string>(hh_val,pdbid));
534  }
535  start_rankedTemplate =rankedTemplate_map.rbegin();
536  stop_rankedTemplate =rankedTemplate_map.rend();
537  while(start_rankedTemplate != stop_rankedTemplate){
538  start_alns = alns.begin();
539  stop_alns = alns.end();
540  while (start_alns != stop_alns){
541  string const aln_id( start_alns->second.sequence(2)->id() );
542  string const template_id( aln_id.substr(0,5) );
543  if(template_id == start_rankedTemplate->second){
544  rankedAlignments.push_back(start_alns->second);
545  }
546  start_alns++;
547  }
548  start_rankedTemplate++;
549  }
550  }
551  }
552  //If the alignments were not ranked in previous two steps.
553  if((rankedAlignments.size() < alns.size())&&(rankedAlignments.size()<minNumbRankedModels)){
554  rankedAlignments.clear();
555  start_alns = alns.begin();
556  stop_alns = alns.end();
557  while (start_alns != stop_alns){
558  rankedAlignments.push_back(start_alns->second);
559  start_alns++;
560  }
561  }
562  return rankedAlignments;
563  }
564 } // comparative_modeling
565 } // protocols