Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
MPI_LoopHashRefine_Master.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file protocols/loophash/MPI_LoopHashRefine_Master.cc
11 /// @brief
12 /// @author Mike Tyka
13 
14 #define TRDEBUG TR.Debug
15 
16 // MPI headers
17 #ifdef USEMPI
18 #include <mpi.h> //keep this first
19 #endif
20 
26 // AUTO-REMOVED #include <core/io/pdb/pose_io.hh>
27 #include <core/pose/util.hh>
29 // AUTO-REMOVED #include <core/chemical/ResidueTypeSet.hh>
30 
31 // AUTO-REMOVED #include <core/import_pose/pose_stream/MetaPoseInputStream.hh>
32 // AUTO-REMOVED #include <core/import_pose/pose_stream/util.hh>
38 #include <basic/options/keys/in.OptionKeys.gen.hh>
39 // AUTO-REMOVED #include <basic/options/keys/out.OptionKeys.gen.hh>
40 // AUTO-REMOVED #include <basic/options/keys/relax.OptionKeys.gen.hh>
41 #include <basic/options/keys/lh.OptionKeys.gen.hh>
42 #include <basic/options/option.hh>
43 #include <core/pose/Pose.hh>
44 // AUTO-REMOVED #include <core/scoring/ScoreFunctionFactory.hh>
46 #include <basic/Tracer.hh>
48 #include <ObjexxFCL/format.hh>
49 /// ObjexxFCL headers
50 #include <ObjexxFCL/string.functions.hh>
51 
52 #include <numeric/random/random.hh>
53 
54 #ifndef _WIN32 // REQUIRED FOR WINDOWS
55 // AUTO-REMOVED #include <unistd.h>
56 // AUTO-REMOVED #include <cctype>
57 #endif
58 
59 #include <fstream>
60 #include <utility/string_util.hh>
61 #include <boost/algorithm/string.hpp>
62 #include <boost/lexical_cast.hpp>
63 
64 //Auto Headers
67 #include <utility/vector1.hh>
68 
69 #include <numeric/random/random.hh>
70 #include <numeric/random/random_permutation.hh>
71 
72 using namespace ObjexxFCL;
73 using namespace ObjexxFCL::fmt;
74 
75 namespace protocols {
76 namespace loophash {
77 
78 using namespace protocols::wum;
79 
80 static basic::Tracer TR("MPI.LHR.Master");
81 
82 static numeric::random::RandomGenerator RG(3893251); // <- Magic number, do not change it (and dont try and use it anywhere else)
83 
84 void
85 MPI_LoopHashRefine_Master::set_defaults(){
86  using namespace basic::options;
87  using namespace basic::options::OptionKeys;
88  max_loophash_per_structure_ = option[ OptionKeys::lh::max_loophash_per_structure ]();
89  batch_relax_chunks_ = option[ OptionKeys::lh::mpi_batch_relax_chunks ]();
90  batch_relax_absolute_max_ = option[ OptionKeys::lh::mpi_batch_relax_absolute_max ]();
91  outbound_wu_buffer_size_ = option[ OptionKeys::lh::mpi_outbound_wu_buffer_size ]();
92  loophash_split_size_ = option[ OptionKeys::lh::mpi_loophash_split_size ]();
93  library_expiry_time_ = option[ OptionKeys::lh::library_expiry_time ]();
94  expire_after_rounds_ = option[ OptionKeys::lh::expire_after_rounds ]();
95  mpi_master_save_score_only_ = option[ OptionKeys::lh::mpi_master_save_score_only ]();
96 }
97 
98 
99 void
101  // Are we resuming an old job ?
102  if( mpi_resume() != "" ){
103  TR << "Resuming job from IDENT: " << mpi_resume() << std::endl;
104  load_state( mpi_resume() );
105  } else {
106  load_structures_from_cmdline_into_library( max_lib_size() * master_rank() );
107  }
108 
109  // sample_weight cannot be initialized until after structures are imported, so we can check size
110  load_sample_weight();
111  TR << "STARTLIB: " << std::endl;
112  print_library();
113 }
114 
115 
116 
117 void
118 MPI_LoopHashRefine_Master::go()
119 {
120  // initialize master (this is a virtual functino call and this function is overloaded by the children of this class)
121  TR << "Init Master: " << mpi_rank() << std::endl;
122  init();
123 
124  TR << "Master Node: Waiting for job requests..." << std::endl;
125  while(true){
126  // process any incoming messages such as incoming
127  TRDEBUG << "Master: processing msgs.." << std::endl;
128  process_incoming_msgs();
129 
130  TRDEBUG << "Master: process incoming" << std::endl;
131  process_inbound_wus();
132 
133  TRDEBUG << "Master: process outbound" << std::endl;
134  process_outbound_wus();
135 
136  // ok, we've done all our work, now wait until we hear from our slaves
137  process_incoming_msgs( true );
138 
139  print_stats_auto();
140  }
141 }
142 
143 
144 
145 /// @brief figure out what to do with incoming WUs.
146 /// Some will be returning WUs that need to be resent others will be finished and will need
147 /// reintegration into the library
148 void
149 MPI_LoopHashRefine_Master::process_inbound_wus(){
150  using namespace protocols::loops;
151 
152  check_library_expiry_dates();
153  TRDEBUG << "Finished checking library dates"<<std::endl;
154  if( inbound().size() > 0 ){
155  TRDEBUG << "Processing inbound WUs on master.." << std::endl;
156  }
157  while( inbound().size() > 0 )
158  {
159  WorkUnitBaseOP next_wu = inbound().pop_next();
160  runtime_assert( next_wu );
161 
162  // skip returning waiting WUs
163  if ( next_wu->get_wu_type() == "waitwu" ) continue;
164 
165  // Upcast to a StructureModifier WU
166  WorkUnit_SilentStructStoreOP structure_wu = dynamic_cast< WorkUnit_SilentStructStore * > ( next_wu() );
167 
168  // If upcast was unsuccessful - warn and ignore.
169  if ( structure_wu.get() == NULL ){
170  TR << "Cannot save structural data for WU: " << std::endl;
171  next_wu->print( TR );
172  continue;
173  }
174 
175  // Otherwise extract structures and figure out what to do with them
176  TRDEBUG << "Saving decoy store.. " << std::endl;
177  SilentStructStore &decoys = structure_wu->decoys();
178 
179  if ( structure_wu->get_wu_type() == "loophasher" ){
180  totaltime_loophash() += structure_wu->get_run_time();
181  TR << "LoopHash return: " << decoys.size() << " structs in " << structure_wu->get_run_time() << "s " << " frm " << structure_wu->last_received_from() << std::endl;
182  // Add the node that returned to the blacklist of all WUs with the same ssid and start_ir
183  for( WorkUnitQueue::iterator iter = outbound().begin(); iter != outbound().end(); iter++ ) {
184  if( (*iter)->get_wu_type() == "loophasher" ) {
185  /* // Upcast to a StructureModifier WU
186  // Why use dynamic cast when we're sure of type? (copied from above)
187  // So slow!
188  TR << "im here1" << std::endl;
189  WorkUnit_SilentStructStore* i = dynamic_cast< WorkUnit_SilentStructStore * > ( (WorkUnitBase*) (&(*(*iter))) );
190  TR << "im here2" << std::endl;
191  if( ( core::Size )(*i->decoys().begin())->get_energy("ssid") == (core::Size)(*decoys.begin())->get_energy("ssid") ) {
192 // i->extra_data_1() == structure_wu->extra_data_1() ) {
193 // */
194  if( (*iter)->extra_data_1() == structure_wu->extra_data_1() && (*iter)->extra_data_3() == structure_wu->extra_data_3() ) {
195  (*iter)->add_blacklist( structure_wu->last_received_from() );
196  TRDEBUG << "Added node " << structure_wu->last_received_from() << " to blacklist of WU " << (*iter)->id() << std::endl;
197  }
198  }
199  }
200  n_loophash_++;
201  //to_be_relaxed_.add( decoys );
202  if( decoys.size() > 0 ){
203  add_relax_batch( decoys );
204  total_structures_ += decoys.size();
205  }
206  } else
207  if ( structure_wu->get_wu_type() == "resultpack" ){
208  decoys.all_sort_silent_scores();
209  // dump structures
210  TR << "Emperor sent: " << decoys.size() << " structs" << std::endl;
211  print_library();
212  add_structures_to_library( decoys, "add_n_limit" );
213  print_library();
214  // dont dump structures that came straight from emperor.
215  //dump_structures( decoys, mpi_master_save_score_only_ );
216  } else
217  if ( structure_wu->get_wu_type() == "batchrelax" ){
218  decoys.all_sort_silent_scores();
219  decoys.all_add_energy("state", 2 ); // mark structures are just having come thoruhg batchrelax
220  totaltime_batchrelax_ += structure_wu->get_run_time();
221  n_batchrelax_ ++;
222  TR << "BatchRelax return: " << decoys.size() << " structs in " << structure_wu->get_run_time() << "s " << " frm " << structure_wu->last_received_from() << std::endl;
223  add_structures_to_library( decoys );
224  dump_structures( decoys, mpi_master_save_score_only_ );
225  } else {
226  TR.Error << "Unknown workunit received. " << std::endl;
227  }
228 
229 
230  }
231 
232  print_stats();
233 }
234 
235 
236 
237 void
238 MPI_LoopHashRefine_Master::process_outbound_wus(){
239  TRDEBUG << "Adding loophash WUs if necessary .. " << std::endl;
240  if( outbound().size() < outbound_wu_buffer_size_ ){
241  if ( library_central().size() == 0 ){
242  TR.Error << "FATAL ERROR: library_central_ is empty! " << std::endl;
243  utility_exit_with_message( "FATAL ERROR: library_central_ is empty! " );
244  }
245  // pick a random structure from the library
246 
247  core::Size finished_structures=0;
248  for( SilentStructStore::iterator it = library_central().begin(); it != library_central().end(); it ++ ){
249  if( max_loophash_per_structure_ > (*it)->get_energy("lhcount"))
250  {
251  TRDEBUG << "Adding: " << (*it) << " " << (*it)->get_energy("lhcount") << std::endl;
252  (*it)->add_energy( "lhcount", (*it)->get_energy("lhcount") + 1.0 );
253  create_loophash_WUs( *it );
254  }else{
255  finished_structures += 1;
256  TRDEBUG << "Already done: " << (*it) << " " << (*it)->get_energy("lhcount") << std::endl;
257  }
258  }
259  TR << "WARNING: " << finished_structures << " " << library_central().size() << std::endl;
260  if ( finished_structures >= library_central().size() ){
261  TR << "WARNING: The starting structs exhausted!" << std::endl;
262  }
263  }
264 
265  save_state_auto();
266 }
267 
268 
269 void
270 MPI_LoopHashRefine_Master::create_loophash_WUs( const core::io::silent::SilentStructOP &start_struct ){
271 
272  runtime_assert( start_struct );
273  core::pose::Pose start_pose;
274  start_struct->fill_pose( start_pose );
276  core::pose::set_ss_from_phipsi( start_pose );
277 
278  //refresh the sampling weight comment, as it may have changed
279  // easier to do it here, copy_scores copies comments as well
280  core::pose::delete_comment(start_pose,"sample_weight");
281  core::pose::add_comment(start_pose,"sample_weight", sample_weight_str_);
282 
283  using namespace basic::options;
284  using namespace basic::options::OptionKeys;
285 
286  core::io::silent::SilentStructOP ss = option[ OptionKeys::lh::bss]() ?
289  ss->fill_struct( start_pose );
290  ss->copy_scores( *start_struct );
291 
292  // first cound up "round" counter - just counts how many times each structure has been
293  // thorugh the loop hasher
294  core::Size round = (core::Size)ss->get_energy("round");
295  round++;
296  ss->add_energy("round", round );
297  ss->add_energy("masterid", mpi_rank() );
298  ss->add_energy("parent_score", ss->get_energy("score") );
299 
300  core::Size start_ir = 1;
301  core::Size end_ir = 1;
302  core::Size ssid = (core::Size)ss->get_energy("ssid");
303 
304  core::Size count_wus = 0;
305  for( ;start_ir< start_pose.total_residue(); start_ir+=loophash_split_size_ )
306  {
307  end_ir = std::min( start_ir + loophash_split_size_ - 1, start_pose.total_residue());
308  if( end_ir < start_ir) end_ir = start_ir;
309  if( start_pose.total_residue() - end_ir < loophash_split_size_ ) end_ir = start_pose.total_residue();
310  TRDEBUG << "Adding a new loophash WU: " << start_ir << " - " << end_ir << ", ssid = " << ssid << std::endl;
311 
312  count_wus++;
313  WorkUnit_LoopHashOP new_wu = new WorkUnit_LoopHash( start_ir, end_ir, ssid );
314  // this is unsatisfying.. why can't i use the template ? grrr C++ thou are limited.
315  new_wu->set_wu_type("loophasher");
316  new_wu->decoys().add( ss);
317  new_wu->clear_serial_data();
318  outbound().add( new_wu );
319  if( start_pose.total_residue() - end_ir < loophash_split_size_ ) start_ir = start_pose.total_residue();
320  }
321  TR << "Added " << count_wus << " loophash WUs to queue. ssid=" << ssid << std::endl;
322 
323 }
324 
325 
326 void
327 MPI_LoopHashRefine_Master::add_relax_batch( SilentStructStore &start_decoys ){
328  if( start_decoys.size() == 0 ) return;
329  TR << "Adding relax WUs.." << start_decoys.size() << std::endl;
330 
331  core::Size count_adds = 0;
332  core::Size count_adds_b4_limit = 0;
333  core::Size count_wus = 0;
334 
335  core::Size chunks = 1 + core::Size( floor( core::Real(start_decoys.size()) / core::Real( batch_relax_chunks_ ) ) );
336  core::Size batchrelax_batchsize_ = (start_decoys.size() / chunks) + 1;
337  core::Size dcount=0;
338  while( dcount < start_decoys.size() ){
340  new_wu->set_wu_type("batchrelax");
341  core::Size lcount=0;
342 
343  for(lcount=0; lcount < batchrelax_batchsize_; lcount++ ){
344  if ( dcount < start_decoys.size() ){
345  core::io::silent::SilentStructOP new_relax_structure = start_decoys.get_struct( dcount );
346  TRDEBUG << "AddRelaxStructure: " << format_silent_struct(new_relax_structure) << std::endl;
347  new_wu->decoys().add( new_relax_structure );
348  }
349  dcount++;
350  }
351 
352  // Mix up the order
353  //std::random__shuffle( new_wu->decoys().begin(), new_wu->decoys().end());
354  numeric::random::random_permutation(new_wu->decoys().begin(), new_wu->decoys().end(), numeric::random::RG);
355 
356  // make sure the chunk size doesnt exceed batch_relax_absolute_max_
357  core::Size chunk_size = new_wu->decoys().size();
358  new_wu->decoys().limit( batch_relax_absolute_max_ );
359 
360  total_structures_relax_ += new_wu->decoys().size();
361  new_wu->clear_serial_data();
362 
363  count_adds += new_wu->decoys().size();
364  count_adds_b4_limit += chunk_size;
365  count_wus ++;
366  // Relax work units have a lot of structures and fill up the queue and lead to memory crashes. Thus they get prioritized and added at the beginning of the queue!!
367  outbound().push_front( new_wu );
368  }
369 
370  TR << "Adding " << count_adds << "/" << count_adds_b4_limit << " structs for batchrlx. " << count_wus << " WUs" << std::endl;
371 
372 }
373 
374 
375 // this goes through the library and identifies structures that have not managed to get replaced
376 // for some cutoff amount of time. It will send back this structure and request a new structure with the same ssid from
377 // the emperor.
378 void
379 MPI_LoopHashRefine_Master::check_library_expiry_dates(){
380  core::Size current_time = time(NULL);
381 
382  SilentStructStore::iterator jt_last = library_central().begin();
383 
384  for( SilentStructStore::iterator jt = library_central().begin();
385  jt != library_central().end(); jt ++ )
386  {
387  TR.Debug << "Checking structure.." << std::endl;
388  core::Size struct_time = (core::Size)(*jt)->get_energy("ltime");
389  core::Size ssid = (core::Size)(*jt)->get_energy("ssid");
390  core::Size round = (core::Size)(*jt)->get_energy("round");
391 
392  bool expired = false;
393  // is the structure expired due to time limit ?
394  if( (int(current_time) - int(struct_time)) > (int)library_expiry_time_ ){
395  expired = true;
396  TR << "Structure: " << ssid << " is expired: " << int(current_time) - int(struct_time) << " > " << (int)library_expiry_time_ << std::endl;
397  }
398 
399  // is the structure expired because it has done too many rounds ?
400  if( (expire_after_rounds_ > 0) && ( round >= expire_after_rounds_ ) ){
401  expired = true;
402  TR << "Structure: " << ssid << " Round: is expired: " << round << " >= " << expire_after_rounds_ << std::endl;
403  }
404 
405  if( ! expired ){
406  jt_last = jt;
407  continue;
408  }
409 
410  // ok, so the structure is expired. send it to the emperor and kill it. wait for a new structure to arrive
411 
412  (*jt)->add_energy("expire", (core::Size)(*jt)->get_energy("expire") + 1);
413 
414 
415  // send the expired structure to the emperor (who will in due time send back a new one)
417  getnewstruct->set_wu_type( "getnewstruct" );
418  getnewstruct->decoys().add( (*jt) );
419  send_MPI_workunit( getnewstruct, 0 ); // The 0 is the MPI_RANK of the master - constant would be better here!
420 
421  // clear the queue of loophash WUs from previous struct to avoid false blacklisting
422  // assume that false blacklisting from currently processing loophash WU is unlikely
423 
424  core::Size erase_count = 0;
425  for( WorkUnitQueue::iterator iter = outbound().begin(); iter != outbound().end();) {
426  if( (*iter)->get_wu_type() == "loophasher" && ssid == (*iter)->extra_data_3() ) {
427  TRDEBUG<<"erasing wu" <<std::endl;
428  iter->reset_to_null();
429  TRDEBUG<<"erasing wu from list" <<std::endl;
430  iter = outbound().erase( iter );
431  TRDEBUG<<"erasing done" <<std::endl;
432  erase_count ++;
433  } else {
434  ++iter;
435  }
436  }
437  TR << "Erased " << erase_count << " deprecated WUs from outbound queue" << std::endl;
438 
439  // now delete this expired structure - it is now at the emperor's mercy
440  library_central().erase(jt);
441 
442  TR << "Reported expired structure to emperor: - waiting for new structure" << std::endl;
443  receive_MPI_workunit( 0 ); //receive the reply from master and add it to the normal inbound queue. the 0 here is the emperor's MPIRANK. Better replace with a function or constant
444  TR << "Done. Restarting reporting.." << std::endl;
445  break; // only one at a time.
446  // reset the iterator to the beginning - we must do that because we could have added the new structure whereever - beginning is the only save iterator
447  jt=library_central().begin();
448 
449  TRDEBUG << "Library state: " << std::endl;
450  print_library();
451  }
452  TRDEBUG << "end of check_library_expiry_dates" << std::endl;
453 }
454 
455 /// This is a virtual over load of the base class MPI_LoopHashRefine:: add_structure_to_library with an extra behavioural step
456 /// that reports any successful library add-ons to the emperor. This behaviour is master specific and thus should not be in the base class.
457 
458 bool
459 MPI_LoopHashRefine_Master::add_structure_to_library( core::io::silent::SilentStruct &pss, std::string add_algorithm ){
460  bool result = MPI_LoopHashRefine::add_structure_to_library( pss, add_algorithm );
461  TR << "MPI_LoopHashRefine_Master::add_structure_to_library: " << std::endl;
462  if(result) report_structure_to_emperor( pss );
463  return result;
464 }
465 
466 void
467 MPI_LoopHashRefine_Master::report_structure_to_emperor( core::io::silent::SilentStructOP &ss ) {
469  resultpack->set_wu_type( "resultpack" );
470  resultpack->decoys().add( ss );
471  send_MPI_workunit( resultpack, my_emperor() );
472  TR << "Reported structure to emperor: " << format_silent_struct( ss ) << std::endl;
473 }
474 
475 void
476 MPI_LoopHashRefine_Master::report_structure_to_emperor( core::io::silent::SilentStruct &pss ) {
478  resultpack->set_wu_type( "resultpack" );
479  resultpack->decoys().add( pss );
480  send_MPI_workunit( resultpack, my_emperor() );
481  TR << "Reported structure to emperor: " << format_silent_struct(pss) << std::endl;
482 }
483 
484 
485 void
486 MPI_LoopHashRefine_Master::load_sample_weight() {
487  using namespace basic::options;
488  using namespace basic::options::OptionKeys;
489  // This just loads sample weights from a file
490  // I assume that the optionkeys sanitizes input
491 
492  // using ifstream instead of utility::io::izstream because izstream doesn't return success bool
493  if( option[ OptionKeys::lh::sample_weight_file ].active() ) {
494  std::string pathtofile = option[ OptionKeys::lh::sample_weight_file ]();
495  std::ifstream file( pathtofile.c_str() );
496  if (!file) utility_exit_with_message( "Failed to open sample_weight file. Check path." );
497  std::string line, tmp;
498  while(getline( file, line ) ) {
499 
500  boost::trim(line);
501  std::vector < std::string > r;
502  boost::split(r, line, boost::is_any_of("\t "));
503 
504 
505  core::Real i=0.0;
506  // Check for correct format
507  try {
508  i = boost::lexical_cast<core::Real> (r[1] );
509  } catch( boost::bad_lexical_cast &) {
510  utility_exit_with_message( "Sample weight second column can't be casted to an int.");
511  }
512 
513  if (i < 0) {
514  utility_exit_with_message( "Sample weight second column is not an float larger than 0." );
515  } else {
516  tmp += r[1] + " ";
517  }
518  }
519  // check for correct length
520  boost::trim(tmp);
521  std::list < std::string > t;
522  t = utility::split_to_list(tmp);
523  if( t.size() != (*(library_central().begin()))->nres() )
524  utility_exit_with_message( "Sample weight file either improperly formatted or does not have same number of residues as structure." );
525  TR << "Sample weight file successfully loaded" << std::endl;
526  sample_weight_str_ = tmp;
527  } else {
528  TR << "Using default sample weight of 50 for every residue" << std::endl;
529  std::string t = "50";
530  for( Size i = 0; i < (*(library_central().begin()))->nres() - 1; i++ ) {
531  t += " 50";
532  }
533  sample_weight_str_ = t;
534  }
535 }
536 
537 
538 
539 
540 
541 
542 } // namespace loophash
543 } // namespace protocols
544 
545