Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
MPIFileBufJobDistributor.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file protocols/jd2/MPIFileBufJobDistributor.cc
11 /// @brief implementation of MPIFileBufJobDistributor
12 /// @author Oliver Lange olange@u.washington.edu
13 /// @detail freely based on the MPIWorkPoolJobDistributor from Doug
14 
15 // MPI headers
16 #ifdef USEMPI
17 #include <mpi.h> //keep this first
18 #endif
19 
20 // Unit headers
22 
23 // Package headers
25 #include <protocols/jd2/Job.hh>
26 
27 #include <protocols/moves/Mover.hh>
28 
30 #include <utility/io/ozstream.hh> //to toggle MPI rerouting
31 
32 // Utility headers
33 #include <basic/Tracer.hh>
34 #include <basic/options/option.hh>
35 #include <utility/exit.hh>
36 // AUTO-REMOVED #include <utility/assert.hh>
37 
38 // Option headers
39 #include <basic/options/keys/out.OptionKeys.gen.hh>
40 #ifdef USEMPI
41 #include <basic/options/keys/jd2.OptionKeys.gen.hh>
42 #endif
43 
44 // C++ headers
45 #include <string>
46 #include <basic/prof.hh>
47 // ObjexxFCL headers
48 #include <ObjexxFCL/string.functions.hh>
49 #include <utility/string_util.hh>
50 #include <utility/vector1.hh>
51 
52 
53 static basic::Tracer tr("protocols.jd2.MPIFileBufJobDistributor");
54 
55 namespace protocols {
56 namespace jd2 {
57 
58 using namespace core;
59 
60 
61 using namespace basic::options;
62 using namespace basic::options::OptionKeys;
63 
64 ///@details constructor. Notice it calls the parent class! It also builds some internal variables for determining
65 ///which processor it is in MPI land.
68  n_rank_( 1 ),
69  rank_( 0 ),
70  slave_current_job_id_( 0 ),
71  slave_current_batch_id_( 0 ),
72  // next_job_to_assign_( 0 ),
73  bad_job_id_( 0 ),
74  repeat_job_( false ),
75  master_rank_( 1 ),
76  file_buf_rank_( 0 ),
77  min_client_rank_( 2 ),
78  cumulated_runtime_( 0.0 ),
79  cumulated_jobs_( 0 )
80 {
81 
82  // set n_rank_ and rank based on whether we are using MPI or not
83 #ifdef USEMPI
84  MPI_Comm_rank( MPI_COMM_WORLD, ( int* )( &rank_ ) );
85  MPI_Comm_size( MPI_COMM_WORLD, ( int* )( &n_rank_ ) );
87 #else
88  utility_exit_with_message( "ERROR ERROR ERROR: The MPIFileBufJobDistributor will not work unless you have compiled using extras=mpi" );
89 #endif
90 }
91 
92 ///@details constructor. Notice it calls the parent class! It also builds some internal variables for determining
93 ///which processor it is in MPI land.
95  core::Size master_rank,
96  core::Size file_buf_rank,
97  core::Size min_client_rank,
98  bool start_empty
99 ) :
100  JobDistributor( start_empty /*call empty c'tor*/ ),
101  n_rank_( 1 ),
102  rank_( 0 ),
103  slave_current_job_id_( 0 ),
104  slave_current_batch_id_( 0 ),
105  // next_job_to_assign_( 0 ),
106  bad_job_id_( 0 ),
107  repeat_job_( false ),
108  master_rank_( master_rank ),
109  file_buf_rank_(file_buf_rank ),
110  min_client_rank_( min_client_rank ),
111  cumulated_runtime_( 0.0 ),
112  cumulated_jobs_( 0 )
113 {
114 
115  // set n_rank_ and rank based on whether we are using MPI or not
116 #ifdef USEMPI
117  //n_rank_ = MPI::COMM_WORLD.Get_size();
118  //rank_ = MPI::COMM_WORLD.Get_rank();
119  MPI_Comm_rank( MPI_COMM_WORLD, ( int* )( &rank_ ) );
120  MPI_Comm_size( MPI_COMM_WORLD, ( int* )( &n_rank_ ) );
122 #else
123  utility_exit_with_message( "ERROR ERROR ERROR: The MPIFileBufJobDistributor will not work unless you have compiled using extras=mpi" );
124 #endif
125 }
126 
127 ///@brief dtor
128 ///WARNING WARNING! SINGLETONS' DESTRUCTORS ARE NEVER CALLED IN MINI! DO NOT TRY TO PUT THINGS IN THIS FUNCTION!
129 ///here's a nice link explaining why: http://www.research.ibm.com/designpatterns/pubs/ph-jun96.txt
131 {}
132 
133 void
135  utility::io::ozstream::enable_MPI_reroute( min_client_rank_, file_buf_rank_ );
137  buffer.run(); //returns immediately if not buffer_rank
138  tr.Debug << "finished call to buffer.run()" << std::endl;
139  if ( rank_ == master_rank_ ) {
140  tr.Debug << "Master JD starts" << std::endl;
141  master_go( mover );
142  tr.Debug << "send STOP to FileBuffer " << std::endl;
143  buffer.stop(); //this communicates to the file_buf_rank_ that it has to stop the run() loop.
144  } else if ( rank_ >= min_client_rank_ ) {
145  go_main( mover );
146  tr.Debug << "Slave JD finished!" << std::endl;
147  }
148 
149  // ideally these would be called in the dtor but the way we have the singleton pattern set up the dtors don't get
150  // called
151 #ifdef USEMPI
152  MPI_Barrier( MPI_COMM_WORLD );
153  MPI_Finalize();
154 #endif
155 }
156 
157 
158 ///@details This is the heart of the MPIFileBufJobDistributor. It consistits of two while loops: the job
159 ///distribution loop (JDL) and the node spin down loop (NSDL). The JDL has three functions. The first is to recieve and
160 ///process messages from the slave nodes requesting new job ids. The second is to recieve and process messages from the
161 ///slave nodes indicating a bad input. The third is to recive and process job_success messages from the slave nodes and
162 ///block while the slave node is writing its output. This is prevent Sizeerleaving of output in score files and silent
163 ///files. The function of the NSDL is to keep the head node alive while there are still slave nodes processing. Without
164 ///the NSDL if a slave node finished its allocated job after the head node had finished handing out all of the jobs and
165 ///exiting (a very likely scenario), it would wait indefinitely for a response from the head node when requesting a new
166 ///job id.
167 void MPIFileBufJobDistributor::send_job_to_slave( Size MPI_ONLY(slave_rank) ) {
168 #ifdef USEMPI
169  int buf[ 2 ];
170  if ( rank_ == master_rank_ ) {
171  buf[ 0 ] = current_job_id();
172  buf[ 1 ] = current_batch_id();
173  tr.Debug << "Master: send new job: " << buf[ 0 ] << " " << buf[ 1 ] << std::endl;
174  MPI_Send( &buf, 2, MPI_INT, slave_rank, MPI_JOB_DIST_TAG, MPI_COMM_WORLD );
175  } else {
176  runtime_assert( rank_ == slave_rank );
177  int buf[ 2 ]; MPI_Status status;
178  MPI_Recv( &buf, 2, MPI_INT, master_rank_, MPI_JOB_DIST_TAG, MPI_COMM_WORLD, &status );
179  slave_current_job_id_ = buf[ 0 ];
180  slave_current_batch_id_ = buf[ 1 ];
181  tr.Debug << "Slave: receive job: " << buf[ 0 ] << " " << buf[ 1 ] << std::endl;
182  }
183 #endif
184 }
185 
186 ///@details messages are received constantly by Master JobDistributor and then the virtual process_message() method
187 /// is used to assign some action to each message ... this allows child-classes to answer to more messages or change behaviour of already known messages
188 bool
189 MPIFileBufJobDistributor::process_message( Size msg_tag, Size slave_rank, Size slave_job_id, Size slave_batch_id, core::Real runtime ) {
190  switch ( msg_tag ) {
191  case NEW_JOB_ID: //slave requested a new job id ... send new job or spin-down signal
192  tr.Debug << "Master Node: Sending new job id " << current_job_id() << " " << "job: " << current_job()->input_tag() << " to node " << slave_rank << std::endl;
193  send_job_to_slave( slave_rank );
194  if ( current_job_id() ) {
195  ++jobs_assigned_;
196  obtain_new_job();
197  } else {
199  }
200  break;
201  case BAD_INPUT: //slave reports failed job
202  tr.Debug << "Master Node: Received job failure message for job id " << slave_job_id << " from node " << slave_rank << std::endl;
203  ++bad_jobs_;
204  mark_job_as_bad( slave_job_id, slave_batch_id );
205  ++jobs_returned_;
206  break;
207  case JOB_SUCCESS:
208  mark_job_as_completed( slave_job_id, slave_batch_id, runtime );
209  ++jobs_returned_;
210  break;
211  case JOB_FAILED_NO_RETRY :
212  ++jobs_returned_;
213  break;
214  default:
215  tr.Error << "[ERROR] from " << slave_rank << " tag: " << msg_tag << " " << slave_job_id << std::endl;
216  utility_exit_with_message(" unknown tag "+ ObjexxFCL::string_of( msg_tag ) +" in master_loop of MPIFileBufJobDistributor ");
217  return false;
218  }
219  return true;
220 }
221 
222 ///@brief mark job as completed
224  if ( runtime > 0 ) {
225  cumulated_runtime_+=runtime;
226  cumulated_jobs_ += 1;
227  }
228  if ( batch_id == current_batch_id() ) {
229  Parent::mark_job_as_completed( job_id, runtime );
230  }
231 }
232 
233 ///@brief mark job as failed --- remove future versions of same input from list
235  if ( batch_id == current_batch_id() ) {
236  bad_job_id_ = job_id;
238  }
239 }
240 
241 ///@brief receive message of certain type -- and ignore it ... sometimes needed in communication protocol
242 void
243 MPIFileBufJobDistributor::eat_signal( Size msg_tag_in, int MPI_ONLY( source ) ) {
244 #ifdef USEMPI
245  Size const mpi_size( 4 );
246  int mpi_buf[ mpi_size ];
247  while( true ) {
248  Size const mpi_size( 4 );
249  int mpi_buf[ mpi_size ];
250  MPI_Status status;
251  MPI_Recv( &mpi_buf, mpi_size, MPI_INT, source, MPI_JOB_DIST_TAG, MPI_COMM_WORLD, &status);
252  Size slave_rank( status.MPI_SOURCE );
253  Size const msg_tag ( mpi_buf[ 0 ] );
254  Size const slave_job_id( mpi_buf[ 1 ] );
255  Size const slave_batch_id( mpi_buf[ 2 ]);
256  Size const runtime( mpi_buf[ 3 ] );
257  if ( msg_tag_in != msg_tag ) {
258  tr.Debug <<" when trying to eat signal " << msg_tag_in <<" I received " << msg_tag << " going to process this now" << std::endl;
259  process_message( msg_tag, slave_rank, slave_job_id, slave_batch_id, runtime );
260  } else {
261  break;
262  }
263  }
264 #else
265  //Size const msg_tag ( 0 );
266 #endif
267 
268  tr.Debug << "eating expected signal " << msg_tag_in << std::endl;
269  // runtime_assert( msg_tag == msg_tag_in );
270 }
271 
272 ///@brief the main message loop --- master cycles thru until all slave nodes have been spun down
273 void
275 {
276 #ifdef USEMPI
277  runtime_assert( rank_ == master_rank_ );
278 
279  Size const mpi_size( 4 );
280  int mpi_buf[ mpi_size ];
281 
282  MPI_Status status;
283  MPI_Request request;
284 
285  // set first job to assign
286  obtain_new_job();
287 
288  // initialize some statistics -- these are member variables, since they are also used in process_messages()
289  jobs_assigned_ = 0;
290  jobs_returned_ = 0;
291  bad_jobs_ = 0;
292  tr.Info << "Starting JobDistribution with " << n_worker() << " worker processes " << std::endl;
293  using namespace basic::options;
294  using namespace basic::options::OptionKeys;
295  n_nodes_left_to_spin_down_ = option[ OptionKeys::jd2::mpi_nowait_for_remaining_jobs ]() ? 0 : ( n_worker() );
296  double timeout_wait_factor = option[ OptionKeys::jd2::mpi_timeout_factor ](); //wait at most X-times the average job-time for stray jobs...
297  // Job Distribution Loop --- receive message and process -- repeat
299  tr.Debug << "current_job_id: " << current_job_id() << " jobs_returned " << jobs_returned_
300  << " jobs_assigned_ " << jobs_assigned_ << " nodes_to_spin_down " << n_nodes_left_to_spin_down_ << std::endl;
301  //receive message
302  tr.Debug << "Master Node: Waiting for job requests..." << std::endl;
303  MPI_Irecv( &mpi_buf, mpi_size, MPI_INT, MPI_ANY_SOURCE, MPI_JOB_DIST_TAG, MPI_COMM_WORLD, &request);
304  int flag=0;
305  bool only_some_nodes_unfinished = ( 1.0 * n_nodes_left_to_spin_down_ / n_worker() ) < 0.2;
306 
307 
308  double timeout=MPI_Wtime();
309  double timeout_limit = ( cumulated_jobs_ < 1 || timeout_wait_factor <=0 ) ? 1e9 : cumulated_runtime_/cumulated_jobs_ * timeout_wait_factor ;
310  tr.Debug << "set timeout_limit to " << timeout_limit << std::endl;
311  while ( flag==0 ) {
312  if ( MPI_Wtime() - timeout > timeout_limit ) break;
313  MPI_Test(&request, &flag, &status);
314  }
315  if ( flag == 0 ) { //timeout
316  utility_exit_with_message("quick exit from job-distributor due to flag jd2::mpi_nowait_for_remaining_jobs and timeout of "+
317  utility::to_string( MPI_Wtime()-timeout )+" seconds\n"+"increase time-out by using -jd2:mpi_timeout_factor" );
318  }
319  Size slave_rank( status.MPI_SOURCE );
320  Size const msg_tag ( mpi_buf[ 0 ] );
321  Size const slave_job_id( mpi_buf[ 1 ] );
322  Size const slave_batch_id( mpi_buf[ 2 ]);
323  Real const runtime( mpi_buf[ 3 ]);
324  tr.Debug << "Master Node: Recieved message from " << slave_rank << " with tag "
325  << msg_tag << " slave_jobid " << slave_job_id << " slave batchid " << slave_batch_id << std::endl;
326 
327  //process message
328  process_message( msg_tag, slave_rank, slave_job_id, slave_batch_id, runtime);
329 
330  }
331 
332  //finished
333  tr.Info << "Master Node: Finished sending spin down signals to slaves" << std::endl;
334  tr.Info << "Master Node stats: jobs-send out: " << jobs_assigned_ << " returned: " << jobs_returned_ << " bad jobs: " << bad_jobs_ << std::endl;
335 
336  if ( option[ OptionKeys::jd2::mpi_nowait_for_remaining_jobs ]() ) {
337  utility_exit_with_message("quick exit from job-distributor due to flag jd2::mpi_nowait_for_remaining_jobs --- this is not an error " );
338  }
339 
340 #endif
341 }
342 
343 ///@brief dummy for master/slave version
346 {
347  if ( rank_ == master_rank_ ) {
348  return master_get_new_job_id();
349  } else {
350  return slave_get_new_job_id();
351  }
352  return 0;
353 }
354 
355 
356 ///@brief work out what next job is
359 {
360  using namespace basic::options;
361  using namespace basic::options::OptionKeys;
362 
363  Jobs const & jobs( get_jobs() );
364  JobOutputterOP outputter = job_outputter();
365 
366  core::Size next_job_to_assign = current_job_id() + 1;
367  basic::show_time( tr, "assign job "+ObjexxFCL::string_of(next_job_to_assign)+" batch: "+ObjexxFCL::lead_zero_string_of( current_batch_id(),5 ) );
368  //increase job-id until a new job is found
369  while( next_job_to_assign <= jobs.size()) {
370  if ( jobs[ next_job_to_assign ]->bad() ) { //don't start jobs with known bad input
371  continue;
372  } else if ( !outputter->job_has_completed( jobs[ next_job_to_assign ] ) ) { //don't start jobs with have been completed ( in previous runs )
373  tr.Debug << "Master Node: Getting next job to assign from list id " << next_job_to_assign << " of " << jobs.size() << std::endl;
374  return next_job_to_assign;
375  } else if ( outputter->job_has_completed( jobs[ next_job_to_assign ] ) && option[ out::overwrite ].value() ) { //ignore what I just said -- we ignore previous data
376  tr.Debug << "Master Node: Getting next job to assign from list, overwriting id " << next_job_to_assign << " of " << jobs.size() << std::endl;
377  return next_job_to_assign;
378  }
379  //arrives here only if job has already been completed on the file-system
380  mark_job_as_completed( next_job_to_assign, current_batch_id(), -1.0 ); //need this for the MPIArchiveJobDistributor
381  ++next_job_to_assign;
382  }
383  tr.Debug << "Master Node: No more jobs to assign, setting next job id to zero" << std::endl;
384  return 0;
385 }
386 
387 //overloaded so that slave-nodes never automatically switch to next_batch when spinning down.
388 bool
390  if ( rank_ != master_rank_ ) return false; //slave answer
391  cumulated_jobs_ = 0;
392  cumulated_runtime_ = 0;
393  return Parent::next_batch();
394 }
395 
398 {
399 #ifdef USEMPI
400  runtime_assert( rank_ != master_rank_ );
401 
402  if ( repeat_job_ == true ) {
403  tr.Debug << "Slave Node " << rank_ << ": Repeating job id " << slave_current_job_id_ <<std::endl;
404  repeat_job_ = false;
405  } else {
406  tr.Debug << "Slave Node " << rank_ << ": Requesting new job id from master" <<std::endl;
410  tr.Debug << "Slave Node " << rank_ << ": Received job id " << slave_current_job_id_
411  << ( slave_current_batch_id_ ? " batch: "+ get_current_batch() : "" ) << " from master" << std::endl;
412 
413  }
414 #endif
415  return slave_current_job_id_;
416 }
417 
418 ///@brief dummy for master/slave version
419 void
421 {
422  if ( rank_ == master_rank_ ) {
424  } else {
427  }
428 }
429 
430 void
432 {
433  runtime_assert( rank_ == master_rank_ );
434  tr.Debug << "Master Node: Mark current job for repetition" << std::endl;
435  utility_exit_with_message( "Master Node: master_mark_current_job_id_for_repetition() should never be called" );
436 }
437 
438 void
440 {
441  runtime_assert( rank_ != master_rank_ );
442  tr.Debug << "Slave Node " << rank_ << ": Mark current job for repetition, id " << current_job_id() << std::endl;
443  repeat_job_ = true;
444 }
445 
446 ///@brief dummy for master/slave version
447 void
449 {
450  if ( rank_ == master_rank_ ) {
452  } else {
454  }
455 }
456 
457 void
459 {
460  runtime_assert( rank_ == master_rank_ );
461 
462  if ( tr.Debug.visible() ) {
463  Jobs const& jobs( get_jobs() );
464  std::string const & bad_job_id_input_tag( jobs[ bad_job_id_ ]->input_tag() );
465  tr.Debug << "Master Node: Job id "
466  << job_outputter()->output_name( jobs[ bad_job_id_ ] )
467  << " failed, reporting bad input; other jobs of same input will be canceled: " << bad_job_id_input_tag << std::endl;
468  }
469 
470  Parent::mark_job_as_bad( bad_job_id_ );//this sets all jobs with this input_tag to bad!
471  obtain_new_job( true /*re_consider_current_job*/ );
472 }
473 
474 void
476 #ifdef USEMPI
477  runtime_assert( rank_ != master_rank_ );
478  Size const mpi_size( 4 );
479  int mpi_buf[ mpi_size ];
480  mpi_buf[ 0 ] = tag;
481  mpi_buf[ 1 ] = slave_current_job_id_;
482  mpi_buf[ 2 ] = slave_current_batch_id_;
483  mpi_buf[ 3 ] = static_cast< int > ( slave_current_runtime_ );
484  MPI_Send( &mpi_buf, mpi_size, MPI_INT, master_rank_, MPI_JOB_DIST_TAG, MPI_COMM_WORLD );
485 #endif
486 }
487 
488 void
490 {
492 }
493 
494 ///@brief dummy for master/slave version
495 void
497 {
498  if ( rank_ == master_rank_ ) {
499  master_job_succeeded( pose );
500  } else {
501  slave_current_runtime_ = runtime;
502  slave_job_succeeded( pose );
503  }
504 }
505 
506 void
509  bool will_retry
510 )
511 {
512  assert( rank_ >= min_client_rank_ );
513  if ( ! will_retry ) {
514  // tell the master node that this job has failed and will not be
515  // re-attempted
517  }
518 }
519 
520 
521 void
523 {
524 #ifdef USEMPI
525  runtime_assert( rank_ == master_rank_ );
526  tr.Debug << "Master Node: Job Succeeded" << std::endl;
527  utility_exit_with_message( "Master Node: master_job_succeeded() should never be called" );
528 #endif
529 }
530 
531 void
533 {
534  runtime_assert( !( rank_ == master_rank_ ) );
535  job_outputter()->final_pose( current_job(), pose );
537 }
538 
539 
540 }//jd2
541 }//protocols