Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
MPIArchiveJobDistributor.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // This file is part of the Rosetta software suite and is made available under license.
5 // The Rosetta software is developed by the contributing members of the Rosetta Commons consortium.
6 // (C) 199x-2009 Rosetta Commons participating institutions and developers.
7 // For more information, see http://www.rosettacommons.org/.
8 
9 /// @file protocols/jd2/MPIFileBufJobDistributor.cc
10 /// @brief implementation of MPIFileBufJobDistributor
11 /// @author Oliver Lange olange@u.washington.edu
12 
13 // MPI headers
14 #ifdef USEMPI
15 #include <mpi.h> //keep this first
16 #endif
17 
18 //testing memory
19 
20 // Unit headers
23 // AUTO-REMOVED #include <protocols/jd2/BatchJobInputter.hh> //for BOGUS_BATCH_ID
24 // Package headers
25 // AUTO-REMOVED #include <protocols/jd2/JobOutputter.hh>
26 #include <protocols/jd2/Job.hh>
27 
28 #include <protocols/moves/Mover.hh>
29 
31 #include <utility/io/ozstream.hh> //to toggle MPI rerouting
32 
33 // Utility headers
34 #include <basic/Tracer.hh>
35 #include <basic/MemTracer.hh>
36 #include <basic/options/option.hh>
37 #include <utility/exit.hh>
38 // AUTO-REMOVED #include <utility/assert.hh>
39 #include <basic/prof.hh>
40 #include <ObjexxFCL/string.functions.hh>
41 
42 // Option headers
43 #include <basic/options/keys/out.OptionKeys.gen.hh>
44 // AUTO-REMOVED #include <basic/options/keys/jd2.OptionKeys.gen.hh>
45 #include <basic/options/keys/archive.OptionKeys.gen.hh>
46 #include <basic/options/keys/run.OptionKeys.gen.hh>
47 
48 // C++ headers
49 #include <string>
50 // AUTO-REMOVED #include <ctime>
51 // AUTO-REMOVED #include <math.h>
52 #include <basic/prof.hh>
53 //Auto Headers
54 #include <utility/vector1.hh>
55 
56 static basic::Tracer tr("protocols.jd2.MPIArchiveJobDistributor");
57 using basic::mem_tr;
58 
59 namespace protocols {
60 namespace jd2 {
61 namespace archive {
62 
63 ///our setup of dedicated processes...
64 int const in_master_rank_( 1 ); //keep const for now
65 int const in_file_buf_rank_( 0 );
66 int const in_archive_rank_( 2 );
67 int const in_min_client_rank_( 3 );
68 
69 using namespace basic::options;
70 using namespace basic::options::OptionKeys;
71 using namespace core;
72 
73 
74 ///@details constructor. Notice it calls the parent class! It also builds some internal variables for determining
75 ///which processor it is in MPI land.
78  nr_notify_( option[ OptionKeys::archive::completion_notify_frequency] ),
79  archive_rank_( in_archive_rank_ )
80 {
81 
82  //if we are testing we want to send JOB_COMPLETION more often
83  if ( option[ OptionKeys::run::test_cycles ] || option[ OptionKeys::run::dry_run ] ) {
84  nr_notify_ = std::min( nr_notify_, Size(10) );
85  }
86 }
87 
88 void
90  if ( rank() == archive_rank() ) {
91  theArchive_ = archive;
92  }
93 }
94 ///@brief dummy for master/slave version -- start the appropriate process depending on rank()
95 void
97 {
98  //copied MPIFileJobDistributor, because in this case the archive - process sends stop to FileBuf.
99  utility::io::ozstream::enable_MPI_reroute( min_client_rank(), file_buf_rank() );
100  mem_tr << "MPIArchiveJobDistributor::go" << std::endl;
101  /// JD
102  if ( rank() == master_rank() ) {
103  tr.Warning << "Master JD starts" << std::endl;
104  master_go( mover );
105  } else if ( rank() == file_buf_rank() ) {
106  /// FileBuffer
108  tr.Warning << "FileBuffer starts " << std::endl;
109  buffer.run();
110  } else if ( rank() == archive_rank() ) {
111  /// Archive
112  tr.Warning << "Archive starts... " << std::endl;
114  runtime_assert( theArchive_ );
115  archive.go( theArchive_ );
116  tr.Warning << "send STOP to FileBuffer " << std::endl;
118  buffer.stop();
119  } else if( rank() >= min_client_rank() ){
120  /// Slave/Runner/Worker
121  go_main( mover );
122  }
123 
124  // ideally these would be called in the dtor but the way we have the singleton pattern set up the dtors don't get
125  // called
126 #ifdef USEMPI
127  MPI_Barrier( MPI_COMM_WORLD );
128  MPI_Finalize();
129 #endif
130  if ( rank() == master_rank() ) {
131  std::cerr << "MPI FINALIZED closing down... " << std::endl;
132  std::cout << "MPI FINALIZED closing down... " << std::endl;
133  }
134 }
135 
136 ///@detail receive a new batch from ArchiveManager -- interpret batch_nr == 0 as STOP
137 bool
138 MPIArchiveJobDistributor::receive_batch( Size MPI_ONLY( source_rank ) ) {
139 basic::prof_show();
140 #ifdef USEMPI
141  MPI_Status status;
142  int buf[ 2 ];
143  //receive size of string
144  MPI_Recv( buf, 2, MPI_INT, source_rank, MPI_JOB_DIST_TAG, MPI_COMM_WORLD, &status );
145  Size size( buf[ 0 ]);
146  Size id( buf[ 1 ] );
147  //receive string
148  std::string new_batch;
149  char *cbuf = new char[ size+1 ];
150  MPI_Recv( cbuf, size, MPI_CHAR, source_rank, MPI_JOB_DIST_TAG, MPI_COMM_WORLD, &status );
151 
152  //STOP?
153  if ( id == 0 ) { //use this as STOP signal!
154  tr.Debug << "received STOP signal from Archive " << std::endl;
155  delete[] cbuf;
156  return false;
157  }
158 
159  ///assign C++ string to cbuf
160  new_batch.assign( cbuf, size );
161  delete[] cbuf;
162 
163  tr.Info << "received new batch " << new_batch << " with id " << id << std::endl;
164  add_batch( new_batch, id );
165 #endif
166  return true;
167 }
168 
169 ///@detail sync batches with worker nodes.. this is called if they get a job for a batch they don't know yet...
170 /// this method will send ALL batches they don't have yet.
171 void
172 MPIArchiveJobDistributor::sync_batches( Size MPI_ONLY( slave_rank ) ) {
173  PROF_START( basic::ARCHIVE_SYNC_BATCHES );
174 #ifdef USEMPI
175  tr.Trace << "Node " << rank() << " sync batches with " << slave_rank << std::endl;
176  int buf[ 4 ];
177  buf[ 1 ] = ADD_BATCH;
178  MPI_Status status;
179 
180  ///send last known batch from SLAVE --> MASTER
181  Size slave_batch_size( nr_batches() );
182  Size nr_to_have;
183  if ( rank() != master_rank() ) { //SLAVE -- SEND
184  buf[ 0 ] = slave_batch_size;
185  MPI_Send( &buf, 1, MPI_INT, master_rank(), MPI_JOB_DIST_TAG, MPI_COMM_WORLD );
186  } else { //MASTER -- RECEIVE
187  MPI_Recv( &buf, 1, MPI_INT, slave_rank, MPI_JOB_DIST_TAG, MPI_COMM_WORLD, &status );
188  slave_batch_size = buf[ 0 ];
189  }
190 
191  tr.Trace << "Node " << rank() << " slave_batch_size " << slave_batch_size << std::endl;
192 
193  //MASTER --> SLAVE how many batches will be sent
194  nr_to_have = nr_batches();
195  if ( rank() != master_rank() ) { //SLAVE
196  MPI_Recv( &buf, 1, MPI_INT, master_rank(), MPI_JOB_DIST_TAG, MPI_COMM_WORLD, &status );
197  nr_to_have = buf[ 0 ];
198  } else { //MASTER
199  buf[ 0 ] = nr_to_have;
200  MPI_Send( &buf, 1, MPI_INT, slave_rank, MPI_JOB_DIST_TAG, MPI_COMM_WORLD );
201  }
202  tr.Trace << "Node " << rank() << " master_batch_size " << nr_to_have << std::endl;
203 
204  //MASTER --> SLAVE now send the individual batches
205  for ( Size send_id = slave_batch_size + 1; send_id <= nr_to_have; ++send_id ) {
206  if ( rank() != master_rank() ) { //SLAVE
208  tr.Trace << "nr_batches() " << nr_batches() << " send_id " << send_id << std::endl;
209  runtime_assert( nr_batches() == send_id );
210  } else { //MASTER
211  //send size of string
212  buf[ 0 ] = batch( send_id ).size();
213  buf[ 1 ] = send_id;
214  MPI_Send(buf, 2, MPI_INT, slave_rank, MPI_JOB_DIST_TAG, MPI_COMM_WORLD );
215  //send string
216  MPI_Send(const_cast<char*> ( batch( send_id ).data()), batch( send_id ).size(), MPI_CHAR, slave_rank, MPI_JOB_DIST_TAG, MPI_COMM_WORLD );
217  }
218  }
219  #endif
220  PROF_STOP( basic::ARCHIVE_SYNC_BATCHES );
221 }
222 
223 ///@detail send message to ArchiveManager .. eg. QueueEmpty
224 /// always send current_batch_id with the message ... used to determine if QueueEmpty is outdated
225 void
227 #ifdef USEMPI
228  runtime_assert( rank() == master_rank() );
229  runtime_assert( rank() != archive_rank() );
230  Size const mpi_size( 6 );
231  int mpi_buf[ mpi_size ];
232  mpi_buf[ 0 ] = tag;
233  mpi_buf[ 1 ] = current_batch_id();
234  MPI_Send( &mpi_buf, mpi_size, MPI_INT, archive_rank(), MPI_ARCHIVE_TAG, MPI_COMM_WORLD );
235 #endif
236 }
237 
238 ///@detail called if JD is at and of BatchQueue...
239 /// for a worker node that might mean he needs to sync batches with Master
240 /// for a master node it means he sends QUEUE-EMPTY to ArchiveManager
241 void
243  if ( !( rank() == master_rank() ) ) {
245  sync_batches( rank() );
246  } else if ( rank() == master_rank() ) {
247  PROF_START( basic::MPI_JD2_WAITS_FOR_ARCHIVE );
248  tr.Debug << "no more batches... ask ArchiveManager if there is some more to do... wait..." << std::endl;
249  _notify_archive();
250  basic::show_time( tr, "no more batches: send QUEUE_EMPTY to archive" );
252  tr.Info << "wait for answer on QUEUE-EMPTY msg... send with " << current_batch_id() << " batch_id " << std::endl;
254  receive_batch( archive_rank() ); //how about some time-out
255  tr.Debug << "...received " << std::endl;
256  basic::show_time( tr, "refilled queue: received new batches after QUEUE_EMPTY" );
257  PROF_STOP( basic::MPI_JD2_WAITS_FOR_ARCHIVE );
258  }
259 }
260 
261 ///@detail process messages... BATCH_SYNC, ADD_BATCH, or delegate to Parent class
262 /// also send pending CompletionMessages out...
263 ////this is a good place to do it, since CompletionMessages are non-blocking and we are otherwise in blocking communication with WorkerNodes
264 bool
266  core::Size msg_tag,
267  core::Size slave_rank,
268  core::Size slave_job_id,
269  core::Size slave_batch_id,
270  core::Real run_time
271 ) {
272  runtime_assert( rank() == master_rank() );
273 
274  // basic::show_time( tr, "jd2 main msg-loop: process message..." );
275 
276  // send out any pending notifications to archive if present -- this is non-blocking
277  _notify_archive(); //we should get here often enough... (basically every finished job
278  //-- unless of course we haven't started any jobs yet)
279 
280  // now go thru messages
281  switch ( msg_tag ) {
282  case BATCH_SYNC: //a slave has received a job with an unknown batch and asks for an update of the Batchlist
283  sync_batches( slave_rank );
284  break;
285  case ADD_BATCH: //the ArchiveManager adds a new BATCH
286  runtime_assert( slave_rank == archive_rank() );
288  break;
289  case CANCEL_BATCH: //the ArchiveManager cancels a certain BATCH
290  runtime_assert( slave_rank == archive_rank() );
291  {
292  //bool was_good( get_current_batch() != jd2::BatchJobInputter::BOGUS_BATCH_ID );
293  tr.Trace << "currently running batch " << get_current_batch() << std::endl;
294  receive_batch( archive_rank() ); //right now we hack it... ArchiveSends BOGUS_BATCH_ID 5 to cancel batch 5
295  // now in the batch_list the name of the respective batch willl be changed to BOGUS_BATCH_ID ... next time we
296  // get to it we will skip, if we are running it now, we will terminate.
297  tr.Trace << "now the current batch is " << get_current_batch() << std::endl;
298  // put this functionality into Baseclass: JobDistributor::obtain_new_job -- it was probably causing a dead-lock in the communication...
299  // if ( was_good && get_current_batch() == jd2::BatchJobInputter::BOGUS_BATCH_ID ) next_batch();
300  }
301  break;
302  default:
303  return Parent::process_message( msg_tag, slave_rank, slave_job_id, slave_batch_id, run_time );
304  }
305 
306  return true;
307 }
308 
309 ///@detail queue up a CompletionMessage
310 void
312  //TODO: check if there are older messages regarding this batch... if so ... remove
313  tr.Debug << "add to notification queue " << msg.batch_id << std::endl;
314  if ( pending_notifications_.size()
315  && pending_notifications_.back().batch_id == msg.batch_id
316  && pending_notifications_.back().msg_tag == msg.msg_tag
317  ) {
318  pending_notifications_.back() = msg;
319  } else {
320  pending_notifications_.push_back( msg );
321  }
322 }
323 
324 /// stuff needed for non-blocking communication
325 #ifdef USEMPI
326 MPI_Request notify_request;
327 int notify_buf[ 6 ];
328 bool notify_first( true ); ///buffer hasn't been used yet?
329 #endif
330 
331 ///the private implementation of notify_archive.
332 /// send JOB_COMPLETION message to Archive if a message is in the message queue.
334  PROF_START( basic::MPI_NOTIFY_ARCHIVE );
335  static basic::Tracer notification_tracer("protocols.jd2.notifications");
336 
337  //nothing in queue?
338  if ( pending_notifications_.size() == 0 ) return;
339 
340  //okay, queue is filled send something
341 #ifdef USEMPI
342  int flag( 1 );
343  if ( !notify_first ) { //if not first time, make sure last message has been received already...
344  notification_tracer.Debug << "test MPI-Send completion of last JOB_COMPLETION ( batch_" << notify_buf[ 1 ] << " ) message...";
345  basic::show_time( tr, "try to send JOB_COMPLETION" );
346  MPI_Status status;
347  MPI_Test( &notify_request, &flag, &status ); //has last communication succeeded ? --- buffer is free again.
348  int flag2;
349  MPI_Test_cancelled( &status, &flag2 );
350  notification_tracer.Debug << ( flag ? "completed " : "pending " ) << ( !flag2 ? "/ test succeeded " : "/ test cancelled" ) << std::endl;
351  }
352  if ( flag ) {
353  //okay ready to send next message
354  CompletionMessage const& msg( pending_notifications_.front() );
355  notification_tracer.Debug << "send out JOB_COMPLETION " << msg.batch_id << std::endl;
356  basic::show_time( tr, "send JOB_COMPLETION" );
357  // int notify_buf[ 6 ];
358  notify_buf[ 0 ] = msg.msg_tag;//JOB_COMPLETION, QUEUE_EMPTY;
359  notify_buf[ 1 ] = msg.batch_id;
360  notify_buf[ 2 ] = msg.final ? 1 : 0;
361  notify_buf[ 3 ] = msg.bad;
362  notify_buf[ 4 ] = msg.good;
363  notify_buf[ 5 ] = msg.njobs;
364  MPI_Isend( &notify_buf, 6, MPI_INT, archive_rank(), MPI_ARCHIVE_TAG, MPI_COMM_WORLD, &notify_request ); //don't block JobDistributor
365  pending_notifications_.pop_front();
366  notify_first = false;
367  }
368  basic::show_time( tr, "finished _notify_archive" );
369 #endif
370  PROF_STOP( basic::MPI_NOTIFY_ARCHIVE );
371 }
372 
373 ///@detail work out if CompletionMessage should be send... looks at completed/bad decoys
374 /// send "final" message if all jobs done... sends "update" message if nr_new_completed_ > nr_notify
375 void
377  /// send "final" message ?
378  tr.Trace << "notify_archive for batch: " << batch_id << " now " << nr_new_completed_[ batch_id ] << " decoys " << std::endl;
379  if ( nr_completed_[ batch_id ] + nr_new_completed_[ batch_id ] + nr_bad_[ batch_id ] == nr_jobs_[ batch_id ] ) {
380  nr_completed_[ batch_id ] += nr_new_completed_[ batch_id ];
381  nr_new_completed_[ batch_id ] = 0;
382  // still send to close files in MPI-FILE-BUF if ( batch( batch_id ) != BatchJobInputter::BOGUS_BATCH_ID ) { //don't send message if this Batch has ben CANCELLED
383  notify_archive( CompletionMessage( batch_id, true, nr_bad_[ batch_id ], nr_completed_[ batch_id ], nr_jobs_[ batch_id ] ) );
384  // }
385  //// send "update" message ?
386  } else if ( nr_new_completed_[ batch_id ] >= nr_notify_ ) {
387  nr_completed_[ batch_id ] += nr_new_completed_[ batch_id ];
388  nr_new_completed_[ batch_id ] = 0;
389  //if ( batch( batch_id ) != BatchJobInputter::BOGUS_BATCH_ID ) { //don't send message if this Batch has ben CANCELLED
390  notify_archive( CompletionMessage( batch_id, false, nr_bad_[ batch_id ], nr_completed_[ batch_id ], nr_jobs_[ batch_id ] ) );
391  // }
392  }
393  tr.Trace << "nr_batches " << nr_batches() << " current_job_id() " << current_job_id() << " get_jobs().size() " << get_jobs().size()
394  << " nr_processors " << number_of_processors() << std::endl;
395  //are we quickly running out of jobs? -- checking for equality to reduce number of messages -- is this safe? do we ever skip jobs?
396  if ( nr_batches() == batch_id && ( (int) current_job_id() == ( (int) get_jobs().size() - (int) number_of_processors() ) ) ) {
397  //tr.Info << "jobs are low... send QUEUE_EMPTY with " << batch_id << " batch_id " << std::endl;
398  // pending_notifications_.push_front( CompletionMessage( batch_id, QUEUE_EMPTY ) );
399  // _notify_archive();
400  }
401 }
402 
403 ///@detail overloaded to update our job-statistics ( needed for CompletionMessages )
405  tr.Trace << "mark_job_as_completed " << job_id << " batch: " << batch_id << " " << run_time << " seconds" << std::endl;
406  Parent::mark_job_as_completed( job_id, batch_id, run_time );
407  if ( rank() == master_rank() ) {
408  runtime_assert( batch_id <= nr_jobs_.size() );
409  nr_new_completed_[ batch_id ] += 1;
410  notify_archive( batch_id );
411  }
412 }
413 
415  Parent::mark_job_as_bad( job_id, batch_id );
416  if ( rank() == master_rank() ) {
417  runtime_assert( batch_id <= nr_jobs_.size() );
418  nr_bad_[ batch_id ] += nstruct_[ batch_id ];
419  notify_archive( batch_id );
420  }
421 }
422 
423 ///@detail load new batch from BatchQueue .. overloaded to setup the statistics for CompletionMessages
425  // if ( current_batch_id() ) notify_archive( current_batch_id() );
427  if ( rank() == master_rank() ) { //in principle I'd rather do this in add_batch() but we need option of new batch for nstruct...
428  while( nr_jobs_.size() < current_batch_id() ) {
429  nr_jobs_.push_back( get_jobs().size() );
430  nr_new_completed_.push_back( 0 );
431  nr_completed_.push_back( 0 );
432  nr_bad_.push_back( 0 );
433  nstruct_.push_back( option[ out::nstruct ] ); ///Assumming that all JobInputters create nstruct jobs per input_tag...
434  }
435  runtime_assert( nr_jobs_.size() == current_batch_id() );
436  mem_tr << "MPIArchiveJobDistributor::load_new_batch()'ed" << std::endl;
437  }
438 }
439 
440 
441 
442 }//archive
443 }//jd2
444 }//protocols