30 #include <utility/io/ozstream.hh>
33 #include <basic/Tracer.hh>
34 #include <basic/options/option.hh>
35 #include <utility/exit.hh>
39 #include <basic/options/keys/out.OptionKeys.gen.hh>
41 #include <basic/options/keys/jd2.OptionKeys.gen.hh>
46 #include <basic/prof.hh>
48 #include <ObjexxFCL/string.functions.hh>
49 #include <utility/string_util.hh>
50 #include <utility/vector1.hh>
53 static basic::Tracer
tr(
"protocols.jd2.MPIFileBufJobDistributor");
61 using namespace basic::options;
62 using namespace basic::options::OptionKeys;
70 slave_current_job_id_( 0 ),
71 slave_current_batch_id_( 0 ),
77 min_client_rank_( 2 ),
78 cumulated_runtime_( 0.0 ),
84 MPI_Comm_rank( MPI_COMM_WORLD, (
int* )( &
rank_ ) );
85 MPI_Comm_size( MPI_COMM_WORLD, (
int* )( &
n_rank_ ) );
88 utility_exit_with_message(
"ERROR ERROR ERROR: The MPIFileBufJobDistributor will not work unless you have compiled using extras=mpi" );
103 slave_current_job_id_( 0 ),
104 slave_current_batch_id_( 0 ),
107 repeat_job_( false ),
108 master_rank_( master_rank ),
109 file_buf_rank_(file_buf_rank ),
110 min_client_rank_( min_client_rank ),
111 cumulated_runtime_( 0.0 ),
119 MPI_Comm_rank( MPI_COMM_WORLD, (
int* )( &
rank_ ) );
120 MPI_Comm_size( MPI_COMM_WORLD, (
int* )( &
n_rank_ ) );
123 utility_exit_with_message(
"ERROR ERROR ERROR: The MPIFileBufJobDistributor will not work unless you have compiled using extras=mpi" );
138 tr.Debug <<
"finished call to buffer.run()" << std::endl;
140 tr.Debug <<
"Master JD starts" << std::endl;
142 tr.Debug <<
"send STOP to FileBuffer " << std::endl;
146 tr.Debug <<
"Slave JD finished!" << std::endl;
152 MPI_Barrier( MPI_COMM_WORLD );
173 tr.Debug <<
"Master: send new job: " << buf[ 0 ] <<
" " << buf[ 1 ] << std::endl;
176 runtime_assert(
rank_ == slave_rank );
177 int buf[ 2 ]; MPI_Status status;
181 tr.Debug <<
"Slave: receive job: " << buf[ 0 ] <<
" " << buf[ 1 ] << std::endl;
192 tr.Debug <<
"Master Node: Sending new job id " <<
current_job_id() <<
" " <<
"job: " <<
current_job()->input_tag() <<
" to node " << slave_rank << std::endl;
202 tr.Debug <<
"Master Node: Received job failure message for job id " << slave_job_id <<
" from node " << slave_rank << std::endl;
215 tr.Error <<
"[ERROR] from " << slave_rank <<
" tag: " << msg_tag <<
" " << slave_job_id << std::endl;
216 utility_exit_with_message(
" unknown tag "+ ObjexxFCL::string_of( msg_tag ) +
" in master_loop of MPIFileBufJobDistributor ");
245 Size const mpi_size( 4 );
246 int mpi_buf[ mpi_size ];
248 Size const mpi_size( 4 );
249 int mpi_buf[ mpi_size ];
251 MPI_Recv( &mpi_buf, mpi_size, MPI_INT, source,
MPI_JOB_DIST_TAG, MPI_COMM_WORLD, &status);
252 Size slave_rank( status.MPI_SOURCE );
253 Size const msg_tag ( mpi_buf[ 0 ] );
254 Size const slave_job_id( mpi_buf[ 1 ] );
255 Size const slave_batch_id( mpi_buf[ 2 ]);
256 Size const runtime( mpi_buf[ 3 ] );
257 if ( msg_tag_in != msg_tag ) {
258 tr.Debug <<
" when trying to eat signal " << msg_tag_in <<
" I received " << msg_tag <<
" going to process this now" << std::endl;
259 process_message( msg_tag, slave_rank, slave_job_id, slave_batch_id, runtime );
268 tr.Debug <<
"eating expected signal " << msg_tag_in << std::endl;
279 Size const mpi_size( 4 );
280 int mpi_buf[ mpi_size ];
292 tr.Info <<
"Starting JobDistribution with " <<
n_worker() <<
" worker processes " << std::endl;
293 using namespace basic::options;
294 using namespace basic::options::OptionKeys;
296 double timeout_wait_factor = option[ OptionKeys::jd2::mpi_timeout_factor ]();
302 tr.Debug <<
"Master Node: Waiting for job requests..." << std::endl;
308 double timeout=MPI_Wtime();
310 tr.Debug <<
"set timeout_limit to " << timeout_limit << std::endl;
312 if ( MPI_Wtime() - timeout > timeout_limit )
break;
313 MPI_Test(&request, &flag, &status);
316 utility_exit_with_message(
"quick exit from job-distributor due to flag jd2::mpi_nowait_for_remaining_jobs and timeout of "+
317 utility::to_string( MPI_Wtime()-timeout )+
" seconds\n"+
"increase time-out by using -jd2:mpi_timeout_factor" );
319 Size slave_rank( status.MPI_SOURCE );
320 Size const msg_tag ( mpi_buf[ 0 ] );
321 Size const slave_job_id( mpi_buf[ 1 ] );
322 Size const slave_batch_id( mpi_buf[ 2 ]);
323 Real const runtime( mpi_buf[ 3 ]);
324 tr.Debug <<
"Master Node: Recieved message from " << slave_rank <<
" with tag "
325 << msg_tag <<
" slave_jobid " << slave_job_id <<
" slave batchid " << slave_batch_id << std::endl;
328 process_message( msg_tag, slave_rank, slave_job_id, slave_batch_id, runtime);
333 tr.Info <<
"Master Node: Finished sending spin down signals to slaves" << std::endl;
336 if ( option[ OptionKeys::jd2::mpi_nowait_for_remaining_jobs ]() ) {
337 utility_exit_with_message(
"quick exit from job-distributor due to flag jd2::mpi_nowait_for_remaining_jobs --- this is not an error " );
360 using namespace basic::options;
361 using namespace basic::options::OptionKeys;
367 basic::show_time(
tr,
"assign job "+ObjexxFCL::string_of(next_job_to_assign)+
" batch: "+ObjexxFCL::lead_zero_string_of(
current_batch_id(),5 ) );
369 while( next_job_to_assign <= jobs.size()) {
370 if ( jobs[ next_job_to_assign ]->bad() ) {
372 }
else if ( !outputter->job_has_completed( jobs[ next_job_to_assign ] ) ) {
373 tr.Debug <<
"Master Node: Getting next job to assign from list id " << next_job_to_assign <<
" of " << jobs.size() << std::endl;
374 return next_job_to_assign;
375 }
else if ( outputter->job_has_completed( jobs[ next_job_to_assign ] ) && option[ out::overwrite ].value() ) {
376 tr.Debug <<
"Master Node: Getting next job to assign from list, overwriting id " << next_job_to_assign <<
" of " << jobs.size() << std::endl;
377 return next_job_to_assign;
381 ++next_job_to_assign;
383 tr.Debug <<
"Master Node: No more jobs to assign, setting next job id to zero" << std::endl;
406 tr.Debug <<
"Slave Node " <<
rank_ <<
": Requesting new job id from master" <<std::endl;
434 tr.Debug <<
"Master Node: Mark current job for repetition" << std::endl;
435 utility_exit_with_message(
"Master Node: master_mark_current_job_id_for_repetition() should never be called" );
442 tr.Debug <<
"Slave Node " <<
rank_ <<
": Mark current job for repetition, id " <<
current_job_id() << std::endl;
462 if (
tr.Debug.visible() ) {
465 tr.Debug <<
"Master Node: Job id "
467 <<
" failed, reporting bad input; other jobs of same input will be canceled: " << bad_job_id_input_tag << std::endl;
478 Size const mpi_size( 4 );
479 int mpi_buf[ mpi_size ];
513 if ( ! will_retry ) {
526 tr.Debug <<
"Master Node: Job Succeeded" << std::endl;
527 utility_exit_with_message(
"Master Node: master_job_succeeded() should never be called" );