25 #include <basic/message_listening/MessageListenerFactory.hh>
26 #include <basic/message_listening/MessageListener.hh>
27 #include <basic/message_listening/util.hh>
33 #include <basic/Tracer.hh>
34 #include <basic/options/option.hh>
35 #include <utility/exit.hh>
36 #include <utility/assert.hh>
37 #include <utility/mpi_util.hh>
40 #include <basic/options/keys/out.OptionKeys.gen.hh>
42 #include <basic/options/keys/jd2.OptionKeys.gen.hh>
49 #include <utility/vector1.hh>
50 static basic::Tracer
TR(
"protocols.jd2.MPIWorkPoolJobDistributor");
55 using namespace basic::options;
56 using namespace basic::options::OptionKeys;
65 next_job_to_assign_( 0 ),
74 MPI_Comm_rank( MPI_COMM_WORLD, (
int* )( &
rank_ ) );
75 MPI_Comm_size( MPI_COMM_WORLD, (
int* )( &
npes_ ) );
77 utility_exit_with_message(
"ERROR ERROR ERROR: The MPIWorkPoolJobDistributor will not work unless you have compiled using extras=mpi" );
102 MPI_Barrier( MPI_COMM_WORLD );
124 runtime_assert(
rank_ == 0 );
134 TR <<
"Master Node: Waiting for job requests..." << std::endl;
137 MPI_Recv( &slave_data, 1, MPI_INT,
MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
138 TR <<
"Master Node: Received message from " << status.MPI_SOURCE <<
" with tag " << status.MPI_TAG << std::endl;
142 switch ( status.MPI_TAG ) {
152 TR <<
"Master Node: Received job failure message for job id " << slave_data <<
" from node " << status.MPI_SOURCE << std::endl;
157 TR <<
"Master Node: Received job success message for job id " << slave_data <<
" from node " << status.MPI_SOURCE <<
" blocking till output is done " << std::endl;
159 MPI_Recv( &slave_data, 1, MPI_INT, status.MPI_SOURCE,
JOB_SUCCESS_TAG, MPI_COMM_WORLD, &status);
160 TR <<
"Master Node: Received job output finish message for job id " << slave_data <<
" from node " << status.MPI_SOURCE << std::endl;
165 using namespace basic::message_listening;
167 listener_tags listener_tag((listener_tags)slave_data);
168 MessageListenerOP listener(MessageListenerFactory::get_instance()->get_listener(listener_tag));
170 std::string message_data = utility::receive_string_from_node(status.MPI_SOURCE);
172 bool request_slave_data = listener->request(message_data, return_info);
173 utility::send_string_to_node(status.MPI_SOURCE, return_info);
176 <<
"Master Node: node '" << status.MPI_SOURCE <<
"' "
177 <<
"requests from the message listener '" << listener_tag_to_name(listener_tag) <<
"' "
178 <<
"data on '" << message_data <<
"', "
179 <<
"respond with '" << return_info <<
"' "
180 << (request_slave_data ?
" and requests more data." :
".") << std::endl;
182 if(request_slave_data){
183 message_data = utility::receive_string_from_node(status.MPI_SOURCE);
185 <<
"Master Node: Received from node '" << status.MPI_SOURCE <<
"' "
186 <<
"'" << message_data <<
"'" << std::endl;
187 listener->receive(message_data);
195 std::stringstream err_msg;
197 <<
"Received unrecognized mpi_tag '" << status.MPI_TAG <<
"' " << std::endl
198 <<
"\tfrom node '" << status.MPI_SOURCE <<
"' " << std::endl
199 <<
"\twith data '" << slave_data <<
"'";
200 utility_exit_with_message(err_msg.str());
205 TR <<
"Master Node: Finished handing out jobs" << std::endl;
210 while ( n_nodes_left_to_spin_down > 0 ) {
211 TR <<
"Master Node: Waiting for " << n_nodes_left_to_spin_down <<
" slaves to finish jobs" << std::endl;
214 MPI_Recv( &slave_data, 1, MPI_INT,
MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
215 TR <<
"Master Node: Received message from " << status.MPI_SOURCE <<
" with tag " << status.MPI_TAG << std::endl;
219 switch ( status.MPI_TAG ) {
223 TR <<
"Master Node: Sending spin down signal to node " << status.MPI_SOURCE << std::endl;
225 n_nodes_left_to_spin_down--;
230 TR <<
"Master Node: Received job success message for job id " << slave_data <<
" from node " << status.MPI_SOURCE <<
" blocking till output is done " << std::endl;
232 MPI_Recv( &slave_data, 1, MPI_INT, status.MPI_SOURCE,
JOB_SUCCESS_TAG, MPI_COMM_WORLD, &status);
233 TR <<
"Master Node: Received job output finish message for job id " << slave_data <<
" from node " << status.MPI_SOURCE << std::endl;
237 using namespace basic::message_listening;
239 listener_tags listener_tag((listener_tags)slave_data);
240 MessageListenerOP listener(MessageListenerFactory::get_instance()->get_listener(listener_tag));
242 std::string message_data = utility::receive_string_from_node(status.MPI_SOURCE);
244 bool request_slave_data = listener->request(message_data, return_info);
245 utility::send_string_to_node(status.MPI_SOURCE, return_info);
248 <<
"Master Node: node '" << status.MPI_SOURCE <<
"' "
249 <<
"requests from the message listener '" << listener_tag_to_name(listener_tag) <<
"' "
250 <<
"data on '" << message_data <<
"', "
251 <<
"respond with '" << return_info <<
"' "
252 << (request_slave_data ?
" and requests more data." :
".") << std::endl;
254 if(request_slave_data){
255 message_data = utility::receive_string_from_node(status.MPI_SOURCE);
257 <<
"Master Node: Received from node '" << status.MPI_SOURCE <<
"' "
258 <<
"'" << message_data <<
"'" << std::endl;
259 listener->receive(message_data);
266 std::stringstream err_msg;
268 <<
"Received unrecognized mpi_tag '" << status.MPI_TAG <<
"' " << std::endl
269 <<
"\tfrom node '" << status.MPI_SOURCE <<
"' " << std::endl
270 <<
"\twith data '" << slave_data <<
"'";
271 utility_exit_with_message(err_msg.str());
275 TR <<
"Master Node: Finished sending spin down signals to slaves" << std::endl;
282 runtime_assert( !(
rank_ == 0 ) );
304 using namespace basic::options;
305 using namespace basic::options::OptionKeys;
313 TR <<
"Master Node: No more jobs to assign, setting next job id to zero" << std::endl;
317 TR <<
"Master Node: Getting next job to assign from list id " << next_job_to_assign_ <<
" of " << jobs.size() << std::endl;
319 }
else if ( outputter->job_has_completed( jobs[ next_job_to_assign_ ] ) && option[ out::overwrite ].value() ) {
320 TR <<
"Master Node: Getting next job to assign from list, overwriting id " << next_job_to_assign_ <<
" of " << jobs.size() << std::endl;
332 runtime_assert( !(
rank_ == 0 ) );
338 TR <<
"Slave Node " <<
rank_ <<
": Requesting new job id from master" <<std::endl;
344 MPI_Send( &empty_data, 1, MPI_INT, 0,
NEW_JOB_ID_TAG, MPI_COMM_WORLD );
367 runtime_assert(
rank_ == 0 );
368 TR <<
"Master Node: Mark current job for repetition" << std::endl;
369 utility_exit_with_message(
"Master Node: master_mark_current_job_id_for_repetition() should never be called" );
376 runtime_assert( !(
rank_ == 0 ) );
396 runtime_assert(
rank_ == 0 );
402 TR <<
"Master Node: Job id " <<
bad_job_id_ <<
" failed, reporting bad input; other jobs of same input will be canceled: " <<
job_outputter()->output_name( jobs[
bad_job_id_ ] ) << std::endl;
405 TR <<
"Master Node: Job canceled without trying due to previous bad input: " <<
job_outputter()->output_name( jobs[
next_job_to_assign_ ] ) <<
" id " << next_job_to_assign_ << std::endl;
420 runtime_assert( !(
rank_ == 0 ) );
447 runtime_assert(
rank_ == 0 );
448 TR <<
"Master Node: Job Succeeded" << std::endl;
449 utility_exit_with_message(
"Master Node: master_job_succeeded() should never be called" );
457 runtime_assert( !(
rank_ == 0 ) );
459 if ( option[ OptionKeys::jd2::mpi_fast_nonblocking_output ].value() ==
true ) {
466 TR <<
"Slave Node " <<
rank_ <<
": Finished job successfully! Sending output request to master." << std::endl;
470 TR <<
"Slave Node " <<
rank_ <<
": Received output confirmation from master. Writing output." << std::endl;
471 MPI_Recv( &empty_data, 1, MPI_INT, 0,
JOB_SUCCESS_TAG, MPI_COMM_WORLD, &status );
473 clock_t starttime = clock();
475 clock_t stoptime = clock();
478 TR <<
"Slave Node " <<
rank_ <<
": Finished writing output in " << ((double) stoptime-starttime) / CLOCKS_PER_SEC <<
" seconds. Sending message to master" << std::endl;
479 MPI_Send( &empty_data, 1, MPI_INT, 0,
JOB_SUCCESS_TAG, MPI_COMM_WORLD );