30 #include <utility/io/ozstream.hh>
33 #include <basic/Tracer.hh>
34 #include <basic/options/option.hh>
35 #include <utility/exit.hh>
39 #include <basic/options/keys/out.OptionKeys.gen.hh>
40 #include <basic/options/keys/jd2.OptionKeys.gen.hh>
46 #include <ObjexxFCL/string.functions.hh>
48 #include <utility/vector1.hh>
51 static basic::Tracer
tr(
"protocols.jd2.MPIMultiCommJobDistributor");
59 using namespace basic::options;
60 using namespace basic::options::OptionKeys;
65 setup_sub_communicators( sub_size );
69 n_comm_ = ( n_rank()-min_client_rank() ) / sub_size;
70 tr.Debug <<
" can allocate " << n_comm_ <<
" communication groups " << std::endl;
71 tr.Debug <<
" n_rank: " << n_rank() <<
" sub_size: " << sub_size << std::endl;
72 set_n_worker( n_comm_ );
75 tr.Error <<
"requested sub-communicators of size " << sub_size <<
" but only " << n_rank() <<
" processes are available " << std::endl;
76 utility_exit_with_message(
"cannot run with requested size of sub-partition" );
79 MPI_Group world_group;
80 MPI_Comm_group(MPI_COMM_WORLD, &world_group );
81 communicator_handle_ = 0;
82 Size i_rank = min_client_rank();
83 mpi_groups_.resize( n_comm_, MPI_GROUP_NULL );
84 mpi_communicators_.resize( n_comm_, MPI_COMM_NULL );
85 for (
Size i_comm = 1; i_comm <= n_comm_; ++i_comm ) {
86 mpi_ranks_.push_back(
new int[ sub_size ] );
87 for (
Size i = 0; i < sub_size; ++i, ++i_rank ) {
88 mpi_ranks_.back()[ i ] = i_rank;
89 if ( i_rank == rank() ) {
90 communicator_handle_ = i_comm;
93 MPI_Group_incl( world_group, sub_size, mpi_ranks_.back(), &(mpi_groups_[ i_comm ]) );
97 MPI_Comm_create( MPI_COMM_WORLD, mpi_groups_[ i_comm ], &(mpi_communicators_[ i_comm ]) );
100 runtime_assert( rank() < min_client_rank() || communicator_handle_ == 0 || communicator_handle_ <= mpi_communicators_.size() );
101 if ( rank() >= min_client_rank() && communicator_handle_ ) {
102 MPI_Comm_rank( mpi_communicators_[ communicator_handle_ ], &sub_rank_ );
113 if ( rank() < min_client_rank() ) {
114 return Parent::get_new_job_id();
117 int new_job_id( -1 );
118 if ( sub_rank_ == 0 ) {
119 new_job_id = Parent::get_new_job_id();
121 if ( sub_rank_ >= 0 ) {
123 runtime_assert( communicator_handle_ && communicator_handle_ <= mpi_communicators_.size() );
125 mpi_buf[ 0 ] = new_job_id;
126 mpi_buf[ 1 ] = current_batch_id();
127 MPI_Bcast( mpi_buf, 2, MPI_INT, 0, mpi_communicators_[ communicator_handle_ ] );
128 new_job_id = mpi_buf[ 0 ];
129 if ( sub_rank_ > 0 ) set_batch_id( mpi_buf[ 1 ] );
130 runtime_assert( new_job_id >= 0 );
141 if ( sub_rank() <= 0 ) {
142 Parent::job_succeeded( pose, run_time);
149 if ( sub_rank() <= 0 ) {
150 Parent::job_failed( pose, retry);
155 MPI_Comm
const& MPIMultiCommJobDistributor::current_mpi_comm() {
156 runtime_assert( communicator_handle_ );
157 runtime_assert( communicator_handle_ <= mpi_communicators_.size() );
158 return mpi_communicators_[ communicator_handle_ ];