Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
MPI_EndPoint.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file protocols/wum2/MPI_EndPoint.cc
11 /// @brief Handles communication between different roles in mpi
12 // Every role in mpi needs one of these for each different role it communicates with
13 // for example, slave only communicates with master so slave needs one
14 // master communicates with slave and pool, so master needs two
15 
16 // role_available_mem is a functor to the role's available memory function
17 //
18 // this allows a role with multiple endpoints have its endpoints be aware of
19 // other endpoint memory usage and act accordingly
20 /// @author Ken Jung
21 
22 #ifdef USEBOOSTMPI
23 // this is useless without mpi
26 
27 #include <basic/options/option.hh>
28 #include <basic/options/keys/els.OptionKeys.gen.hh>
29 
30 #include <basic/Tracer.hh>
31 
32 namespace protocols {
33 namespace wum2 {
34 
35 static basic::Tracer TR("protocols.wum2.MPI_EndPoint");
36 
37 MPI_EndPoint::MPI_EndPoint( mpi::communicator world, function< uint64_t () > role_available_mem )
38  : world_( world ),
39  EndPoint( role_available_mem ) {
40  // set up initial channels
41  clearcommand_channel_.get<0>() = world_.irecv( mpi::any_source, CLEARCOMMAND, clearcommand_channel_.get<1>() );
42  statusrequest_channel_.get<0>() = world_.irecv( mpi::any_source, STATUSREQUEST, statusrequest_channel_.get<1>() );
43 }
44 
45 void MPI_EndPoint::check_and_act_status_request( function< void ( StatusResponse & , int ) > functor ) {
46  if ( statusrequest_channel_.get<0>().test().is_initialized() ) {
47  mpi::request req;
48  StatusResponse m;
49  m.rank = mpi_rank();
50  uint64_t free_mem = role_available_mem_();
51  uint64_t available_work = statusrequest_channel_.get<1>().max_outgoing_wu_mem;
52  m.incoming_allocated = free_mem < available_work ? free_mem : available_work;
53  m.outq_current_mem = outq_.current_mem();
54 
55  tuple< mpi::request, StatusResponse > tmp = make_tuple ( req , m );
56  outbound_statusresponse_.push_back( tmp );
57  std::list< tuple< mpi::request, StatusResponse > >::reverse_iterator itr = outbound_statusresponse_.rbegin();
58  itr->get<0>() = world_.isend( statusrequest_channel_.get<1>().rank , STATUSRESPONSE, itr->get<1>() );
59 
60  // call functor
61  functor( itr->get<1>(), statusrequest_channel_.get<1>().rank );
62  // refresh statusrequest channel
63  statusrequest_channel_.get<0>() = world_.irecv( mpi::any_source, STATUSREQUEST, statusrequest_channel_.get<1>() );
64  }
65 }
66 
67 void MPI_EndPoint::listen_wu_sendrecv( StatusResponse & r, int requesting_node ) {
68  // assumption here is that the asking node will do an isend/irecv to clear out this node's queues
69  // opens irecv from asking node in anticipation of wu isend from asking node
70  // -> master sending new wu to slave
71  // opens isend to asking node in anticipation of wu irecv from asking node
72  // -> slave sending completed wu back to master
73  receive_wus( requesting_node, r.incoming_allocated );
74  send_wus( requesting_node, r.outq_current_mem );
75 }
76 
77 void MPI_EndPoint::send_status_request( int rank ) {
78  mpi::request req;
79  StatusRequest m;
80  m.rank = mpi_rank();
81  m.max_outgoing_wu_mem = max_outgoing_wu_mem();
82  tuple< mpi::request, StatusRequest > tmpa = make_tuple(req, m);
83  outbound_statusrequest_.push_back( tmpa );
84  std::list< tuple< mpi::request, StatusRequest > >::reverse_iterator jtr = outbound_statusrequest_.rbegin();
85  jtr->get<0>() = world_.isend( rank, STATUSREQUEST, jtr->get<1>() );
86 
87  mpi::request reqn;
88  StatusResponse n;
89  tuple< mpi::request, StatusResponse > tmp = make_tuple(reqn, n);
90  inbound_statusresponse_.push_back( tmp );
91  std::list< tuple< mpi::request, StatusResponse > >::reverse_iterator itr = inbound_statusresponse_.rbegin();
92  itr->get<0>() = world_.irecv( rank, STATUSRESPONSE, itr->get<1>() );
93 
94  open_status_.insert(rank);
95 }
96 
97 
98 void MPI_EndPoint::act_on_status_response( function<bool ( StatusResponse & r )> f ){
99  std::list< tuple< mpi::request, StatusResponse > >::iterator itr = inbound_statusresponse_.begin();
100  while( itr != inbound_statusresponse_.end() ) {
101  if ( itr->get<0>().test().is_initialized() ) {
102  if( f( itr->get<1>() ) ) {
103  // only replace if status response send/recv can actually be fufilled
104  // may not be fufillfed due to memory limits and current memory use
105  open_status_.erase( itr->get<1>().rank ) ;
106  itr = inbound_statusresponse_.erase( itr );
107  } else {
108  itr++;
109  }
110  } else {
111  itr++;
112  }
113  }
114 }
115 
116 bool MPI_EndPoint::initiate_wu_sendrecv( StatusResponse & r ) {
117  // assumption here is that only reason this node initiated a
118  // StatusRequest->StatusResponse chain was to send and recv WU to the responding node
119  //
120  // we can't delete the status response until both send and recv are fulfilled
121  bool can_send = false;
122  bool can_recv = false;
123 
124  if( r.outq_current_mem == 0 ) {
125  can_recv = true;
126  } else if (r.outq_current_mem > role_available_mem_() ) {
127  // once we get in here, may never actually be able to get out of this
128  can_recv = false;
129  } else {
130  can_recv = true;
131  receive_wus( r.rank, r.outq_current_mem );
132  }
133 
134  if( r.incoming_allocated == 0 ) {
135  can_send = true;
136  /* just sending empty message seems less complicated, more overhead though
137  *
138  } else if (outq.size() == 0 ) {
139  can_send = false;
140  */
141  } else {
142  can_send = true;
143  send_wus( r.rank, r.incoming_allocated );
144  }
145  return can_send && can_recv;
146 }
147 
148 void MPI_EndPoint::cleanup_reqs() {
149  // buffer reqs
150  outbuf_.cleanup_reqs();
151  std::vector< WorkUnitSP > tmp = inbuf_.cleanup_reqs();
152  for( std::vector< WorkUnitSP >::iterator itr = tmp.begin(); itr != tmp.end(); itr++ ) {
153  if( (*itr)->prioritize() ) {
154  inq_.push_front( *itr );
155  } else {
156  inq_.push_back( *itr );
157  }
158  }
159  // outbound statusresponse reqs
160  std::list< tuple< mpi::request, StatusResponse > >::iterator itr = outbound_statusresponse_.begin();
161  while( itr != outbound_statusresponse_.end() ){
162  if ( itr->get<0>().test().is_initialized() ) {
163  // outbound statusresponse was sent successfully
164  itr = outbound_statusresponse_.erase( itr );
165  } else {
166  itr++;
167  }
168  }
169  // outbound statusrequest reqs
170  std::list< tuple< mpi::request, StatusRequest > >::iterator jtr = outbound_statusrequest_.begin();
171  while( jtr != outbound_statusrequest_.end() ) {
172  if ( jtr->get<0>().test().is_initialized() ) {
173  // outbound statusresponse was sent successfully
174  jtr = outbound_statusrequest_.erase( jtr );
175  } else {
176  jtr++;
177  }
178  }
179 }
180 
181 void MPI_EndPoint::receive_wus( int rank, uint64_t mem_size ) {
182  if( mem_size != 0 ) {
183  WUQueueBuffer::riterator itr = inbuf_.allocate_buffer( mem_size );
184  itr->get<1>() = world_.irecv( rank, WORKUNITVEC, *(itr->get<2>()) );
185  }
186 }
187 
188 void MPI_EndPoint::send_wus( int rank, uint64_t mem_size ) {
189  using namespace basic::options;
190  using namespace basic::options::OptionKeys;
191  // this is kind of inefficient, copying the vector twice, but easiest way to track mem use
192  if( mem_size != 0 ) {
193  std::vector< WorkUnitSP > tmp;
194  uint64_t current_size = 0;
195  int counter = 0;
196  // slave should not have any outbound limit
197  // master should have low outbound limit to promote distributing work among slaves
198  // otherwise will send all work to one slave
199  // use the fact that master rank is always lower than slave rank to figure out who we are
200  // only implement limit for master send for now
201  // putting this in here is bad, should be one level up in master/slave/baserole
202 
203  int num_masters =
204  (option[OptionKeys::els::num_traj]() / option[OptionKeys::els::traj_per_master]() ) +
205  (!( option[OptionKeys::els::num_traj]() % option[OptionKeys::els::traj_per_master]() == 0 )) ;
206  int num_slaves = (world_.size() - num_masters)/num_masters;
207  while( outq_.size_front() && current_size + outq_.size_front() <= mem_size ) {
208  current_size += outq_.size_front();
209  tmp.push_back( outq_.pop_front() );
210 
211  if( rank > mpi_rank() )
212  counter++;
213  if( outq_.size() < num_slaves || counter >= 2 ) break;
214 
215  }
216  WUQueueBuffer::riterator itr = outbuf_.allocate_buffer( current_size );
217  itr->get<2>()->insert( itr->get<2>()->end(), tmp.begin(), tmp.end() );
218  itr->get<1>() = world_.isend( rank, WORKUNITVEC, *(itr->get<2>()) );
219  }
220 }
221 
222 void MPI_EndPoint::check_and_act_clearcommand() {
223  // 2 potential problems here:
224  // 1) can't clear inbuf because of possible dangling isend -> zombie WUs
225  // mark and sweep?
226  // 2) "security issue" anyone can send a clear command to a node -> who cares about security
227  if ( clearcommand_channel_.get<0>().test().is_initialized() ) {
228  inq_.clear();
229  // refresh clearcommand channel
230  clearcommand_channel_.get<0>() = world_.irecv( mpi::any_source, CLEARCOMMAND, clearcommand_channel_.get<1>() );
231  }
232 }
233 
234 } // wum2
235 } // protocols
236 #endif