Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
MPI_WorkUnitManager.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file protocols/loops/LoopHashMap.cc
11 /// @brief
12 /// @author Mike Tyka
13 
14 #define TRDEBUG TR.Debug
15 
16 // MPI headers
17 #ifdef USEMPI
18 #include <mpi.h> //keep this first
19 #endif
20 
21 #include <utility/assert.hh> //MPI_ONLY macro
22 
26 // AUTO-REMOVED #include <core/io/silent/SilentFileData.hh>
27 // AUTO-REMOVED #include <core/io/silent/SilentStructFactory.hh>
28 // AUTO-REMOVED #include <core/scoring/ScoreFunctionFactory.hh>
29 // AUTO-REMOVED #include <core/scoring/ScoreFunction.hh>
30 // AUTO-REMOVED #include <core/chemical/ResidueTypeSet.hh>
31 
32 // AUTO-REMOVED #include <core/chemical/ChemicalManager.hh>
33 #include <basic/Tracer.hh>
34 // AUTO-REMOVED #include <core/pose/Pose.hh>
35 
36 
37 // AUTO-REMOVED #include <core/import_pose/pose_stream/MetaPoseInputStream.hh>
38 // AUTO-REMOVED #include <core/import_pose/pose_stream/util.hh>
39 #include <basic/options/option.hh>
40 #include <basic/options/keys/wum.OptionKeys.gen.hh>
41 /// ObjexxFCL headers
42 #include <numeric/random/random.hh>
43 #include <ObjexxFCL/string.functions.hh>
44 #include <ObjexxFCL/format.hh>
45 
46 #include <utility/vector1.hh>
47 
48 
49 
50 #if defined(WIN32) || defined(__CYGWIN__)
51  #include <ctime>
52 #endif
53 
54 using namespace ObjexxFCL::fmt;
55 
56 namespace protocols {
57 namespace wum {
58 
59 static basic::Tracer TR("MPI_WUM");
60 static numeric::random::RandomGenerator RG(23765);
61 
62 int mpi_rank(){
63  int mpi_rank_=0;
64  #ifdef USEMPI
65  MPI_Comm_rank( MPI_COMM_WORLD, ( int* )( &mpi_rank_ ) );
66  #else
67  utility_exit_with_message( "ERROR: The MPI_WorkUnitManager will not work unless you have compiled using extras=mpi" );
68  #endif
69  return mpi_rank_;
70 }
71 
72 int mpi_npes(){
73  int mpi_npes_=0;
74  #ifdef USEMPI
75  MPI_Comm_size( MPI_COMM_WORLD, ( int* )( &mpi_npes_ ) );
76  #else
77  utility_exit_with_message( "ERROR: The MPI_WorkUnitManager will not work unless you have compiled using extras=mpi" );
78  #endif
79  return mpi_npes_;
80 }
81 
82 
83 
85  #ifdef USEMPI
86  return MPI_Wtime();
87  #else
88  return (core::Real) time(NULL);
89  #endif
90 }
91 
92 
93 
94 
95 
96 
97 
98 
99 MPI_WorkUnitManager::MPI_WorkUnitManager( char machine_letter ):
101 last_stats_(0),
102 traffic_total_received_(0),
103 traffic_total_sent_(0),
104 send_wu_time_(0),
105 send_wu_time_n_(0),
106 recv_wu_time_(0),
107 recv_wu_time_n_(0),
108 machine_letter_( machine_letter )
109 {
110  TR << "Starting MPI_WorkUnitManager.." << std::endl;
111  using namespace basic::options;
112  using namespace basic::options::OptionKeys;
113 
114  start_time_wall_clock_ = time(NULL);
119 
120  TR << "This is node " << mpi_rank() << " Nprocs: " << mpi_npes() << std::endl;
121 
122  outbound().set_memory_limit( option[ OptionKeys::wum::memory_limit ]() * 1000 ); // memory_limit option is in Kilobytes!
123  inbound().set_memory_limit( option[ OptionKeys::wum::memory_limit ]() * 1000 );
124 }
125 
126 
127 
128 
130  return machine_letter_;
131 }
132 
133 
134 
135 
136 
137 void
138 MPI_WorkUnitManager::process_incoming_msgs( bool MPI_ONLY( wait_until_message ) )
139 {
140 #ifdef USEMPI
141  while(true){
142  // Check if there's anything on the line
144  MPI_Status status;
145  int result;
146  TR.Trace << "Probing for incoming messages .." << std::endl;
147 
148  core::Size before_size = outbound().size();
149  while( outbound().size() > 0 ){
150  TR.Trace << "Fulfilling work requests since we have outbound work" << std::endl;
151  MPI_Iprobe( MPI_ANY_SOURCE, WUM_MPI_REQUEST_WU, MPI_COMM_WORLD, &result, &status);
152  if( !result){ // If there are no work requests
153  break; // break out and continue to accept *any* messages
154  }
155  // if we're here that means we got a work request - deal with that
156  // sanity check - this should absolutely be true here
157  if( status.MPI_TAG != WUM_MPI_REQUEST_WU ){
158  TR.Error << "ERROR: status.MPI_TAG != WUM_MPI_REQUEST_WU" << std::endl;
159  break;
160  }
161  TR.Trace << "Someone requested work!" << std::endl;
163  }
164 
165  if( outbound().size() != before_size ){
166  TR.Trace << "Present " << (before_size - outbound().size()) << " units" << std::endl;
167  }
168 
169  TR.Trace << "Now listening for any inbound work" << std::endl;
170  MPI_Iprobe( MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &result, &status);
171  if( !result){ // if there's nothing on the line...
172  if( !wait_until_message ){ // depending on this flag
174  return; // return to caller
175  } else { // or
176  // try again
177  continue;
178  }
179  }
181 
182  // interpret what's there
183  switch( status.MPI_TAG ){
184  case WUM_MPI_REQUEST_WU:
185  TR.Debug << "Sending WU on request to " << status.MPI_SOURCE << std::endl;
187  break;
188  case WUM_MPI_SEND_WU:
189  TR.Debug << "Receiving WU from " << status.MPI_SOURCE << std::endl;
191  return; // now, surely ther eis work
192  default:
193  TR.Error << "Unknown MPI_Message waiting from" << status.MPI_SOURCE << " with tag " << status.MPI_TAG << std::endl;
194 
195 // // receive the message and discard result.
196 // int data;
197 // TR << "Cleaning out unknown data.." << std::endl;
198 // MPI_Recv( &data, 1, MPI_INT, status.MPI_SOURCE, status.MPI_TAG, MPI_COMM_WORLD, &status);
199 // TR << "Cleaned out unknown data.." << std::endl;
200 
201  // can't break here because this could lock the master in an infinite loop
202  // should we somehow clear the queue ?
203  //return;
204  }
206 
207  }
208 #endif
209 }
210 
211 
212 
213 
214 
215 void MPI_WorkUnitManager::send_MPI_workunit( const WorkUnitBaseOP& MPI_ONLY(wu), int MPI_ONLY( dest_rank ) ) const {
216 #ifdef USEMPI
217  const core::Real warning_threshold = 0.050000; // 50ms. usually sends take on the order of 600us so that limit is quite loose. if the limit is breached something is seriously going arwy.
218 
219  runtime_assert( dest_rank < mpi_npes() );
220  runtime_assert( (int)dest_rank != mpi_rank() ); // no self-sending
221  // serialize data
222  wu->serialize();
223  // now send data
224  int size_of_raw_data;
225  unsigned char * raw_data_ptr=NULL;
226  size_of_raw_data = wu->raw_data_dump( &raw_data_ptr );
227 
228 
229  // The first blocking send will only return when the master has recevied the data. Since the data transfer is negligible,
230  // We'll count that as waiting time, not sending time
232  TRDEBUG << "Sending workunit to " << dest_rank << std::endl;
233  // announce that you're about to send data and the size of it
234  double start_wait = get_time();
235  MPI_Ssend( &size_of_raw_data, 1, MPI_UNSIGNED, dest_rank, WUM_MPI_SEND_WU, MPI_COMM_WORLD );
236  TR.Debug << "Sent header announcing incoming WU of size " << F(5,1,(size_of_raw_data/1024.0)) << "kB" << " to node " << dest_rank << std::endl;
238  double start_send = get_time();
239  TR.Debug << "Sending WU" << std::endl;
240  MPI_Ssend( (char*) raw_data_ptr, size_of_raw_data, MPI_CHAR, dest_rank, WUM_MPI_DATA_BLOCK, MPI_COMM_WORLD );
241  TR.Debug << "WU sent" << std::endl;
242  double end_send = get_time();
243  send_wu_time_ += end_send - start_send;
244  send_wu_time_n_++;
245  double last_spent = start_timer( TIMING_CPU );
246  TRDEBUG << "MPI_SEND: " << dest_rank << " " << F(7,1,(start_send - start_wait)*1000.0) << "ms " << F(7,1,(end_send - start_send)*1000.0)
247  << "ms " << F(5,1,(size_of_raw_data/1024.0)) << "kB" << std::endl;
248  if( (end_send - start_wait) > warning_threshold){
249  TR << "WARNING LONG MPI_SEND: " << dest_rank << " " << F(7,1,(start_send - start_wait)*1000.0) << "ms " << F(7,1,(end_send - start_send)*1000.0)
250  << "ms " << F(5,1,(size_of_raw_data/1024.0)) << "kB" << std::endl;
251  }
252 
253  delete [] raw_data_ptr;
254  TR.Trace << " Delete temp data.. " << std::endl;
256  wu->clear_serial_data();
257 
258  traffic_total_sent_ += size_of_raw_data + sizeof( int );
259 #endif
260 }
261 
262 
264 #ifdef USEMPI
265  MPI_Status status;
266  int size_of_raw_data;
267  unsigned char * raw_data_ptr;
268 
270 
271  double start_recv = get_time();
272  TR.Debug << "Receiving MPI header (ie how big the WU will be)" << std::endl;
273  MPI_Recv( &size_of_raw_data, 1, MPI_UNSIGNED, node_rank, WUM_MPI_SEND_WU, MPI_COMM_WORLD, &status);
274  TR.Debug << "Received MPI header, receiving WU of " << F(5,1,(size_of_raw_data/1024.0)) << "kB from node " << status.MPI_SOURCE << std::endl;
275  raw_data_ptr = new unsigned char [size_of_raw_data];
276  // now receive a datablock fromt he very same source
277  TRDEBUG << "Confirmed datablock is coming" << std::endl;
278  MPI_Recv( (char*) raw_data_ptr, size_of_raw_data, MPI_CHAR, status.MPI_SOURCE, WUM_MPI_DATA_BLOCK, MPI_COMM_WORLD, &status);
279  double end_recv = get_time();
280  TRDEBUG << "MPI_RECV: " << status.MPI_SOURCE << F(5,0,(end_recv - start_recv)/1000000.0) << "us " << F(5,1,(size_of_raw_data/1024.0)) << "kB" << std::endl;
281 
282  recv_wu_time_ += end_recv - start_recv;
283  recv_wu_time_n_++;
285 
286  if( raw_data_ptr[size_of_raw_data-1] != 0){
287  TR.Error << " ERROR: cannot load data - terminal zero not found!" << std::endl;
288  return;
289  }
290 
291  raw_data_ptr[size_of_raw_data-1] = 0;
292 
293  traffic_total_received_ += size_of_raw_data + sizeof( int );
294 
295  TRDEBUG << " RECEVIED WU: Data: " << std::endl;
296 
297  WorkUnitBaseOP wu = new WorkUnitBase;
298  runtime_assert( wu );
299  wu->raw_data_load( raw_data_ptr, size_of_raw_data );
300  delete [] raw_data_ptr;
301 
302  // Here at this point we have a WorkUnitBaseOP to a workUnitBase.
303  // Now we need to interpret the id field and upcast or somehow otherwise
304  // create the right type of work unit such that the polymorphic code
305  // for the interpretation of the serial data can take place.
306 
307  WorkUnitBaseOP qualified_wu = work_unit_list().get_work_unit( *wu )->clone();
308 
309  runtime_assert( qualified_wu );
310  // cope over data (the header and the serial data)
311  (*qualified_wu) = (*wu);
312  (*qualified_wu).last_received_from_ = status.MPI_SOURCE;
313  TRDEBUG << " Received: " << std::endl;
314  //if( TRDEBUG.visible() ) qualified_wu->print( TR );
315 
316  qualified_wu->deserialize( );
317  qualified_wu->clear_serial_data();
318  inbound().add( qualified_wu ); // add to stack of WUs to be processed
319  TRDEBUG << "DONE Receiving" << std::endl;
320 
322 #endif
323 }
324 
326 #ifdef USEMPI
327  MPI_Status status;
328  int data;
329 
330  // Receive the actual data (it's been merely MPI_Probed up till now, just preceeding this function)
331  MPI_Recv( &data, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, WUM_MPI_REQUEST_WU, MPI_COMM_WORLD, &status);
332 
333  // Find next work unit which does not blacklist the node that's requesting a workunit
334  // (indicated by MPI_SOURCE)
335  WorkUnitQueue::iterator suitable_work_unit = outbound().begin();
336  for( ; suitable_work_unit != outbound().end(); ++suitable_work_unit )
337  {
338  // break out of loop once a WU is found that matches the above criterion
339  if( !( (*suitable_work_unit)->in_blacklist( status.MPI_SOURCE ) ) ) break;
340  // blurb some debug output if in debug mode
341  TRDEBUG << "WU " << (*suitable_work_unit)->id() << " was not sent to " << status.MPI_SOURCE << " because it was blacklisted." << std::endl;
342  }
343 
344  if( suitable_work_unit == outbound().end() ) {
345  // No more work
346  TRDEBUG << "No suitable work for node " << status.MPI_SOURCE << " ( blacklisted=" << outbound().size() << ")" << std::endl;
347 
348  // craete a idling command workunit - since we have no work for the slave node that's requesting work
349  WorkUnit_WaitOP wait_wu = new WorkUnit_Wait();
350  wait_wu->set_wu_type("waitwu");
351  outbound().push_back( wait_wu );
352 
353  // set suitable_work_unit to the work unit just inserted
354  suitable_work_unit = outbound().end();
355  // after setting suitable_work_unit to outbound().end() we need to
356  // decrement the iterator by one to have it point to the last element
357  --suitable_work_unit;
358  }
359 
360  // at this point there *must* be a work unit in the queue. if not we fucked up bad.
361  runtime_assert( outbound().size() != 0 );
362 
363  TRDEBUG << "Sending next WU on request... "<< std::endl;
365 
366  // if we do, then suitable_work_unit one to the node that requested another job
367  send_MPI_workunit( *suitable_work_unit, status.MPI_SOURCE );
368 
369  // remove the workunit that was just sent
370  outbound().erase( suitable_work_unit );
371 
372  // if error free (ERROR CHECKING!)
373  TRDEBUG << "END Send-on-request" << std::endl;
374 
375 #endif
376 }
377 
378 
379 
381 {
382  core::Real current_time = get_time();
383  core::Real elapsed = 0;
384  runtime_assert( timing_mode < TIMING_end );
385  // analyse old timer
386  if( timing_last_start_time_ != 0 ){
387  elapsed = current_time - timing_last_start_time_;
388  timing_total_[timing_last_type_] += elapsed;
389  }
390 
391  // set new timer
392  timing_last_start_time_ = current_time;
393  timing_last_type_ = timing_mode;
394 
395  return elapsed;
396 }
397 
399  if( time(NULL) - last_stats_ > 60 ){
401  last_stats_ = time(NULL);
402  }
403 }
404 
406  for( core::Size i=0;i<TIMING_end;i++) timing_total_[i] = 0;
407 }
408 
409 
411  return time(NULL) - start_time_wall_clock_;
412 }
413 
415 {
416  core::Real total_secs = 0;
417  total_secs =
425  TR << "STATW" << get_machine_letter() << " " <<
426  I( (int)7, (int)wall_time() ) << "s " <<
427  I( (int)7, total_secs ) << "s " <<
428  F( 4, 1, 100.0f*( timing_total_[TIMING_CPU] / total_secs)) << "% " <<
429  F( 4, 1, 100.0f*( timing_total_[TIMING_TRANSFER_SEND] / total_secs)) << "% " <<
430  F( 4, 1, 100.0f*( timing_total_[TIMING_TRANSFER_RECV] / total_secs)) << "% " <<
431  F( 4, 1, 100.0f*( timing_total_[TIMING_IO_READ] / total_secs)) << "% " <<
432  F( 4, 1, 100.0f*( timing_total_[TIMING_IO_WRITE] / total_secs)) << "% " <<
433  F( 4, 1, 100.0f*( timing_total_[TIMING_WAIT] / total_secs)) << "% " <<
434  F( 4, 1, 100.0f*( timing_total_[TIMING_IDLE] / total_secs)) << "% " <<
435  F( 6, 2, (float(traffic_total_sent_)/1024.0/1024.0) ) << "Mb " <<
436  F( 6, 2, (float(traffic_total_received_)/1024.0/1024.0) ) << "Mb " <<
437  F( 6, 2, (float(send_wu_time_/1000.0/(send_wu_time_n_+1)) ) ) << "ms " <<
438  F( 6, 2, (float(recv_wu_time_/1000.0/(recv_wu_time_n_+1)) ) ) << "ms " <<
439  "";
440 
442 
443  core::Size in_total = inbound().size();
444  core::Size in_total_structs=0;
445  core::Size in_total_structs_memory=0;
446  core::Size in_total_WU_memory=0;
447  inbound().mem_stats( in_total_structs, in_total_structs_memory, in_total_WU_memory );
448 
449  core::Size out_total = outbound().size();
450  core::Size out_total_structs=0;
451  core::Size out_total_structs_memory=0;
452  core::Size out_total_WU_memory=0;
453  outbound().mem_stats( out_total_structs, out_total_structs_memory, out_total_WU_memory );
454 
455  TR <<
456  "IWUs: " << in_total << " " <<
457  "IStruc: " << in_total_structs << " " <<
458  "IMem: " << int((in_total_WU_memory + in_total_structs_memory)/1000.0) << " " <<
459  "OWUs: " << out_total << " " <<
460  "OStruc: " << out_total_structs << " " <<
461  "OMem: " << int((out_total_WU_memory+out_total_structs_memory)/1000.0) << " " <<
462  "OMem: " << outbound().mem_foot_print() <<
463  " kB " <<
464  std::endl;
465 
466 }
467 
468 
469 
470 } // namespace wum
471 } // namespace protocols
472 
473 
474 
475