Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
MPIFileBufJobDistributor.hh
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file protocols/jd2/MPIWorkPoolJobDistributor.hh
11 /// @brief header for MPIWorkPoolJobDistributor - intended for MPI jobs on large numbers of nodes where the head node is dedicated to handing out new job ids
12 /// @author Oliver Lange olange@u.washington.edu
13 
14 #ifndef INCLUDED_protocols_jd2_MPIFileBufJobDistributor_hh
15 #define INCLUDED_protocols_jd2_MPIFileBufJobDistributor_hh
16 
17 // Unit headers
19 
20 // Package headers
22 #include <protocols/jd2/Job.fwd.hh>
23 // AUTO-REMOVED #include <protocols/jd2/JobDistributorFactory.hh>
24 
26 #include <core/pose/Pose.fwd.hh>
27 
28 // Utility headers
29 #include <core/types.hh>
30 
31 // C++ headers
32 #include <string>
33 
34 #include <utility/vector1.hh>
35 
36 
37 namespace protocols {
38 namespace jd2 {
39 
40 ///@brief this tag is used for all communication with JobDistributor ( use this tag to be received in the main MSG-loop of jd2 cf. process_message() )
41 core::Size const MPI_JOB_DIST_TAG ( 1542 ); //keep unique
42 
43 ///@brief Tags used to tag messeges sent by MPI functions used to decide whether a slave is requesting a new job id or
44 ///flagging as job as being a bad input
45 //FileBufjobDistributor messages range 1-100
50 
51 ///@details This JobDistributor is intended for machines where you have a large number of processors.
52 /// two dedicated processes are used to handle JobDistribution and File-IO.
53 /// all other processes (higher rank ) are used for computation.
54 /// the file_buf_rank_ process runs the MpiFileBuffer which is at the receiving end of all ozstream output that is rerouted via MPI from the
55 /// slave nodes.
56 /// This means all slaves write to the same file without FileSystem congestion and interlacing in the file -- IO is handled from a single dedicated process
57 /// The other dedicated process (master_rank) runs the actual JobDistributor
58 /// and is only used to distribute jobs to slaves and receive their notification of successful or failed execution
59 /// in case you have only a small number of processors you can put say 10 MPI processes on 8 processors to get optimal CPU usage.
61 {
63 protected:
64  ///@brief ctor is protected; singleton pattern
66 
67  ///@brief protected ctor for child-classes
69 
70  virtual void handle_interrupt() {}
71 
72 public:
73  ///WARNING WARNING! SINGLETONS' DESTRUCTORS ARE NEVER CALLED IN MINI! DO NOT TRY TO PUT THINGS IN THIS FUNCTION!
74  ///here's a nice link explaining why: http://www.research.ibm.com/designpatterns/pubs/ph-jun96.txt
75  virtual ~MPIFileBufJobDistributor();
76 
78  return ++min_client_rank_;
79  }
80 
81  ///@brief return rank of first worker process (there might be more dedicated processes, e.g., ArchiveManager...)
83  return min_client_rank_;
84  }
85 
86 
87 
88  ///@brief dummy for master/slave version
89  virtual
90  void
92 
93 
94  ///@brief dummy for master/slave version
95  virtual
98 
99  ///@brief dummy for master/slave version
100  virtual
101  void
103 
104  ///@brief dummy for master/slave version
105  virtual
106  void
108 
109  ///@brief dummy for master/slave version
110  virtual
111  void
112  job_succeeded(core::pose::Pose & pose, core::Real runtime);
113 
114  virtual
115  void
116  job_failed( core::pose::Pose & pose, bool will_retry );
117 
118  friend class JobDistributorFactory; //ctor access
119 
120 protected:
121 
122  //return true if message was understood
123  virtual bool process_message(
124  core::Size msg_tag,
125  core::Size slave_rank,
126  core::Size slave_job_id,
127  core::Size slave_batch_id,
128  core::Real runtime
129  );
130 
131  //overloaded so that slave-nodes never automatically switch to next_batch when spinning down.
132  virtual bool next_batch();
133 
134  ///@brief Handles the receiving of job requests and the sending of job ids to and from slaves
135  void master_go( protocols::moves::MoverOP mover );
136 
137  ///@brief Always returns zero, simply increments next_job_to_assign_ to the next job that should be run based
138  ///on what has been completeted and the overwrite flags
140 
141  ///@brief requests, receives, and returns a new job id from the master node or returns the current job id if the
142  ///repeat_job_ flag is set to true
144 
145  ///@brief This should never be called as this is handled internally by the slave nodes, it utility_exits
147 
148  ///@brief Sets the repeat_job_ flag to true
150 
151  ///@brief Simply increments next_job_to_assign_ to the next job that should be run based on what has been
152  ///completed and if the input job tag of the job marked as having bad input
154 
155  ///@brief Sends a message to the head node that contains the id of a job that had bad input
157 
158  ///@brief This should never be called as this is handled internally by the slave nodes, it utility_exits
160 
161  ///@brief Sends a message to the head node upon successful job completion to avoid output interleaving
163 
164  ///@brief send a message to master
165  void slave_to_master( core::Size tag );
166 
167  ///@brief called by master to send and by slave to receive job
168  void send_job_to_slave( core::Size slave_rank );
169 
170  ///@brief return rank of this process
171  core::Size rank() const {
172  return rank_;
173  }
174 
175  ///@brief return rank of master process ( where JobDistributor is running )
177  return master_rank_;
178  }
179 
180  ///@brief return rank of file-buffer process ( where output data (via ozstream )is handled )
182  return file_buf_rank_;
183  }
184 
185  ///@brief how many processes --- this includes dedicated processes
187  return n_rank_;
188  }
189 
190  ///@brief how many processes --- this includes dedicated processes
192  return n_rank_;
193  }
194 
195  ///@brief how many workers --- important to keep track during spin-down process
197  return n_worker_;
198  }
199 
200  ///@brief how many workers --- important to keep track during spin-down process
201  void set_n_worker( core::Size setting ) {
202  n_worker_=setting;
203  }
204 
205  ///@brief marks job as completed in joblist
206  virtual void mark_job_as_completed( core::Size job_id, core::Size batch_id, core::Real runtime );
207 
208  ///@brief marks job as bad in joblist
209  virtual void mark_job_as_bad( core::Size job_id, core::Size batch_id );
210 
211  ///@brief receive a certain signal and ignore it.... this is needed, for instance, when MPIArchiveJobDistributor triggers an
212  /// ADD_BATCH signal by sending QUEUE_EMPTY to the ArchiveManager...
213  void eat_signal( core::Size signal, int source );
214 
215 private:
216 
217  ///@brief total number of processing elements
219 
221 
222  ///@brief rank of the "local" instance
224 
225  ///@brief where slave jobs store current job id
226  core::Size slave_current_job_id_; //this overlays current_job_id_ of base class.... BAD
227 
228  ///@brief batch_id allow to run multiple batches of jobs -
229  core::Size slave_current_batch_id_; //i.e. next_job_to_assign_ is from this batch (for master)
230 
231  ///@brief runtime of last job
232  core::Real slave_current_runtime_; //i.e. next_job_to_assign_ is from this batch (for master)
233 
234  ///@brief where master stores next job to assign (in a good state after get_new_job_id up until it's used)
235  //core::Size next_job_to_assign_;
236 
237  ///@brief where master temporarily stores id of jobs with bad input
239 
240  ///@brief where slave stores whether it should repeat its current job id
242 
243  ///@brief keep some statistics about the jobs
244  /// this is mostly just for silly tr.Info messages...
245  //// but we need some of this to properly spin-down at the end
246 
247  ///@brief jobs send to slave-nodes
249 
250  ///@brief jobs that have returned (either, bad or good )
252 
253  ///@brief jobs that have returned bad
255 
256  ///@brief how many more to spin down
258 
259  ///@brief keep here the ranks of different functional processes
260 
261  ///@brief the job-distributor (master)
263 
264  ///@brief the File-Buffer
266 
267  ///@brief the first slave node
268  //core::Size const min_client_rank_; //2 or 3 ...
269  core::Size min_client_rank_; //2 or 3 ... //ek made non-const
270 
271  ///@brief keep track of average timings for time-outs
273 
275 
276 };
277 
278 }//jd2
279 }//protocols
280 
281 #endif //INCLUDED_protocols_jd2_MPIFileBufJobDistributor_HH