Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
JobDistributors.hh
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file protocols/jobdist/JobDistributors.hh
11 ///
12 /// @brief
13 /// @author Ian W. Davis
14 
15 
16 #ifndef INCLUDED_protocols_jobdist_JobDistributors_hh
17 #define INCLUDED_protocols_jobdist_JobDistributors_hh
18 
19 #define TAG_NUM_FORMAT_LENGTH 8
20 
21 #ifdef USEMPI
22 #include <mpi.h>
23 #endif
24 
25 // Project headers
26 // This has to come before boinc.hh or we get this error on VC++
27 // '_read' : is not a member of 'std::basic_istream<_Elem,_Traits>'
28 // AUTO-REMOVED #include <utility/io/izstream.hh>
29 
30 #ifdef BOINC
31 #include <protocols/boinc/boinc.hh>
32 #endif // BOINC
33 
35 
36 // AUTO-REMOVED #include <protocols/checkpoint/Checkpoint.hh>
37 
38 // AUTO-REMOVED #include <core/svn_version.hh>
39 #include <core/types.hh>
40 // AUTO-REMOVED #include <core/io/pdb/file_data.hh>
41 // AUTO-REMOVED #include <core/io/atom_tree_diffs/atom_tree_diff.hh>
42 // AUTO-REMOVED #include <core/conformation/Residue.hh>
43 // AUTO-REMOVED #include <basic/options/option.hh>
44 // AUTO-REMOVED #include <core/pose/Pose.hh>
45 #include <basic/Tracer.hh>
46 // AUTO-REMOVED #include <core/scoring/Energies.hh>
47 // AUTO-REMOVED #include <core/scoring/ScoreFunction.hh>
48 // AUTO-REMOVED #include <core/scoring/ScoreFunctionFactory.hh>
49 
50 // AUTO-REMOVED #include <core/io/silent/util.hh>
51 // AUTO-REMOVED #include <core/io/raw_data/DecoyFileData.hh>
52 // AUTO-REMOVED #include <core/io/raw_data/ScoreFileData.hh>
53 // AUTO-REMOVED #include <core/io/silent/SilentFileData.hh>
54 // AUTO-REMOVED #include <core/io/silent/SilentFileData.fwd.hh>
55 // AUTO-REMOVED #include <core/io/silent/SilentStructFactory.hh>
56 
57 // AUTO-REMOVED #include <numeric/random/random.hh>
58 // AUTO-REMOVED #include <numeric/numeric.functions.hh>
59 
60 // AUTO-REMOVED #include <utility/exit.hh>
61 // AUTO-REMOVED #include <utility/file/file_sys_util.hh>
62 #include <utility/file/FileName.hh>
63 #include <utility/io/ozstream.hh>
64 // AUTO-REMOVED #include <utility/vector1.hh>
65 #include <utility/pointer/ReferenceCount.hh>
66 // AUTO-REMOVED #include <utility/string_util.hh>
67 
68 // ObjexxFCL headers
69 // AUTO-REMOVED #include <ObjexxFCL/string.functions.hh>
70 
71 #include <map>
72 #include <set>
73 #include <sstream>
74 #include <string>
75 
76 
77 // option key includes
78 
79 // AUTO-REMOVED #include <basic/options/keys/out.OptionKeys.gen.hh>
80 // AUTO-REMOVED #include <basic/options/keys/run.OptionKeys.gen.hh>
81 
83 #include <core/pose/Pose.fwd.hh>
84 #include <utility/vector1.hh>
85 
86 
87 
88 #ifdef BOINC
89 #ifdef USEMPI
90 Throw a compiler error because MPI and BOINC cannot be used together!
91 If you got this message, something is wrong with your build settings.
92 #endif
93 #endif
94 
95 
96 namespace protocols {
97 namespace jobdist {
98 
99 extern basic::Tracer JobDistributorTracer;
100 
101 ///@brief Coordinates processing of jobs across multiple Rosetta processes.
102 ///
103 ///@details
104 /// Job distributors need to be customized in three different ways:
105 /// - by cluster architecture (none/Condor, MPI, BOINC, etc)
106 /// - by local test for job completion (PDB exists, tag already in silent file, etc)
107 /// - by type of input (single PDB file, pair of PDB files, list of tag IDs, etc)
108 ///
109 /// Cluster architecture is a GLOBAL COMPILE-TIME decision: it's handled by scons,
110 /// it's the same for all executables built at that time, and it should be implemented
111 /// using ifdef's in this base class, by modifying the next_job() method directly.
112 ///
113 /// Test for job completion is a PER-EXECUTABLE decision: it's handled
114 /// by subclassing BaseJobDistributor and implementing the is_finished() method.
115 /// BaseJobDistributor will consult is_finished() in whatever way is appropriate
116 /// for the current cluster architecture to ensure that jobs are not repeated.
117 ///
118 /// Type of input is handled by templating the job distributor on a Job object.
119 /// BasicJob has been provided already, but you can subclass it if you need
120 /// to carry around additional information about the inputs.
121 ///
123 {
124 protected:
125 
127 
128 public:
129 
132  virtual ~BaseJobDistributor();
133 
134  ///@brief If true, sets the next Job and nstruct number to be processed.
135  /// Deliberately not virtual: should not be overriden. Uses the "find_available_job"
136  /// method, which is common to both MPI and standard protocols, but used in slightly
137  /// different manners.
138  bool next_job(BasicJobOP & job, int & struct_n);
139 
140  ///@brief Must be called by client before first call to next_job().
141  /// If overriden by a subclass, it MUST call the superclass implementation.
142  virtual void startup();
143 
144  ///@brief Must be called by client after last call to next_job().
145  /// If overriden by a subclass, it MUST call the superclass implementation.
146  virtual void shutdown();
147 
148  ///@brief Signal that if at all possible, we would like to not be killed while in the critical section.
149  /// If overriden by a subclass, it MUST call the superclass implementation.
150  virtual void begin_critical_section();
151 
152  ///@brief Signal that if at all possible, we would like to not be killed while in the critical section.
153  /// If overriden by a subclass, it MUST call the superclass implementation.
154  virtual void end_critical_section();
155 
156  ///@brief Virtual function for dump_pose that is needed for main_plain_mover
157  virtual void dump_pose_and_map( std::string const &, core::pose::Pose & );
158 
159  ///@brief Virtual function for temp_file main_plain_mover
160  virtual void temp_file( std::string const & );
161 
162  ///@brief Virtual function for score_map that is needed for main_plain_mover
163  /// sets the score_map
164  virtual void score_map( std::map < std::string, core::Real> & );
165 
168 
169  void disable_output(){ nooutput_ = true; }
170  void enable_output(){ nooutput_ = false; }
171 
172  void disable_inprogress(){ inprogress_ = false; }
173  void enable_inprogress(){ inprogress_ = true; }
174 
175  bool ignorefinished() const { return ignorefinished_; }
176  bool nooutput() const { return nooutput_; }
177  bool inprogress() const { return inprogress_; }
178 
179  void set_proc_id( core::Size proc_id, core::Size nproc ) {
180  proc_id_ = proc_id;
181  nproc_ = nproc;
182  }
183  ///@brief get output_tag for current job's current nstruct
184  ///@details by default return current_job's current_nstruct' output_tag.
185  ///Overridden in derived PlainSilentFileJobDistributor to return names with "S_" at the beginning
187 
189 
190 protected:
191 
192  ///@brief Is the given nstruct number of the given input job already finished by the local process?
193  /// To be implemented by subclasses.
194  virtual bool is_finished(BasicJobOP const & job, int struct_n) = 0;
195 
196  ///@brief Restore state from checkpoint file, if it exists
197  virtual void checkpoint_read();
198  ///@brief Save state to checkpoint file, overwriting previous
199  virtual void checkpoint_write();
200  ///@brief Remove checkpoint file (at end of batch)
201  virtual void checkpoint_clear();
202 
203  ///@brief accessor for current_nstruct_
205  ///@brief accessor for current_job owning pointer
207 
208 #ifdef USEMPI
209  /// @brief Check that a call to MPI_Init has occurred already -- for use in runtime_assert statements
210  bool MPI_has_been_initialized() const;
211 
212  /// @brief read access to derived classes for MPI related (const) data
213  /// @details must not be called until startup has been called
214  int mpi_rank() const;
215 
216  /// @brief read access to derived classes for MPI related (const) data
217  /// @details must not be called until startup has been called.
218  int mpi_nprocs() const;
219 
220  /// @brief Node 0 does no work, rather, it tells the other n-1 proccessors
221  /// which jobs they should work on. When all the work has been distributed, it tells the
222  /// slave nodes to spin down. Once all other nodes have been spun-down, it quits.
223  void master_node_distribute_jobs();
224 
225  //// @brief Request a job from node 0. If node 0 has no work left to be done, returns false.
226  bool request_job_from_master_node();
227 
228 #endif
229 
230 private:
231  ///@brief looks for a job that has not yet been started, and stores the index for the job, and the
232  /// nstruct index in the member variables. Called by next_job() and by master_node_distribute_jobs().
233  bool find_available_job();
234 
235 private:
236 
237  bool const overwrite_; // = basic::options::option[ basic::options::OptionKeys::out::overwrite ];
242 
243  //for simple distribution without communication
244  //this Jobdistributor will only process job-nr if ( job_nr mod nprocs_ == proc_id )
247  Size curr_jobid_; //running number
248 
249 
250 #ifdef USEMPI
251  /// Private data, readable by the derived class -- valid only after startup() has been called.
252  int mpi_rank_;
253  int mpi_nprocs_;
254  /// Private data not to be read by the derived class
255  MPI_Status stat_;
256  int tag_;
257 #endif
258 
259 protected:
260  /// ignore already done jobs - redo everything !
262 
263  /// do not write files - useful such things as statistics or rescoring !
264  bool nooutput_;
265 
266  /// write .in_progress files for multiple processor runs
268 
269  /// starttime
271 
272  /// RandomStore - needed for
273 
274 
277 
278  int get_next_random_range(int low, int high);
279 
280 }; // BaseJobDistributor
281 
282 
283 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
284 ///@brief Distributor for use with atomtree_diff silent files.
285 ///@details This class is deliberately designed for each process to work on
286 /// its own silent file (and preferrably in its own directory);
287 /// unlike Rosetta++, we DO NOT share silent files among multiple processes.
288 /// First, true atomic file locking is impossible on most distributed file systems (e.g. NFS),
289 /// meaning that files may be corrupted by multiple simultaneous writes.
290 /// For long-running processes, collisions may be rare,
291 /// but as we scale to more processors it becomes increasingly dangerous.
292 /// Second, many processes writing to the same file (or even directory) can cause
293 /// tremendous file system bottlenecks, possibly bringing down the cluster;
294 /// ask Keith or Chance in the Baker lab for details.
296 {
297 public:
299 
300 protected:
301 
303 
304 public:
305 
307  virtual ~AtomTreeDiffJobDistributor();
308 
309  ///@brief Appends pose to the silent file
310  virtual void dump_pose(
311  std::string const & tag,
312  std::map< std::string, core::Real > const & scores,
313  core::pose::Pose const & ref_pose,
314  core::pose::Pose const & pose
315  );
316 
317  ///@brief Sets number of digits used in writing atomtree diff.
318  virtual void set_precision(
319  int bb_precision,
320  int sc_precision,
321  int bondlen_precision
322  );
323 
324  virtual void shutdown();
325 
326 protected:
327 
328  virtual bool is_finished(BasicJobOP const & job, int struct_n );
329 
330 private:
331 
332  utility::io::ozstream out_;
333  std::set< std::string > used_tags_;
336 
337 };
338 
339 
340 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
341 ///@brief Distributor for use with plain old PDB files.
342 /// Use is strongly discouraged in production environments!
343 ///@details This class is deliberately designed for each process to write
344 /// its own PDB files in its own directory; it checks for pre-existing files only
345 /// for use by stopped and re-started jobs, NOT for coordinating between processes.
346 /// (To coordinate, it would have to "touch" the non-existant file in next_job()
347 /// before starting to process it, but see AtomtreeDiffJobDistributor for an
348 /// explanation of why coordinating processes via the filesystem is a bad idea.)
350 {
351 public:
353 
354 protected:
355 
357 
358 public:
359 
360  PlainPdbJobDistributor(JobVector jobs, std::string outfile_name="none");
361  virtual ~PlainPdbJobDistributor();
362 
363  ///@brief Allows setting of inprogress.
364  virtual void startup();
365 
367 
368  ///@brief Translates an output tag name to an output PDB file name.
369  virtual std::string get_output_filename(std::string const & tag);
370 
371  ///@brief Writes pose and basic score data to a standard PDB file.
372  virtual void dump_pose_and_map(
373  std::string const & tag,
374  core::pose::Pose & pose
375  );
376 
377  ///@brief Opens a temp file (.in_progress)
378  virtual void temp_file(std::string const & tag);
379 
380  void score_map( std::map < std::string, core::Real > & score_map_in ) { score_map_ = score_map_in; }
381 protected:
382 
383  virtual bool is_finished(BasicJobOP const & job, int struct_n );
384 
385  ///@brief Writes score data to PDB file in YAML format.
386  virtual void dump_scores(
387  utility::io::ozstream & out,
388  std::string const & tag,
389  core::pose::Pose & pose
390  );
391 
392 private:
393 
396  std::map< std::string, core::Real > score_map_;
397 
398 };
399 
400 
401 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
402 ///@brief Distributor for use with raw files.
403 ///@details This class is deliberately designed for each process to work on
404 /// its own silent file (and preferrably in its own directory);
405 /// unlike Rosetta++, we DO NOT share silent files among multiple processes.
406 /// First, true atomic file locking is impossible on most distributed file systems (e.g. NFS),
407 /// meaning that files may be corrupted by multiple simultaneous writes.
408 /// For long-running processes, collisions may be rare,
409 /// but as we scale to more processors it becomes increasingly dangerous.
410 /// Second, many processes writing to the same file (or even directory) can cause
411 /// tremendous file system bottlenecks, possibly bringing down the cluster;
412 /// ask Keith or Chance in the Baker lab for details.
414 {
415 public:
417 
418 protected:
419 
421 
422 public:
423 
424  PlainRawJobDistributor(JobVector jobs, std::string outfile_name);
425 
426  virtual ~PlainRawJobDistributor();
427 
428  ///@brief Writes pose and basic score data to a standard silent file.
429  virtual void dump_pose_and_map(
430  std::string const & tag,
431  core::pose::Pose & pose
432  );
433 
435 
436  ///@brief Translates an output tag name to an output PDB file name.
437  virtual std::string get_output_filename(std::string const & tag);
438  virtual std::string get_output_tag( int const & struct_n );
439 
440  void score_map( std::map < std::string, core::Real > & score_map_in ) { score_map_ = score_map_in; }
441 protected:
442 
443  virtual bool is_finished(BasicJobOP const & job, int struct_n );
444 
445 private:
446 
449  std::map < std::string, core::Real > score_map_;
450 
451 };
452 
453 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
454 ///@brief Distributor for use with silent files.
455 ///@details This class is deliberately designed for each process to work on
456 /// its own silent file (and preferrably in its own directory);
457 /// unlike Rosetta++, we DO NOT share silent files among multiple processes.
458 /// First, true atomic file locking is impossible on most distributed file systems (e.g. NFS),
459 /// meaning that files may be corrupted by multiple simultaneous writes.
460 /// For long-running processes, collisions may be rare,
461 /// but as we scale to more processors it becomes increasingly dangerous.
462 /// Second, many processes writing to the same file (or even directory) can cause
463 /// tremendous file system bottlenecks, possibly bringing down the cluster;
464 /// ask Keith or Chance in the Baker lab for details.
466 {
467 public:
469 
470 protected:
472 
473 public:
474 
476 
478 
479  ///@brief Writes pose and basic score data to a standard silent file.
480  virtual void dump_pose(
481  BasicJobOP const & job,
482  int const & nstruct,
483  bool const & fullatom,
484  core::pose::Pose & pose
485  );
486 
487  ///@brief Writes the silent_struct to a silen file
488  void dump_silent(
489  int const & struct_n,
490  core::io::silent::SilentStruct & silent_struct
491  );
492 
493  void dump_silent(
494  core::io::silent::SilentFileData const& silent_file
495  );
496 
497  virtual std::string get_output_tag( BasicJobOP const & job, int const & struct_n ) const;
500 
501  virtual void startup();
502 
503  virtual void shutdown();
504 
505 protected:
506 
507  virtual bool is_finished(BasicJobOP const & job, int struct_n );
508 
509 private:
510 
512 
513 };
514 
515 
516 } // namespace jobdist
517 } // namespace protocols
518 
519 #endif // INCLUDED_protocols_jobdist_JobDistributors_HH