Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
MPIWorkPartitionJobDistributor.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file protocols/jd2/MPIWorkPartitionJobDistributor.cc
11 /// @brief implementation of MPIWorkPartitionJobDistributor - intended for MPI jobs on small numbers of nodes where the load can be balanced equally by the user
12 /// @author P. Douglas Renfrew (renfrew@unc.edu)
13 
14 // MPI headers
15 #ifdef USEMPI
16 #include <mpi.h> //keep this first
17 #endif
18 
19 
20 // Unit headers
22 
23 //Package headers
25 #include <protocols/jd2/Job.hh>
26 
27 #include <protocols/moves/Mover.hh>
28 
29 ///Utility headers
30 #include <basic/Tracer.hh>
31 #include <basic/options/option.hh>
32 #include <utility/exit.hh>
33 
34 // Option headers
35 #include <basic/options/keys/out.OptionKeys.gen.hh>
36 #include <basic/options/keys/run.OptionKeys.gen.hh>
37 
38 
39 ///C++ headers
40 #include <string>
41 
42 //Auto Headers
43 #include <utility/vector1.hh>
44 #include <complex>
45 
46 
47 static basic::Tracer TR("protocols.jd2.MPIWorkPartitionJobDistributor");
48 
49 namespace protocols {
50 namespace jd2 {
51 
52 ///@details constructor. Notice it calls the parent class! It also builds some internal variables for determining
53 ///which processor it is in MPI land (later used in job determination). Note that all processors will have the same
54 ///internal Jobs object (set by the parent class); this class merely iterates over it differently.
57  npes_( 1 ),
58  rank_( 0 ),
59  job_id_start_( 0 ),
60  job_id_end_( 0 ),
61  next_job_to_try_assigning_( 0 )
62 {
63  // set npes and rank based on whether we are using MPI or not
64 
65  using namespace basic::options;
66  using namespace basic::options::OptionKeys;
67  npes_ = option[ OptionKeys::run::nproc ](); //make this same as in "queue"-command in condor script
68  rank_ = option[ OptionKeys::run::proc_id ](); //use this as -jd2:condor_rank $(PROCESS)
69 #ifdef USEMPI
70  //npes_ = MPI::COMM_WORLD.Get_size();
71  //rank_ = MPI::COMM_WORLD.Get_rank();
72  MPI_Comm_rank( MPI_COMM_WORLD, (int*)( &rank_ ) );
73  MPI_Comm_size( MPI_COMM_WORLD, (int*)( &npes_ ) );
74 #endif
75 
78 
79  Jobs const & jobs( get_jobs() );
80  TR << "RANK: " << rank_ << " NUM_PROCS: " << npes_ << " NUM_JOBS: " << jobs.size()
81  << " START_ID: " << job_id_start_ << " END_ID: " << job_id_end_ << std::endl;
82 }
83 
84 ///@brief dtor
85 ///WARNING WARNING! SINGLETONS' DESTRUCTORS ARE NEVER CALLED IN MINI! DO NOT TRY TO PUT THINGS IN THIS FUNCTION!
86 ///here's a nice link explaining why: http://www.research.ibm.com/designpatterns/pubs/ph-jun96.txt
88 {}
89 
90 ///@details All processors will get the same Jobs object; this function determines which slice belongs to a particular
91 ///processor determined solely by its mpi rank and the number of processors, no communication needed
92 /// EXAMPLE CASE: 18 jobs, 4 processors
93 /// processor rank number of jobs assigned range in Jobs vector
94 /// 0 5 1-5
95 /// 1 5 6-10
96 /// 2 4 11-14
97 /// 3 4 15-18
98 void
100 {
101  Jobs const & jobs( get_jobs() );
102 
103  core::Size num_jobs( 0 );
104  core::Size jobs_mod_procs( jobs.size() % npes_ );
105  core::Real jobs_div_procs( core::Real( jobs.size() ) / core::Real( npes_ ) );
106 
107  // calculate number of jobs to run and what id to start at
108 
109  // if jobs_mod_procs == 0 (evenly divisible), an equal number of jobs go to each processor. +1 is because rank_ is
110  // 0-indexed but Jobs is 1-indexed
111  if( jobs_mod_procs == 0 ) {
112  num_jobs = core::Size( jobs_div_procs );
113  job_id_start_ = rank_ * num_jobs + 1;
114  }
115  // if the rank is less than jobs%procs, the number of jobs per processor is the ceiling of jobs/processors; take that
116  // many jobs. +1 is because rank_ is 0-indexed but Jobs is 1-indexed
117  else if( rank_ < jobs_mod_procs ) {
118  num_jobs = (core::Size) std::ceil( jobs_div_procs );
119  job_id_start_ = rank_ * num_jobs + 1;
120  }
121  // if the rank is more than or equal to jobs%procs, the number of jobs per processor is the floor of jobs/processors;
122  // take that many jobs. rank * num jobs accounts for bulk of earlier jobs. jobs_mod_procs accounts for all processors
123  // with rank < jobs_mod_procs getting an extra job because they use ceiling instead of floor in num_jobs. +1 is
124  // because rank_ is 0-indexed but Jobs is 1-indexed
125  else if( rank_ >= jobs_mod_procs ) {
126  num_jobs = (core::Size) std::floor( jobs_div_procs );
127  job_id_start_ = rank_ * num_jobs + jobs_mod_procs + 1;
128  }
129  else {
130  utility_exit_with_message("ERROR: Problem determining job ids to run");
131  }
132 
133  // calculate job_id_end
134  job_id_end_ = job_id_start_ + num_jobs - 1;
135 }
136 
137 void
139 {
140  go_main( mover );
141 #ifdef USEMPI
142  //MPI::COMM_WORLD.Barrier();
143  //MPI::Finalize();
144  MPI_Barrier( MPI_COMM_WORLD );
145  MPI_Finalize();
146 #endif
147 }
148 
149 ///@details determine which job to assign next: increment until we run out of available jobs
152 {
153  Jobs const & jobs( get_jobs() );
154  JobOutputterOP outputter = job_outputter();
155 
157  if ( outputter->job_has_completed( jobs[ next_job_to_try_assigning_ ] ) &&
158  !basic::options::option[ basic::options::OptionKeys::out::overwrite ].value() ) {
160  } else {
161  break;
162  }
163  }
164 
166  core::Size job_to_assign = next_job_to_try_assigning_;
168  return job_to_assign;
169  }
170 
171  // indicate that no jobs remain
172  return 0;
173 }
174 
175 void
177 {
178  runtime_assert( current_job_id() == next_job_to_try_assigning_ - 1 );
181 }
182 
183 ///@details this function handles the FAIL_BAD_INPUT mover status by removing other jobs with the same input from
184 ///consideration. This function DOES NOT percolate across processors - so if multiple processors have jobs starting
185 ///with the same bad input, you will get multiple hits through this function. This is less efficient than it
186 ///theoretically could be (but it's good enough).
187 void
189 {
190  std::string const & current_input_tag(current_job()->input_tag());
191 
192  TR << "job failed, reporting bad input; other jobs of same input will be canceled: "
193  << job_outputter()->output_name( current_job() ) << std::endl;
194 
195  Jobs const & jobs( get_jobs() );
196 
197  while(next_job_to_try_assigning_ <= job_id_end_ && //MUST BE FIRST for c++ shortcut logical evaluation
198  jobs[next_job_to_try_assigning_]->input_tag() == current_input_tag) {
199  TR << "job canceled without trying due to previous bad input: "
200  << job_outputter()->output_name( jobs[next_job_to_try_assigning_] ) << std::endl;
202  }
203 }
204 
205 }//jd2
206 }//protocols