Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ArchiveManager.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // This file is part of the Rosetta software suite and is made available under license.
5 // The Rosetta software is developed by the contributing members of the Rosetta Commons consortium.
6 // (C) 199x-2009 Rosetta Commons participating institutions and developers.
7 // For more information, see http://www.rosettacommons.org/.
8 
9 /// @file protocols/jd2/MPIFileBufJobDistributor.cc
10 /// @brief implementation of MPIFileBufJobDistributor
11 /// @author Oliver Lange olange@u.washington.edu
12 
13 
14 // MPI headers
15 #ifdef USEMPI
16 #include <mpi.h> //keep this first
17 #endif
18 
19 // Unit headers
23 #include <protocols/jd2/BatchJobInputter.hh> //for BOGUS_BATCH_ID
24 
25 // Package headers
27 
28 
29 //for factory
30 //#include <protocols/abinitio/IterativeAbrelax.hh>
31 
32 // Utility headers
33 #include <basic/Tracer.hh>
34 #include <basic/MemTracer.hh>
35 #include <basic/options/option.hh>
36 #include <utility/exit.hh>
37 
39 
40 #include <ObjexxFCL/string.functions.hh>
41 #include <utility/file/file_sys_util.hh>
42 
43 // Option headers
44 #include <basic/options/keys/out.OptionKeys.gen.hh>
45 #include <basic/options/keys/jd2.OptionKeys.gen.hh>
46 #include <basic/options/keys/in.OptionKeys.gen.hh>
47 #include <basic/options/keys/run.OptionKeys.gen.hh>
48 #include <basic/options/keys/broker.OptionKeys.gen.hh>
49 #include <basic/options/keys/abinitio.OptionKeys.gen.hh>
50 
51 //#include <basic/options/keys/jd2.OptionKeys.gen.hh>
52 #include <basic/options/option.cc.gen.hh>
53 #include <basic/options/option_macros.hh>
54 #include <basic/prof.hh>
55 
56 // C++ headers
57 #include <string>
58 // AUTO-REMOVED #include <ctime>
59 #include <sstream>
60 #include <iterator>
61 
62 //Debug headers
63 // AUTO-REMOVED #include <protocols/abinitio/AbrelaxMover.hh>
64 #include <fstream> //testing
65 #include <utility/io/izstream.hh>
66 
67 #include <utility/io/ozstream.hh>
68 
69 #if (defined WIN32) //&& (!defined WIN_PYROSETTA)
70  #include <windows.h>
71 #endif
72 
73 //Auto Headers
75 //#include <protocols/jobdist/Jobs.hh>
76 //#include <protocols/noesy_assign/CrossPeak.hh>
77 #include <utility/vector1.hh>
78 #include <boost/bind.hpp>
79 
80 static basic::Tracer tr("protocols.jd2.Archive");
81 using basic::mem_tr;
82 
83 OPT_1GRP_KEY( File, iterative, input_pool )
84 OPT_1GRP_KEY( String, iterative, input_pool_struct_type )
85 
86 bool protocols::jd2::archive::ArchiveManager::options_registered_( false );
87 
88 using namespace basic::options;
89 using namespace basic::options::OptionKeys;
90 //Mike: when you want to remove these Macros... leave them at least here as comment - since they provide documentation
91 void protocols::jd2::archive::ArchiveManager::register_options() {
92  if ( !options_registered_ ) {
93  options_registered_ = true;
94  NEW_OPT( iterative::input_pool, "read these structures into pool", "" );
95  NEW_OPT( iterative::input_pool_struct_type, "specifies the input-silent-struct type", "protein" );
96  }
97 }
98 
99 namespace protocols {
100 namespace jd2 {
101 namespace archive {
102 
103 #ifdef WIN32
104  void sleep(int seconds){
105  //#if (defined WIN32) && (!defined WIN_PYROSETTA)
106  Sleep( seconds * 1000 );
107  //#endif
108  }
109 #endif
110 
111 using namespace basic::options;
112 using namespace basic::options::OptionKeys;
113 using namespace core;
114 
115 
116 
117 
119 Batch::batch() const {
120  return "batch_" + ObjexxFCL::lead_zero_string_of( id(), 6 );
121 }
122 
124  return batch() + "/";
125 }
126 
128  return batch() + "/decoys.out";
129 }
130 
132  // if ( has_silent_in() )
133  return batch() + "/decoys.in";
134  // else
135  // return "";
136 }
137 
139  return batch() + "/score.fsc";
140 }
141 
143  return batch() + "/flags";
144 }
145 
147  return batch() + "/setup.tpb";
148 }
149 
151  return "";
152 }
153 
154 void Batch::show( std::ostream& out, bool single_line ) const {
155  std::string eol( single_line ? " " : "\n" );
156  out << "ID " << id() << eol
157  << "INPUT " << ( has_silent_in() ? "yes" : "no" ) << eol
158  << "NSTRUCT " << nstruct() << eol
159  << "RETURNED " << decoys_returned() << eol
160  << "FINISHED " << ( has_finished() ? "yes" : "no" ) << eol
161  << "CANCELLED " << ( is_cancelled() ? "yes" : "no" ) << eol
162  << "ALLOW_READING_CANCELLED_DECOYS " << ( allow_reading_cancelled_decoys() ? "yes" : "no" ) << eol;
163 }
164 
165 std::ostream& operator<< (std::ostream& out, Batch const& batch ) {
166  batch.show( out, true );
167  return out;
168 }
169 
171  utility::io::ozstream out( dir() + "BATCH_INFO" );
172  tr.Debug << "write batch info " << dir() << "BATCH_INFO" << std::endl;
173  show( out, false /*not single_line*/ );
174 }
175 
177  core::Size this_id = id(); //to detec errors
178  std::string this_batch = batch(); //for error report
179  utility::io::izstream in( dir() + "BATCH_INFO" );
180  if ( !in.good() ) throw( EXCN_Archive( "cannot find " + dir() + "BATCH_INFO" ) );
181  in >> *this;
182  if ( this_id != id() ) {
183  throw( EXCN_Archive("Inconsistency detected when reading BATCH_INFO for "+ this_batch+" ID in BATCH_INFO is " + batch() ) );
184  }
185 }
186 
187 //instead of goto statements: I think goto would be clearer... but there are coding guidlines to adhere...
188 void report_tag_error( Batch& batch, std::string const& expected_tag, std::string const& tag ) {
189  throw( EXCN_Archive( "Error reading batch information for batch: "+batch.batch()+" expected_tag: "+expected_tag+ " found " + tag) );
190 }
191 
192 void report_value_error( Batch& batch, std::string const& tag ) {
193  throw( EXCN_Archive( "Error reading batch information for batch: "+batch.batch()+" wrong value for tag: "+tag ) );
194 }
195 
196 std::istream& operator >> (std::istream& in, Batch &batch ) {
197  std::string tag;
198  std::string expected_tag;
199 
200  in >> tag;
201  expected_tag = "ID";
202  if ( tag == expected_tag ) {
203  in >> batch.batch_id_;
204  if ( !in.good() ) report_value_error( batch, tag );
205  } else report_tag_error( batch, expected_tag, tag );;
206 
207  in >> tag;
208  expected_tag = "INPUT";
209  if ( tag == expected_tag ) {
210  std::string yesno;
211  in >> yesno;
212  if ( !in.good() ) report_value_error( batch, tag );
213  if ( yesno == "yes" ) batch.has_silent_in_ = true;
214  else if ( yesno == "no" ) batch.has_silent_in_ = false;
215  else report_value_error( batch, tag );
216  } else report_tag_error( batch, expected_tag, tag );;
217 
218  in >> tag;
219  expected_tag = "NSTRUCT";
220  if ( tag == expected_tag ) {
221  in >> batch.nstruct_;
222  if ( !in.good() ) report_value_error( batch, tag );
223  } else report_tag_error( batch, expected_tag, tag );;
224 
225  in >> tag;
226  expected_tag = "RETURNED";
227  if ( tag == expected_tag ) {
228  in >> batch.decoys_returned_to_archive_;
229  if ( !in.good() ) report_value_error( batch, tag );
230  } else report_tag_error( batch, expected_tag, tag );;
231 
232  in >> tag;
233  expected_tag = "FINISHED";
234  if ( tag == expected_tag ) {
235  std::string yesno;
236  in >> yesno;
237  if ( !in.good() ) report_value_error( batch, tag );
238  if ( yesno == "yes" ) batch.has_finished_ = true;
239  else if ( yesno == "no" ) batch.has_finished_ = false;
240  } else report_tag_error( batch, expected_tag, tag );;
241 
242  in >> tag;
243  expected_tag = "CANCELLED";
244  if ( tag == expected_tag ) {
245  std::string yesno;
246  in >> yesno;
247  if ( !in.good() ) report_value_error( batch, tag );
248  if ( yesno == "yes" ) batch.is_cancelled_ = true;
249  else if ( yesno == "no" ) batch.is_cancelled_ = false;
250  } else report_tag_error( batch, expected_tag, tag );;
251 
252  return in;
253 }
254 
255 //#ifndef WIN32
256 
257 ///@details constructor. Notice it calls the parent class! It also builds some internal variables for determining
258 ///which processor it is in MPI land.
259 ArchiveManager::ArchiveManager( core::Size archive_rank, core::Size jd_master_rank, core::Size file_buf_rank ) :
260  archive_rank_( archive_rank ),
261  jd_master_rank_( jd_master_rank ),
262  file_buf_rank_( file_buf_rank ),
263  save_archive_time_interval_( 60 )
264 {
265  runtime_assert( options_registered_ );
266 }
267 
270  for ( BatchList::const_iterator it = batches().begin(); it != batches().end(); ++it ) {
271  if ( !it->has_finished() && !it->is_cancelled() && it->valid() ) ++unfinished_batches;
272  }
273  return unfinished_batches;
274 }
275 
276 void
278 {
279  tr.Debug << "starting ArchiveManager ..." << archive_rank_ << " " << jd_master_rank_ << " " << file_buf_rank_ << std::endl;
280  theArchive_ = archive;
281  theArchive_->set_manager( this );
282  theArchive_->initialize();
283  mem_tr << "initialized IterativeAbrelax" << std::endl;
284  try {
285  if ( !restore_archive() ) {
286  if ( option[ OptionKeys::iterative::input_pool ].user() ) {
287  std::string const& decoys( option[ OptionKeys::iterative::input_pool ]() );
288  tr.Info << "reading decoys from " << decoys << " into archive " << std::endl;
289  core::io::silent::SilentFileData sfd( decoys, false, false, option[ OptionKeys::iterative::input_pool_struct_type ]() );
290  sfd.read_file( decoys );
291  theArchive_->init_from_decoy_set( sfd );
292  }
293  }
294  save_archive();
296  } catch ( utility::excn::EXCN_Base& excn ) {
298  throw;
299  }
300  // if ( batches_.size() == 0 ) theArchive_->generate_batch();
301  sleep( 5 ); //give JobDistributor time to start up...
302 #ifdef USEMPI
303  MPI_Status status;
304  MPI_Request request;
305 #endif
306  bool stop( false );
307  bool print_status( true );
308  while ( !stop || unfinished_batches() ) {
309 
310  if ( print_status && tr.Debug.visible() ) {
311  tr.Debug << "probing for message in ArchiveManager" << std::endl;
312  tr.Debug << "\nSTATUS: " << (stop ? "STOP send: " : "" ) << " ------ unfinished_batches: " << unfinished_batches() << std::endl;
313  tr.Debug << "POOL_STATUS: " << std::endl;
314  theArchive_->save_status( tr.Debug );
315  tr.Debug << "END_STATUS\n\n"<< std::endl;
316  basic::show_time( tr, "manager main msg-loop: probe for message..." );
317  print_status = false;
318  }
319  //is there a message ?
320  int flag( -1 );
321 #ifdef USEMPI
322  //no idea why... but 4 request seems to be the magical number... to receive the correct answer ... WEIRD
323  MPI_Iprobe( jd_master_rank_, MPI_ARCHIVE_TAG, MPI_COMM_WORLD, &flag, &status );
324  MPI_Iprobe( jd_master_rank_, MPI_ARCHIVE_TAG, MPI_COMM_WORLD, &flag, &status );
325  MPI_Iprobe( jd_master_rank_, MPI_ARCHIVE_TAG, MPI_COMM_WORLD, &flag, &status );
326  MPI_Iprobe( jd_master_rank_, MPI_ARCHIVE_TAG, MPI_COMM_WORLD, &flag, &status );
327  MPI_Iprobe( jd_master_rank_, MPI_ARCHIVE_TAG, MPI_COMM_WORLD, &flag, &status );
328 #endif
329  // if ( !flag ) { //nothing ...
330  // //tell JobDistributor, that we are ready to receive message
331  // int buf[ 4 ];
332  // buf[ 0 ] = NOTIFICATION_QUERY;
333  // MPI_Send( &buf, 1, MPI_INT, jd_master_rank_, MPI_JOB_DIST_TAG, MPI_COMM_WORLD );
334  // sleep( 1 );
335  // //check if there is something this time...
336  // MPI_Iprobe( jd_master_rank_, MPI_ARCHIVE_TAG, MPI_COMM_WORLD, &flag, &status );
337  // }
338 
339  try {
340  //if there is a message -- go get it.
341  int buf[ 6 ]={0,0,0,0,0,0};
342  if ( flag ) {
343 #ifdef USEMPI
344  int merrno = MPI_Recv( &buf, 6, MPI_INT, jd_master_rank_, MPI_ARCHIVE_TAG, MPI_COMM_WORLD, &status );
345  if ( merrno != MPI_SUCCESS ) tr.Error << "ERROR: MPI_Recv error " << std::endl;
346 #endif
347  // basic::show_time( tr, "manager main msg-loop: received message..." );
348  } else { //nothing received
349  // basic::show_time( tr, "manager main msg-loop: no message: idle..." );
350  idle();
351  continue;
352  }
353  print_status = true;
354  // here if we got a message
355  Size const msg_tag( buf[ 0 ]);
356  tr.Debug << "received message in ArchiveManager " << msg_tag << std::endl;
357 
358  switch( msg_tag ) {
359  case JOB_COMPLETION: {
360  Size const batch_id( buf[ 1 ] );
361  bool const final( buf[ 2 ] == 1 );
362  Size const bad( buf[ 3 ] );
363  Size const good( buf[ 4 ] );
364  Size const total( buf[ 5 ] ); //total nr of jobs
365  basic::show_time( tr, "ArchiveManager receveid job-completion..." );
366  tr.Debug << "ArchiveManager received JOB_COMPLETION " << batch_id << " " << bad << " " << good << " " << total << std::endl;
367  jobs_completed_[ batch_id ] = CompletionMessage( batch_id, final, bad, good, total );
368  break; //switch
369  }
370  case QUEUE_EMPTY: {
371  Size const batch_id( buf[ 1 ] );
372 
373  //we ignore QUEUE_EMPTY if we know that a new batch has been submitted after issuing of this signal (i.e., the batch-number
374  // coming with the message would be smaller than the currently highest batch number... however, there might be invalid batches...
375  // find last valid and unfinished batch in list:
376  Size max_working_batch_id( batches_.size() );
377  if ( batches_.size() ) {
378  while ( max_working_batch_id > 0
379  && ( !batches_[ max_working_batch_id ].valid() || batches_[ max_working_batch_id ].has_finished() ) )
380  --max_working_batch_id;
381  if ( batch_id <= max_working_batch_id ) {
382  tr.Info << "ArchiveManager ignored outdated QUEUE_EMPTY with batch_id " << batch_id << " -- already submitted " << batches_.size() << std::endl;
383  break; //switch
384  }
385  }
386  //any job-completions we should work thru before generating a new batch?
387  PROF_START( basic::ARCHIVE_CRITICAL_JOBSCOMPLETE );
388  while ( jobs_completed_.size() ) {
389  jobs_completed(); //get thru these before making job decisions
390  }
391  PROF_STOP( basic::ARCHIVE_CRITICAL_JOBSCOMPLETE );
392  // theArchive_->idle(); why was this in the job-completed loop ?
393 
394  PROF_START( basic::ARCHIVE_GEN_BATCH );
395  //this is a valid QUEUE_EMPTY request: do something about it
396  tr.Info << "ArchiveManager received QUEUE_EMPTY" << std::endl;
397  tr.Debug << "JD batch_id: " << batch_id << " max_working_batch_id: " << max_working_batch_id << std::endl;
398  basic::show_time( tr, "manager main msg-loop: queue empty..." );
399  if ( !theArchive_->finished() ) {
400  //if !finished Archive should always generate a batch...
401  //but let's make sure by monitoring, since it would be bad if we hang in the communication...
402  Size ct( batches_.size() );//monitor number of batches
403  if ( !stop ) theArchive_->generate_batch();
404  if ( ct == batches_.size() ) { //if generate_batch didn't create anything --- we still owe Jobdistributor a signal
405  send_stop_to_jobdistributor(); //send stop
406  stop = true;
407  }
408  } else {
409  tr.Debug << "archive is finished ... spinning down" << std::endl;
411  stop = true;
412  }
413  PROF_STOP( basic::ARCHIVE_GEN_BATCH );
414  basic::prof_show();
415  break; //switch
416  }
417  default:
418  utility_exit_with_message( "unknown msg in ArchiveManager " + ObjexxFCL::string_of( msg_tag ) );
419  } //switch
420  } catch ( utility::excn::EXCN_Base &excn ) {
421  basic::show_time( tr, "Exception in main msg-loop !" );
422  tr.Error << "[ERROR] " << excn.msg() << std::endl;
423  tr.Error << "spinning down" << std::endl;
424  save_archive();
425  //this usually doesn't work the jobs always run to completion ... let's hard exit for now.
426  utility_exit_with_message( "error detected in ArchiveManager -- spinning down" );
428  stop = true;
429  }
430  } //while loop
431  save_archive();
432  tr.Info << "ArchiveManager finished !!!" << std::endl;
433 }
434 
435 void
437 
438  { //save archive
439  static time_t last_save( time(NULL) );
440  time_t now( time( NULL ) );
441  Size const elapsedtime( now - last_save );
442  if ( elapsedtime > save_archive_time_interval_ ) {
443  save_archive();
444  last_save = now;
445  }
446  }
447 
448  // tr.Debug << "idle..." << std::endl;
449  if ( jobs_completed_.size() ) {
450  PROF_START( basic::ARCHIVE_JOBSCOMPLETE );
451  jobs_completed();
452  PROF_STOP( basic::ARCHIVE_JOBSCOMPLETE );
453  return;
454  };
455 
456  // if ( !theArchive_->finished() && theArchive_->ready_for_batch() ) {
457  // theArchive_->generate_batch();
458  // } else {
459  time_t before( time(NULL) );
460  theArchive_->idle();
461  time_t after( time( NULL ) );
462  if ( after-before > 1 ) tr.Debug << "spend " << after-before << " seconds in archives idle method... " << std::endl;
463  //sleep some more if idle didn't use much time
464  if ( after-before < 5 ) sleep( (5 - ( after - before )) );
465  // }
466 }
467 
468 void
469 ArchiveManager::jobs_completed() {// core::Size batch_id, bool final, core::Size bad ) {
470  runtime_assert( jobs_completed_.begin() != jobs_completed_.end() );
471  CompletionMessage msg = jobs_completed_.begin()->second;
472  jobs_completed_.erase( jobs_completed_.begin() );
473  Size batch_id( msg.batch_id );
474  bool final( msg.final );
475  Size bad( msg.bad );
476  Size good_decoys( msg.good );
477  Batch const& batch( batches_[ batch_id ] );
478 
479  // here if in integration-test mode, jump out if not final
480  if ( option[ run::constant_seed ] && !final ) return;
481 
482  tr.Debug << "jobs_completed for " << batch.batch() << "..." << "already "
483  << batch.decoys_returned() << " decoys known" << std::endl;
484  runtime_assert( batch.id() == batch_id );
486  if ( bad ) {
487  ///there were some bad-jobs --- we might be at the end of this run... hard to tell
488  }
489  PROF_START( basic::ARCHIVE_BLOCK_FILE );
490  if ( !final ) {
491  tr.Debug << "not final ... block file" << std::endl;
492  //careful if file isn't in FileBuf anymore... create it just for blocking ? don't block... ?
493  file_buf.block_file( ".//"+batch.silent_out() ); //destructor will release file automatically
494  } else {
495  tr.Debug << "final ... close file " << std::endl;
496  // file_buf.close_file( ".//"+batch.silent_out() ); //that is not very nice, but file-buf isn't very smart with filenames...
497  // file_buf.close_file( ".//"+batch.score_file() ); // not required since now we have garbage-collection
498  }
499  if ( batch.is_cancelled() && !batch.allow_reading_cancelled_decoys() ) {
500  tr.Debug << "returned decoys of cancelled batch.. ignore..." << std::endl;
501  return;
502  }
503  PROF_STOP( basic::ARCHIVE_BLOCK_FILE );
504  //sleep( 5 );
505  PROF_START( basic::ARCHIVE_READ_DECOYS );
506  utility::vector1< std::string > tags_in_file;
507 
508 
509  if ( good_decoys ) {
510  tr.Debug << "read file " << batch.silent_out() << std::endl;
511  utility::io::izstream testin( batch.silent_out() );
512  tr.Debug << "stream is " << ( testin.good() ? "good " : "bad" ) << std::endl;
513  if ( !testin.good() ) { //this happens sometimes... usually it needs a little bit of waiting and then it works -- NFS lag ?
514  //let's look at this later...
515  jobs_completed_[ batch_id ] = msg;
516  sleep( 5 );
517  return;
518  }
519 
520  using namespace core::io::silent;
521  SilentFileData sfd;
522 
523  //this keeps order as in file... important since we skip already known tags by just keeping their number
524  sfd.read_tags_fast( batch.silent_out(), tags_in_file );
525 
526  if ( !final ) {
527  tr.Debug << "...and release file" << std::endl;
528  file_buf.release_file( ".//"+batch.silent_out() );
529  }
530 
531  tr.Debug << "found " << tags_in_file.size() << " decoys in " << batch.silent_out() << std::endl;
532 
533  utility::vector1< std::string >::iterator iter = tags_in_file.begin();
534  for ( Size ct = 1;
535  iter != tags_in_file.end() && ct <= batch.decoys_returned(); ++iter, ++ct ) { }; //just skipping...
536  utility::vector1< std::string > tags_to_read;
537 
538  std::copy( iter, tags_in_file.end(), std::back_inserter( tags_to_read ) );
539  if ( tags_to_read.size() ) {
540  try {
541  sfd.read_file( batch.silent_out(), tags_to_read );
542  } catch ( utility::excn::EXCN_Base& excn ) { //or should we be more specific ?
543  if ( final ) throw; //rethrow if it is the final version of the file...
544  tr.Error << "[ignored ERROR] " << excn.msg() << std::endl;
545  tr.Error << "this is not the final version of " << batch.silent_out() << "\n... maybe some data is still held in a cache of the filesystem..."
546  << " let's see if it works better the next time we have to read" << std::endl;
547  //or sleep( 5 ) and retry as above ?
548  return;
549  }
550 
551  PROF_STOP( basic::ARCHIVE_READ_DECOYS );
552  tr.Debug << "add " << tags_to_read.size() << " structures to archive " << std::endl;
553 
554  PROF_START( basic::ARCHIVE_EVAL_DECOYS );
555 
556  { //now update our batch information so that this is already known in read_structures
557  Batch& batch( batches_[ batch_id ] );
558  batch.set_decoys_returned( tags_in_file.size() );
559  if ( final ) {
560  batch.mark_as_finished();
561  }
562  }
563  //read structures and add to archive
564  theArchive_->read_structures( sfd, batch );
565  PROF_STOP( basic::ARCHIVE_EVAL_DECOYS );
566  } else {
567  tr.Info << "no more decoys to read from file " << batch.silent_out() << std::endl;
568  PROF_STOP( basic::ARCHIVE_READ_DECOYS );
569  }
570 
571 
572  PROF_START( basic::SAVE_ARCHIVE );
573  if ( jobs_completed_.size() == 0 ) save_archive();
574  PROF_STOP( basic::SAVE_ARCHIVE );
575  } else { // no good decoys found
576  tr.Debug << " no good decoys to read " << std::endl;
577  throw EXCN_Archive( "all decoys returned with FAIL_BAD_INPUT" );
578  }
579 
580 
581  { //now update our batch information and save to disck
582  Batch& batch( batches_[ batch_id ] );
583  batch.set_decoys_returned( tags_in_file.size() );
584  if ( final ) {
585  batch.mark_as_finished();
586  }
587  batch.write_info_file();
588  }
589 }
590 
591 void
593  tr.Debug << "queue new batch into MPIArchiveJobDistributor " << batch.flag_file() << std::endl;
594 #ifdef USEMPI
595  Size const size( 3 );
596  int buf[ size ];
597  buf[ 0 ] = ADD_BATCH;
598  buf[ 1 ] = batch.id();
599  buf[ 2 ] = batch.nstruct();
600 //#ifdef USEMPI
601  MPI_Send( &buf, size, MPI_INT, jd_master_rank_, MPI_JOB_DIST_TAG, MPI_COMM_WORLD );
602  //need to have MPI_JOB_DIST_TAG... since it goes into main msg-loop of JobDist
603 
604  //send size of string
605  std::string strbuf( batch.flag_file() );
606  buf[ 0 ] = strbuf.size();
607  buf[ 1 ] = batch.id();
608  MPI_Send( buf, 2, MPI_INT, jd_master_rank_, MPI_JOB_DIST_TAG, MPI_COMM_WORLD );
609  //send string
610  MPI_Send(const_cast<char*> ( strbuf.data() ), strbuf.size(), MPI_CHAR, jd_master_rank_, MPI_JOB_DIST_TAG, MPI_COMM_WORLD );
611 #else
613 #endif
614 
615 }
616 
617 void ArchiveManager::cancel_batches_previous_to( core::Size batch_id, bool allow_reading_of_decoys ) {
618  for ( BatchList::iterator it = batches_.begin(); it!=batches_.end(); ++it) {
619  if ( it->id() == batch_id ) break;
620  cancel_batch( *it, allow_reading_of_decoys );
621  }
622 }
623 
624 void
625 ArchiveManager::cancel_batch( Batch& batch, bool allow_reading_of_decoys ) {
626  if ( option[ OptionKeys::run::constant_seed ]() ) {
627  tr.Warning << "asked to cancel batch, but ignore in constant_seed mode to enable integration test" << std::endl;
628  return;
629  }
630  tr.Debug << "cancel batch " << batch.flag_file() << std::endl;
631 #ifdef USEMPI
632  Size const size( 3 );
633  int buf[ size ];
634  buf[ 0 ] = CANCEL_BATCH;
635  buf[ 1 ] = batch.id();
636  buf[ 2 ] = batch.nstruct();
637 //#ifdef USEMPI
638  MPI_Send( &buf, size, MPI_INT, jd_master_rank_, MPI_JOB_DIST_TAG, MPI_COMM_WORLD );
639  //need to have MPI_JOB_DIST_TAG... since it goes into main msg-loop of JobDist
640 
641  //send size of string
643  buf[ 0 ] = strbuf.size();
644  buf[ 1 ] = batch.id();
645  MPI_Send( buf, 2, MPI_INT, jd_master_rank_, MPI_JOB_DIST_TAG, MPI_COMM_WORLD );
646  //send string
647  MPI_Send(const_cast<char*> ( strbuf.data() ), strbuf.size(), MPI_CHAR, jd_master_rank_, MPI_JOB_DIST_TAG, MPI_COMM_WORLD );
648 #else
650 #endif
651  batch.mark_as_cancelled( allow_reading_of_decoys );
652  batch.write_info_file();
653 }
654 
656 
657  // we can also do this the quick way:
658  // a) there is still some problem and sometimes jobs-just don't finish... haven't figured out why.
659  // b) we don't really care, if we are here there was either an error, or the Archive is converged.
660  // at the current low-acceptance rate the remaining jobs are unlikely to yield anything useful...
661  // c) quick exits saves costly time on the cluster.
662  if ( option[ OptionKeys::jd2::mpi_nowait_for_remaining_jobs ]() ) {
663  save_archive();
664  utility_exit_with_message("quick exit from job-distributor due to flag jd2::mpi_nowait_for_remaining_jobs --- this is not an error " );
665 
666  //we do this by sending empty batch.
667  tr.Debug << "send STOP signal to JobDistributor " << std::endl;
668  }
669  Batch stop_batch( 0 );
670  queue_batch( stop_batch );
671 }
672 
673 void
676  using namespace basic::options::OptionKeys;
677 
678  //possible options:
679  //reads all structures from batches as if they were newly coming in
680  bool b_reread_all_structures( option[ OptionKeys::archive::reread_all_structures ]() /* default false */ );
681 
682  //don't know how to probe directory... take counter...
683  core::Size id( 1 );
684  Batch aBatch( id );
685  batches_.clear();
686  while ( file_exists( aBatch.flag_file() ) ) {
687  Batch& new_batch( start_new_batch() );
688  runtime_assert( new_batch.id() == id );
689  tr.Info << "found existing batch " << new_batch.batch() << std::endl;
690  try {
691  finalize_batch( new_batch, true /*reread */ );
692  tr.Debug << new_batch << std::endl;
693  } catch ( EXCN_Archive& excn ) {
694  //last started batch must have problems... ignore it
695  tr.Warning << "[ WARNING ] "+new_batch.batch()+" is errorneous: " + excn.msg() << std::endl;
696  tr.Warning << "[ WARNING ] ignoring this batch..." << std::endl;
697  //fill batch list if exception state has left us without it
698  if ( batches_.size() < id ) batches_.push_back( Batch( id ) );
699  batches_.back().mark_as_invalid();
700  }
701  if ( b_reread_all_structures ) {
702  if ( batches_[ id ].decoys_returned() ) {
703  jobs_completed_[ id ] =
704  CompletionMessage( id, batches_[id ].has_finished(), 0, batches_[ id ].decoys_returned(), batches_[ id ].nstruct() );
705  }
706  batches_[ id ].set_decoys_returned( 0 );
707  }
708  aBatch = Batch( ++id );
709  }
710 }
711 
712 Batch&
715  return start_new_batch( empty );
716 }
717 
718 Batch&
721 
722  core::Size batch_id( batches_.size() + 1 );
723  tr.Debug << "start new batch " << batch_id << std::endl;
724  batches_.push_back( Batch( batch_id ) );
725  Batch &new_batch( batches_.back() );
726 
727  new_batch.set_id( batch_id );
728  //make directory:
729  utility::file::create_directory( new_batch.dir() );
730  if ( start_decoys.size() ) {
731  new_batch.set_has_silent_in();
733  for ( core::io::silent::SilentStructOPs::const_iterator
734  it = start_decoys.begin(); it != start_decoys.end(); ++it ) {
735  sfd.add_structure( **it );
736  }
737  sfd.write_all( new_batch.silent_in() );
738  }
739  new_batch.user_options().add_built_in_options();
740  add_all_rosetta_options( new_batch.user_options() );
741 
742 //copy the system broker setup --- OBSOLET since Sept 20th 2010. now broker:setup is FileVector option.
743 // if ( !file_exists( new_batch.broker_file() ) && option[ OptionKeys::broker::setup ].user() ) {
744 // utility::io::ozstream batch_broker( new_batch.broker_file() );
745 // utility::io::izstream system_broker( option[ OptionKeys::broker::setup ]() );
746 // std::string line;
747 // while ( getline( system_broker, line ) ) batch_broker << line << std::endl;
748 // }
749  new_batch.nstruct() = basic::options::option[ basic::options::OptionKeys::out::nstruct ];
750  return batches_.back();
751 }
752 
753 void report_batch_inconsistency( Batch& new_batch, std::string const &tag ) {
754  throw( EXCN_Archive( "inconsistency detected when re-reading "+new_batch.batch()+" for " + tag) );
755 }
756 
757 void
758 ArchiveManager::finalize_batch( Batch& new_batch, bool reread ) {
760  using namespace basic::options::OptionKeys;
761  tr.Debug << "finalize_batch " << new_batch << std::endl;
762  if ( !reread) new_batch.set_decoys_returned( 0 );
763 
764  if ( !utility::file::file_exists( new_batch.broker_file() ) ) {
765  utility::io::ozstream broker( new_batch.broker_file() );
766  broker << "# NO CLAIMERS PRESENT" << std::endl;
767  broker.close();
768  }
769 
770  if( file_exists( new_batch.flag_file() ) ) {
771  tr.Debug << "checking aBatch.flag_file()... " << std::endl;
772  utility::options::OptionCollection batch_opts;
773  batch_opts.add_built_in_options();
774  add_all_rosetta_options( batch_opts );
775  try {
776  tr.Debug << "load options from file" << std::endl;
777  batch_opts.load_options_from_file_exception( new_batch.flag_file() );
778  } catch ( utility::excn::EXCN_Msg_Exception &excn ) {
779  tr.Error << "[ERROR] problems with flags in " << new_batch.flag_file() << " aborting... " << std::endl;
780  // excn.show( tr.Error );
781  batches_.pop_back();
782  throw ( EXCN_Archive( new_batch.flag_file() + " contains errors: " + excn.msg() ) );
783  }
784  if ( !reread ) {
785  //access all archive controlled options... so they are not in the "user_flags" anymore
786  if ( batch_opts[ in::file::silent ].user() )
787  tr.Warning << "option -in:file:silent will be overwritten by ArchiveMaster"
788  << " -- control directly via class Batch" << std::endl;
789  if ( batch_opts[ out::nstruct ].user() )
790  tr.Warning << "option -nstruct will be overwritten by ArchiveMaster "
791  << "-- control directly via class Batch" << std::endl;
792  if ( batch_opts[ run::intermediate_structures ].user() )
793  tr.Warning << "option -run::intermediate_structures will be overwritten by ArchiveMaster "
794  << "-- control directly via class Batch" << std::endl;
795  if ( batch_opts[ out::file::silent ].user() )
796  tr.Warning << "option -out:file:silent will be overwritten by ArchiveMaster "
797  << "-- control directly via class Batch" << std::endl;
798  if ( batch_opts[ broker::setup ].user() )
799  tr.Warning << "option -broker:setup will be overwritten by ArchiveMaster "
800  << "-- control directly via class Batch" << std::endl;
801  if ( batch_opts[ out::file::scorefile ].user() )
802  tr.Warning << "option -out:file:scorefile will be overwritten by ArchiveMaster "
803  << "-- control directly via class Batch" << std::endl;
804  }
805 
806  bool has_silent( batch_opts[ in::file::silent ].user() );
807  core::Size nstruct( batch_opts[ out::nstruct ]() );
808  bool intermeds( batch_opts[ run::intermediate_structures ]() );
809  std::string silent_out( batch_opts[ out::file::silent ]() );
810  utility::vector1< std::string > broker( batch_opts[ broker::setup ]() );
811  std::ostringstream broker_files;
812  std::copy( broker.begin(), broker.end(), std::ostream_iterator<std::string>( broker_files, " "));
813  std::string score_file( batch_opts[ out::file::scorefile ]() );
814 
815  // now the other options are "inaccessed options" and can be dumped to a stream
816  std::stringstream user_flags;
817  batch_opts.show_inaccessed_user_options( user_flags );
818  tr.Debug << "user_options: \n" << user_flags.str() << std::endl;
819 
820  // and can be added to the batch-options
821  new_batch.user_options().load_options_from_stream( user_flags, "USER_FLAGS" );
822  if ( reread ) {
823  new_batch.read_info_file();
824  new_batch.set_intermediate_structs( intermeds ); //this is not read from BATCH_INFO
825 
826  // for all other values we just double-check consistency
827  if ( new_batch.nstruct() != nstruct ) report_batch_inconsistency( new_batch, "NSTRUCT" );
828  if ( new_batch.has_silent_in() != has_silent ) report_batch_inconsistency( new_batch, "INPUT" );
829  if ( silent_out != new_batch.silent_out() ) report_batch_inconsistency( new_batch, "OUTPUT" );
830  if ( broker_files.str() != new_batch.all_broker_files() ) report_batch_inconsistency( new_batch, "BROKER_FILE" );
831  //TODO: determine how many decoys have been returned to archive...
832  }
833  }
834 
835  //now write the final flag-file
836  utility::io::ozstream flag_out( new_batch.flag_file() );
837  new_batch.user_options().show_user( flag_out );
838  flag_out << "\n\n#Archive controlled flags" << std::endl;
839  flag_out << "-out:file:silent " << new_batch.silent_out() << std::endl;
840  if ( new_batch.has_silent_in() ) flag_out << "-in:file:silent " << new_batch.silent_in() << std::endl;
841 
842  flag_out << "-out:nstruct " << new_batch.nstruct() << std::endl;
843  flag_out << "-out:file:scorefile " << new_batch.score_file() << std::endl;
844  flag_out << "-broker:setup " << new_batch.all_broker_files() << std::endl;
845 
846  if ( new_batch.intermediate_structs() ) flag_out << "-run:intermediate_structures" << std::endl;
847 
848  if ( !reread ) {
849  new_batch.write_info_file();
850  }
851 
852  if ( !new_batch.has_finished() && !new_batch.is_cancelled() && theArchive_->still_interested( new_batch ) ) {
853  tr.Debug << "queue " << new_batch.batch() << " " << new_batch.flag_file() << std::endl;
854  queue_batch( new_batch );
855  } else {
856  new_batch.mark_as_finished();
857  }
858 
859  tr.Debug << "\n" << std::endl;
860 
861 }
862 
863 
864 void
866  theArchive_->save_to_file();
867 }
868 
869 
870 bool
872  return theArchive_->restore_from_file();
873 }
874 
875 //#endif //ndef WIN32
876 
877 }//archive
878 }//jd2
879 }//protoco
880 
881 
882