Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
watchdog.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 // CVS information:
4 // $Revision: 7630 $
5 // $Date: 2006-03-10 09:37:52 -0800 (Fri, 10 Mar 2006) $
6 // $Author: mtyka & rhiju $
7 // (c) Copyright Rosetta Commons Member Institutions.
8 // (c) This file is part of the Rosetta software suite and is made available under license.
9 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
10 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
11 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
12 
13 #ifdef BOINC
14 #include <utility/io/izstream.hh>
15 #include <protocols/boinc/boinc.hh>
16 #endif
17 
18 // Unit header
20 
21 #include <iostream>
22 
23 
24 #ifdef BOINC
25 #include <protocols/boinc/boinc.hh>
26 #include <core/io/silent/util.hh>
27 
28 // Utility Headers
29 #include <utility/io/ozstream.hh>
30 
31 #include <utility/basic_sys_util.hh>
32 #include <utility/file/file_sys_util.hh>
33 
34 // C++ Headers
35 #include <cstdlib>
36 #include <iostream>
37 #include <sstream>
38 #include <ctime>
39 
40 #include <basic/options/option.hh>
41 #include <basic/options/after_opts.hh>
42 
43 #include <basic/options/keys/out.OptionKeys.gen.hh>
44 #include <basic/options/keys/boinc.OptionKeys.gen.hh>
45 
46 
47 #ifndef _WIN32
48 #include "pthread.h"
49 
50 // option key includes
51 
52 
53 #endif
54 
55 namespace protocols {
56 namespace boinc {
57 namespace watchdog {
58 
59 // protocols can set this pose as the global bailout - if the watchdog kicks in it will write out *this*
60 // pose and give it a special label to be identified as the Bailout ( W_xxx )
61 
62 #ifdef WIN32
63 #else
64  pthread_mutex_t bailout_mutex = PTHREAD_MUTEX_INITIALIZER;
65 #endif
66 
67 std::string bailout_silent_structure = "watchdog_failure: \n";
69 
70 //////////////////////////////////////////////////////////////////////////////////
71 //////////////////////////////////////////////////////////////////////////////////
72 //
73 // The watchdog thread has a few roles in BOINC builds.
74 //
75 // 1. If Rosetta has been going for cpu_run_timeout long than the user's preferred CPU run time,
76 // that's way too long. Exit gracefully. Default cpu_run_timeout = 3600*4
77 //
78 // 2. Update percentage complete every second or so, for user feedback.
79 //
80 //////////////////////////////////////////////////////////////////////////////////
81 //////////////////////////////////////////////////////////////////////////////////
82 
83  int WATCHDOG_BLINK_TIME = 2; // Watchdog sees if Rosetta is still running every second or so.
84  int const PCT_COMPLETE_UPDATE_TIME = 5; // resolution of pct complete status updates in seconds
85 
86 #ifdef _WIN32
87  HANDLE watchdogThread;
88 #endif
89 
90 //////////////////////////////////////////////////////////////////////////////////
91 //////////////////////////////////////////////////////////////////////////////////
92 #ifdef _WIN32
93 UINT WINAPI main_watchdog_windows( void* lpParam )
94 {
95  main_watchdog(NULL);
96  return 0;
97 }
98 #endif
99 
100 //////////////////////////////////////////////////////////////////////////////////
101 //////////////////////////////////////////////////////////////////////////////////
102 void
104  using namespace basic::options;
105 
106  // This is default for
107  // BOINC runs -- can turn it off with no_watchdog.
108  if ( option[ OptionKeys::boinc::watchdog ] ) {
109  std::cerr << "Starting watchdog..." << std::endl;
110 #ifdef _WIN32
111  watchdogThread = (HANDLE)_beginthreadex(
112  NULL, // default security attributes
113  0, // use default stack size
114  main_watchdog_windows, // thread function
115  NULL, // argument to thread function
116  0, // use default creation flags
117  NULL); // returns the thread identifier
118 #else
119  pthread_t p_watchdog;
120  pthread_create ( &p_watchdog, NULL,
121  &main_watchdog, NULL );
122 #endif
123 
124  }
125  return;
126 }
127 
128 void
130 {
131  using namespace basic::options;
132  // This is default for
133  // BOINC runs -- can turn it off with no_watchdog.
134  if ( option[ OptionKeys::boinc::watchdog ] ) {
135  std::cerr << std::endl;
136  std::cerr << "BOINC :: Watchdog shutting down..." << std::endl;
137 
138  // should already by set
140 #ifdef _WIN32
141  // Wait for the watchdog thread to shutdown before leaving the function.
142  WaitForSingleObject(watchdogThread, INFINITE);
143  CloseHandle(watchdogThread);
144 #endif
145  }
146  return;
147 }
148 
149 ///////////////////////////////////////////////////////////////////////////////////
150 void
151 get_the_hell_out(std::string )
152 {
153  using basic::options::option;
154  using namespace basic::options::OptionKeys;
155 
157 
158  boinc_fraction_done(1);
159  boinc_begin_critical_section();
160 
161  std::cerr << utility::timestamp() << " :: BOINC " << std::endl;
162  std::cerr.flush();
163 
164  std::string const fullname = option[ out::file::silent ]();
165 
166  unsigned int decoy_estimate = std::max(1,protocols::boinc::Boinc::decoy_count()) + 1;
167  // if structures were built, just bail quietly and return those structures.
168  if( utility::file::file_exists( fullname ) ){
169  std::cerr << "InternalDecoyCount: " << protocols::boinc::Boinc::decoy_count() << std::endl;
170 
171  // Count ?
172  // print a fake message to satisfy validator
173  protocols::boinc::Boinc::worker_finish_summary( decoy_estimate, decoy_estimate , 2 );
174 
176  boinc_finish(0); // process terminates right here, nothing else needed.
177  }
178 
179  // or maybe it's already zipped ?
180  if( utility::file::file_exists( fullname + ".gz" ) ){
181  // in that case just bail
182  std::cerr << "Output exists: " << fullname + ".gz" << " Size: " << utility::file::file_size( fullname + ".gz" ) << std::endl;
183  std::cerr << "InternalDecoyCount: " << protocols::boinc::Boinc::decoy_count() << " (GZ)" << std::endl;
184 
185  // Check that it's readablie
186  std::string wholename( fullname + ".gz" );
187  utility::io::izstream infile( wholename.c_str() );
188 
189  std::cerr << "-----" << std::endl;
190  std::cerr << infile << std::endl;
191  std::cerr << "-----" << std::endl;
192 
193  if (infile.good()) {
194  infile.close();
195  // print a fake message to satisfy validator
196  protocols::boinc::Boinc::worker_finish_summary( decoy_estimate, decoy_estimate , 2 );
197  boinc_finish(0); // process terminates right here, nothing else needed.
198  }else{
199  std::cerr << "Stream information inconsistent." << std::endl;
200  }
201  }
202 
203  // ONLY IF output file does not exist, make a dummy one !
204 
205  std::cerr << "Writing W_0000001" << std::endl;
206 
207  // otherwise write a watchdig structure - however this tends to fail the
208  // validator currently.
209 
210  // change Rhiju's original logic a little bit. Instead of creating a blank
211  // .out file, we will write a watchdog_failure line so that the validator
212  // can take operators accordingly. This will avoid throwing a lot of client
213  // errors on BOINC as a blank file can not be gzipped. -- chu
214 
215  utility::io::ozstream pdb_out_checkstream( fullname,
216  std::ios_base::in|std::ios_base::out );
217  utility::io::ozstream pdb_out_stream;
218 #ifdef BOINC
219  if (!Boinc::trywait_semaphore()) {
220 #else
221  pthread_mutex_lock(&bailout_mutex);
222 #endif
223  if (!pdb_out_checkstream ) {
224  pdb_out_checkstream.close();
225  pdb_out_checkstream.clear();
226  pdb_out_stream.open( fullname );
227  if ( !pdb_out_stream ) {
228  std::cout << "Open failed for file: " << fullname << std::endl;
229  utility::exit( EXIT_FAILURE, __FILE__, __LINE__);
230  }
231  pdb_out_stream << bailout_silent_structure_header;
232  pdb_out_stream << bailout_silent_structure;
233  //pdb_out_stream << "REMARK " << moreinfostring << std::endl;
234  } else {
235  pdb_out_checkstream.close();
236  pdb_out_checkstream.clear();
237  pdb_out_stream.open_append( fullname );
238  if ( !pdb_out_stream ) {
239  std::cout << " Append failed for file: " << fullname << std::endl;
240  utility::exit( EXIT_FAILURE, __FILE__, __LINE__);
241  }
242  pdb_out_stream << bailout_silent_structure;
243  //pdb_out_stream << "REMARK " << moreinfostring << std::endl;
244  }
245 
246 #ifdef _WIN32
248 #else
249  pthread_mutex_unlock(&bailout_mutex);
250 #endif
251  }
252 
253  pdb_out_stream.close();
254  pdb_out_stream.clear();
255 
256  // gzip silent file(s)
258 
259  boinc_end_critical_section();
261  boinc_finish(0); // process terminates right here, nothing else needed.
262 
263  return;
264 }
265 
266 
267 //////////////////////////////////////////////////////////////////////////////////
268 // Percentage complete = cpu_time/ cpu_run_time.
269 // There is an exception, though. If we're getting close to the user's preferred
270 // run time, always set percentage complete so that it appears that we
271 // have about ten minutes left...
272 void
273 update_pct_complete()
274 {
275  static int pct_complete_blink_counter( 1 );
276  if ( pct_complete_blink_counter++ % PCT_COMPLETE_UPDATE_TIME == 0)
278 }
279 
280 //////////////////////////////////////////////////////////////////////////////////
281 void
282 watchdog_sleep(int const watchdog_time){
283 #ifdef _WIN32
284  Sleep( 1000 * watchdog_time ); // windows -- Sleep function takes milliseconds.
285 #else
286  sleep(watchdog_time); // mac/linux -- Posix thread function takes seconds.
287 #endif
288 }
289 
290 //////////////////////////////////////////////////////////////////////////////////
291 void*
292 main_watchdog( void* )
293 {
294  using namespace basic::options;
295  using namespace protocols::boinc;
296 
297  double current_cpu_time = 0.0;
298  std::stringstream moreinfo;
299 
300  int watchdog_time = option[ OptionKeys::boinc::watchdog_time ];
301  int cpu_run_timeout = option[ OptionKeys::boinc::cpu_run_timeout ];
302 
303  int startup_time = 30;
304  bool init = false;
305  int count_blinks = 0;
306 
307  std::cerr << "Watchdog active." << std::endl;
308  Boinc boinc_wu = Boinc::instance();
309 
310 #ifdef _WIN32
311  /* Open the Semaphore */
312  // for data sychronization
314 #endif
315 
316  // Monitor main and quit when it sends the signal ("watchdog_finish" turns
317  // off worker_running flag)
318  while (Boinc::is_worker_running()){
319  watchdog_sleep(WATCHDOG_BLINK_TIME);
320 
321  //A new role for the watchdog.
322  // Every time it blinks, update "percentage complete" so that
323  // users know that they're making progress.
324  update_pct_complete();
325 
326  count_blinks++;
327  if (count_blinks < watchdog_time) continue;
328  //Don't come in too early, allow for about 30 seconds of start up time.
329  if (!init && count_blinks < startup_time) continue;
330 
331  count_blinks = 0;
332  init = true;
333 
334  // Rosetta might be suspended or preempted, in which case don't do anything.
335  BOINC_STATUS *rosetta_status = new BOINC_STATUS;
336  boinc_get_status( rosetta_status );
337  if (!rosetta_status->suspended) {
338 
339  // get current working set size (memory use)
340  protocols::boinc::Boinc::set_working_set_size( rosetta_status->working_set_size );
341 
342  // get current cpu run time
343  boinc_wu_cpu_time(current_cpu_time);
344 
345  // get max cpu run time user preference
346  // user may have updated this
348  int cpu_run_time = boinc_wu.get_project_pref_max_cpu_run_time();
349 
350  // Are we taking too long?
351  if (current_cpu_time > (cpu_run_timeout + cpu_run_time) && cpu_run_time > 0) {
352  moreinfo << "CPU time: " << current_cpu_time << " seconds. Exceeded timeout" << cpu_run_timeout << " + " << cpu_run_time << " seconds";
353  std::cerr << "BOINC:: CPU time: " << current_cpu_time << "s, " << cpu_run_timeout << "s + " << cpu_run_time << "s";
354  get_the_hell_out(moreinfo.str());
355  return 0;
356  }
357  }
358  }
359 
360  std::cout << "Watchdog finished." << std::endl;
361  return 0;
362 }
363 
364 } // namespace watchdog
365 } // namespace boinc
366 } // namespace protocols
367 
368 #endif