// -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
// vi: set ts=2 noet:
//  CVS information:
//  $Revision: 7630 $
//  $Date: 2006-03-10 09:37:52 -0800 (Fri, 10 Mar 2006) $
//  $Author: rhiju $

#include "watchdog.h"

#ifdef BOINC

// Rosetta Headers
#include "after_opts.h"
#include "counters.h"
#include "files_paths.h"
#include "output_decoy.h"
#include "runlevel.h"
#include "trajectory.h"
#include "boinc_rosetta_util.h"

#include "monte_carlo.h" // yab: misc removal

// Utility Headers
#include <utility/file/gzip_util.hh>
#include <utility/basic_sys_util.hh>
#include <utility/file/file_sys_util.hh>
#include <utility/io/ozstream.hh>

// C++ Headers
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <string>
#include <ctime>

#ifndef _WIN32
#include "pthread.h"
#endif

#ifdef _WIN32
#include "boinc_win.h"
#endif
#include "boinc_api.h"
#include "diagnostics.h"

//////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////
//
//  The watchdog thread has a few roles in BOINC builds.
//
//  1. First, it checks if Rosetta has done anything every hour or so. If not,
//      assume Rosetta is stuck in an infinite loop (yes it happens!),
//      Rosetta is killed, and we attempt to exit gracefully.
//
//  2. If Rosetta has been going for 4x the user's preferred CPU run time,
//     that's way too long. Exit gracefully.
//
//  3. Update percentage complete every second or so, for user feedback.
//
//////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////


namespace watchdog_ns{
	int WATCHDOG_BLINK_TIME = 1; // Watchdog sees if Rosetta is still running every second or so.
	int const DEFAULT_WATCHDOG_TIME = 60*60; // Check every sixty minutes.
	float const DEFAULT_CPU_RUN_TIME_FACTOR = 4.0;
	int watchdog_time;

	double const MINIMUM_TIME_LEFT = 10.0 * 60.0;
	int const PCT_COMPLETE_UPDATE_TIME = 5;

#ifdef _WIN32
	HANDLE watchdogThread;
#endif
}

//////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////
#ifdef _WIN32
UINT WINAPI main_watchdog_windows( void* lpParam )
{
	main_watchdog(NULL);
	return 0;
}
#endif

//////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////
void
watchdog_start(){
	using namespace watchdog_ns;

	// This is default for
	// BOINC runs -- can turn it off with no_watchdog.
	if (truefalseoption("watchdog")){

#ifdef _WIN32
		watchdogThread = (HANDLE)_beginthreadex(
            NULL,              // default security attributes
            0,                 // use default stack size
            main_watchdog_windows,  // thread function
            NULL,              // argument to thread function
            0,                 // use default creation flags
            NULL);             // returns the thread identifier
#else
		pthread_t p_watchdog;
		pthread_create ( &p_watchdog, NULL,
										 &main_watchdog, NULL );
#endif


	}
	return;
}

void
watchdog_finish()
{
	using namespace watchdog_ns;
	boinc_params::rosetta_is_running = false; // Should have already happened, actually.

#ifdef _WIN32
	// Wait for the watchdog thread to shutdown before leaving the function.
	WaitForSingleObject(watchdogThread, INFINITE);
	CloseHandle(watchdogThread);
#endif
}

//////////////////////////////////////////////////////////////////////////////////
void
get_the_hell_out(std::string moreinfostring)
{
	using namespace files_paths; // contains output_all

	// Rosetta stayed the same over the period of a blink! Suspicious! Bring out the dog!
	std::cout << "**********************************************************************" << std::endl;
	std::cout << "Rosetta score is stuck or going too long. Watchdog is ending the run!" << std::endl;
	std::cout << moreinfostring << std::endl;
	std::cout << "**********************************************************************" << std::endl;

	std::cerr << "**********************************************************************" << std::endl;
	std::cerr << "Rosetta score is stuck or going too long. Watchdog is ending the run!" << std::endl;
	std::cerr << moreinfostring << std::endl;
	std::cerr << "**********************************************************************" << std::endl;

	//set_pose_flag(false);
	//store_low_info();
	//bool accepted = false;
	//output_all = true; // this flag combination will writeout a file with tag F_ instead of S_
	//output_decoy( accepted );

	// This is copied from main.cc... must be a better way to organize this code
	// so its not repeated.

	boinc_fraction_done(1);
	boinc_begin_critical_section();

	// change Rhiju's original logic a little bit. Instead of creating a blank
	// .out file, we will write a watchdog_failure line so that the validator
	// can take operators accordingly. This will avoid throwing a lot of client
	// errors on BOINC as a bland file can not be gzipped. -- chu
	std::string const fullname( pdb_out_path + code + protein_name + ".out" );
 	if ( runlevel_ns::runlevel == runlevel_ns::silent ) {
		utility::io::ozstream pdb_out_checkstream( fullname,
			std::ios_base::in|std::ios_base::out );
		utility::io::ozstream pdb_out_stream;
		if (!pdb_out_checkstream ) {
			pdb_out_checkstream.close();
			pdb_out_checkstream.clear();
			pdb_out_stream.open( fullname );
			if ( !pdb_out_stream ) {
				std::cout << "Open failed for file: " << fullname << std::endl;
				utility::exit( EXIT_FAILURE, __FILE__, __LINE__);
			}
		} else {
			pdb_out_checkstream.close();
			pdb_out_checkstream.clear();
			pdb_out_stream.open_append( fullname );
			if ( !pdb_out_stream ) {
				std::cout << " Append failed for file: " << fullname << std::endl;
				utility::exit( EXIT_FAILURE, __FILE__, __LINE__);
			}
		}
		pdb_out_stream << "watchdog_failure: " << moreinfostring << std::endl;
		pdb_out_stream.close();
		pdb_out_stream.clear();
	}

 	if ( get_output_silent_gz_flag() ) {
 		std::cout << "GZIP SILENT FILE: " << fullname << std::endl;
 		std::cerr << "GZIP SILENT FILE: " << fullname << std::endl;

		if (!utility::file::gzip( fullname, true )) {
#ifdef _WIN32
			// For some reason we have failed to compress an output file.  Make sure
			//   we get some diagnostic information out of it.  This will cause the exit
			//   code to be set to 0x80000003 and all available diagnostic information
			//   will be written to stderr.
			//
			// Returning an exit code of 0 without any data files still causes BOINC
			//   to fail the result.
			DebugBreak();
#endif
		}
 	}
 	if ( get_output_scorefile_gz_flag() ) {
 		std::string scorefile_name = get_scorefile_name();
 		std::cout << "GZIP SCORE FILE: " << scorefile_name << std::endl;
 		std::cerr << "GZIP SCORE FILE: " << scorefile_name << std::endl;
		if (!utility::file::gzip( scorefile_name, true )) {
#ifdef _WIN32
			// For some reason we have failed to compress an output file.  Make sure
			//   we get some diagnostic information out of it.  This will cause the exit
			//   code to be set to 0x80000003 and all available diagnostic information
			//   will be written to stderr.
			//
			// Returning an exit code of 0 without any data files still causes BOINC
			//   to fail the result.
			DebugBreak();
#endif
		}
 	}

	boinc_end_critical_section();
	boinc_finish(0); // process terminates right here, nothing else needed.

	return;
}


//////////////////////////////////////////////////////////////////////////////////
// Percentage complete = cpu_time/ cpu_run_time.
// There is an exception, though. If we're getting close to the user's preferred
// run time, always set percentage complete so that it appears that we
// have about ten minutes left...
void
update_pct_complete()
{
	using namespace watchdog_ns;
	using namespace boinc_project_prefs;

	//Wrest control of pct complete from the checkpoint function in boinc_rosetta_util.cc
	boinc_params::pct_complete_handled_by_watchdog = true;

	double current_cpu_time;
	boinc_wu_cpu_time(current_cpu_time);

	double time_left = cpu_run_time - current_cpu_time;
	//If we've gone little over time, don't let it show... always at least ten minutes left.

	time_left = std::max( time_left, MINIMUM_TIME_LEFT );

	static int pct_complete_blink_counter( 1 );
	if ( pct_complete_blink_counter++ %  PCT_COMPLETE_UPDATE_TIME == 0)
	{
		boinc_params::pct_complete = current_cpu_time / (current_cpu_time + time_left);
		//		std::cout << "PCT COMPLETE " << boinc_params::pct_complete << std::endl;
		boinc_fraction_done(boinc_params::pct_complete);
	}

}

//////////////////////////////////////////////////////////////////////////////////
void
watchdog_sleep(int const watchdog_time){
	// I could also use boinc_sleep in boinc/lib/util.C but then I'd have to set up
	// the include for that library. Anyway this is the same thing...
#ifdef _WIN32
	Sleep( 1000 * watchdog_time ); // windows -- Sleep function takes milliseconds.
#else
	sleep(watchdog_time); // mac/linux -- Posix thread function takes seconds.
#endif
}

//////////////////////////////////////////////////////////////////////////////////
void*
main_watchdog( void* )
{
	using namespace watchdog_ns;
	using namespace mc_global_track::mc_score;
	using namespace counters::monte_carlo_ints;
	using namespace boinc_project_prefs;

	float prev_score = 0.0;
	int prev_ntrials = 0;
	float cpu_run_timeout_factor;
	double current_cpu_time = 0.0;
	double prev_cpu_time = 0.0;
	float time_since_last_score_change;
	std::stringstream moreinfo;

	intafteroption("watchdog_time", DEFAULT_WATCHDOG_TIME, watchdog_time);
	realafteroption("cpu_run_timeout_factor", DEFAULT_CPU_RUN_TIME_FACTOR, cpu_run_timeout_factor);

#ifdef _WIN32
//	diagnostics_set_thread_name("Watchdog");
#endif

	int startup_time = 30;
	bool init = false;
	//	watchdog_sleep( startup_time ); //Allow at least half a minute for initialization before starting watchdog

	int count_blinks = 0;
	reread_boinc_project_prefs();
	// Monitor main and quit when it sends the signal ("watchdog_finish" turns off rosetta_is_running flag)
	while (boinc_params::rosetta_is_running){
		//		watchdog_sleep(watchdog_time);
		watchdog_sleep(WATCHDOG_BLINK_TIME);

		//A new role for the watchdog.
		// Every time it blinks, update "percentage complete" so that
		// users know that they're making progress.
		update_pct_complete();

		count_blinks++;
		if (count_blinks < watchdog_time) continue;
		//Don't come in too early, allow for about 30 seconds of start up time.
		if (!init && count_blinks < startup_time) continue;

		count_blinks = 0;
		init = true;

		// Rosetta might be suspended or preempted, in which case don't do anything.
		BOINC_STATUS *rosetta_status = new BOINC_STATUS;
		boinc_get_status( rosetta_status );
		//		std::cout << "GET STATUS: " << rosetta_status->no_heartbeat << " " << rosetta_status->suspended << " "  << rosetta_status->quit_request << std::endl;
		if  (!rosetta_status->suspended) {

			boinc_wu_cpu_time(current_cpu_time);

			//			std::cout << "WATCHDOG Time: " << current_cpu_time << " cpu_run_time "
			//					<< cpu_run_time << " ; Score: " << score
			//					<< "  Previous score: " << prev_score
			//					<< " Time since last score: " << current_cpu_time - prev_cpu_time << std::endl;

			reread_boinc_project_prefs();
			if (current_cpu_time > cpu_run_timeout_factor * cpu_run_time && cpu_run_time > 0) {
				// Are we taking too long?
				moreinfo << "CPU time: " << current_cpu_time << " seconds. Greater than " << cpu_run_timeout_factor <<
					"X preferred time: " << cpu_run_time << " seconds";
				get_the_hell_out(moreinfo.str());
				return 0;
			}

			// There may not have been a whole lot of computation since the last time we checked, if Rosetta
			//  was preempted or suspended in the meanwhile. In that case it won't be totally fair
			// to require Rosetta to have increased its score.
			if ((current_cpu_time - prev_cpu_time) > 0.25*watchdog_time) {
				if ( (prev_score == score) && (prev_ntrials == ntrials) ){ // Are we stuck?
					moreinfo << "Stuck at score " << score << " for " << 0.25*watchdog_time << " seconds";
					get_the_hell_out(moreinfo.str());
					return 0;
				}
			}
			prev_score = score;
			prev_ntrials = ntrials;
			prev_cpu_time = current_cpu_time;
		}
	}

	std::cout << "Watchdog finished." << std::endl;
	return 0;
}

#endif
