#!/usr/bin/perl
#Almighty module : check walltimes and jobs to frag
use strict;
use DBI();
use Data::Dumper;
use oar_iolib;
use oar_Judas qw(oar_debug oar_warn oar_error);
use oar_conflib qw(init_conf dump_conf get_conf is_conf);
use IO::Socket::INET;
use oar_Tools;

# Get job delete and checkpoint walltime values
my $leonSoftWalltime = oar_Tools::getDefaultLeonSoftWalltime();
my $leonWalltime = oar_Tools::getDefaultLeonWalltime();
init_conf("oar.conf");
if (is_conf("JOBDEL_SOFTWALLTIME")){
    $leonSoftWalltime = get_conf("JOBDEL_SOFTWALLTIME");
}
if (is_conf("JOBDEL_WALLTIME")){
    $leonWalltime = get_conf("JOBDEL_WALLTIME");
}

if ($leonWalltime <= $leonSoftWalltime){
    $leonWalltime = $leonSoftWalltime + 1;
    oar_warn("[sarko] (JOBDEL_WALLTIME <= JOBDEL_SOFTWALLTIME) so I must set JOBDEL_WALLTIME to $leonWalltime\n");
}

oar_debug("[sarko] JOBDEL_SOFTWALLTIME = $leonSoftWalltime; JOBDEL_WALLTIME = $leonWalltime\n");

my $dead_switch_time = 0;
if (is_conf("DEAD_SWITCH_TIME")){
    $dead_switch_time = get_conf("DEAD_SWITCH_TIME");
}

# get script args
my $base = iolib::connect();
if (!defined($base)){
    oar_error("[sarko] Can not connect to the database\n");
    exit(1);
}

oar_debug("[sarko] Hello, identity control !!!\n");

my $guilty_found=0;
my $current = iolib::get_date($base);
$current = iolib::sql_to_local($current);
oar_debug("[sarko] Current time : $current\n");

# Look at leon timers
my @JobToFrag = iolib::get_timered_job($base);
my $Jid;
# Decide if OAR must retry to delete the job or just change values in the database
foreach my $Jid (@JobToFrag){
    my $refJob = iolib::get_job($base,$Jid);
    if (($refJob->{'state'} eq "Terminated") || ($refJob->{'state'} eq "Error")){
        iolib::job_fragged($base,$Jid);
        oar_debug("[sarko] I set to FRAGGED the job $Jid\n");
    }else{
        my $fragDate = iolib::get_frag_date($base,$Jid);
        $fragDate = iolib::sql_to_local($fragDate);
        oar_debug("[sarko] frag date : $fragDate\n");
        if (($current > $fragDate+$leonSoftWalltime) && ($current <= $fragDate+$leonWalltime)){
            oar_debug("[sarko] Leon will RE-FRAG bipbip of job $Jid\n");
            iolib::job_refrag($base,$Jid);
            $guilty_found=1;
        }elsif ($current > $fragDate+$leonWalltime){
            oar_debug("[sarko] Leon will EXTERMINATE bipbip of job $Jid\n");
            iolib::job_leon_exterminate($base,$Jid);
            $guilty_found=1;
        }else{
            oar_debug("[sarko] The leon timer is not yet expired for the job $Jid; I do nothing\n");
        }
    }
}


# Look at job walltimes
my @result = iolib::get_jobs_in_state($base, "Running");
foreach my $job (@result){
    my ($date, $start, $max);

    # Get starting time
    $date = $job->{startTime};
    #print "Job [$job{idJob}] startTime $date\n";
    $start = iolib::sql_to_local($date);

    # Get maxtime
    $date = $job->{maxTime};
    $max = iolib::sql_to_duration($date);

    oar_debug("[sarko] Job [$job->{idJob}] from $start with $max; current time=$current");
    if ($current > $start+$max){
        oar_debug(" (Elapsed)");
        $guilty_found=1;
        iolib::frag_job($base, $job->{idJob});
        iolib::add_new_event($base,"WALLTIME",$job->{idJob},"[sarko] Job [$job->{idJob}] from $start with $max; current time=$current (Elapsed)");
    }elsif (($job->{checkpoint} > 0) && ($job->{autoCheckpointed} eq "NO") && ($current >= ($start+$max-$job->{checkpoint}))){
        # OAR must notify the job to checkpoint itself
        oar_debug("[sarko] Send checkpoint signal to the job $job->{idJob}\n");
        # Retrieve node names used by the job
        my @hosts = iolib::get_job_host_distinct($base,$job->{idJob});
        iolib::add_new_event($base,"CHECKPOINT",$job->{idJob},"User oar (sarko) requested a checkpoint on the job $job->{idJob}");
        my $timeoutSSH = oar_Tools::getSSHTimeout();
        my $strComment;
        my @exitCodes;
        # Timeout the ssh command
        eval {
            $SIG{ALRM} = sub { die "alarm\n" };
            alarm($timeoutSSH);
            @exitCodes = oar_Tools::signalOarexec($hosts[0],$job->{idJob},"SIGUSR2");
            alarm(0);
        };
        if ($@){
            if ($@ eq "alarm\n"){
                $strComment = "[sarko] Cannot contact $hosts[0], operation timouted ($timeoutSSH s). So I cannot send checkpoint signal to the job $job->{idJob}";
                oar_warn("$strComment\n");
                iolib::add_new_event($base,"CHECKPOINT_ERROR",$job->{idJob},$strComment);
            }else{
                $strComment = "[sarko] An unknown error occured during the sending of the checkpoint signal to the job $job->{idJob} on the host $hosts[0]";
                oar_warn("$strComment\n");
                iolib::add_new_event($base,"CHECKPOINT_ERROR",$job->{idJob},$strComment);
            }
        }else{
            if ($exitCodes[0] == 0){
                iolib::set_job_autoCheckpointed($base, $job->{idJob});
                $strComment = "[sarko] The job $job->{idJob} was notified to checkpoint itself (send SIGUSR2) on the node $hosts[0]";
                oar_debug("$strComment\n");
                iolib::add_new_event($base,"CHECKPOINT_SUCCESSFULL",$job->{idJob},$strComment);
            }else{
                $strComment = "[sarko] The kill command return a bad exit code (@exitCodes) for the job $job->{idJob} on the node $hosts[0]";
                oar_warn("$strComment\n");
                iolib::add_new_event($base,"CHECKPOINT_ERROR",$job->{idJob},$strComment);
            }
        }
    }
    oar_debug("\n");
}



# Retrieve nodes with expiryDates in the past
# special for Desktop computing
my @nodes = iolib::get_expired_nodes($base);
if (defined @nodes) {
    # First mark the nodes as dead
    foreach my $node (@nodes) {
        iolib::set_node_nextState($base, $node, 'Suspected');
    }
    # Then notify Almighty
    my $remote_host = get_conf("SERVER_HOSTNAME");
    my $remote_port = get_conf("SERVER_PORT");
    my $socket = IO::Socket::INET->new( PeerAddr => $remote_host,
                                        PeerPort => $remote_port,
                                        Type => SOCK_STREAM,
                                        Proto => "tcp")
        or die("Couldn't connect executor $remote_host:$remote_port\n");
    print $socket "ChState\n";
    close $socket;
}

# Get Absent and Suspected nodes for more than 5 mn (default)
if ($dead_switch_time > 0){
    my $notify = 0;
    foreach my $n (iolib::get_node_absent_suspected_nodes_for_a_timeout($base,$dead_switch_time)){
        iolib::set_node_nextState($base,$n,"Dead");
        iolib::update_node_nextFinaudDecision($base,$n,"YES");
        oar_debug("[Sarko] Set the next state of $n to Dead\n");
        $notify = 1;
    }
    if ($notify > 0){
        my $remote_host = get_conf("SERVER_HOSTNAME");
        my $remote_port = get_conf("SERVER_PORT");
        oar_Tools::notifyAlmighty($remote_host,$remote_port,"ChState");
    }
}

iolib::disconnect($base);

exit $guilty_found;
