#!/usr/bin/perl
# $Id: bipbip,v 1.44 2005/10/03 14:36:36 capitn Exp $

use oar_iolib;
use Sys::Hostname;
use oar_conflib qw(init_conf dump_conf get_conf is_conf);
use IO::Socket::INET;
use oar_Judas qw(oar_debug oar_warn oar_error);
use ping_checker qw(test_hosts);
use POSIX qw(:signal_h :sys_wait_h);

#essential : you become oar instead of the user
#UID=EUID
$< = $>;

my $Jid= shift;
my $OarExecReattachExitValue = shift;

#overwrite env variables because we execute this script suid
$ENV{PATH}="/bin:/usr/bin:/usr/local/bin";
$ENV{USER}="oar";
$ENV{USERNAME}="oar";
$ENV{LOGNAME}="oar";

#hang signals
my $sigset   = POSIX::SigSet->new;
my $blockset = POSIX::SigSet->new(SIGINT, SIGQUIT, SIGUSR1);
sigprocmask(SIG_BLOCK, $blockset, $sigset);

#get server conf
init_conf("oar.conf");
my $remote_host = get_conf("SERVER_HOSTNAME");
my $remote_port = get_conf("SERVER_PORT");

my $deploy_hostname = get_conf("DEPLOY_HOSTNAME");
if (!defined($deploy_hostname)){
    $deploy_hostname = $remote_host;
}


my $detach_job = get_conf("DETACH_JOB");
if (!defined($detach_job)){
    $detach_job = 1;
}

# read a line on a socket
# arg1 --> socket
# arg2 --> timeout
# return 0 if the read times out
sub readSocketLine($$){
    my $sock = shift;
    my $timeout= shift;

    my $char = "a";
    my $res = 1;
    my $rin = '';
    my $line;
    vec($rin,fileno($sock),1) = 1;
    my $rinTmp;
    while (($res > 0) && ($char ne "\n") && ($char ne "")){
        $res = select($rinTmp=$rin, undef, undef, $timeout);
        if ($res > 0){
            sysread($sock,$char,1);
            if ($char ne "\n"){
                $line .= $char;
            }
        }
    }
    return($res,$line);
}

#Launch the oarexec command
# arg1 --> command
# arg2 --> host
# arg3 --> enable or not pseudo tty option for ssh
sub launch($$$){
    my $usercmd=shift;
    my $host=shift;
    my $tty=shift;

    my $opt="";

    if($tty==1){
        $opt="-t";
    }
    my $cmd = "ssh -x $opt ".$host." $usercmd ";

    oar_debug("[bipbip $Jid] Launch command : $cmd\n");
    
    system($cmd);
    
    my $exit_value  = $? >> 8;
    oar_debug("[bipbip $Jid] oarexec return code = $exit_value\n");

    return($exit_value);
}

# notify Almighty
# arg1 --> type of notification
sub notifyAlmighty($){
    my $sig = shift;

    my $socket = IO::Socket::INET->new( PeerAddr => $remote_host,
                                        PeerPort => $remote_port,
                                        Proto => "tcp",
                                        Type  => SOCK_STREAM);
    if (defined($socket)){
        print $socket "$sig\n";
    }else{
        oar_error("[bipbip $Jid] Couldn't connect executor $remote_host:$remote_port\n");
        #exit(2);
    }
}

#my $error=0;
my $base;
my @hosts;

$base = iolib::connect();
if (!defined($base)){
    oar_error("[bipbip $Jid] Can t connect to the data base\n");
    exit(1);
}

my @tmp = iolib::get_job_cmd_user($base,$Jid);
my $user=shift(@tmp);
my $usercmd=shift(@tmp);
my $launchingDirectory=shift(@tmp);
my $weight = shift(@tmp);
my $queue = shift(@tmp);

@hosts = iolib::get_job_host_distinct($base,$Jid);

if(scalar @hosts eq 0){
    oar_error("[bipbip $Jid] A Job is requested on NO HOSTS. This job must be already killed (exterminated) and come back from hell ???\n");
    exit 1;
}

my $hostToConnectViaSSH = $hosts[0];
#deploy part
if ($queue eq "deploy"){
    $hostToConnectViaSSH = $deploy_hostname;
}
#deploy part

#check if we must reconnect an oarexec?
if (defined($OarExecReattachExitValue)){  
    iolib::disconnect($base);
    if ($OarExecReattachExitValue =~ m/\d+/m){
        checkEndOfJob($OarExecReattachExitValue);
        exit(0);
    }else{
        oar_error("[bipbip $Jid] Bad argument for bipbip : $OarExecReattachExitValue\n");
        exit(2);
    }
}

#record bipbip pid and host in data base (I'm bipbip :-) )
my $hostname = hostname;
my $hostpid="$hostname:$$";
iolib::set_job_bpid($base,$Jid,$hostpid);

oar_debug("[bipbip $Jid] JOB: $Jid User: $user Command: $usercmd ==> hosts :[@hosts]\n");


# check nodes
#oar_debug("[bipbip $Jid] Check nodes\n");
#my @bad = test_hosts(@hosts);
my @bad;
 
# csr changes
if ($queue eq 'csr') {
    oar_debug("[bipbip $Jid] Job is in the csr queue, skipping nodes check\n");
    @bad = ();
}else{
    oar_debug("[bipbip $Jid] Check nodes\n");
    @bad = test_hosts(@hosts);
}
# csr changes- end

if ($#bad >= 0){
    #foreach my $i (@bad){
    #    iolib::set_node_nextState($base,$i,"Suspected");
    #    iolib::update_node_nextFinaudDecision($base,$i,"YES");
    #    oar_debug("[bipbip $Jid] BAD node $i; set to Suspected\n");
    #}
    #iolib::add_new_event($base,"PING_CHECKER_NODE_SUSPECTED",$Jid,"[bipbip] Ping_checker suspects  nodes for the job $Jid : @bad");
    #Resubmit the job
    $base->do("LOCK TABLE jobs WRITE, jobState_log WRITE, nodes WRITE, processJobs WRITE, processJobs_log WRITE, fragJobs WRITE,nodeState_log WRITE");
    my $refHashJob = iolib::get_job($base,$Jid);
    iolib::decrease_weight($base,$Jid);
    iolib::delete_job_process($base,$Jid);
    oar_debug("[bipbip $Jid] COMMAND=$usercmd\n");
    #if ($usercmd eq ""){
        iolib::set_job_state($base,$Jid,"Error");
        iolib::set_job_message($base,$Jid,"Blacklist job nodes");
    #}else{
    #    iolib::delete_job_process_log($base,$Jid);
    #    iolib::set_job_state($base,$Jid,"Waiting");
    #}
    $base->do("UNLOCK TABLES");
    oar_error("[bipbip $Jid] /!\\ Sorry but some nodes are inaccessible\n");
    #notifyAlmighty("ChState");
    iolib::add_new_event_with_host($base,"PING_CHECKER_NODE_SUSPECTED",$Jid,"[bipbip] Ping_checker suspects  nodes for the job $Jid : @bad",\@bad);
    notifyAlmighty("ChState");
    exit(2);
}else{
    oar_debug("[bipbip $Jid] No bad nodes\n");
}

# end of verif


#iolib::set_running_date($base,$Jid);
iolib::set_job_state($base,$Jid,"Running");
#iolib::disconnect($base);

#CALL OAREXEC ON THE FIRST NODE
# csr changes
if ($queue eq 'csr') {
    oar_debug("[bipbip $Jid] job is in the csr queue: not launching $usercmd on $hostToConnectViaSSH\n"); 
    exit(0);
# csr changes- end
}elsif($usercmd eq ""){
    iolib::disconnect($base);
    oar_debug("[bipbip $Jid] execute an interactive command\n");
    my $error = launch("oarexec $Jid $user $launchingDirectory $weight 0 @hosts ",$hostToConnectViaSSH,1);
    checkEndOfJob($error);
}else{
    if ($detach_job == 1){
        oar_debug("[bipbip $Jid] execute a NON interactive command and detach the oarexec\n");
        my $error = launch(" \' oarexec $Jid $user $launchingDirectory $weight 1 @hosts -- $usercmd  \'",$hostToConnectViaSSH,0);
    
        iolib::set_job_bpid($base,$Jid,"");
        iolib::disconnect($base);
    
        if ($error == 0){
            oar_debug("[bipbip $Jid] Exit from bipbip normaly\n");
            exit(0);
        }else{
            oar_debug("[bipbip $Jid] Oarexec returned an early error --> check end of job\n");
            checkEndOfJob($error);
        }
    }else{
        iolib::disconnect($base);
        oar_debug("[bipbip $Jid] execute a NON interactive command WITHOUT detaching oarexec\n");
        my $error = launch(" \' oarexec $Jid $user $launchingDirectory $weight 0 @hosts -- $usercmd \'",$hostToConnectViaSSH,0);
        checkEndOfJob($error);
    }
}


sub checkEndOfJob($){
    my $error = shift;

    my $baseerror=0;
    my $base = iolib::connect() or $baseerror=1;
    if ($baseerror!=0){
        oar_debug("[bipbip $Jid] Can t extablish connection with DataBase WARNING finish date not fixed\n");
        oar_error("[bipbip $Jid] Can t connect to the database\n");
        exit(1);
    }

    #oar_debug("[bipbip $Jid] Job $Jid is ended\n");
    #iolib::set_finish_date($base,$Jid);

    $base->do("LOCK TABLE jobs WRITE, jobState_log WRITE, nodes WRITE, processJobs WRITE, processJobs_log WRITE, nodeState_log WRITE, event_log WRITE");
    my $refJob = iolib::get_job($base,$Jid);
    if ($refJob->{'state'} eq "Running"){
        oar_debug("[bipbip $Jid] Job $Jid is ended\n");
        iolib::set_finish_date($base,$Jid);
        oar_debug("[bipbip $Jid] Release nodes \n");
        iolib::decrease_weight($base,$Jid);
        iolib::delete_job_process($base,$Jid);
        if($error == 0){
            oar_debug("[bipbip $Jid] User Launch completed OK\n");
            iolib::set_job_state($base,$Jid,"Terminated");
            iolib::set_job_message($base,$Jid,"ALL is GOOD");
        }elsif ($error == 1){
            #Prologue error
            my $strWARN = "[bipbip $Jid] error of oarexec prologue; the job $Jid is in Error and the node $hosts[0] is Suspected";
            oar_warn("$strWARN\n");
            iolib::add_new_event($base,"PROLOGUE_ERROR",$Jid,"$strWARN");
            iolib::set_job_state($base,$Jid,"Error");
            iolib::set_job_message($base,$Jid,"$strWARN");
            #iolib::set_node_nextState($base,$hosts[0],"Suspected");
            notifyAlmighty("ChState");
        }elsif ($error == 2){
            #Epilogue error
            my $strWARN = "[bipbip $Jid] error of oarexec epilogue; the node $hosts[0] is Suspected; (jobId = $Jid)";
            oar_warn("$strWARN\n");
            iolib::add_new_event($base,"EPILOGUE_ERROR",$Jid,"$strWARN");
            iolib::set_job_state($base,$Jid,"Terminated");
            #iolib::set_node_nextState($base,$hosts[0],"Suspected");
            notifyAlmighty("ChState");
        }elsif ($error == 3){
            #Oarexec is killed by Leon normaly
            my $strWARN = "[bipbip $Jid] oarexec of the job $Jid was killed by Leon";
            oar_debug("$strWARN\n");
            iolib::set_job_state($base,$Jid,"Error");
            iolib::set_job_message($base,$Jid,"$strWARN");
        }elsif ($error == 4){
            #Oarexec was killed by Leon and epilogue of oarexec is in error
            my $strWARN = "[bipbip $Jid] The job $Jid was killing by Leon and oarexec epilogue was in error";
            oar_warn("$strWARN\n");
            iolib::add_new_event($base,"EPILOGUE_ERROR",$Jid,"$strWARN");
            iolib::set_job_state($base,$Jid,"Error");
            iolib::set_job_message($base,$Jid,"$strWARN");
            #iolib::set_node_nextState($base,$hosts[0],"Suspected");
            notifyAlmighty("ChState");
        }elsif ($error == 5){
            #Oarexec is not able write in the node file
            my $strWARN = "[bipbip $Jid] oarexec can t create the node file";
            oar_warn("$strWARN\n");
            iolib::add_new_event($base,"CAN_NOT_WRITE_NODE_FILE",$Jid,"$strWARN");
            iolib::set_job_state($base,$Jid,"Error");
            iolib::set_job_message($base,$Jid,"$strWARN");
            #iolib::set_node_nextState($base,$hosts[0],"Suspected");
            notifyAlmighty("ChState");
        }elsif ($error == 6){
            #Oarexec can not write its pid file
            my $strWARN = "[bipbip $Jid] oarexec cannot write its pid file";
            oar_warn("$strWARN\n");
            iolib::add_new_event($base,"CAN_NOT_WRITE_PID_FILE",$Jid,"$strWARN");
            iolib::set_job_state($base,$Jid,"Error");
            iolib::set_job_message($base,$Jid,"$strWARN");
            #iolib::set_node_nextState($base,$hosts[0],"Suspected");
            notifyAlmighty("ChState");
        }elsif ($error == 7){
            #Can t get shell of user
            my $strWARN = "[bipbip $Jid] Can't get shell of user $user, so I suspect node $hosts[0]";
            oar_warn("$strWARN\n");
            iolib::add_new_event($base,"USER_SHELL",$Jid,"$strWARN");
            iolib::set_job_state($base,$Jid,"Error");
            iolib::set_job_message($base,$Jid,"$strWARN");
            #iolib::set_node_nextState($base,$hosts[0],"Suspected");
            notifyAlmighty("ChState");
        }elsif ($error == 10){
            #oarexecuser.sh can not go into working directory
            my $strWARN = "[bipbip $Jid] Can't go into the working directory $launchingDirectory of the job on node $hosts[0]";
            oar_warn("$strWARN\n");
            iolib::add_new_event($base,"WORKING_DIRECTORY",$Jid,"$strWARN");
            iolib::set_job_state($base,$Jid,"Error");
            iolib::set_job_message($base,$Jid,"$strWARN");
        }elsif ($error == 20){
            #oarexecuser.sh can not write stdout and stderr files
            my $strWARN = "[bipbip $Jid] Can't create .stdout and .stderr files in $launchingDirectory on the node $hosts[0]";
            oar_warn("$strWARN\n");
            iolib::add_new_event($base,"OUTPUT_FILES",$Jid,"$strWARN");
            iolib::set_job_state($base,$Jid,"Error");
            iolib::set_job_message($base,$Jid,"$strWARN");
        }elsif ($error == 12){
            #oarexecuser.sh can not go into working directory and epilogue is in error
            my $strWARN = "[bipbip $Jid] Can't go into the working directory $launchingDirectory of the job on node $hosts[0] AND epilogue is in error";
            oar_warn("$strWARN\n");
            iolib::add_new_event($base,"WORKING_DIRECTORY",$Jid,"$strWARN");
            iolib::add_new_event($base,"EPILOGUE_ERROR",$Jid,"$strWARN");
            iolib::set_job_state($base,$Jid,"Error");
            iolib::set_job_message($base,$Jid,"$strWARN");
            #iolib::set_node_nextState($base,$hosts[0],"Suspected");
            notifyAlmighty("ChState");
        }elsif ($error == 22){
            #oarexecuser.sh can not write stdout and stderr files and epilogue is in error
            my $strWARN = "[bipbip $Jid] Can't get shell of user $user, so I suspect node $hosts[0] AND epilogue is in error";
            oar_warn("$strWARN\n");
            iolib::add_new_event($base,"OUTPUT_FILES",$Jid,"$strWARN");
            iolib::add_new_event($base,"EPILOGUE_ERROR",$Jid,"$strWARN");
            iolib::set_job_state($base,$Jid,"Error");
            iolib::set_job_message($base,$Jid,"$strWARN");
            #iolib::set_node_nextState($base,$hosts[0],"Suspected");
            notifyAlmighty("ChState");
        }else{
            my $strWARN = "[bipbip $Jid] error of oarexec, exit value = $error; the job $Jid is in Error and the node $hosts[0] is Suspected";
            oar_warn("$strWARN\n");
            iolib::add_new_event($base,"EXIT_VALUE_OAREXEC",$Jid,"$strWARN");
            iolib::set_job_state($base,$Jid,"Error");
            iolib::set_job_message($base,$Jid,"$strWARN");
            #iolib::set_node_nextState($base,$hosts[0],"Suspected");
            notifyAlmighty("ChState");
        }
    }else{
        oar_debug("[bipbip $Jid] I was previously killed or Terminated but I did not know that!!\n");
    }
    $base->do("UNLOCK TABLES");
    iolib::disconnect($base);


    notifyAlmighty("BipBip");
}

exit(0);

